def vis_att(pages_idx, query, alpha, wiki, vocab, idx):
rows = [prm.root_page.title()]
for pageidx in pages_idx[:-1]:
if pageidx != -1:
rows.append(wiki.get_article_title(pageidx).decode('utf-8', 'ignore').title())
else:
break
#rows.append('Stop')
rows = rows[::-1]
columns = []
for word in wordpunct_tokenize(query):
if word.lower() in vocab:
columns.append(str(word))
columns = columns[:prm.max_words_query*prm.n_consec]
alpha = alpha[:len(rows),:len(columns)]
alpha = alpha[::-1]
fig,ax=plt.subplots(figsize=(27,10))
#Advance color controls
norm = matplotlib.colors.Normalize(0,1)
im = ax.pcolor(alpha,cmap=plt.cm.gray,edgecolors='w',norm=norm)
fig.colorbar(im)
ax.set_xticks(np.arange(0,len(columns))+0.5)
ax.set_yticks(np.arange(0,len(rows))+0.5)
ax.tick_params(axis='x', which='minor', pad=15)
# Here we position the tick labels for x and y axis
ax.xaxis.tick_bottom()
ax.yaxis.tick_left()
ax.axis('tight') # correcting pyplot bug that add extra white columns.
plt.xticks(rotation=90)
fig.subplots_adjust(bottom=0.2)
fig.subplots_adjust(left=0.2)
#Values against each labels
ax.set_xticklabels(columns,minor=False,fontsize=18)
ax.set_yticklabels(rows,minor=False,fontsize=18)
plt.savefig('vis' + str(idx) + '.svg')
plt.close()
python类wordpunct_tokenize()的实例源码
def BOW2(texts, vocab, dim):
'''
Convert a list of texts to the BoW dense representation.
'''
out = np.zeros((len(texts), dim), dtype=np.int32)
mask = np.zeros((len(texts), dim), dtype=np.float32)
for i, text in enumerate(texts):
bow = BOW(wordpunct_tokenize(text), vocab)
out[i,:len(bow[0])] = bow[0]
mask[i,:len(bow[1])] = bow[1]
return out, mask
def Word2Vec_encode(texts, wemb):
out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
for i, text in enumerate(texts):
words = wordpunct_tokenize(text)
n = 0.
for word in words:
if word in wemb:
out[i,:] += wemb[word]
n += 1.
out[i,:] /= max(1.,n)
return out
def _generate_phrases(self, sentences):
"""Method to generate contender phrases given the sentences of the text
document.
:param sentences: List of strings where each string represents a
sentence which forms the text.
:return: Set of string tuples where each tuple is a collection
of words forming a contender phrase.
"""
phrase_list = set()
# Create contender phrases from sentences.
for sentence in sentences:
word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
phrase_list.update(self._get_phrase_list_from_words(word_list))
return phrase_list
def _on_start(self, utterance):
# do all on start things
# maybe clear all chart data structures
# maybe clear agenda data structures
self.agenda.clear()
tokenized_utterance = tokenizer(utterance)
self.utter_len = self.settings.utter_len = len(tokenized_utterance)
self.left_buckets = [set() for _ in xrange(self.utter_len+1)]
self.right_buckets = [set() for _ in xrange(self.utter_len+1)]
self.initialize_agenda(tokenized_utterance)
# Buckets are over dot indices, so are len=1
# self._print_buckets()
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def score(self, sentence):
# track both positive and negative scores for sentence
pos_score, neg_score = 0., 0.
# assuming no contextual forms are used for Arabic
ensure_package_path()
from nltk.tokenize import wordpunct_tokenize as tokenize
tokens = tokenize(sentence.lower())
term_count = 0
# using nested while loops here to accomodate early termination of
# inner loop, and updating the index of the outer loop based on the
# number of tokens used in the sub-phrase
i = 0
while i < len(tokens):
matched = False
j = min(self.max_len, len(tokens) - i)
# check phrase lengths up to `max_len`
while j > 0 and (i + j) <= len(tokens):
sub_tokens = tokens[i : i + j]
sub_word = ' '.join(sub_tokens)
# if a match exist for phrase, update scores and counts
if sub_word in self.lookup:
sub_word_scores = self.lookup[sub_word]
pos_score += sub_word_scores[0]
neg_score += sub_word_scores[1]
term_count += 1
matched = True
i += j
break
j -= 1
# if not matched, skip token
if not matched:
i += 1
# if no terms matched, or scores are equal, return a neutral score
if pos_score == neg_score:
return 0.5
# if sentence is more positive than negative, use positive word sense
elif pos_score > neg_score:
return 0.5 + pos_score / term_count / 2
# if sentence is more negative than positive, use negative word sense
else:
return 0.5 - neg_score / term_count / 2
def create_keyword_regex(keyword):
print 'create_keyword_regex'
# import nltk
ensure_package_path()
from nltk.tokenize import wordpunct_tokenize as tokenize
print 'tokenize ==> %s' % (keyword)
tokens = tokenize(keyword)
pattern = '\\s+'.join(tokens)
pattern = '\\b%s\\b' % pattern
print 'compile pattern ==> %s' % (pattern)
return re.compile(pattern, re.I | re.UNICODE)
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def tokenize(text, filter_stopwords=False, lowercase=True):
words = wordpunct_tokenize(text)
if filter_stopwords:
words = [w for w in words if w not in STOPWORDS]
return words
def BOW2(texts, vocab, dim):
'''
Convert a list of texts to the BoW dense representation.
'''
out = np.zeros((len(texts), dim), dtype=np.int32)
mask = np.zeros((len(texts), dim), dtype=np.float32)
for i, text in enumerate(texts):
bow = BOW(wordpunct_tokenize(text), vocab)
out[i,:len(bow[0])] = bow[0]
mask[i,:len(bow[1])] = bow[1]
return out, mask
def Word2Vec_encode(texts, wemb):
out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
for i, text in enumerate(texts):
words = wordpunct_tokenize(text)
n = 0.
for word in words:
if word in wemb:
out[i,:] += wemb[word]
n += 1.
out[i,:] /= max(1.,n)
return out
def text2idx2(texts, vocab, dim, use_mask=False):
'''
Convert a list of texts to their corresponding vocabulary indexes.
'''
if use_mask:
out = -np.ones((len(texts), dim), dtype=np.int32)
mask = np.zeros((len(texts), dim), dtype=np.float32)
else:
out = -2 * np.ones((len(texts), dim), dtype=np.int32)
out_lst = []
for i, text in enumerate(texts):
words = wordpunct_tokenize(text)[:dim]
for j, word in enumerate(words):
if word in vocab:
out[i,j] = vocab[word]
else:
out[i,j] = -1 # Unknown words
out_lst.append(words)
if use_mask:
mask[i,:j] = 1.
if use_mask:
return out, mask, out_lst
else:
return out, out_lst
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def get_syllables(sonnet):
from nltk.tokenize import wordpunct_tokenize
tokens = [wordpunct_tokenize(s) for s in sonnet]
punct = set(['.', ',', '!', ':', ';'])
filtered = [ [w for w in sentence if w not in punct ] for sentence in tokens]
last = [ sentence[len(sentence) - 1] for sentence in filtered]
syllables = [[(word, len(pron), pron) for (word, pron) in cmu_dict if word == w] for w in last]
return syllables
def compute_idx(pages_path_in, pages_path_out, vocab):
f = h5py.File(pages_path_in, 'r')
if prm.att_doc and prm.att_segment_type == 'sentence':
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
os.remove(pages_path_out) if os.path.exists(pages_path_out) else None
# Save to HDF5
fout = h5py.File(pages_path_out,'a')
if prm.att_doc:
shape = (f['text'].shape[0],prm.max_segs_doc,prm.max_words)
else:
shape=(f['text'].shape[0],prm.max_words)
idxs = fout.create_dataset('idx', shape=shape, dtype=np.int32)
mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)
i = 0
for text in f['text']:
st = time.time()
if prm.att_doc:
if prm.att_segment_type.lower() == 'section' or prm.att_segment_type.lower() == 'subsection':
segs = ['']
for line in text.split('\n'):
if prm.att_segment_type == 'section':
line = line.replace('===', '')
if line.strip().startswith('==') and line.strip().endswith('=='):
segs.append('')
segs[-1] += line.lower() + '\n'
elif prm.att_segment_type.lower() == 'sentence':
segs = tokenizer.tokenize(text.lower().decode('ascii', 'ignore'))
elif prm.att_segment_type.lower() == 'word':
segs = wordpunct_tokenize(text.decode('ascii', 'ignore'))
else:
raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter. Valid options are "section", "subsection", "sentence", or "word".')
segs = segs[:prm.max_segs_doc]
idxs_, _ = utils.text2idx2(segs, vocab, prm.max_words)
idxs[i,:len(idxs_),:] = idxs_
mask[i] = len(idxs_)
else:
idx, _ = utils.text2idx2([text.lower()], vocab, prm.max_words)
idxs[i,:] = idx[0]
i += 1
#if i > 3000:
# break
print 'processing article', i, 'time', time.time()-st
f.close()
fout.close()
def get_candidates(qatp):
print 'loading data...'
idf = pkl.load(open(prm.idf_path, "rb"))
wk = wiki.Wiki(prm.pages_path)
print 'creating vocabulary...'
vocab = {}
for q,_,_,_ in qatp:
words = wordpunct_tokenize(q.lower())
for word in words:
if word in idf:
vocab[word] = {}
print 'creating inverted index...'
i = 0
for text in wk.get_text_iter():
if i%10000==0:
print 'article', i
words = wordpunct_tokenize(text.lower())
for word in words:
if word in vocab:
vocab[word][i] = 0
#if i > 500000:
# break
i += 1
print 'selecting pages...'
candidates = []
for i,[q,_,_,_] in enumerate(qatp):
st = time.time()
words = wordpunct_tokenize(q.lower())
scores = {}
for word in words:
if word in vocab:
if len(vocab[word]) < 100000:
for pageid in vocab[word].keys():
if pageid not in scores:
scores[pageid] = 0.
scores[pageid] += idf[word]
idxs = np.argsort(np.asarray(scores.values()))[::-1]
pages = scores.keys()
if len(idxs)==0:
print 'error question:', q
c = OrderedDict()
for idx in idxs[:prm.max_candidates]:
c[pages[idx]] = 0
candidates.append(c)
print 'sample ' + str(i) + ' time ' + str(time.time()-st)
#if i > 10000:
# break
return candidates