def keyword_extractor(data):
try:
#np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data)))
#result = np_extractor.extract()
text = words_wo_stopwords(strip_tags(data))
#TODO this is duplicated job, should be improved
words = word_tokenize(strip_tags(text))
taggged = pos_tag(words)
cleaned = filter_insignificant(taggged)
text = " ".join(cleaned)
wc = WordCloudMod().generate(text)
result = list(wc.keys())[:10]
except Exception as err:
print(colored.red("At keywords extraction {}".format(err)))
result = []
return result
# TODO definitely can be better if we knew where content is
python类word_tokenize()的实例源码
def create_batch(self, sentence_li):
"""Create a batch for a list of sentences."""
embeddings_batch = []
for sen in sentence_li:
embeddings = []
sent_toks = sent_tokenize(sen)
word_toks = [word_tokenize(el) for el in sent_toks]
tokens = [val for sublist in word_toks for val in sublist]
tokens = [el for el in tokens if el != '']
for tok in tokens:
embeddings.append(self.embdict.tok2emb.get(tok))
if len(tokens) < self.max_sequence_length:
pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))]
embeddings = pads + embeddings
else:
embeddings = embeddings[-self.max_sequence_length:]
embeddings = np.asarray(embeddings)
embeddings_batch.append(embeddings)
embeddings_batch = np.asarray(embeddings_batch)
return embeddings_batch
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def extractFeatures(self, article, n, customStopWords=None):
# pass in article as a tuple ( text, title)
text = article[0]
# extract the text
title = article[1]
# extract the title
sentences = sent_tokenize(text)
# split text into sentences
word_sent = [word_tokenize(sentences.lower()) for a in sentences]
# split sentences into words
self._freq = self._compute_frequencies(word_sent, customStopWords)
# calculate word freq using member func created above
if n < 0:
# how many features (words) to return - a -ve number means
# no feature ( word) selection, just return all features
return nlargest(len(self._freq_keys()),
self._freq, key=self._freq.get)
else:
# here we say if calling e func has asked for a subset
# then return only the 'n' largest features, i.e. the
# most important words ( important == frequent, less stopwords)
return nlargest(n, self._freq, key=self._freq.get)
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def summarize(self, article, n):
text = article[0]
text = article[1]
sentences = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sentences]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i, sentence in enumerate(word_sent):
for word in sentence:
if word in self._freq:
ranking[i] += self._freq[word]
sentences_index = nlargest(n, ranking, key=ranking.get)
return [sentences[j] for j in sentences_index]
##############################################################################
# TEST
def similarity(c1, c2):
'''stop words are words like "it" and "the" , that have no massive impact on the
sentence'''
stop_words = list(stopwords.words("english"))
# Removes stop words in both sentences
c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
c1_words = Counter(dedupe(c1_cleaned))
c2_words = Counter(dedupe(c2_cleaned))
total_words = c1_words + c2_words
similarity_between_words = 0
for key, val in total_words.items():
''' Looks at whether the two articles share a word'''
if total_words[key] > 1:
similarity_between_words += 1
return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
dont_run_me_run_the_other_script_instead.py 文件源码
项目:punctuator2
作者: ottokart
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def process_line(line):
tokens = word_tokenize(line)
output_tokens = []
for token in tokens:
if token in INS_PUNCTS:
output_tokens.append(INS_PUNCTS[token])
elif token in EOS_PUNCTS:
output_tokens.append(EOS_PUNCTS[token])
elif is_number(token):
output_tokens.append(NUM)
else:
output_tokens.append(token.lower())
return untokenize(" ".join(output_tokens) + " ")
def check_sent(s):
count = 0
for r in s:
#words = word_tokenize(r)
# for w in words:
for w in r:
if type(w) != str:
print(w)
count += 1
continue
if w in inv_words or w in oov_words_in_train:
continue
if w not in word2vec:
count += 1
oov_words_in_train.add(w)
else:
inv_words[w] = word2vec.vocab[w].index
return count
def preprocess_questions(examples, nlp='nltk'):
if nlp == 'nltk':
from nltk.tokenize import word_tokenize
print('Example of generated tokens after preprocessing some questions:')
for i, ex in enumerate(examples):
s = ex['question']
if nlp == 'nltk':
ex['question_words'] = word_tokenize(str(s).lower())
elif nlp == 'mcb':
ex['question_words'] = tokenize_mcb(s)
else:
ex['question_words'] = tokenize(s)
if i < 10:
print(ex['question_words'])
if i % 1000 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(examples), i*100.0/len(examples)) )
sys.stdout.flush()
return examples
def summarize(self, text, n):
"""
Return a list of n sentences
which represent the summary of text.
"""
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = [word_tokenize(s.lower()) for s in sents]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(word_sent):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking, n)
return [sents[j] for j in sents_idx]
def load_jacana(fname, regexen):
samples = []
with open(fname, 'rt') as inp:
for line in inp:
line = line.strip()
if line.startswith('<Q> '):
qorig = line[len('<Q> '):]
q = word_tokenize(qorig)
else:
l = line.split(' ')
label = int(l[0])
kwweight = float(l[1])
aboutkwweight = float(l[2])
text = word_tokenize(' '.join(l[3:]))
toklabels = regex_overlap(text, regexen[qorig])
samples.append({'qtext': ' '.join(q), 'label': label,
'atext': ' '.join(text),
'kwweight': kwweight, 'aboutkwweight': aboutkwweight,
'toklabels': ' '.join([str(0+tl) for tl in toklabels])})
return samples
def load_sts(dsfile, skip_unlabeled=True):
""" load a dataset in the sts tsv format """
s0 = []
s1 = []
labels = []
with codecs.open(dsfile, encoding='utf8') as f:
for line in f:
line = line.rstrip()
label, s0x, s1x = line.split('\t')
if label == '':
if skip_unlabeled:
continue
else:
labels.append(-1.)
else:
labels.append(float(label))
s0.append(word_tokenize(s0x))
s1.append(word_tokenize(s1x))
return (s0, s1, np.array(labels))
def load_quora(dsfile):
""" load a dataset in the quora csv format """
s0 = []
s1 = []
labels = []
with open(dsfile, encoding = 'utf8') as csvfile:
f = csv.reader(csvfile)
firstline = True
for line in f:
if firstline:
firstline = False
continue
s0x = line[3]
s1x = line[4]
label = line[5]
labels.append(float(label))
s0.append(word_tokenize(s0x))
s1.append(word_tokenize(s1x))
return (s0, s1, np.array(labels))
def load_sts(dsfile, skip_unlabeled=True):
""" load a dataset in the sts tsv format """
s0 = []
s1 = []
labels = []
with codecs.open(dsfile, encoding='utf8') as f:
for line in f:
line = line.rstrip()
label, s0x, s1x = line.split('\t')
if label == '':
if skip_unlabeled:
continue
else:
labels.append(-1.)
else:
labels.append(float(label))
s0.append(word_tokenize(s0x))
s1.append(word_tokenize(s1x))
return (s0, s1, np.array(labels))
def load_quora(dsfile):
""" load a dataset in the quora csv format """
s0 = []
s1 = []
labels = []
with open(dsfile, encoding = 'utf8') as csvfile:
f = csv.reader(csvfile)
firstline = True
for line in f:
if firstline:
firstline = False
continue
s0x = line[3]
s1x = line[4]
label = line[5]
labels.append(float(label))
s0.append(word_tokenize(s0x))
s1.append(word_tokenize(s1x))
return (s0, s1, np.array(labels))
def make_word_feature(df,embeddings):
# use embeddings to vectorize merchant description
# currently using averaging to combine words in merchant
# there are other options: http://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence
merchants = df.merchant.tolist()
veclen = len(embeddings['food'])
word_feature = np.zeros((len(merchants),veclen))
for idx, merchant in enumerate(merchants):
num_known = 0
try:
words = tokenize.word_tokenize(merchant)
words = [word.lower() for word in words]
for word in words:
wordvec = embeddings[word]
word_feature[idx,:] += wordvec
num_known += 1
except:
pass
word_feature[idx,:] = word_feature[idx,:] / float(max(num_known,1))
return word_feature
def predict(testSet,PP,PN,positive_probabilities,negative_probabilities,unseen_pos_prob,unseen_neg_prob):
predicted_class = []
for review in testSet:
negative_probab = math.log10(PN)
positive_probab = math.log10(PP)
review_words = word_tokenize(review)
for w in review_words:
if w in negative_probabilities:
negative_probab = negative_probab + math.log10(negative_probabilities[w])
else:
negative_probab = negative_probab + math.log10(unseen_neg_prob)
if w in positive_probabilities:
positive_probab = positive_probab + math.log10(positive_probabilities[w])
else:
positive_probab = positive_probab + math.log10(unseen_pos_prob)
if(negative_probab > positive_probab):
result = '-'
else:
result = '+'
predicted_class.append(result)
return predicted_class
def create_vocab(self,dataset_path, vocab_path ,max_vocab_size):
print("generating vocab from dataset at {}".format(dataset_path))
all_words = []
for dataset in ["snli_1.0_train.jsonl","snli_1.0_dev.jsonl","snli_1.0_test.jsonl"]:
for line in open(os.path.join(dataset_path, dataset),"r").readlines():
data = json.loads(line)
all_words += word_tokenize(data["sentence1"].lower())
all_words += word_tokenize(data["sentence2"].lower())
counter = Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x : (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
words = ["PAD"] + ["UNK"] + list(words)
word_to_id = dict(zip(words[:max_vocab_size], range(max_vocab_size)))
with open(vocab_path, "w") as file:
for word, id in word_to_id.items():
file.write("{}\t{}\n".format(word,id))
print("vocab of size {} written to {}, with PAD token == 0, UNK token == 1".format(max_vocab_size,vocab_path))
def getFreqWords(directoryPath):
files = getListOfFilesInDir(directoryPath, "*") # get list of files in directory
allWords = []
count = 0
if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files):
files = random.sample(files, MAX_FILES_PER_CLASS)
for ifile, fi in enumerate(files): # for each file in current class:
with open(fi) as f:
content = f.read()
words = word_tokenize(content.decode('utf-8'))
words = [w.lower() for w in words if w.lower() not in stop]
words = list(set(words))
allWords += words
count += 1
#print allWords
C = Counter(allWords)
C = sorted(C.items(), key=itemgetter(1),reverse=True)
for c in C:
if c[1] > 0.05 * float(count):
print c[0], c[1] / float(count)
def prepro_question(imgs, params):
# preprocess all the question
print 'example processed tokens:'
for i,img in enumerate(imgs):
s = img['question']
if params['token_method'] == 'nltk':
txt = word_tokenize(str(s).lower())
else:
txt = tokenize(s)
img['processed_tokens'] = txt
if i < 10: print txt
if i % 1000 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(imgs), i*100.0/len(imgs)) )
sys.stdout.flush()
return imgs
sentiment_featureset.py 文件源码
项目:tensorflow-neural-networks
作者: vipul-sharma20
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def create_lexicon(pos, neg):
lexicon = []
for fi in [pos, neg]:
with open (fi, 'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
"""
This is done in the tutorial.
Seems like a brute force method of removing stopwords.
TODO: Use NLTK stopwords to remove stop words ?
"""
l2 = []
for w in w_counts:
if 1000 > w_counts[w] > 50:
l2.append(w)
return l2
sentiment_featureset.py 文件源码
项目:tensorflow-neural-networks
作者: vipul-sharma20
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def sample_handling(sample, lexicon, classification):
featureset = []
with open(sample, 'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
current_words = word_tokenize(l.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
features[index_value] += 1
features = list(features)
featureset.append([features, classification])
return featureset
def _avgrank_corp(inp_dir,hdv_vocab, num = 5000):
cnt, vocab = Counter(), []
# Counter for all words in the corpus
for (root, dirs, files) in os.walk(inp_dir):
files = [f for f in files if not f[0] == '.']
for f in files:
filepath = os.path.join(root,f)
with codecs.open(filepath,'r', encoding="utf-8") as f:
tok_txt = word_tokenize(f.read())
for word in tok_txt: cnt[word] += 1
for word in hdv_vocab:
if word in cnt.keys(): del cnt[word]
for word in cnt.most_common(num):
try: vocab.append(str(word[0]))
except: continue
return vocab
def prepro_question(imgs, params):
# preprocess all the question
print 'example processed tokens:'
for i,img in enumerate(imgs):
s = img['question']
if params['token_method'] == 'nltk':
txt = word_tokenize(str(s).lower())
else:
txt = tokenize(s)
img['processed_tokens'] = txt
if i < 10: print txt
if i % 1000 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(imgs), i*100.0/len(imgs)) )
sys.stdout.flush()
return imgs
create_sentiment_featuresets.py 文件源码
项目:kaggle-youtube-8m
作者: liufuyang
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def create_lexicon(pos, neg):
lexicon = []
for fi in [pos, neg]:
with io.open(fi, 'r', encoding='utf-8') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l.lower())
lexicon += list(all_words)
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
l2 = []
for w in w_counts:
if 1000 > w_counts[w] > 50:
l2.append(w)
return l2
create_sentiment_featuresets.py 文件源码
项目:kaggle-youtube-8m
作者: liufuyang
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def sample_handling(sample, lexicon, classification):
featureset = []
with io.open(sample, 'r', encoding='utf-8') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
current_words = word_tokenize(l.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
features[index_value] += 1
features = list(features)
featureset.append([features, classification])
return featureset
def custom_tokenizer(sentence, delimiters=['|', ','], remove_puncs=True, get_unique=False):
# tokens = re.split('(\W)', sentence)
for delimiter in delimiters:
sentence = re.sub(re.escape(delimiter), " "+delimiter+" ", sentence)
tokens = word_tokenize(sentence)
# Remove duplicates
if get_unique:
tokens = list(set(tokens))
if remove_puncs:
tokens = [token for token in tokens if
not ((len(token.strip()) == 1) and bool(re.search("[^a-zA-Z0-9]", token)))]
tokens = [token for token in tokens if (not bool(re.search("\s", token)) and token != '')]
# Remove duplicates
if get_unique:
tokens = list(set(tokens))
return tokens
def offset_tokenize(text):
tail = text
accum = 0
tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
info_tokens = []
for tok in tokens:
scaped_tok = re.escape(tok)
m = re.search(scaped_tok, tail)
start, end = m.span()
# global offsets
gs = accum + start
ge = accum + end
accum += end
# keep searching in the rest
tail = tail[end:]
info_tokens.append((tok, (gs, ge)))
return info_tokens
def prepro_question(imgs, params):
# preprocess all the question
print 'example processed tokens:'
for i,img in enumerate(imgs):
s = img['question']
if params['token_method'] == 'nltk':
txt = word_tokenize(str(s).lower())
else:
txt = tokenize(s)
img['processed_tokens'] = txt
if i < 10: print txt
if i % 100 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(imgs), i*100.0/len(imgs)) )
sys.stdout.flush()
return imgs
def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""):
try:
tagged = pos_tag(word_tokenize(sent))
#Maybe actually better if possessives aren't included.
#At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is
#probably not enough information to identify a data source
chunkParser = RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
chunks = []
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
chunk = ""
for leave in subtree.leaves():
chunk += leave[0] + ' '
chunks.append(chunk.strip())
return chunked, chunks
except Exception as e:
print(str(e))
def train_model(documents, labels, sample_size=.3, verbose=True):
if verbose:
print('starting to generate training data...', end='', flush=True)
labeled_feature_set = list()
for n, doc in enumerate(documents):
feature = word_tokenize(' '.join(doc))
label = labels[n]
resampled = resample(feature, label, sample_size)
labeled_feature_set += resampled
if verbose:
print('done', flush=True)
print('training model...this may take a few minutes.',
flush=True, end='')
trained_model = NaiveBayesClassifier.train(iter(labeled_feature_set))
if verbose:
print('done', flush=True)
return trained_model