def get_user_to_word_proportion(user_to_text, word):
"""
Maps each user to the proportion of his words that consist of a specificied
word.
"""
user_to_word_proportion = {}
for user in user_to_text:
lm = LanuageModel(user_to_text[user])
n_tokens = len(lm.lowercase_tokens)
if n_tokens > 0:
fd = nltk.FreqDist(lm.lowercase_tokens)
user_to_word_proportion[user] = fd[word] / float(n_tokens)
else:
user_to_word_proportion[user] = 0.0
print 'Finished user {}'.format(user.encode('utf-8'))
return user_to_word_proportion
python类FreqDist()的实例源码
def most_frequent_Brown_Corpus_words():
import nltk
import nltk.corpus
words = []
for word in nltk.corpus.brown.words():
if word not in [
",",
".",
"``",
"''",
";",
"?",
"--",
")",
"(",
":",
"!"
]:
words.append(word.lower())
frequencies_words = nltk.FreqDist(words).most_common()
words_most_frequent = [word[0] for word in frequencies_words]
return words_most_frequent
def _calculate_word_scores(self, phrase_list):
"""Scores words according to frequency and tendency to appear in multi-word key phrases"""
word_freq = nltk.FreqDist()
word_multiplier = nltk.FreqDist()
for phrase in phrase_list:
# Give a higher score if word appears in multi-word candidates
multi_word = min(2, len(filter(lambda x: not is_numeric(x), phrase)))
for word in phrase:
# Normalize by taking the stem
word_freq[stem(word)] += 1
word_multiplier[stem(word)] += multi_word
for word in word_freq.keys():
word_multiplier[word] = word_multiplier[word] / float(word_freq[word]) # Take average
word_scores = {}
for word in word_freq.keys():
word_scores[word] = word_freq[word] * word_multiplier[word]
return word_scores
1-train-CBOW.py 文件源码
项目:Deep-Learning-with-Theano
作者: PacktPublishing
项目源码
文件源码
阅读 39
收藏 0
点赞 0
评论 0
def build_dictionary(words, max_df=5):
word_freq = [[unkown_token, -1], [pad_token, 0]]
word_freq.extend(nltk.FreqDist(itertools.chain(words)).most_common())
word_freq = OrderedDict(word_freq)
word2idx = {unkown_token: 0, pad_token: 1}
idx2word = {0: unkown_token, 1: pad_token}
idx = 2
for w in word_freq:
f = word_freq[w]
if f >= max_df:
word2idx[w] = idx
idx2word[idx] = w
idx += 1
else:
word2idx[w] = 0 # map the rare word into the unkwon token
word_freq[unkown_token] += 1 # increment the number of unknown tokens
return word2idx, idx2word, word_freq
def get_corpus_of_most_active_users(n_users=5):
tweets = []
texts = []
with open(DATASET_PATH) as f:
for line in f:
tweets.append(json.loads(line)['user']['screen_name'])
texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text']))
users = nltk.FreqDist(tweets).most_common(n_users)
dict = {}
for user, tweet in texts:
if user in dict:
dict[user] = " ".join([dict[user],tweet])
else:
dict[user] = tweet
corpus = [dict[name] for name, _ in users]
user_names = [name for name, _ in users]
return corpus, user_names
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def load_data():
global N, words, labels
posts = corpus.xml_posts()[:10000]
freqs = [ FreqDist(post.text) for post in posts ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
labels = list(set([ post.get('class') for post in posts ]))
data = []
N = len(words)
for post, dist in zip(posts, freqs):
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
data.append((V, labels.index(post.get('class'))))
return data
def load_data():
global N, words
freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
data = []
N = len(words)
for dist in freqs:
x = volumize(dist)
data.append((x, x.w))
return data
def test():
gt = GetTweets()
documents = gt.get_hashtag('ferguson', count=20)
documents += gt.get_hashtag('police', count=21)
print 'Query:', documents[-1]
tokenizer = RegexpTokenizer('\w+')
vols = []
for doc in documents:
samples = []
for token in tokenizer.tokenize(doc):
word = token.lower()
if word not in ENGLISH_STOP_WORDS and word not in punctuation:
samples.append(word)
vols.append(volumize(FreqDist(samples)))
vectors = [ doc_code(v) for v in vols[:-1] ]
query_vec = doc_code(vols[-1])
sims = [ cos(v, query_vec) for v in vectors ]
m = max(sims)
print m, documents[sims.index(m)]
def load_data():
global N, words
freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
data = []
N = len(words)
for dist in freqs:
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
data.append((V, V.w))
return data
def tokenize_sentences(self):
# tokenize the sentences into words and count the word frequencies
# get most common words, build index_to_word and word_to_index vectors
self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
self.sentences]
word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
print("Found %d unique word tokens." % len(word_freq.items()))
vocab = word_freq.most_common(self.vocabulary_size - 1)
self.index_to_word = [x[0] for x in vocab]
self.index_to_word.append(self.unknown_token)
self.word_to_index = dict(
[(w, i) for i, w in enumerate(self.index_to_word)])
print("Using vocabulary size %d." % self.vocabulary_size)
print(
"The least frequent word is '%s' appearing %d times." % (
vocab[-1][0], vocab[-1][1]))
# replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(self.tokenized_sentences):
self.tokenized_sentences[i] = [
w if w in self.word_to_index else self.unknown_token for w in
sent]
textcat.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 39
收藏 0
点赞 0
评论 0
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def load_words(num_words):
words = get_words_from_nltk()
fdist = nltk.FreqDist(words)
fdistmc = fdist.most_common()
nd = OrderedDict()
nda = []
occurences = set([wt[1] for wt in fdistmc])
occurences = sorted(occurences, key=int, reverse=True)
for idx in occurences:
nd[idx] = sorted([wt[0] for wt in fdistmc if wt[1] == idx])
for key, val in nd.items():
nda += val
words = nda[:num_words]
return words
def statistics_by_aspect():
filename = "aspects_train.csv"
words_dist = nltk.ConditionalFreqDist()
sample_sizes = nltk.FreqDist()
samples_stream = get_samples_stream(filename)
for aspect,words in samples_stream:
sample_sizes[aspect] += 1
for word in words:
words_dist[aspect][word] += 1
for category,dist in words_dist.iteritems():
print "\n------- Category: {}".format(category)
print dist.most_common(20)
total_samples = sample_sizes.N()
print "\ntotally {} samples".format(total_samples)
for aspect, count in sample_sizes.iteritems():
print "aspect[{}] has {} samples, {:.2f}%".format(aspect,count, count*100.0/total_samples)
def save_topics(model,filename):
with open(filename,"wt") as outf:
# ---------- write each topic and words' contribution
topics = model.show_topics(num_topics=-1, log=False, formatted=True)
for topic in topics:
# topic[0]: topic number
# topic[1]: topic description
outf.write("\n############# TOPIC {} #############\n".format(topic[0]))
outf.write(topic[1]+"\n")
# ---------- words statistics in all topics
outf.write("\n\n\n****************** KEY WORDS ******************\n")
topics = model.show_topics(num_topics=-1, log=False, formatted=False)
keywords = (word for (_,words) in topics for (word,score) in words)
fdist = nltk.FreqDist(keywords)
for index,(w,c) in enumerate( fdist.most_common(100) ):
outf.write("{}-th keyword: <{},{}>\n".format(index+1,w,c))
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def createPopularWords(combined, lowerBound, upperBound):
allWords = []
for message in combined:
for word in message[0]:
allWords.append(word)
allWords = nltk.FreqDist(allWords)
# grab the top several thousand words, ignoring the lowerBound most popular
# grabbing more words leads to more accurate predictions, at the cost of both memory and compute time
# ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall
popularWords = []
wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound]
for pair in wordsToUse:
popularWords.append(pair[0])
return popularWords
# extract features from a single document in a consistent manner for all documents in a corpus
# simply returns whether a given word in popularWords is included in the document
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def get_word_counts(input_str, limit = 100):
input_str = PreprocessManager.remove_non_ascii(input_str)
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = EnglishStemmer()
tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower())
tokenized_text = [word for word in tokenized_text if len(word) > 1] # Filter some small words
#tokenized_text = [word for word in tokenized_text if not word.isnumeric()]
filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')]
stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words]
# Calculate frequency distribution
frequency_dist = nltk.FreqDist(stemmed_list)
# Output top 50 words
result = dict()
for word, frequency in frequency_dist.most_common(limit):
# print(u'{};{}'.format(word, frequency))
result[word] = frequency
return result
# This function just splits the words and gives the words that's all!
def analysis(reviews_collection_text):
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
raw_data = f.read()
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
comments = f.readlines()
data = raw_data.replace('\n', ' ')
data_lower = data.lower()
tokens_with_punc = word_tokenize(data_lower)
tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
print("--- Most frequent tokens ---\n",
FreqDist(tokens_with_punc).most_common(15))
print("--- Tokens without punctuation ---\n",
FreqDist(tokens).most_common(15))
stop = set(stopwords.words('english'))
words = [word for word in tokens if word not in stop]
print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
tagged = pos_tag(words)
nouns = [word for word, pos in tagged if (pos == 'NN')]
print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
adjts = [word for word, pos in tagged if (pos == 'JJ')]
print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
avgld = sum(lxdst) / len(comments)
print("--- Average lexical density ---\n", avgld)
def plot_common_tokens(self, n_tokens):
# Remove common stopwords
fd = nltk.FreqDist(w for w in self.alpha_tokens if w not in s)
fd.plot(n_tokens)
def get_user_to_word_count(user_to_text, word):
user_to_word_count = {}
for user in user_to_text:
lm = LanuageModel(user_to_text[user])
fd = nltk.FreqDist(lm.lowercase_tokens)
user_to_word_count[user] = fd[word]
return user_to_word_count
def sentence2vec(self, sentence):
if len(self.features) == 0:
self.load_feature_model()
seg_list = jieba.cut(sentence, False)
freq_dist = nltk.FreqDist(seg_list)
local_list = []
for each in self.features:
local_list.append(freq_dist[each])
return local_list
def get_freq_dist(self, seg_list):
freq_dist = []
for each in seg_list:
freq_dist.append(nltk.FreqDist(each))
return freq_dist
def _remove_uncommon_words(cls, tokenized_corpus, vocabulary_size):
word_count = nltk.FreqDist( itertools.chain(*tokenized_corpus) )
word_count = [cls.WORD_COUNT_ITEM(word=word, count=count) for word, count in word_count.items()]
word_count = sorted(word_count, key=lambda item: (item.count, item.word), reverse=True)
most_common_words = [word_count_item.word for word_count_item in word_count[:vocabulary_size - \
cls.NUMBER_OF_WORDS_TO_ADD_IN_MANUALLY + 1]]
tokenized_corpus = [
[word if word in most_common_words else cls.UNKNOWN_TOKEN for word in sentence]\
for sentence in tokenized_corpus
]
return tokenized_corpus
def parse_text(filename, vocabulary_size=9000, type="word"):
with open(filename, 'rb') as f:
txt = f.read()
if type == "word":
sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
# sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))
vocab = word_freq.most_common(vocabulary_size-1)
index = [sentence_start_token, sentence_end_token, unknown_token] + [x[0] for x in vocab]
word_to_index = dict([(w,i) for i,w in enumerate(index)])
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
for i, sent in enumerate(tokenized_sentences):
tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
X_train = np.asarray([ [0]+[word_to_index[w] for w in sent] for sent in tokenized_sentences])
y_train = np.asarray([ [word_to_index[w] for w in sent]+[1] for sent in tokenized_sentences])
# X_train, y_train = [], []
# for sent in tokenized_sentences:
# l = len(sent) - 1
# X_train.append(coo_matrix((np.ones( (l) ), ( range(l), [word_to_index[w] for w in sent[:-1]] )), shape=(l, vocabulary_size )).toarray())
# y_train.append( [word_to_index[w] for w in sent[1:] ] )
else:
sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
index = ['^','$'] + list(set(txt))
char_to_index = dict([(w,i) for i,w in enumerate(index)])
X_train = np.asarray([ [0]+[ char_to_index[w] for w in sent] for sent in sentences])
y_train = np.asarray([ [ char_to_index[w] for w in sent]+[1] for sent in sentences])
return X_train, y_train, index