def describe(self, fileids=None, categories=None):
"""
Performs a single pass of the corpus and returns a dictionary with a
variety of metrics concerning the state of the corpus.
"""
# Structures to perform counting.
counts = nltk.FreqDist()
tokens = nltk.FreqDist()
started = time.time()
# Perform single pass over paragraphs, tokenize and count
for para in self.paras(fileids, categories):
counts['paras'] += 1
for sent in self._sent_tokenizer.tokenize(para):
counts['sents'] += 1
for word in self._word_tokenizer.tokenize(sent):
counts['words'] += 1
tokens[word] += 1
# Compute the number of files and categories in the corpus
n_fileids = len(self._resolve(fileids, categories) or self.fileids())
n_topics = len(self.categories(self._resolve(fileids, categories)))
# Return data structure with information
return {
'files': n_fileids,
'topics': n_topics,
'paras': counts['paras'],
'sents': counts['sents'],
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': float(counts['words']) / float(len(tokens)),
'ppdoc': float(counts['paras']) / float(n_fileids),
'sppar': float(counts['sents']) / float(counts['paras']),
'secs': time.time() - started,
}
python类FreqDist()的实例源码
def describe(self, fileids=None, categories=None):
"""
Performs a single pass of the corpus and returns a dictionary with a
variety of metrics concerning the state of the corpus.
"""
# Structures to perform counting.
counts = nltk.FreqDist()
tokens = nltk.FreqDist()
started = time.time()
# Perform single pass over paragraphs, tokenize and count
for para in self.paras(fileids, categories):
counts['paras'] += 1
for sent in para:
counts['sents'] += 1
for word, tag in sent:
counts['words'] += 1
tokens[word] += 1
# Compute the number of files and categories in the corpus
n_fileids = len(self._resolve(fileids, categories) or self.fileids())
n_topics = len(self.categories(self._resolve(fileids, categories)))
# Return data structure with information
return {
'files': n_fileids,
'topics': n_topics,
'paras': counts['paras'],
'sents': counts['sents'],
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': float(counts['words']) / float(len(tokens)),
'ppdoc': float(counts['paras']) / float(n_fileids),
'sppar': float(counts['sents']) / float(counts['paras']),
'secs': time.time() - started,
}
def scoreFunction(wholetext):
"""Get text, find most common words and compare with known
stopwords. Return dictionary of values"""
dictiolist = {}
scorelist = {}
# These are the available languages with stopwords from NLTK
NLTKlanguages=["dutch","finnish","german","italian", "portuguese",
"spanish","turkish","danish","english", "french","hungarian",
"norwegian","russian","swedish"]
FREElanguages = [""]
languages=NLTKlanguages + FREElanguages
# Fill the dictionary of languages, to avoid unnecessary function calls
for lang in NLTKlanguages:
dictiolist[lang] = stopwords.words(lang)
# Split all the text in tokens and convert to lowercase. In a
# decent version of this, I'd also clean the unicode
tokens = word_tokenize(wholetext)
tokens = [t.lower() for t in tokens]
# Determine the frequency distribution of words, looking for the
# most common words
freq_dist = FreqDist(tokens)
# This is the only interesting piece, and not by much. Pick a
# language, and check if each of the 20 most common words is in
# the language stopwords. If it's there, add 1 to this language
# for each word matched. So the maximal score is 20. Why 20? No
# specific reason, looks like a good number of words.
for lang in languages:
scorelist[lang]=0
for word in freq_dist.keys()[0:20]:
if word in dictiolist[lang]:
scorelist[lang]+=1
return scorelist
def calc_frequencies(words, words_n=50, lang='german'):
words = [word for word in words if len(word) > 1]
words = [word for word in words if not word.isnumeric()]
words = [word.lower() for word in words]
# words = [word for word in words if word not in all_stopwords]
# Stemming words seems to make matters worse, disabled
# stemmer = nltk.stem.snowball.SnowballStemmer(lang)
# words = [stemmer.stem(word) for word in words]
fdist = nltk.FreqDist(words)
return fdist.most_common(words_n)
def occurencecount():
# Ask the user to input a word
word = raw_input("Enter a word : ")
# Create a list of file which we will be looking into for matches
fileList = ['Text1.txt', 'Text2.txt', 'Text3.txt', 'Text4.txt']
# Open the files one by one, read them and find the occurance count inside each file
for filename in fileList:
# Openthe file
fp_text = codecs.open(filename, 'r', 'utf-8')
# Read all the words inside the file
words_text = word_tokenize(fp_text.read())
# Find the number of occurances of each word using built in method from NLTK
fd_text = FreqDist(words_text)
# Print out the number of occurances for that specific word
print("Number of occurences in " + filename + " : " + str(fd_text[word]))
def get_words(tweets):
"""Given a set of tweets, return the most frequently-used words."""
tweets = filter(lambda x: not(x.is_rt), tweets)
tokenized = [nltk.word_tokenize(handle_strip(t.tweet_text))
for t in tweets]
words = [item for sublist in tokenized for item in sublist]
longwords = filter(lambda x: len(x) > 6, words)
lcwords = map(lambda x: x.lower(), longwords)
fdist = nltk.FreqDist(lcwords)
common = fdist.most_common(100)
common = filter(lambda x: x[1] > 4, common)
common = map(lambda x: [x[0], 6 + int(x[1]/3)], common)
return common
def make_data(file_name):
'''Returns Tuple of dataframes used in analysis:
core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df'''
#realDonaldTrump_master_tweet_list.json
#TODO: fix so strings aren't written to file and we can just load it as json.
with open(file_name) as tfile:
lines = tfile.readlines()
raw_tweets_data = [eval(t) for t in lines]
analyzer = TextAnalyzer(raw_tweets_data)
english_stopwords = stopwords.words("english")
core_tweet_df = analyzer.make_tweet_df(
with_pos_tags=False,
columns_to_filter=['id', 'created_at', 'text', 'retweet_count', 'favorite_count'])
# get list of tweets as text
tweets_list = core_tweet_df.text.tolist()
pos_df = analyzer.make_pos_df(tweets_list, make_csv=False)
adj_df = pos_df[pos_df.pos_tag=='JJ']
adj_df = analyzer.make_word_frequency_df(adj_df, 'word', make_csv=False)
# calculate word frequencies among other words in data set. can't merge with pos
# because certain words have many parts of speech.
word_frequency_df = analyzer.make_word_frequency_df(pos_df, 'word', make_csv=False)
#Most common hashtags and total unique hashtags.
all_hashtags = []
for i in raw_tweets_data:
all_hashtags.extend([d['text'] for d in i['entities']['hashtags']])
fd = FreqDist(all_hashtags)
hash_df = pd.DataFrame([{'hashtag':x,'abs_frequency': y, 'rel_frequency_pct': float(y)/len(all_hashtags)*100} for x,y in fd.most_common()])
return core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df
def zipf(self, message, users):
source_user = message.author.name
source_user = source_user.strip('@').split('#')[0]
target_users = [user.strip('@').split('#')[0] for user in users.split()]
if len(users) == 0:
target_users = [source_user]
if users == '*':
if message.server is not None:
target_users = [member.name for member in message.server.members]
target_users = [user for user in target_users if self.check_nickname_valid(user.lower()) is None]
image_file_name = self.quotes_file_name(source_user.lower())[:-4] + '.png'
pylab.title('Word frequencies')
for user in target_users:
quotes_file = codecs.open(self.quotes_file_name(user.lower()), 'r', encoding='utf-8')
lines = quotes_file.readlines()
quotes_file.close()
if len(lines) < 20:
continue
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = self.filter_to_english_words(tokenizer.tokenize(str(lines)))
if len(tokens) < 200:
continue
freq = nltk.FreqDist(tokens)
self.plot_word_frequencies(freq, user)
pylab.legend()
pylab.savefig(image_file_name)
pylab.gcf().clear()
await self.client.send_file(message.channel, image_file_name)
def index_(tokenized_sentences, vocab_size):
# get frequency distribution
freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
# get vocabulary of 'vocab_size' most used words
vocab = freq_dist.most_common(vocab_size)
# index2word
index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
# word2index
word2index = dict([(w,i) for i,w in enumerate(index2word)] )
return index2word, word2index, freq_dist
def index_(tokenized_sentences, vocab_size):
# get frequency distribution
freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
# get vocabulary of 'vocab_size' most used words
vocab = freq_dist.most_common(vocab_size)
vocab = [ item for item in vocab if item[1] > 1 ]
# index2word
index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ]
# word2index
word2index = dict([(w,i) for i,w in enumerate(index2word)] )
return index2word, word2index, freq_dist
def test():
global N, words, network
print 'In testing.'
gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
tokenizer = RegexpTokenizer('\w+')
gettysburg_tokens = tokenizer.tokenize(gettysburg)
samples = []
for token in gettysburg_tokens:
word = token.lower()
if word not in ENGLISH_STOP_WORDS and word not in punctuation:
samples.append(word)
dist = FreqDist(samples)
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
pred = network.forward(V).w
topics = []
while len(topics) != 5:
max_act = max(pred)
topic_idx = pred.index(max_act)
topic = words[topic_idx]
if topic in gettysburg_tokens:
topics.append(topic)
del pred[topic_idx]
print 'Topics of the Gettysburg Address:'
print topics
def getFeat(self, line):
listItem = [0]*self.noFeat
fileFreqDist = nltk.FreqDist(SVM.tokenize(line))
i = 0
for key in self.trainKeys:
if fileFreqDist.has_key(key):
listItem[i] = fileFreqDist.get(key)
i = i + 1
return listItem
def main():
freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION)
vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]]
for w in vocab:
print w
def take_some_analysis(file_dir):
context_length = []
utterance_length = []
dist = nltk.FreqDist()
for c, u in utterance_generator(file_dir):
c_tokens = nltk.word_tokenize(c)
u_tokens = nltk.word_tokenize(u)
# ????
context_length.append(len(c_tokens))
utterance_length.append(len(u_tokens))
dist.update(c_tokens + u_tokens)
cl_array = np.array(context_length)
ul_array = np.array(utterance_length)
print("most length of context is %d" % cl_array.max())
print("most length of utterance is %d" % ul_array.max())
print("mean length of context is %f" % cl_array.mean())
print("mean length of utterance is %f" % ul_array.mean())
sub_abs = np.abs(cl_array - ul_array)
print("max,min,mean of abs(context_length -utterance_length) is %f,%f,%f" % (
np.max(sub_abs), np.min(sub_abs), np.mean(sub_abs)))
print("most common words :")
print(dist.most_common(10))
def preprocess_data(self):
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open('data/reddit-comments-2015-08.csv', 'rb') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
# Split full comments into sentences
sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
# Append SENTENCE_START and SENTENCE_END
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(self.vocabulary_size-1)
self.index_to_word = [x[0] for x in vocab]
self.index_to_word.append(unknown_token)
self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)])
print "Using vocabulary size %d." % self.vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
tokenized_sentences[i] = [w if w in self.word_to_index else unknown_token for w in sent]
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]
# Create the training data
#tokenized_words = [item for sublist in tokenized_sentences for item in sublist]
#self.X_train = np.asarray([self.word_to_index[w] for w in tokenized_words[:-1]])
#self.Y_train = np.asarray([self.word_to_index[w] for w in tokenized_words[1:]])
self.X_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
self.Y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
def checkSentenceSanity(sentence):
""" Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
caseDist = nltk.FreqDist()
for token in sentence:
caseDist[getCasing(token)] += 1
if caseDist.most_common(1)[0][0] != 'allLower':
return False
return True
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
keyphrase_extraction.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 41
收藏 0
点赞 0
评论 0
def get_top_ngrams(corpus, ngram_val=1, limit=5):
corpus = flatten_corpus(corpus)
tokens = nltk.word_tokenize(corpus)
ngrams = compute_ngrams(tokens, ngram_val)
ngrams_freq_dist = nltk.FreqDist(ngrams)
sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
key=itemgetter(1), reverse=True)
sorted_ngrams = sorted_ngrams_fd[0:limit]
sorted_ngrams = [(' '.join(text), freq)
for text, freq in sorted_ngrams]
return sorted_ngrams
extract_samples_for_sentiments.py 文件源码
项目:OpinionMining728
作者: stasi009
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def sample_split(dbname,num_train,num_test):
client = MongoClient()
db = client[dbname]
sentisent_collection = db.sentiment_sentences
################## load and count
aspect_dist = nltk.FreqDist()
sentiment_dist = nltk.FreqDist()
all_samples = []
cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train + num_test } } ])
for index,d in enumerate(cursor):
sent = Sentence.from_dict(d)
all_samples.append( (sent.words,sent.sentiment) )
aspect_dist[sent.aspect] +=1
sentiment_dist[int(sent.sentiment)] +=1
client.close()
################## show statistics
for k in aspect_dist:
print '[{}]: {}'.format(k,aspect_dist.freq(k))
for k in sentiment_dist:
print '[{}]: {}'.format(k,sentiment_dist.freq(k))
################## shuffle
random.shuffle(all_samples)
################## split
def __dump(filename,data):
with open(filename,"wb") as outf:
cPickle.dump(data,outf)
__dump("sentidata_train_raw.pkl",all_samples[:num_train])
__dump("sentidata_test_raw.pkl",all_samples[num_train:])