python类FreqDist()的实例源码

corpus.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
corpus.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in para:
                counts['sents'] += 1

                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
text_tools.py 文件源码 项目:QProb 作者: quant-trade 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def scoreFunction(wholetext):
    """Get text, find most common words and compare with known
    stopwords. Return dictionary of values"""

    dictiolist = {}
    scorelist = {}
    # These are the available languages with stopwords from NLTK
    NLTKlanguages=["dutch","finnish","german","italian", "portuguese",
        "spanish","turkish","danish","english", "french","hungarian",
        "norwegian","russian","swedish"]

    FREElanguages = [""]
    languages=NLTKlanguages + FREElanguages

    # Fill the dictionary of languages, to avoid  unnecessary function calls
    for lang in NLTKlanguages:
        dictiolist[lang] = stopwords.words(lang)

    # Split all the text in tokens and convert to lowercase. In a
    # decent version of this, I'd also clean the unicode
    tokens = word_tokenize(wholetext)
    tokens = [t.lower() for t in tokens]

    # Determine the frequency distribution of words, looking for the
    # most common words
    freq_dist = FreqDist(tokens)

    # This is the only interesting piece, and not by much. Pick a
    # language, and check if each of the 20 most common words is in
    # the language stopwords. If it's there, add 1 to this language
    # for each word matched. So the maximal score is 20. Why 20? No
    # specific reason, looks like a good number of words.
    for lang in languages:
        scorelist[lang]=0
        for word in freq_dist.keys()[0:20]:
            if word in dictiolist[lang]:
                scorelist[lang]+=1
    return scorelist
word_cluster.py 文件源码 项目:PolBotCheck 作者: codeforfrankfurt 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def calc_frequencies(words, words_n=50, lang='german'):
    words = [word for word in words if len(word) > 1]
    words = [word for word in words if not word.isnumeric()]
    words = [word.lower() for word in words]
    # words = [word for word in words if word not in all_stopwords]
    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer(lang)
    # words = [stemmer.stem(word) for word in words]

    fdist = nltk.FreqDist(words)
    return fdist.most_common(words_n)
religion_project.py 文件源码 项目:religion_project 作者: 000384832 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def occurencecount():

    # Ask the user to input a word
    word = raw_input("Enter a word : ")

    # Create a list of file which we will be looking into for matches
    fileList = ['Text1.txt', 'Text2.txt', 'Text3.txt', 'Text4.txt']

    # Open the files one by one, read them and find the occurance count inside each file
    for filename in fileList:

        # Openthe file
        fp_text = codecs.open(filename, 'r', 'utf-8')

        # Read all the words inside the file
        words_text = word_tokenize(fp_text.read())

        # Find the number of occurances of each word using built in method from NLTK
        fd_text = FreqDist(words_text)

        # Print out the number of occurances for that specific word
        print("Number of occurences in " + filename + " :  " + str(fd_text[word]))
app.py 文件源码 项目:may142016 作者: ftrain 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_words(tweets):
    """Given a set of tweets, return the most frequently-used words."""
    tweets = filter(lambda x: not(x.is_rt), tweets)
    tokenized = [nltk.word_tokenize(handle_strip(t.tweet_text))
                 for t in tweets]
    words = [item for sublist in tokenized for item in sublist]
    longwords = filter(lambda x: len(x) > 6, words)
    lcwords = map(lambda x: x.lower(), longwords)
    fdist = nltk.FreqDist(lcwords)
    common = fdist.most_common(100)
    common = filter(lambda x: x[1] > 4, common)
    common = map(lambda x: [x[0], 6 + int(x[1]/3)], common)
    return common
analysis.py 文件源码 项目:tweet_analyzer 作者: atandy 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def make_data(file_name):
    '''Returns Tuple of dataframes used in analysis:
    core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df'''
    #realDonaldTrump_master_tweet_list.json

    #TODO: fix so strings aren't written to file and we can just load it as json.
    with open(file_name) as tfile:
        lines = tfile.readlines()
    raw_tweets_data =  [eval(t) for t in lines]

    analyzer = TextAnalyzer(raw_tweets_data)
    english_stopwords = stopwords.words("english")

    core_tweet_df = analyzer.make_tweet_df(
        with_pos_tags=False,
        columns_to_filter=['id', 'created_at', 'text', 'retweet_count', 'favorite_count'])

    # get list of tweets as text
    tweets_list = core_tweet_df.text.tolist()
    pos_df = analyzer.make_pos_df(tweets_list, make_csv=False)
    adj_df = pos_df[pos_df.pos_tag=='JJ']
    adj_df = analyzer.make_word_frequency_df(adj_df, 'word', make_csv=False)

    # calculate word frequencies among other words in data set. can't merge with pos
    # because certain words have many parts of speech. 
    word_frequency_df = analyzer.make_word_frequency_df(pos_df, 'word', make_csv=False)


    #Most common hashtags and total unique hashtags.
    all_hashtags = []
    for i in raw_tweets_data:
        all_hashtags.extend([d['text'] for d in i['entities']['hashtags']])
    fd = FreqDist(all_hashtags)

    hash_df = pd.DataFrame([{'hashtag':x,'abs_frequency': y, 'rel_frequency_pct': float(y)/len(all_hashtags)*100} for x,y in fd.most_common()])

    return core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df
quotes.py 文件源码 项目:GLaDOS2 作者: TheComet 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def zipf(self, message, users):
        source_user = message.author.name
        source_user = source_user.strip('@').split('#')[0]

        target_users = [user.strip('@').split('#')[0] for user in users.split()]
        if len(users) == 0:
            target_users = [source_user]

        if users == '*':
            if message.server is not None:
                target_users = [member.name for member in message.server.members]
        target_users = [user for user in target_users if self.check_nickname_valid(user.lower()) is None]

        image_file_name = self.quotes_file_name(source_user.lower())[:-4] + '.png'
        pylab.title('Word frequencies')
        for user in target_users:
            quotes_file = codecs.open(self.quotes_file_name(user.lower()), 'r', encoding='utf-8')
            lines = quotes_file.readlines()
            quotes_file.close()

            if len(lines) < 20:
                continue

            tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
            tokens = self.filter_to_english_words(tokenizer.tokenize(str(lines)))
            if len(tokens) < 200:
                continue
            freq = nltk.FreqDist(tokens)
            self.plot_word_frequencies(freq, user)

        pylab.legend()
        pylab.savefig(image_file_name)
        pylab.gcf().clear()

        await self.client.send_file(message.channel, image_file_name)
data.py 文件源码 项目:augmented_seq2seq 作者: suriyadeepan 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist
data.py 文件源码 项目:augmented_seq2seq 作者: suriyadeepan 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    vocab = [ item for item in vocab if item[1] > 1 ]
    # index2word
    index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist
topics.py 文件源码 项目:PyTrafficCar 作者: liyuming1978 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)

        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics
SVM.py 文件源码 项目:codenn 作者: sriniiyer 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def getFeat(self, line):
        listItem = [0]*self.noFeat
        fileFreqDist = nltk.FreqDist(SVM.tokenize(line))

        i = 0
        for key in self.trainKeys:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i = i + 1
        return listItem
print_english_words.py 文件源码 项目:adversarial-squad 作者: robinjia 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def main():
  freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION)
  vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]]
  for w in vocab:
    print w
mdc_generator.py 文件源码 项目:DeepBot 作者: IgorWang 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def take_some_analysis(file_dir):
    context_length = []
    utterance_length = []

    dist = nltk.FreqDist()

    for c, u in utterance_generator(file_dir):
        c_tokens = nltk.word_tokenize(c)
        u_tokens = nltk.word_tokenize(u)
        #  ????
        context_length.append(len(c_tokens))
        utterance_length.append(len(u_tokens))

        dist.update(c_tokens + u_tokens)

    cl_array = np.array(context_length)
    ul_array = np.array(utterance_length)

    print("most length of context is %d" % cl_array.max())
    print("most length of utterance is %d" % ul_array.max())
    print("mean length of context is %f" % cl_array.mean())
    print("mean length of utterance is %f" % ul_array.mean())

    sub_abs = np.abs(cl_array - ul_array)
    print("max,min,mean of abs(context_length -utterance_length) is %f,%f,%f" % (
        np.max(sub_abs), np.min(sub_abs), np.mean(sub_abs)))

    print("most common words :")
    print(dist.most_common(10))
preprocess.py 文件源码 项目:RNNPythonTutorial 作者: eublefar 项目源码 文件源码 阅读 47 收藏 0 点赞 0 评论 0
def preprocess_data(self):
        # Read the data and append SENTENCE_START and SENTENCE_END tokens
        print "Reading CSV file..."
        with open('data/reddit-comments-2015-08.csv', 'rb') as f:
            reader = csv.reader(f, skipinitialspace=True)
            reader.next()
            # Split full comments into sentences
            sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
            # Append SENTENCE_START and SENTENCE_END
            sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
        print "Parsed %d sentences." % (len(sentences))

        # Tokenize the sentences into words
        tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

        # Count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        print "Found %d unique words tokens." % len(word_freq.items())

        # Get the most common words and build index_to_word and word_to_index vectors
        vocab = word_freq.most_common(self.vocabulary_size-1)
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(unknown_token)
        self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)])

        print "Using vocabulary size %d." % self.vocabulary_size
        print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

        # Replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in self.word_to_index else unknown_token for w in sent]

        print "\nExample sentence: '%s'" % sentences[0]
        print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

        # Create the training data
        #tokenized_words = [item for sublist in tokenized_sentences for item in sublist]
        #self.X_train = np.asarray([self.word_to_index[w] for w in tokenized_words[:-1]])
        #self.Y_train = np.asarray([self.word_to_index[w] for w in tokenized_words[1:]])
        self.X_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
        self.Y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
TrainFunctions.py 文件源码 项目:truecaser 作者: nreimers 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def checkSentenceSanity(sentence):
    """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
    caseDist = nltk.FreqDist()

    for token in sentence:
        caseDist[getCasing(token)] += 1

    if caseDist.most_common(1)[0][0] != 'allLower':        
        return False

    return True
sentiment.py 文件源码 项目:RottenCrawler 作者: kevin940726 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features
keyphrase_extraction.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def get_top_ngrams(corpus, ngram_val=1, limit=5):

    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)

    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(), 
                              key=itemgetter(1), reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) 
                     for text, freq in sorted_ngrams]

    return sorted_ngrams
extract_samples_for_sentiments.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def sample_split(dbname,num_train,num_test):
    client = MongoClient()
    db = client[dbname]
    sentisent_collection = db.sentiment_sentences

    ################## load and count
    aspect_dist = nltk.FreqDist()
    sentiment_dist = nltk.FreqDist()

    all_samples = []
    cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train  + num_test } } ])
    for index,d in enumerate(cursor):
        sent = Sentence.from_dict(d)
        all_samples.append( (sent.words,sent.sentiment) )

        aspect_dist[sent.aspect] +=1
        sentiment_dist[int(sent.sentiment)] +=1
    client.close()

    ################## show statistics
    for k in aspect_dist:
        print '[{}]: {}'.format(k,aspect_dist.freq(k))

    for k in sentiment_dist:
        print '[{}]: {}'.format(k,sentiment_dist.freq(k))

    ################## shuffle
    random.shuffle(all_samples)

    ################## split
    def __dump(filename,data):
        with open(filename,"wb") as outf:
            cPickle.dump(data,outf)

    __dump("sentidata_train_raw.pkl",all_samples[:num_train])
    __dump("sentidata_test_raw.pkl",all_samples[num_train:])


问题


面经


文章

微信
公众号

扫码关注公众号