python类probability()的实例源码-面圈网

extractFeatures.py 文件源码项目：weibo_scrawler_app 作者: coolspiderghy 项目源码文件源码阅读 51 收藏 0 点赞 0 评论 0

def create_word_scores(posWords,negWords,posTag,negTag):
    from nltk.probability import FreqDist, ConditionalFreqDist
    import itertools 
    posWords = list(itertools.chain(*posWords)) #????????????
    negWords = list(itertools.chain(*negWords)) #??

    word_fd = FreqDist() #?????????
    cond_word_fd = ConditionalFreqDist() #????????????????????
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd[posTag].N() #??????
    neg_word_count = cond_word_fd[negTag].N() #??????
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #????????????????????????????
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #??
        word_scores[word] = pos_score + neg_score #?????????????????????????

    return word_scores #??????????????

utils.py 文件源码项目：Question-Answering 作者: arianhosseini 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def generate_squad_vocab(path, vocabulary_size=30000):
    import json
    import itertools
    # from operator import itemgetter
    from nltk.probability import FreqDist
    d = json.load(open(path))
    tokenized_sentences = []
    for reading in d['data']:
        for paragraph in reading['paragraphs']:
            sentence = paragraph['context'].lower()
            tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))
            for question in paragraph['qas']:
                sentence = question['question'].lower()     #TODO later check whether to add answer as well or not
                tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))

    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print('total uniq words:', len(word_freq))
    # sorted_freq = sorted(dict(word_freq).items(), key=itemgetter(1))[::-1]
    full_vocab = word_freq.most_common(len(word_freq))
    vocab = open('vocab_full.txt','w')
    for w in full_vocab:
        vocab.write(w[0]+'\t'+str(w[1])+'\n')
    vocab.close()
    shorted_vocab = word_freq.most_common(vocabulary_size-1)
    vocab = open('vocab.txt','w')
    for w in shorted_vocab:
        vocab.write(w[0]+'\t'+str(w[1])+'\n')
    vocab.close()

extractFeatures_org02.py 文件源码项目：weibo_scrawler_app 作者: coolspiderghy 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def create_word_scores(posWords,negWords):
    # import all yuliao
    import itertools 
    from nltk.probability import FreqDist, ConditionalFreqDist 
    def count_fd(valueWords,tag):   
        Words = list(itertools.chain(*valueWords)) #????????????
        word_fd = FreqDist() #?????????
        cond_word_fd = ConditionalFreqDist() #????????????????????
        for word in Words:
            word_fd[word] += 1#word_fd.inc(word)
            cond_word_fd[tag][word]+= 1#cond_word_fd['pos'].inc(word)        
        word_count = cond_word_fd[tag].N() #???
        return word_fd,cond_word_fd,tag,word_count
    """
    def count_fd(valueWords,tag):    
        Words[0] = list(itertools.chain(*valueWords)) #????????????
        word_fd = FreqDist() #?????????
        cond_word_fd = ConditionalFreqDist() #????????????????????
        for word in Words[0]:
            word_fd[word] += 1#word_fd.inc(word)
            cond_word_fd[tag[0]][word]+= 1#cond_word_fd['pos'].inc(word)
        for word in Words[1]:
            word_fd[word] += 1#word_fd.inc(word)
            cond_word_fd[tag[1]][word]+= 1#cond_word_fd['pos'].inc(word)            
        word_count[0] = cond_word_fd[tag[0]].N() #???
        word_count[1] = cond_word_fd[tag[1]].N() #???
        return word_fd,cond_word_fd,tag,word_count[0],word_count[1]
    """
    total_word_count = count_fd(posWords,'pos')[3]+count_fd(negWords,'neg')[3]
    # get words_scores
    def all_word_scores(total_word_count,*args):#word_fd,cond_word_fd,tag,word_count):  
        #print args#count_fd(posWords,'pos')[0]
        word_fd,cond_word_fd,tag,word_count=args[0][0],args[0][1],args[0][2],args[0][3]
        #print word_fd,cond_word_fd,tag,word_count
        word_score = []
        for word, freq in word_fd.iteritems():
            score = BigramAssocMeasures.chi_sq(cond_word_fd[tag][word], (freq, word_count), total_word_count) #????????????????????????????
            word_score.append((word,score))
        return word_score
    word_scores={}
    for word_score in all_word_scores(total_word_count,count_fd(posWords,'pos')):
        word_scores.setdefault(word_score[0],word_score[1])
    for word_score in all_word_scores(total_word_count,count_fd(posWords,'neg')):
        word_scores.setdefault(word_score[0],word_score[1]) 
    return word_scores #??????????????    
    #word_scores[word] = pos_score + neg_score #?????????????????????????
#????????????????????????