def create_word_scores(posWords,negWords,posTag,negTag):
from nltk.probability import FreqDist, ConditionalFreqDist
import itertools
posWords = list(itertools.chain(*posWords)) #????????????
negWords = list(itertools.chain(*negWords)) #??
word_fd = FreqDist() #?????????
cond_word_fd = ConditionalFreqDist() #????????????????????
for word in posWords:
#help(FreqDist)
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd[posTag].N() #??????
neg_word_count = cond_word_fd[negTag].N() #??????
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #????????????????????????????
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #??
word_scores[word] = pos_score + neg_score #?????????????????????????
return word_scores #??????????????
python类probability()的实例源码
def generate_squad_vocab(path, vocabulary_size=30000):
import json
import itertools
# from operator import itemgetter
from nltk.probability import FreqDist
d = json.load(open(path))
tokenized_sentences = []
for reading in d['data']:
for paragraph in reading['paragraphs']:
sentence = paragraph['context'].lower()
tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))
for question in paragraph['qas']:
sentence = question['question'].lower() #TODO later check whether to add answer as well or not
tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print('total uniq words:', len(word_freq))
# sorted_freq = sorted(dict(word_freq).items(), key=itemgetter(1))[::-1]
full_vocab = word_freq.most_common(len(word_freq))
vocab = open('vocab_full.txt','w')
for w in full_vocab:
vocab.write(w[0]+'\t'+str(w[1])+'\n')
vocab.close()
shorted_vocab = word_freq.most_common(vocabulary_size-1)
vocab = open('vocab.txt','w')
for w in shorted_vocab:
vocab.write(w[0]+'\t'+str(w[1])+'\n')
vocab.close()
extractFeatures_org02.py 文件源码
项目:weibo_scrawler_app
作者: coolspiderghy
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def create_word_scores(posWords,negWords):
# import all yuliao
import itertools
from nltk.probability import FreqDist, ConditionalFreqDist
def count_fd(valueWords,tag):
Words = list(itertools.chain(*valueWords)) #????????????
word_fd = FreqDist() #?????????
cond_word_fd = ConditionalFreqDist() #????????????????????
for word in Words:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[tag][word]+= 1#cond_word_fd['pos'].inc(word)
word_count = cond_word_fd[tag].N() #???
return word_fd,cond_word_fd,tag,word_count
"""
def count_fd(valueWords,tag):
Words[0] = list(itertools.chain(*valueWords)) #????????????
word_fd = FreqDist() #?????????
cond_word_fd = ConditionalFreqDist() #????????????????????
for word in Words[0]:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[tag[0]][word]+= 1#cond_word_fd['pos'].inc(word)
for word in Words[1]:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[tag[1]][word]+= 1#cond_word_fd['pos'].inc(word)
word_count[0] = cond_word_fd[tag[0]].N() #???
word_count[1] = cond_word_fd[tag[1]].N() #???
return word_fd,cond_word_fd,tag,word_count[0],word_count[1]
"""
total_word_count = count_fd(posWords,'pos')[3]+count_fd(negWords,'neg')[3]
# get words_scores
def all_word_scores(total_word_count,*args):#word_fd,cond_word_fd,tag,word_count):
#print args#count_fd(posWords,'pos')[0]
word_fd,cond_word_fd,tag,word_count=args[0][0],args[0][1],args[0][2],args[0][3]
#print word_fd,cond_word_fd,tag,word_count
word_score = []
for word, freq in word_fd.iteritems():
score = BigramAssocMeasures.chi_sq(cond_word_fd[tag][word], (freq, word_count), total_word_count) #????????????????????????????
word_score.append((word,score))
return word_score
word_scores={}
for word_score in all_word_scores(total_word_count,count_fd(posWords,'pos')):
word_scores.setdefault(word_score[0],word_score[1])
for word_score in all_word_scores(total_word_count,count_fd(posWords,'neg')):
word_scores.setdefault(word_score[0],word_score[1])
return word_scores #??????????????
#word_scores[word] = pos_score + neg_score #?????????????????????????
#????????????????????????