def tokenize(text):
# text = NB.remove_punctuation(text)
try:
text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
except:
text = text.encode('ascii', 'replace').strip().lower()
word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't
biword = [b for b in nltk.bigrams(word)]
triword = [t for t in nltk.trigrams(word)]
# word = [w for w in word if w not in stopwords.words('english')]
return word # triword
python类trigrams()的实例源码
def posTrigramsScore(trigrams,category,pos_tags_trigrams,labels):
#keep pos tags bigrams of specific category
trigrams_category = subList(pos_tags_trigrams,labels,category)
#initialize dictionary
d = {}
#calculate score for every bigram
for trigram in trigrams:
d[trigram] = score(trigram,category,trigrams_category,pos_tags_trigrams)
return d
#calculate bigram's f1 score
def getBigrams(l):
b = []
for x in l:
b.append(list(bigrams(x)))
return b
#calculate trigrams of every item of the list l
def getTrigrams(l):
tr = []
for x in l:
tr.append(list(trigrams(x)))
return tr
#calculate pos tag score