def extract_bigrams(self, text):
text = self.remove_return_lines_and_quotes(text)
bigrams = []
st = PorterStemmer()
stop = stopwords.words('english')
more_stop_words = [
'(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
stop = stopwords.words('english')
stop = stop + more_stop_words
tokens = st.stem(text)
tokens = nltk.word_tokenize(tokens.lower())
tokens = [i for i in tokens if i not in stop]
tokens = [word for word in tokens if len(word) > 2]
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(2)
top_bigrams = finder.nbest(bigram_measures.pmi, 1000)
for bg in top_bigrams:
bg = " ".join(bg)
tag = nltk.pos_tag([bg])[0]
if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
bigrams.append(tag[0])
return bigrams
评论列表
文章目录