def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
python类PorterStemmer()的实例源码
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
def bag_of_words(list_of_strings, remove_puncs=True, remove_digits=True, remove_alnums=True):
porter = PorterStemmer()
lmtz = WordNetLemmatizer()
# empty bag of words
bag_of_words = []
# Iterate for string
for string in tqdm(list_of_strings):
string_tokens = custom_tokenizer(string, remove_puncs=remove_puncs, get_unique=True)
bag_of_words.extend(string_tokens)
if remove_alnums:
bag_of_words = [bag for bag in bag_of_words if bag.isalpha()]
elif remove_digits:
bag_of_words = [bag for bag in bag_of_words if (not isNumber(bag))]
bag_of_words.sort()
# Stem and Lemmatize the data
bag_of_words_stemmed = []
for word in bag_of_words:
try:
bag_of_words_stemmed.append(porter.stem(lmtz.lemmatize(word)))
except:
bag_of_words_stemmed.append(word)
bag_of_words = list(bag_of_words_stemmed)
# Remove stop words
stop = set(stopwords.words('english'))
print('Removing Stop words...')
bag_of_words = [bag.strip().lower() for bag in bag_of_words if (bag.strip().lower() not in stop)]
bow_counter = Counter(bag_of_words)
bow_counter = OrderedDict(sorted(bow_counter.items()))
return bow_counter
lda_model_calculator.py 文件源码
项目:moviegeek
作者: practical-recommender-systems
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def build_lda_model(self, data, docs, n_topics=5):
texts = []
tokenizer = RegexpTokenizer(r'\w+')
for d in data:
raw = d.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = self.remove_stopwords(tokens)
stemmed_tokens = stopped_tokens
#stemmer = PorterStemmer()
#stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
num_topics=n_topics)
index = similarities.MatrixSimilarity(corpus)
self.save_lda_model(lda_model, corpus, dictionary, index)
self.save_similarities(index, docs)
return dictionary, texts, lda_model
def extract_bigrams(self, text):
text = self.remove_return_lines_and_quotes(text)
bigrams = []
st = PorterStemmer()
stop = stopwords.words('english')
more_stop_words = [
'(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
stop = stopwords.words('english')
stop = stop + more_stop_words
tokens = st.stem(text)
tokens = nltk.word_tokenize(tokens.lower())
tokens = [i for i in tokens if i not in stop]
tokens = [word for word in tokens if len(word) > 2]
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(2)
top_bigrams = finder.nbest(bigram_measures.pmi, 1000)
for bg in top_bigrams:
bg = " ".join(bg)
tag = nltk.pos_tag([bg])[0]
if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
bigrams.append(tag[0])
return bigrams
build_wiki_classifier.py 文件源码
项目:wikipedia_classifier
作者: LouisFoucard
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def stem_tokens(tokens, stemmer = PorterStemmer()):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def k_tokenizer(text):
text = text.encode('ascii',errors='ignore').replace('-', '')
""" We should use a better way to remove non-english words """
tokenizer = TweetTokenizer(preserve_case=False)
tokens = tokenizer.tokenize(text)
# stopset = set(stopwords.words('english'))
# tokens = [word for word in tokens if not word in stopset]
""" Synonyms using wordnet """
mwe_tokenizer = MWETokenizer([('ios', '9'),])
mwe_tokens = mwe_tokenizer.tokenize(tokens)
""" We might want to tokenize by sentence and then tag each sentence and aggregate the results """
""" train -> train_NN train_V"""
tagged = nltk.pos_tag(mwe_tokens)
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN # we preserve the original form of any unknown word
wordnet_lemmatizer = WordNetLemmatizer()
final_doc=[]
for token, tag in tagged:
word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag))
final_doc.append(word)
# porter = PorterStemmer()
# final_doc=[]
# for token in mwe_tokens:
# final_doc.append(porter.stem(token))
return final_doc
def get_encoded_vector(list_of_words, new_string):
porter = PorterStemmer()
lmtz = WordNetLemmatizer()
if 'START_SEQ' not in list_of_words:
list_of_words.append('START_SEQ')
if 'UNKNOWN_WORDS' not in list_of_words:
list_of_words.append('UNKNOWN_WORDS')
if 'END_SEQ' not in list_of_words:
list_of_words.append('END_SEQ')
tokens = text_to_word_sequence(new_string, lower=True, split=" ")
# Stem and Lemmatize the data
token_stemmed = []
for token in tokens:
try:
token_stemmed.append(porter.stem(lmtz.lemmatize(token)))
except:
token_stemmed.append(token)
tokens = list(token_stemmed)
out = []
all_unknown_words = True
for token in tokens:
if token in list_of_words:
all_unknown_words = False
out.append(list_of_words.index(token))
else:
out.append(list_of_words.index('UNKNOWN_WORDS'))
if all_unknown_words:
print('Sentence not recognised:', new_string)
out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')]
return out