def preprocessor_data(data, ids, test=0):
stopwords_set = set(stopwords.words('english'))
stemmer = stem.LancasterStemmer()
data_in_preprocessed = []
labels = []
for line in data:
words_preprocessed = []
line.lower()
label, words = line.split()[0], line.split()[1:]
labels.append(int(label))
for word in words:
if word in stopwords_set:
continue
lemmatized = stemmer.stem(word)
if test == 0:
ids[lemmatized]
words_preprocessed.append(lemmatized)
data_in_preprocessed.append(words_preprocessed)
return data_in_preprocessed, labels
python类LancasterStemmer()的实例源码
def getFeature(word_list):
stemmer = stem.LancasterStemmer()
# stemmer2 = stem.PorterStemmer()
feature = defaultdict(lambda: 0)
for word in word_list:
if not isStopWords(word):
word_stem = stemmer.stem(word)
feature[word_stem] += 1
return dict(feature)
def stem_text(text):
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
tokens = tokenize_text(text)
filtered_tokens = [ls.stem(token) for token in tokens]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def preprocessor_words(words):
stopwords_set = set(stopwords.words('english'))
stemmer = stem.LancasterStemmer()
words_preprocessed = []
for word in words:
if word in stopwords_set:
continue
lemmatized = stemmer.stem(word)
words_preprocessed.append(lemmatized)
return words_preprocessed