def preprocessor_data(data, ids, test=0):
stopwords_set = set(stopwords.words('english'))
stemmer = stem.LancasterStemmer()
data_in_preprocessed = []
labels = []
for line in data:
words_preprocessed = []
line.lower()
label, words = line.split()[0], line.split()[1:]
labels.append(int(label))
for word in words:
if word in stopwords_set:
continue
lemmatized = stemmer.stem(word)
if test == 0:
ids[lemmatized]
words_preprocessed.append(lemmatized)
data_in_preprocessed.append(words_preprocessed)
return data_in_preprocessed, labels
评论列表
文章目录