def preprocess(content):
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
words_set = []
for twitter in content:
words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
words_set = list(set(words_set))
stop_words = stopwords.words('english')
non_words = list(punctuation)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# only need the alphabetic word
formartted_twitter_words_set = []
for word in words_set:
if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
formartted_twitter_words_set.append(lemmatizer.lemmatize(word))
nltk_words_set = list(set(nltk.corpus.words.words()))
# training whole set
training_set = formartted_twitter_words_set + nltk_words_set
return training_set
评论列表
文章目录