def preprocess(tweet):
preprocessed = copy.copy(tweet)
preprocessed = preprocessed.lower()
# remove some emoticons the TweetTokenizer does not know
preprocessed = remove_emoticons(preprocessed)
# split contractions like "he's" -> "he s",
# by using imported contractions dictionary
preprocessed = split_contractions(preprocessed)
# split compounds like "next-level" -> "next level"
preprocessed = split_compounds(preprocessed)
# remove links
preprocessed = remove_links(preprocessed)
# remove all special characters and return tokenized text
preprocessed = remove_special_characters(preprocessed)
preprocessed = remove_empty_sentences(preprocessed)
return preprocessed
评论列表
文章目录