def preprocess(post):
# example
# {(romeo and juliet 2013),(romeo and juliet),(douglas booth),(hailee steinfeld)}"
# -> romeo and juliet 2013 romeo and juliet douglas booth hailee steinfeld
print post
# remove all punctuations
post = PUNCTUATION.sub(' ', utils.to_unicode(post))
# replace all emoji characters to '_EMOTICON_' and add space in between.
post = EMOTICON.sub(' _emoticon_ ', post)
# convert all special characters to ascii characters
post = unidecode(post).decode('ascii', 'ignore')
# remove all whitespace into single one
post = WHITESPACE.sub(' ', post).strip()
return utils.to_unicode(post)
评论列表
文章目录