def preprocessing(content):
remove_punc = ('? ? ? ? ? ? ? ? ? —').split(' ')
## preprocessing #1 : remove XXenglishXX and numbers
preprocessing_1 = re.compile(r'\d*',re.L) ## only substitute numbers
#preprocessing_1 = re.compile(r'\w*',re.L) ## substitute number & English
content = preprocessing_1.sub("",content)
## preprocessing #2 : remove punctuation
preprocessing_2 = re.compile('[%s]' % re.escape(string.punctuation))
content = preprocessing_2.sub("",content)
## preprocessing #3 : remove Chinese punctuation and multiple whitspaces
content = content.replace('\n','')
for punc in remove_punc:
content = content.replace(punc,'')
try:
content = parsing.strip_multiple_whitespaces(content)
except:
print 'Warning : failed to strip whitespaces @ '
return content
评论列表
文章目录