def _preprocessing(self, content):
"""Text preprocessing"""
# Remove new line
content = re.sub(r"(\r\n|\r|\n)+", r"", content)
# Convert one or multiple non-breaking space to space
content = re.sub(r"(\xa0)+", r"\s", content)
# Convert multiple spaces to only one space
content = re.sub(r"\s{2,}", r"\s", content)
# Trim whitespace from starting and ending of text
content = content.strip(string.whitespace)
if self.word_delimiter and self.tag_delimiter:
# Trim word delimiter from starting and ending of text
content = content.strip(self.word_delimiter)
# Convert special characters (word and tag delimiter)
# in text's content to escape character
find = "{0}{0}{1}".format(re.escape(self.word_delimiter),
re.escape(self.tag_delimiter))
replace = "{0}{2}{1}".format(re.escape(self.word_delimiter),
re.escape(self.tag_delimiter),
re.escape(constant.ESCAPE_WORD_DELIMITER))
content = re.sub(find, replace, content)
find = "{0}{0}".format(re.escape(self.tag_delimiter))
replace = "{1}{0}".format(re.escape(self.tag_delimiter),
re.escape(constant.ESCAPE_TAG_DELIMITER))
content = re.sub(find, replace, content)
# Replace distinct quotation mark into standard quotation
content = re.sub(r"\u2018|\u2019", r"\'", content)
content = re.sub(r"\u201c|\u201d", r"\"", content)
return content
评论列表
文章目录