def _load(self):
"""Load text to memory"""
corpus_directory = glob.escape(self.corpus_directory)
file_list = sorted(glob.glob(os.path.join(corpus_directory, "*.txt")))
for path in file_list:
with open(path, "r", encoding="utf8") as text:
# Read content from text file
content = text.read()
# Preprocessing
content = self._preprocessing(content)
# Create text instance
text = Text(path, os.path.basename(path), content)
# Add text to corpus
self.__corpus.append(text)
python类escape()的实例源码
def glob_escape(pathname):
"""
Escape all special characters.
"""
drive, pathname = os.path.splitdrive(pathname)
pathname = _magic_check.sub(r'[\1]', pathname)
return drive + pathname
def _preprocessing(self, content):
"""Text preprocessing"""
# Remove new line
content = re.sub(r"(\r\n|\r|\n)+", r"", content)
# Convert one or multiple non-breaking space to space
content = re.sub(r"(\xa0)+", r"\s", content)
# Convert multiple spaces to only one space
content = re.sub(r"\s{2,}", r"\s", content)
# Trim whitespace from starting and ending of text
content = content.strip(string.whitespace)
if self.word_delimiter and self.tag_delimiter:
# Trim word delimiter from starting and ending of text
content = content.strip(self.word_delimiter)
# Convert special characters (word and tag delimiter)
# in text's content to escape character
find = "{0}{0}{1}".format(re.escape(self.word_delimiter),
re.escape(self.tag_delimiter))
replace = "{0}{2}{1}".format(re.escape(self.word_delimiter),
re.escape(self.tag_delimiter),
re.escape(constant.ESCAPE_WORD_DELIMITER))
content = re.sub(find, replace, content)
find = "{0}{0}".format(re.escape(self.tag_delimiter))
replace = "{1}{0}".format(re.escape(self.tag_delimiter),
re.escape(constant.ESCAPE_TAG_DELIMITER))
content = re.sub(find, replace, content)
# Replace distinct quotation mark into standard quotation
content = re.sub(r"\u2018|\u2019", r"\'", content)
content = re.sub(r"\u201c|\u201d", r"\"", content)
return content
def get_token_list(self, index):
"""Get list of (word, tag) pair"""
if not self.word_delimiter or not self.tag_delimiter:
return list()
# Get content by index
content = self.__corpus[index].content
# Empty file
if not content:
return list()
# Split each word by word delimiter
token_list = content.split(self.word_delimiter)
for idx, token in enumerate(token_list):
# Empty or Spacebar
if token == "" or token == constant.SPACEBAR:
word = constant.SPACEBAR
tag = constant.PAD_TAG_INDEX
# Word
else:
# Split word and tag by tag delimiter
datum = token.split(self.tag_delimiter)
word = datum[0]
tag = datum[-2]
# Replace escape character to proper character
word = word.replace(constant.ESCAPE_WORD_DELIMITER, self.word_delimiter)
tag = tag.replace(constant.ESCAPE_TAG_DELIMITER, self.tag_delimiter)
# Replace token with word and tag pair
token_list[idx] = (word, tag)
return token_list
def glob_escape(pathname):
"""
Escape all special characters.
"""
drive, pathname = os.path.splitdrive(pathname)
pathname = _magic_check.sub(r'[\1]', pathname)
return drive + pathname
def check_escape(self, arg, expected):
self.assertEqual(glob.escape(arg), expected)
self.assertEqual(glob.escape(os.fsencode(arg)), os.fsencode(expected))
def glob_escape(pathname):
"""
Escape all special characters.
"""
drive, pathname = os.path.splitdrive(pathname)
pathname = _magic_check.sub(r'[\1]', pathname)
return drive + pathname
def glob_escape(pathname):
"""
Escape all special characters.
"""
drive, pathname = os.path.splitdrive(pathname)
pathname = _magic_check.sub(r'[\1]', pathname)
return drive + pathname
def check_escape(self, arg, expected):
self.assertEqual(glob.escape(arg), expected)
self.assertEqual(glob.escape(os.fsencode(arg)), os.fsencode(expected))
def glob_escape(pathname):
"""
Escape all special characters.
"""
drive, pathname = os.path.splitdrive(pathname)
pathname = _magic_check.sub(r'[\1]', pathname)
return drive + pathname
def search_file(root_dir, base_name):
# This won't work with python < 3.5
found = []
base_name = glob.escape(base_name)
beforext, _, ext = base_name.rpartition('.')
if ext.lower() in VIDEO_FILES_EXT:
protected_path = os.path.join(root_dir, "**", "*" + beforext + "*" + ext)
else:
protected_path = os.path.join(root_dir, "**", "*" + beforext + "*")
protected_path = protected_path
log.debug("Searching %r", protected_path)
for filename in glob.iglob(protected_path, recursive=True):
log.debug("Found: %s", filename)
found.append(filename)
return found