def pre_processing(tokenizer, truecaser, info):
# SPLIT THE WHITESPACES
source_file_t = re.split('([\t\n\r\f\v]+)', info['src'])
# SENTENCE TOKENIZE
for i in range(len(source_file_t)):
if i % 2 == 0:
source_file_t[i] = sent_tokenize(source_file_t[i])
# TOKENIZATION
if info['tok']:
for j in range(len(source_file_t)):
if j % 2 == 0:
for i in range(len(source_file_t[j])):
try:
source_file_t[j][i] = str(
tokenizer.tokenize(source_file_t[j][i], return_str=True).encode('utf-8'))
except NameError:
source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))
# TRUECASING
if info['tc']:
for j in range(len(source_file_t)):
if j % 2 == 0:
for i in range(len(source_file_t[j])):
source_file_t[j][i] = str((truecasing(truecaser, source_file_t[j][i].split(' ')[0]).decode(
'utf-8') + " " + (' '.join(source_file_t[j][i].split(' ')[1:]).decode('utf-8'))).encode('utf-8'))
print source_file_t[j][i]
# IF NEITHER
if not (info['tc'] or info['tok']):
for j in range(len(source_file_t)):
if j % 2 == 0:
for i in range(len(source_file_t[j])):
try:
source_file_t[j][i] = str(source_file_t[j][i].encode('utf-8'))
except NameError:
source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))
return source_file_t
评论列表
文章目录