def tokenize(str_stream, eos=True, remove_punct=False):
"""
Given a str or str_stream (f.read()) convert the str to a list of sentences,
e.g.: [[word, word], [word, word, ...], ...]
:param str_stream: a str or a str_stream
:param eos: wether turns '.' into <eos> tag
:param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'"
:return: a list of sentences, each sentence is a list of words (str)
"""
# do lazy import coz import nltk is very slow
import nltk
try:
nltk.data.load('tokenizers/punkt/english.pickle')
except LookupError:
print('punct resource not found, using nltk.download("punkt") to download resource data...')
nltk.download('punkt')
tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())]
# get POS Tags
tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal')
pos_tags = []
for token_tags in tokens_tags:
_, tags = zip(*token_tags)
pos_tags.append(tags)
# tag number
tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens]
if eos:
for token in tokens:
token[-1] = '<eos>'
if remove_punct:
tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens]
return tokens, pos_tags
评论列表
文章目录