def add_token(self, token_string, token_pos=None):
# get lemma string:
if all(x in string.punctuation for x in token_string):
token_pos = "PUNCT"
lemma = token_string
else:
try:
# use the current lemmatizer to assign the token to a lemma:
lemma = self._lemmatize(token_string, self._pos_translate(token_pos)).lower()
except Exception:
lemma = token_string.lower()
# get word id, and create new word if necessary:
word_dict = {self.word_lemma: lemma, self.word_label: token_string}
if token_pos and self.arguments.use_nltk:
word_dict[self.word_pos] = token_pos
word_id = self.table(self.word_table).get_or_insert(word_dict, case=True)
# store new token in corpus table:
return self.add_token_to_corpus(
{self.corpus_word_id: word_id,
self.corpus_sentence: self._sentence_id,
self.corpus_file_id: self._file_id})
评论列表
文章目录