def __init__(self,
lang='en',
lower=True,
lemmatize=False,
remove_punct=True,
remove_digits=True,
remove_stop_words=False,
exclude_oov=False,
exclude_pos_tags=None,
exclude_entities=['PERSON']):
"""Encodes text into `(samples, words)`
Args:
lang: The spacy language to use. (Default value: 'en')
lower: Lower cases the tokens if True. (Default value: True)
lemmatize: Lemmatizes words when set to True. This also makes the word lower case
irrespective if the `lower` setting. (Default value: False)
remove_punct: Removes punct words if True. (Default value: True)
remove_digits: Removes digit words if True. (Default value: True)
remove_stop_words: Removes stop words if True. (Default value: False)
exclude_oov: Exclude words that are out of spacy embedding's vocabulary.
By default, GloVe 1 million, 300 dim are used. You can override spacy vocabulary with a custom
embedding to change this. (Default value: False)
exclude_pos_tags: A list of parts of speech tags to exclude. Can be any of spacy.parts_of_speech.IDS
(Default value: None)
exclude_entities: A list of entity types to be excluded.
Supported entity types can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types
(Default value: ['PERSON'])
"""
super(WordTokenizer, self).__init__(lang, lower)
self.lemmatize = lemmatize
self.remove_punct = remove_punct
self.remove_digits = remove_digits
self.remove_stop_words = remove_stop_words
self.exclude_oov = exclude_oov
self.exclude_pos_tags = set(exclude_pos_tags or [])
self.exclude_entities = set(exclude_entities or [])
评论列表
文章目录