def __init__(self,language='en'):
"""
Create a Parser object that will use Spacy for parsing. It uses Spacy and offers all the same languages that Spacy offers. Check out: https://spacy.io/usage/models. Note that the language model needs to be downloaded first (e.g. python -m spacy download en)
:param language: Language to parse (en/de/es/pt/fr/it/nl)
:type language: str
"""
# We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily)
import spacy
acceptedLanguages = ['en','de','es','pt','fr','it','nl']
assert language in acceptedLanguages, "Language for parser (%s) not in accepted languages: %s" % (language,str(acceptedLanguages))
self.language = language
if not language in Parser.languageModels:
Parser.languageModels[language] = spacy.load(language, disable=['ner'])
self.nlp = Parser.languageModels[language]
python类io()的实例源码
def __init__(self,
lang='en',
lower=True,
lemmatize=False,
remove_punct=True,
remove_digits=True,
remove_stop_words=False,
exclude_oov=False,
exclude_pos_tags=None,
exclude_entities=['PERSON']):
"""Encodes text into `(samples, words)`
Args:
lang: The spacy language to use. (Default value: 'en')
lower: Lower cases the tokens if True. (Default value: True)
lemmatize: Lemmatizes words when set to True. This also makes the word lower case
irrespective if the `lower` setting. (Default value: False)
remove_punct: Removes punct words if True. (Default value: True)
remove_digits: Removes digit words if True. (Default value: True)
remove_stop_words: Removes stop words if True. (Default value: False)
exclude_oov: Exclude words that are out of spacy embedding's vocabulary.
By default, GloVe 1 million, 300 dim are used. You can override spacy vocabulary with a custom
embedding to change this. (Default value: False)
exclude_pos_tags: A list of parts of speech tags to exclude. Can be any of spacy.parts_of_speech.IDS
(Default value: None)
exclude_entities: A list of entity types to be excluded.
Supported entity types can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types
(Default value: ['PERSON'])
"""
super(WordTokenizer, self).__init__(lang, lower)
self.lemmatize = lemmatize
self.remove_punct = remove_punct
self.remove_digits = remove_digits
self.remove_stop_words = remove_stop_words
self.exclude_oov = exclude_oov
self.exclude_pos_tags = set(exclude_pos_tags or [])
self.exclude_entities = set(exclude_entities or [])
def __init__(self,
lang='en',
lower=True,
lemmatize=False,
remove_punct=True,
remove_digits=True,
remove_stop_words=False,
exclude_oov=False,
exclude_pos_tags=None,
exclude_entities=['PERSON']):
"""Encodes text into `(samples, sentences, words)`
Args:
lang: The spacy language to use. (Default value: 'en')
lower: Lower cases the tokens if True. (Default value: True)
lemmatize: Lemmatizes words when set to True. This also makes the word lower case
irrespective if the `lower` setting. (Default value: False)
remove_punct: Removes punct words if True. (Default value: True)
remove_digits: Removes digit words if True. (Default value: True)
remove_stop_words: Removes stop words if True. (Default value: False)
exclude_oov: Exclude words that are out of spacy embedding's vocabulary.
By default, GloVe 1 million, 300 dim are used. You can override spacy vocabulary with a custom
embedding to change this. (Default value: False)
exclude_pos_tags: A list of parts of speech tags to exclude. Can be any of spacy.parts_of_speech.IDS
(Default value: None)
exclude_entities: A list of entity types to be excluded.
Supported entity types can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types
(Default value: ['PERSON'])
"""
super(SentenceWordTokenizer, self).__init__(lang,
lower,
lemmatize,
remove_punct,
remove_digits,
remove_stop_words,
exclude_oov,
exclude_pos_tags,
exclude_entities)
def ensure_proper_language_model(nlp):
# type: (Optional[Language]) -> None
"""Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid."""
if nlp is None:
raise Exception("Failed to load spacy language model. Loading the model returned 'None'.")
if nlp.path is None:
# Spacy sets the path to `None` if it did not load the model from disk.
# In this case `nlp` is an unusable stub.
raise Exception("Failed to load spacy language model for lang '{}'. ".format(nlp.lang) +
"Make sure you have downloaded the correct model (https://spacy.io/docs/usage/).")
def ensure_proper_language_model(nlp):
# type: (Optional[Language]) -> None
"""Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid."""
if nlp is None:
raise Exception("Failed to load spacy language model. Loading the model returned 'None'.")
if nlp.path is None:
# Spacy sets the path to `None` if it did not load the model from disk.
# In this case `nlp` is an unusable stub.
raise Exception("Failed to load spacy language model for lang '{}'. ".format(nlp.lang) +
"Make sure you have downloaded the correct model (https://spacy.io/docs/usage/).")
def get_tokenizer(tokenizer):
if callable(tokenizer):
return tokenizer
if tokenizer == "spacy":
try:
import spacy
spacy_en = spacy.load('en')
return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
except ImportError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
except AttributeError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
elif tokenizer == "moses":
try:
from nltk.tokenize.moses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
raise
except LookupError:
print("Please install the necessary NLTK corpora. "
"See the docs at http://nltk.org for more information.")
raise
elif tokenizer == 'revtok':
try:
import revtok
return revtok.tokenize
except ImportError:
print("Please install revtok.")
raise
elif tokenizer == 'subword':
try:
import revtok
return lambda x: revtok.tokenize(x, decap=True)
except ImportError:
print("Please install revtok.")
raise
raise ValueError("Requested tokenizer {}, valid choices are a "
"callable that takes a single string as input, "
"\"revtok\" for the revtok reversible tokenizer, "
"\"subword\" for the revtok caps-aware tokenizer, "
"\"spacy\" for the SpaCy English tokenizer, or "
"\"moses\" for the NLTK port of the Moses tokenization "
"script.".format(tokenizer))