def __init__(self, fname, processes=None, lemmatize=utils.HAS_PATTERN, dictionary=None, filter_namespaces=('0',)):
"""
Initialize the corpus. Unless a dictionary is provided, this scans the
corpus once, to determine its vocabulary.
If `pattern` package is installed, use fancier shallow parsing to get
token lemmas. Otherwise, use simple regexp tokenization. You can override
this automatic logic by forcing the `lemmatize` parameter explicitly.
"""
self.fname = fname
self.filter_namespaces = filter_namespaces
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
else:
self.dictionary = dictionary
评论列表
文章目录