def run(self):
# generate dictionary of features
features = {'tokens':{'n_list':self.ngrams.split(), 'blackfeats':self.blackfeats.split(), 'mt':self.minimum_token_frequency}}
# format lines
documents = [[doc] for doc in format_tokdoc(self.in_tokenized().path,self.lowercase)]
# extract features
ft = featurizer.Featurizer(documents, features)
ft.fit_transform()
instances, vocabulary = ft.return_instances(['tokens'])
# write output
numpy.savez(self.out_features().path, data=instances.data, indices=instances.indices, indptr=instances.indptr, shape=instances.shape)
vocabulary = list(vocabulary)
with open(self.out_vocabulary().path,'w',encoding='utf-8') as vocab_out:
vocab_out.write('\n'.join(vocabulary))
# When the input is a directory with tokenized documents
评论列表
文章目录