def run(self):
# generate dictionary of features
features = {'tokens':{'n_list':self.ngrams.split(), 'blackfeats':self.blackfeats.split(), 'mt':self.minimum_token_frequency}}
# read in files and put in right format for featurizer
documents = []
for infile in sorted(listdir(self.in_tokdir().path),key=keyfunc):
documents.append(format_tokdoc(self.in_tokdir().path + '/' + infile,self.lowercase))
# extract features
ft = featurizer.Featurizer(documents, features) # to prevent ngrams across sentences, a featurizer is generated per document
ft.fit_transform()
instances, vocabulary = ft.return_instances(['tokens'])
# write output
numpy.savez(self.out_features().path, data=instances.data, indices=instances.indices, indptr=instances.indptr, shape=instances.shape)
with open(self.out_vocabulary().path,'w',encoding='utf-8') as vocab_out:
vocab_out.write('\n'.join(vocabulary))
# when the input is a file with frogged documents
评论列表
文章目录