def _vectorize(self,corpus,fit):
assert isinstance(corpus,kindred.Corpus)
matrices = []
for feature in self.chosenFeatures:
assert feature in self.featureInfo.keys()
featureFunction = self.featureInfo[feature]['func']
never_tfidf = self.featureInfo[feature]['never_tfidf']
data = featureFunction(corpus)
notEmpty = any( len(d)>0 for d in data )
if fit:
if notEmpty:
self.dictVectorizers[feature] = DictVectorizer()
if self.tfidf and not never_tfidf:
self.tfidfTransformers[feature] = TfidfTransformer()
intermediate = self.dictVectorizers[feature].fit_transform(data)
matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate))
else:
matrices.append(self.dictVectorizers[feature].fit_transform(data))
else:
if feature in self.dictVectorizers:
if self.tfidf and not never_tfidf:
intermediate = self.dictVectorizers[feature].transform(data)
matrices.append(self.tfidfTransformers[feature].transform(intermediate))
else:
matrices.append(self.dictVectorizers[feature].transform(data))
mergedMatrix = hstack(matrices)
return mergedMatrix
评论列表
文章目录