def transform(self, X, **transform_params):
#sparse matrix with occurrences nxm
# n : number of docs
# m : size of lexicon
features = np.empty((len(X),len(self.lexicon)))
for docid,doc in enumerate(X):
if self.preprocessor is not None:
doc = self.preprocessor(doc)
tokens = TreebankWordTokenizer().tokenize(doc)
bigrams = [" ".join(i) for i in ngrams(tokens,2)]
doctokens = tokens + bigrams
tokencounts = Counter(doctokens)
match = set(tokencounts.keys()) & set(self.lexicon["ngram"])
if len(match) > 0 :
#occurrences vector
occurrences = self.lexicon["ngram"].map(lambda w : w in match)
ovec = csr_matrix(occurrences)
#polarity vector
pvec = csr_matrix(self.lexicon["polarity"])
#counts vector
counts = self.lexicon["ngram"].map(lambda w : tokencounts[w] if w in match else 0 )
cvec = csr_matrix(counts)
if self.polarity:
if self.weightedcount:
vector = ovec.multiply(pvec).multiply(cvec)
else :
vector = ovec.multiply(pvec)
else :
if self.weightedcount:
vector = ovec.multiply(cvec)
else :
vector = ovec
vector = vector.todense()
else:
#can't skip because np.empty is > 0
vector = np.zeros(len(self.lexicon))
features[docid] = vector
return csr_matrix(features)
评论列表
文章目录