LexiconVectorizer.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:hatespeech 作者: lukovnikov 项目源码 文件源码
def transform(self, X, **transform_params):

        #sparse matrix with occurrences nxm
        # n : number of docs
        # m : size of lexicon 
        features = np.empty((len(X),len(self.lexicon)))            

        for docid,doc in enumerate(X):
            if self.preprocessor is not None:
                doc = self.preprocessor(doc)

            tokens = TreebankWordTokenizer().tokenize(doc)
            bigrams = [" ".join(i) for i in ngrams(tokens,2)]
            doctokens = tokens + bigrams

            tokencounts = Counter(doctokens)            
            match = set(tokencounts.keys()) & set(self.lexicon["ngram"])

            if len(match) > 0 :
                #occurrences vector
                occurrences = self.lexicon["ngram"].map(lambda w : w in match)
                ovec = csr_matrix(occurrences)
                #polarity vector
                pvec = csr_matrix(self.lexicon["polarity"])
                #counts vector
                counts = self.lexicon["ngram"].map(lambda w : tokencounts[w] if w in match else 0 )
                cvec = csr_matrix(counts)

                if self.polarity:
                    if self.weightedcount:
                        vector = ovec.multiply(pvec).multiply(cvec)
                    else :
                        vector = ovec.multiply(pvec)
                else : 
                    if self.weightedcount:
                        vector = ovec.multiply(cvec)
                    else :
                        vector = ovec         
                vector = vector.todense()
            else:
                #can't skip because np.empty is > 0 
                vector = np.zeros(len(self.lexicon))

            features[docid] = vector

        return csr_matrix(features)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号