test_nlp_proccessing.py 文件源码-python代码片段

test_nlp_proccessing.py 文件源码

python

阅读 23 收藏 0 点赞 0 评论 0

项目：Informed-Finance-Canary 作者: Darthone 项目源码文件源码

def tfidf(corpus, corpusKeys):
    #TODO clean this up
    #discard any stop words - saves on processing
    stopset = list(stopwords.words('english'))
    stopset.append('000')
    stopset.extend([str(x) for x in range(9999)])
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))

    #matrix of input set
    X = (vectorizer.fit_transform(corpus)).toarray()
    size_matrix = X.shape[0] 
    lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
    terms = vectorizer.get_feature_names()
    records = []
    for i, comp in enumerate(X):
        termsInComp = zip(terms, comp)
        sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]

        #List with all the terms gathered from the tfidf vectorizer
        termList = [term[0] + '.' for term in sortedTerms]

        # List with Article ID and list of tfidf terms
        records.append((vader(corpusKeys[i], termList), termList))
    return records