def tfidf(corpus, corpusKeys):
#TODO clean this up
#discard any stop words - saves on processing
stopset = list(stopwords.words('english'))
stopset.append('000')
stopset.extend([str(x) for x in range(9999)])
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))
#matrix of input set
X = (vectorizer.fit_transform(corpus)).toarray()
size_matrix = X.shape[0]
lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
terms = vectorizer.get_feature_names()
records = []
for i, comp in enumerate(X):
termsInComp = zip(terms, comp)
sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
#List with all the terms gathered from the tfidf vectorizer
termList = [term[0] + '.' for term in sortedTerms]
# List with Article ID and list of tfidf terms
records.append((vader(corpusKeys[i], termList), termList))
return records
test_nlp_proccessing.py 文件源码
python
阅读 23
收藏 0
点赞 0
评论 0
评论列表
文章目录