def new(stop_words=[],decomposition='SVD',n_components=5):
# Prepare vectoriser engines
idf = TfidfVectorizer(
ngram_range=(1,3), #Unigram,bigram,& trigram
stop_words=stop_words
)
# Prepare normaliser
norm = Normalizer(norm='max')
print(colored('Texthasher model created','yellow'))
# Prepare dimensionality reduction
if decomposition and n_components:
if decomposition=='LDA': # Results in Non-negative matrix
reducer = LatentDirichletAllocation( # TFIDF --> Topic term
n_topics=n_components,
max_doc_update_iter=20,
max_iter=8
)
return [idf,norm,reducer]
elif decomposition=='SVD':
reducer = TruncatedSVD( # Best for small dataset,
n_components, # nightmare for large dataset
n_iter=8) # Damn slow
return [idf,norm,reducer]
elif decomposition=='PCA':
# When using IPCA, remember to always keep:
# n_samples > n_components > batch_size
# reducer = IncrementalPCA(n_components)
# Sparse -> Dense greedily consumes large amount of mem
# to_dense = SparseToDense()
# return [idf,norm,to_dense,reducer]
reducer = SparsePCA(n_components)
return [idf,norm,reducer]
return [idf,norm]
else:
return [idf,norm]
评论列表
文章目录