def buildVectorizer(classes, examples, parameters):
featureChoice = None
doFeatureSelection = False
tfidf = False
featureSelectPerc = 10
if "featureChoice" in parameters:
featureChoice = parameters["featureChoice"]
if "doFeatureSelection" in parameters and parameters["doFeatureSelection"] == "True":
doFeatureSelection = True
if "featureSelectPerc" in parameters:
featureSelectPerc = int(parameters["featureSelectPerc"])
if "tfidf" in parameters and parameters["tfidf"] == "True":
tfidf = True
print "Starting vectorizer..."
vectorizer = Vectorizer(classes,examples,featureChoice,tfidf)
vectors = vectorizer.getTrainingVectors()
print "Vectors of size:", vectors.shape
if doFeatureSelection:
print "Trimming training vectors..."
from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2
#featureSelector = SelectKBest(chi2, k=100)`:
featureSelector = SelectPercentile(chi2,featureSelectPerc)
vectorsTrimmed = featureSelector.fit_transform(vectors, classes)
vectorsTrimmed = coo_matrix(vectorsTrimmed)
print "Trimmed training vectors of size:", vectorsTrimmed.shape
else:
vectorsTrimmed = vectors
featureSelector = None
return vectorsTrimmed,vectorizer,featureSelector
评论列表
文章目录