def TL():
allurls = './data/data.csv' #path to our all urls file
allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
allurlsdata = pd.DataFrame(allurlscsv) #converting to a dataframe
allurlsdata = np.array(allurlsdata) #converting it into an array
random.shuffle(allurlsdata) #shuffling
y = [d[1] for d in allurlsdata] #all labels
corpus = [d[0] for d in allurlsdata] #all urls corresponding to a label (either good or bad)
vectorizer = TfidfVectorizer(tokenizer=getTokens) #get a vector for each url but use our customized tokenizer
X = vectorizer.fit_transform(corpus) #get the X vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #split into training and testing set 80/20 ratio
lgs = LogisticRegression() #using logistic regression
lgs.fit(X_train, y_train)
print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98%
return vectorizer, lgs
AIserver.py 文件源码
python
阅读 37
收藏 0
点赞 0
评论 0
评论列表
文章目录