AIserver.py 文件源码-python代码片段

AIserver.py 文件源码

python

阅读 37 收藏 0 点赞 0 评论 0

项目：Using-machine-learning-to-detect-malicious-URLs 作者: faizann24 项目源码文件源码

def TL():
    allurls = './data/data.csv' #path to our all urls file
    allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
    allurlsdata = pd.DataFrame(allurlscsv)  #converting to a dataframe

    allurlsdata = np.array(allurlsdata) #converting it into an array
    random.shuffle(allurlsdata) #shuffling

    y = [d[1] for d in allurlsdata] #all labels 
    corpus = [d[0] for d in allurlsdata]    #all urls corresponding to a label (either good or bad)
    vectorizer = TfidfVectorizer(tokenizer=getTokens)   #get a vector for each url but use our customized tokenizer
    X = vectorizer.fit_transform(corpus)    #get the X vector

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #split into training and testing set 80/20 ratio

    lgs = LogisticRegression()  #using logistic regression
    lgs.fit(X_train, y_train)
    print(lgs.score(X_test, y_test))    #pring the score. It comes out to be 98%
    return vectorizer, lgs