python类load()的实例源码-面圈网

pybrain_captcha.py 文件源码项目：Verification-code-crack 作者: weixianglin 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def predict():
    fnn=joblib.load(PKL)
    dir='E:/????/??????/1 ???/captcha_master1/captcha_master/worddata/'
    predictValue = []
    for fr in os.listdir(dir):
        dataset=[]
        f = dir + fr
        if f.rfind(u'.DS_Store') == -1 and f.rfind(u'Thumbs.db') == -1:
            data = np.loadtxt(f, delimiter=',')
            #data.reshape((1,2500))
            for item in data:
                dataset.append(int(item))

            #print(len(dataset))
            out = fnn.activate(dataset)
            out = out.argmax()
            iconset = ['3', 'c', 'd', 'e', 'f', 'h', 'j', 'k', 'l', 'm', 'n', 'w', 'x', 'y']
            for y, word in enumerate(iconset):
                if out == y:
                    print(word)
                    predictValue.append(word)

    print(u'????%s' % (''.join(predictValue)))

pybrain_captcha.py 文件源码项目：Verification-code-crack 作者: weixianglin 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test():
    DS=loadPybrainData()
    train,test=DS.splitWithProportion(0.1)
    fnn=joblib.load(PKL)
    # ??test??
    output = fnn.activateOnDataset(test)
    # ann.activate(onedata)????????????
    outputs=[]
    target=[]
    count=0
    for out in output:
        outs=out.argmax()
        outputs.append(outs)
    for tar in test['target']:
        ta=tar.argmax()
        target.append(ta)
    for i in range(0,len(target)):
        if outputs[i]==target[i]:
            count+=1

    right=count/len(target)#???????
    rate=(right**4)
    print("???????%.4f%%" % (rate * 100))
    v = Validator()
    print(u'??????',v.MSE(output, test['target']))#??test?????????????,????????

__init__.py 文件源码项目：rosie 作者: datasciencebr 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def load_trained_model(self, classifier):
        filename = '{}.pkl'.format(classifier.__name__.lower())
        path = os.path.join(self.data_path, filename)

        # palliative: this outputs a model too large for joblib
        if classifier.__name__ == 'MonthlySubquotaLimitClassifier':
            model = classifier()
            model.fit(self.dataset)

        else:
            if os.path.isfile(path):
                model = joblib.load(path)
            else:
                model = classifier()
                model.fit(self.dataset)
                joblib.dump(model, path)

        return model

amazon_stacking.py 文件源码项目：ensemble_amazon 作者: kaz-Anova 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def loadcolumn(filename,col=4, skip=1, floats=True):
    pred=[]
    op=open(filename,'r')
    if skip==1:
        op.readline() #header
    for line in op:
        line=line.replace('\n','')
        sps=line.split(',')
        #load always the last columns
        if floats:
            pred.append(float(sps[col]))
        else :
            pred.append(str(sps[col]))
    op.close()
    return pred            


#functions to manipulate pickles

main.py 文件源码项目：SNAP_R 作者: zerofox-oss 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def is_target(screen_name, disable_targeting, model_file='cluster.pkl'):
    """
    Returns a boolean for whether the user should be selected according
    to label identity returned by a prediction from a pretrained
    clustering algorithm.
    """
    if disable_targeting:
        return True
    else:
        auth = tweepy.OAuthHandler(credentials.consumer_key,
                                   credentials.consumer_secret)
        auth.set_access_token(credentials.access_token,
                              credentials.access_token_secret)
        api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())
        user_array = numpy.array([api.get_user(screen_name=screen_name)])
        model = joblib.load(model_file)
        cluster_label = model.predict(user_array)
        return cluster_label == 1

util.py 文件源码项目：stacked_generalization 作者: fukatani 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def get_cache_file(model_id, index, cache_dir='', suffix='csv'):
    # Identify index trick.
    # If sum of first 20 index, recognize as the same index.
    if index is None:
        raise IOError
    if len(index) < 20:
        sum_index = sum(index)
    else:
        sum_index = sum(index[:20])
    return "{0}{1}_{2}.{3}".format(cache_dir,
                                   model_id,
                                   sum_index,
                                   suffix)

##def saving_fit(learner, X, y, index):
##    import os
##    pkl_file = "{0}_{1}_{2}.pkl".format(learner.id, min(index), max(index))
##    try:
##        learner = joblib.load(pkl_file)
##        print("**** learner is loaded from {0} ****".format(pkl_file))
##    except IOError:
##        learner.fit(X, y)
##        joblib.dump(learner, pkl_file)
##    return learner

similar_posts.py 文件源码项目：hugo_similar_posts 作者: elbaulp 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def KmeansWrapper(true_k, data, load=False):
    from sklearn.externals import joblib

    modelName = 'doc_cluster.%s.plk' % true_k

    if load:
        km = joblib.load(modelName)
        labels = km.labels_
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    # max_iter=1000,
                    n_init=10,
                    n_jobs=-1,
                    random_state=0,
                    verbose=0)
        km.fit_predict(data)
        labels = km.labels_
        joblib.dump(km,  modelName)

    return labels, km.cluster_centers_

BidirectionNet_conv_ltp.py 文件源码项目：image-text-matching 作者: llltttppp 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self,is_training=True,is_skip=False, batch_size= 100, is_TopKloss=True, 
                 word2vec_model='./model/word2vec/ourword2vec.pkl'):
        # word2vec_model='/media/wwt/860G/model/word2vec/cn.cbow.bin'
        #self.model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model, binary=True, unicode_errors='ignore')
        self.word2vec = pkl.load(open(word2vec_model,'r'))
        self.batch_size = batch_size
        self.weight_decay = 0.0005
        self.endpoint={}
        self.is_skip=is_skip
        self.is_TopKloss = is_TopKloss
        self.is_training = is_training
        self.keep_prob = 0.5 if is_training else 1.0
        self.build_input()
        #self.build_matchnet()
        #self.build_classify()
    #self.build_crossEnt_class()
    #self.loss_weight = 0.
    self.build_unite()
        if is_training:
            #self.build_summary()
            #self.build_summary_crossEnt()
        self.build_summary_unite()

preprocessing.py 文件源码项目：image-text-matching 作者: llltttppp 项目源码文件源码阅读 57 收藏 0 点赞 0 评论 0

def generate_fishervector(sample_set,ica_model='./model/ICA/ica_ourword2vec.model',gmm_model_path='./model/GMM/gmm_ourword2vec.model',max_num = 30000):
    ica = joblib.load(ica_model)
    gmm_model =pkl.load(open(gmm_model_path,'r'))
    centenrs = gmm_model[0].shape[0]
    dims = gmm_model[1].shape[1]
    fishervector = np.zeros([len(sample_set),centenrs*dims*2])+0.00001
    for i,v in enumerate(sample_set):
        words =v.strip().split(' ')
        words = words[:min(len(words),max_num+200)]
        vectors =[]
        for j in words:
            try:
                vectors.append(word2vec_model[j])
            except:
                pass#print 'Not found %s'%j
        if len(vectors) >0:
            vectors=vectors[:min(len(vectors),max_num)]
            fishervector[i]=yael.ynumpy.fisher(gmm_model,ica.transform(np.array(vectors)).astype(np.float32) ,include='mu sigma')
    print 'mean vectors is',fishervector.mean(0)
    return fishervector

BidirectionNet_lstm.py 文件源码项目：image-text-matching 作者: llltttppp 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self,is_training=True,is_skip=False, batch_size= 100, is_TopKloss=True, 
                 word2vec_model='./model/word2vec/ourword2vec.pkl'):
        # word2vec_model='/media/wwt/860G/model/word2vec/cn.cbow.bin'
        #self.model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model, binary=True, unicode_errors='ignore')
        self.word2vec = pkl.load(open(word2vec_model,'r'))
        self.batch_size = batch_size
        self.weight_decay = 0.0005
        self.endpoint={}
        self.is_skip=is_skip
        self.is_TopKloss = is_TopKloss
        self.is_training = is_training
        self.keep_prob = 0.5 if is_training else 1.0
        self.build_input()
        #self.build_matchnet()
        #self.build_classify()
        #self.build_crossEnt_class()
        #self.loss_weight = 0.
        self.build_unite()
        if is_training:
            #self.build_summary()
            #self.build_summary_crossEnt()
            self.build_summary_unite()

ex3-self_learning_quant.py 文件源码项目：sl-quant 作者: danielzak 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def init_state(indata, test=False):
    close = indata['close'].values
    diff = np.diff(close)
    diff = np.insert(diff, 0, 0)
    sma15 = SMA(indata, timeperiod=15)
    sma60 = SMA(indata, timeperiod=60)
    rsi = RSI(indata, timeperiod=14)
    atr = ATR(indata, timeperiod=14)

    #--- Preprocess data
    xdata = np.column_stack((close, diff, sma15, close-sma15, sma15-sma60, rsi, atr))

    xdata = np.nan_to_num(xdata)
    if test == False:
        scaler = preprocessing.StandardScaler()
        xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
        joblib.dump(scaler, 'data/scaler.pkl')
    elif test == True:
        scaler = joblib.load('data/scaler.pkl')
        xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
    state = xdata[0:1, 0:1, :]

    return state, xdata, close

#Take Action

svm_utils.py 文件源码项目：ml_defense 作者: arjunbhagoji 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def model_loader(model_dict, adv=None, rd=None, rev=None):
    """
    Returns a classifier object if it already exists. Returns None, otherwise.
    """
    if adv is None:
        adv_mag = None
    print('Loading model...')
    abs_path_m = resolve_path_m(model_dict)
    try:
        clf = joblib.load(abs_path_m + get_svm_model_name(model_dict, adv, adv_mag, rd, rev) +
                          '.pkl')
    except BaseException:
        clf = None

    return clf
#------------------------------------------------------------------------------#

vectorizer.py 文件源码项目：FreeDiscovery 作者: FreeDiscovery 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def vect_(self):
        if not hasattr(self, '_vect') or self._vect is None:
            mid = self.dsid
            mid_dir = self.cache_dir / mid
            if not mid_dir.exists():
                raise ValueError(('Vectorizer model id {} ({}) '
                                  'not found in the cache {}!')
                                 .format(mid, mid_dir))
            fname = mid_dir / 'vectorizer'
            if self.pars_['use_hashing']:
                self._vect = joblib.load(str(fname))
            else:
                # this is much faster in python 3 as cpickle is used
                # (only works if no numpy arrays are used)
                with fname.open('rb') as fh:
                    self._vect = pickle.load(fh)
        return self._vect

__main__.py 文件源码项目：FreeDiscovery 作者: FreeDiscovery 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def _show(args):
    cache_dir = _parse_cache_dir(args.cache_dir)
    p = PipelineFinder.by_id(mid=args.mid, cache_dir=cache_dir)
    print(p)
    print(' * model_id: {}'.format(args.mid))
    print(' * model_type: {}'.format(list(p.keys())[-1]))
    print(' * file_path: {}'.format(p.get_path()))
    try:
        pars = joblib.load(os.path.join(p.get_path(), 'pars'))
        for key, val in pars.items():
            val_str = str(val)
            if len(val_str) > 30 and not isinstance(val, dict):
                continue
            print(' * {}: {}'.format(key, val_str))
    except:
        pass

test_ingestion.py 文件源码项目：FreeDiscovery 作者: FreeDiscovery 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def test_get_feature_extraction(app, hashed, weighting):
    norm_alpha = 0.5
    dsid, _, _ = get_features_cached(app, hashed=hashed, weighting=weighting,
                                     norm_alpha=norm_alpha)
    method = V01 + "/feature-extraction/{}".format(dsid)
    data = app.get_check(method)
    assert dict2type(data, collapse_lists=True) == {'analyzer': 'str',
                     'ngram_range': ['int'], 'stop_words': 'str',
                     'n_jobs': 'int', 'chunk_size': 'int',
                     'data_dir': 'str', 'n_samples': 'int',
                     'n_features': 'int', 'weighting': 'str',
                     'norm_alpha': 'float', 'use_hashing': 'bool',
                     'filenames': ['str'], 'max_df': 'float', 'min_df': 'float',
                     'parse_email_headers': 'bool', 'n_samples_processed': 'int',
                     'preprocess': []}

    assert data['use_hashing'] == hashed
    assert data['weighting'] == weighting
    assert data['norm_alpha'] == norm_alpha

    vect = joblib.load(os.path.join(CACHE_DIR, 'ediscovery_cache', dsid, 'vectorizer'))
    assert (data['use_hashing'] is True) == ('hashing' in type(vect).__name__.lower())

knock74.py 文件源码项目：100knock2016 作者: tmu-nlp 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def predict_function():
    x_list = []
    line_list = []
    line_dict = {}
    predict_doc = joblib.load('logreg.pkl')
    feature_doc = joblib.load("word_vec.pkl")
    y_train, x_train = get_feature()
    line = "bad bad good good"
    line_list = line.split()
    for line in x_train:
        for key in line:
            line_dict[key] = 0
    line_dict.update(dict(Counter(line_list)))
    for a in sorted(line_dict.items(), key = lambda x:x[1]):
        print(a)
    x_list.append(line_dict)
    print(x_list)
    exit()
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    for pred, prob in zip(pred,prob):
        print(pred, prob)

classification.py 文件源码项目：CAAPR 作者: Stargrazer82301 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def from_file(cls, path):

        """
        This function ...
        :param path:
        :return:
        """

        # Create a new classifier instance
        classifier = cls()

        # Load the classifier
        classifier.vector_classifier = joblib.load(path)

        # Return the classifier
        return classifier

    # -----------------------------------------------------------------

classification.py 文件源码项目：CAAPR 作者: Stargrazer82301 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def from_file(cls, path):

        """
        This function ...
        :param path:
        :return:
        """

        # Create a new classifier instance
        classifier = cls()

        # Load the classifier
        classifier.vector_classifier = joblib.load(path)

        # Return the classifier
        return classifier

    # -----------------------------------------------------------------

BidirectionNet_lstm.py 文件源码项目：Sohu-LuckData-Image-Text-Matching-Competition 作者: WeitaoVan 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def __init__(self,is_training=True,is_skip=False, batch_size= 100, is_TopKloss=True, 
                 word2vec_model='/media/wwt/860G/data/souhu_data/fusai/train/word2vec_11w.pkl'):
        # word2vec_model='/media/wwt/860G/model/word2vec/cn.cbow.bin'
        #self.model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model, binary=True, unicode_errors='ignore')
        self.word2vec = pkl.load(open(word2vec_model,'r'))
        self.batch_size = batch_size
        self.weight_decay = 0.000001
        self.endpoint={}
        self.is_skip=is_skip
        self.is_TopKloss = is_TopKloss
        self.is_training = is_training
        self.keep_prob = 0.5 if is_training else 1.0
        self.build_input()
        #self.build_matchnet()
        #self.build_classify()
    #self.build_crossEnt_class()
    self.loss_weight = 0.
    self.build_unite(self.loss_weight)
        if is_training:
            #self.build_summary()
            #self.build_summary_crossEnt()
        self.build_summary_unite()

recsys.py 文件源码项目：modl 作者: arthurmensch 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def load_movielens(version):
    data_home = get_data_dirs()[0]

    if version == "100k":
        path = os.path.join(data_home, "movielens100k", "movielens100k.pkl")
    elif version == "1m":
        path = os.path.join(data_home, "movielens1m", "movielens1m.pkl")
    elif version == "10m":
        path = os.path.join(data_home, "movielens10m", "movielens10m.pkl")
    else:
        raise ValueError("Invalid version of movielens.")

    # FIXME: make downloader
    if not os.path.exists(path):
        raise ValueError("Dowload dataset using 'make download-movielens%s' at"
                         " project root." % version)

    X = load(path)
    return X

predict.py 文件源码项目：MusicAnalyser 作者: ShivayaDevs 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def predict_song(wavfile):
    sampling_rate, song_array = scipy.io.wavfile.read(wavfile)
    song_array[song_array == 0] = 1
    ceps, mspec, spec = mfcc(song_array)
    base_wav, ext = os.path.splitext(wavfile)
    data_wav = base_wav + ".ceps"
    np.save(data_wav, ceps)

    # features
    X = []
    Y = []
    ceps = np.load(data_wav + ".npy")
    num_ceps = len(ceps)
    X.append(np.mean(ceps[int(num_ceps * 1 / 10): int(num_ceps * 9 / 10)], axis=0))

    # prediction
    # print predict_file(X)
    genre_list = ["country", "hiphop", "metal", "pop", "reggae", "rock"]

    clf = joblib.load('./analyser/ml_utils/genre_classify/model_ceps.pkl')
    index = clf.predict(X)
    return genre_list[index[0]]

my_bt_lstm.py 文件源码项目：algotrading 作者: alifanov 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def backtesting_with_lstm():
    model = get_loaded_model()
    df = pd.read_csv('btc_etc.csv').rename(columns={
        'Close': 'close',
        'Date time': 'datetime',
        'Open': 'open',
        'High': 'high',
        'Low': 'low',
        'Volume': 'volume'
    })
    ds = DataSeries(df)

    scaler = joblib.load(open('scaler.sav', 'rb'))
    look_back = 1

    bt = NNBT(ds, 1000.0, model, look_back, scaler)
    bt.run()

    print('Profit: ${:.2f}'.format(bt.get_profit()))

my_bt_mlp.py 文件源码项目：algotrading 作者: alifanov 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def backtesting_with_lstm():
    model = get_loaded_model()
    df = pd.read_csv('btc_etc.csv').rename(columns={
        'Close': 'close',
        'Date time': 'datetime',
        'Open': 'open',
        'High': 'high',
        'Low': 'low',
        'Volume': 'volume'
    })
    ds = DataSeries(df)

    scaler = joblib.load(open('scaler.sav', 'rb'))
    look_back = 1

    bt = NNBT(ds, 1000.0, model, look_back, scaler)
    bt.run()

    print('Profit: ${:.2f}'.format(bt.get_profit()))

XGB_solver.py 文件源码项目：tpai_comp 作者: luuuyi 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def use_model_to_predict(test_df, model):
    test_df.drop(['label'], axis=1, inplace=True)
    print 'Fix Missing App Count Value...'
    model_miss = joblib.load('XGB_missing.model')
    test_df = fix_missing_appcounts(test_df, model_miss)
    '''print 'Fix Missing Age Value...'
    model_age = joblib.load('XGB_age.model')
    test_df = fix_missing_age(test_df, model_age)'''
    test_df.drop(['marriageStatus','haveBaby','sitesetID', 'positionType'], axis=1, inplace=True)
    print 'Done'
    print test_df.info()
    print test_df.describe()
    print test_df.isnull().sum()
    test_np = test_df.as_matrix()
    X = test_np[:, 1:]
    print 'Use Model To Predict...'
    predicts = model.predict(X)
    result = pd.DataFrame({'instanceID':test_df['instanceID'].as_matrix(), 'prob':predicts})
    #print predicts#, predicts.min(axis=0), predicts.max(axis=0), predicts.sum(axis=1)
    return result

util.py 文件源码项目：facial-keypoints-detection 作者: saber1988 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def generate_submission(test_dataset, sess, eval_prediction, eval_data_node):
    test_labels = eval_in_batches(test_dataset, sess, eval_prediction, eval_data_node)
    test_labels *= 96.0
    test_labels = test_labels.clip(0, 96)

    lookup_table = pd.read_csv(FLOOKUP)
    values = []

    cols = joblib.load('data/cols.pkl')

    for index, row in lookup_table.iterrows():
        values.append((
            row['RowId'],
            test_labels[row.ImageId - 1][np.where(cols == row.FeatureName)[0][0]],
        ))
    submission = pd.DataFrame(values, columns=('RowId', 'Location'))
    submission.to_csv('data/submission.csv', index=False)

util.py 文件源码项目：facial-keypoints-detection 作者: saber1988 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def make_submission(test_labels):
    test_labels *= 96.0
    test_labels = test_labels.clip(0, 96)

    lookup_table = pd.read_csv(FLOOKUP)
    values = []

    cols = joblib.load('data/cols.pkl')

    for index, row in lookup_table.iterrows():
        values.append((
            row['RowId'],
            test_labels[row.ImageId - 1][np.where(cols == row.FeatureName)[0][0]],
        ))
    submission = pd.DataFrame(values, columns=('RowId', 'Location'))
    submission.to_csv('data/submission.csv', index=False)

fund_dl.py 文件源码项目：ocr 作者: lznumber1 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def splity(im):
    s = ''
    w,h = im.size
    pix = im.load()
    for j in xrange(h):
        num = 0
        for i in xrange(w):
            # print pix[i,j]
            if pix[i,j]==BLACK:
                num += 1
        if num > 0:
            s += '1'
        else:
            s += '0'
    # print s
    start = s.find('1')
    end = s.rfind('1')
    return im.crop((0,start,w,end))

tflearn_lstm.py 文件源码项目：PyMLT 作者: didw 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self, s_date):
        prev_bd = int(s_date[:6])-1
        prev_ed = int(s_date[9:15])-1
        if prev_bd%100 == 0: prev_bd -= 98
        if prev_ed%100 == 0: prev_ed -= 98
        pred_s_date = "%d01_%d01" % (prev_bd, prev_ed)
        prev_model = '../model/tflearn/lstm/%s' % pred_s_date
        self.model_dir = '../model/tflearn/lstm/%s' % s_date

        tf.reset_default_graph()
        tflearn.init_graph(gpu_memory_fraction=0.1)
        input_layer = tflearn.input_data(shape=[None, 30, 23], name='input')
        lstm1 = tflearn.lstm(input_layer, 23, dynamic=True, name='lstm1')
        dense1 = tflearn.fully_connected(lstm1, 1, name='dense1')
        output = tflearn.single_unit(dense1)
        regression = tflearn.regression(output, optimizer='adam', loss='mean_square',
                                metric='R2', learning_rate=0.001)
        self.estimators = tflearn.DNN(regression)
        if os.path.exists('%s/model.tfl' % prev_model):
            self.estimators.load('%s/model.tfl' % prev_model)

tokenizer.py 文件源码项目：word_segmentation 作者: CongSon1293 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def load_vocab(self, vocab):
        self.vocab = self.load('model/vocab.pkl')
        self.max_length = self.load('model/max_length.pkl')
        if self.vocab != None and self.max_length != None:
            return
        vocab_temp, self.max_length = utils.load_data2list_string(vocab)
        # vocab_temp = sorted(vocab_temp, key=lambda s: len(s.split()), reverse=True)
        vocab_temp = filter(lambda s: len(s.split()) > 1, vocab_temp) # remove word have one syllable
        vocab_temp_clone = map(lambda s: s.replace(u' ', u'_'), vocab_temp)
        self.vocab = {i:{} for i in xrange(1, self.max_length+1)}
        for i in xrange(len(vocab_temp)):
            s = vocab_temp[i]
            ss = vocab_temp_clone[i]
            w = s.split()[0]
            length = vocab_temp[i].count(u' ')
            try: self.vocab[length][w].update({s:ss})
            except: self.vocab[length].update({w:{s:ss}})
        print('size of vocab = %d' % (len(vocab_temp)))
        self.save_model(self.vocab, 'model/vocab.pkl')
        self.save_model(self.max_length, 'model/max_length.pkl')

classifier.py 文件源码项目：pygameweb 作者: pygame 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def classify_comment(comment):
    """Classify the comment.

    :param comment: should have a message attribute.
    """
    global _comment_pipeline
    from sklearn.externals import joblib

    model_is_not_loaded = _comment_pipeline is None
    if model_is_not_loaded:
        import pygameweb.comment.classifier_train
        import pygameweb.config
        model_fname = pygameweb.config.Config.COMMENT_MODEL
        _comment_pipeline = joblib.load(model_fname)

    return _comment_pipeline.predict([comment.message])[0]