python类TfidfVectorizer()的实例源码

reuters_classifier.py 文件源码 项目:ml-projects 作者: saopayne 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def represent(documents):

    train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

    # Tokenization
    vectorizer = TfidfVectorizer(tokenizer=tokenize)

    # Learn and transform train documents
    vectorised_train_documents = vectorizer.fit_transform(train_docs)
    vectorised_test_documents = vectorizer.transform(test_docs)

    # Transform multilabel labels
    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
    test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

    return vectorised_train_documents, train_labels, vectorised_test_documents, test_labels
ContentTest.py 文件源码 项目:newsrecommender 作者: Newsrecommender 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def getTFV(token_pattern = token_pattern,
           norm = tfidf__norm,
           max_df = tfidf__max_df,
           min_df = tfidf__min_df,
           ngram_range = (1, 1),
           vocabulary = None,
           stop_words = 'english'):
    tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None, 
                         strip_accents='unicode', analyzer='word', 
                         token_pattern=token_pattern,
                         ngram_range=ngram_range, use_idf=True, 
                         smooth_idf=True, sublinear_tf=True,
                         stop_words = stop_words, norm=norm, vocabulary=vocabulary)
    return tfv   


#========= CountVectorizer =========#
utils_data_cleaning.py 文件源码 项目:auto_ml 作者: doordash 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self, column_descriptions=None):
        self.column_descriptions = column_descriptions
        self.text_col_indicators = set(['text', 'nlp'])

        self.text_columns = {}
        for key, val in self.column_descriptions.items():
            if val in self.text_col_indicators:
                self.text_columns[key] = TfidfVectorizer(
                    # If we have any documents that cannot be decoded properly, just ignore them and keep going as planned with everything else
                    decode_error='ignore'
                    # Try to strip accents from characters. Using unicode is slightly slower but more comprehensive than 'ascii'
                    , strip_accents='unicode'
                    # Can also choose 'character', which will likely increase accuracy, at the cost of much more space, generally
                    , analyzer='word'
                    # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
                    , stop_words='english'
                    # Convert all characters to lowercase
                    , lowercase=True
                    # Only consider words that appear in fewer than max_df percent of all documents
                    # In this case, ignore all words that appear in 90% of all documents
                    , max_df=0.9
                    # Consider only the most frequently occurring 3000 words, after taking into account all the other filtering going on
                    , max_features=3000
                )
topic_modeling.py 文件源码 项目:glassdoor-analysis 作者: THEdavehogue 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def fit_tfidf(self, df):
        '''
        Function to fit a TF-IDF matrix to a corpus of text

        INPUT:
            df: df with 'lemmatized_text' to analyze
        '''
        self.tfidf = TfidfVectorizer(input='content',
                                     use_idf=True,
                                     lowercase=True,
                                     max_features=self.tfidf_max_features,
                                     max_df=self.tfidf_max_df,
                                     min_df=self.tfidf_min_df)
        self.tfidf_matrix = self.tfidf.fit_transform(
            df['lemmatized_text']).toarray()
        self.tfidf_features = np.array(self.tfidf.get_feature_names())
        self.tfidf_reverse_lookup = {
            word: idx for idx, word in enumerate(self.tfidf_features)}
utils.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def create_vectorizer_selector(train_data, train_labels, model_file,
                               ngram_list=[1], max_num_features_list=[100],
                               analyzer_type_list=['word']):
    """Call creation and save of vectorizers and selectors including special cases.

    Args:
        train_data: list of train text samples
        train_labels:  list of train labels
        model_file: model filename
        ngram_list: list of ranges of n-grams
        max_num_features_list: list of maximum number of features to select
        analyzer_type_list: list of analyzer types for TfidfVectorizer 'word' or 'char'

    Returns:
        nothing
    """
    for i in range(len(ngram_list)):
        ngrams_selection(train_data, train_labels, 'general_' + str(i), model_file,
                         ngram_range_=(ngram_list[i], ngram_list[i]),
                         max_num_features=max_num_features_list[i],
                         analyzer_type=analyzer_type_list[i])
    you_are_data = ngrams_you_are(train_data)
    ngrams_selection(you_are_data, train_labels, 'special', model_file,
                     ngram_range_=(1,1), max_num_features=100)
    return
datasets.py 文件源码 项目:sef 作者: passalis 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def load_20ng_dataset_bow():
    """
    Loads the 20NG dataset
    :return:
    """

    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')

    # Convert data to tf-idf

    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95)
    train_data = vectorizer.fit_transform(newsgroups_train.data)
    test_data = vectorizer.transform(newsgroups_test.data)
    train_data = train_data.todense()
    test_data = test_data.todense()
    train_labels = newsgroups_train.target
    test_labels = newsgroups_test.target

    return train_data, train_labels, test_data, test_labels
utils_data_cleaning.py 文件源码 项目:auto_ml 作者: doordash 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def fit(self, X_df, y=None):

        # See if we should fit TfidfVectorizer or not
        for key in X_df.columns:

            if key in self.text_columns:
                X_df[key].fillna('nan', inplace=True)
                text_col = X_df[key].astype(str, raise_on_error=False)
                self.text_columns[key].fit(text_col)

                col_names = self.text_columns[key].get_feature_names()

                # Make weird characters play nice, or just ignore them :)
                for idx, word in enumerate(col_names):
                    try:
                        col_names[idx] = str(word)
                    except:
                        col_names[idx] = 'non_ascii_word_' + str(idx)

                col_names = ['nlp_' + key + '_' + str(word) for word in col_names]

                self.text_columns[key].cleaned_feature_names = col_names

        return self
_model.py 文件源码 项目:probablyPOTUS 作者: jjardel 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def train(self, train_size=0.8, k_folds=5):

        # retrieve data from DB and pre-process
        self._get_data()

        # perform train/test split
        self._get_train_test_split(train_size=train_size)

        # define text pre-processing pipeline
        text_pipeline = Pipeline([
            ('extract_text', DFColumnExtractor(TEXT_FEATURES)),
            ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
        ])

        # define pipeline for pre-processing of numeric features
        numeric_pipeline = Pipeline([
            ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
            ('scaler', MinMaxScaler())
        ])

        # combine both steps into a single pipeline
        pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text_processing', text_pipeline),
                ('num_processing', numeric_pipeline)
            ])),
            ('clf', self._estimator)
        ])

        self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
        gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)

        X = self.data.iloc[self.train_inds_, :]
        y = self.data[LABEL].values[self.train_inds_]

        gs.fit(X, y)

        self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))

        self.gs_ = gs
        self.model_ = gs.best_estimator_
data.py 文件源码 项目:geomdn 作者: afshinrahimi 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def tfidf(self):
        #keep both hashtags and mentions
        #token_pattern=r'(?u)@?#?\b\w\w+\b'
        #remove hashtags and mentions
        #token_pattern = r'(?u)(?<![#@])\b\w+\b'
        #just remove mentions and remove hashsign from hashtags
        #token_pattern = r'(?u)(?<![@])\b\w+\b'
        #remove mentions but keep hashtags with their sign
        #token_pattern = r'(?u)(?<![@])#?\b\w\w+\b'
        #remove multple occurrences of a character after 2 times yesss => yess
        #re.sub(r"(.)\1+", r"\1\1", s)
        self.vectorizer = TfidfVectorizer(tokenizer=self.tokenizer, token_pattern=self.token_pattern, use_idf=self.idf, 
                                    norm=self.norm, binary=self.btf, sublinear_tf=self.subtf, 
                                    min_df=self.mindf, max_df=self.maxdf, ngram_range=(1, 1), stop_words=self.stops, 
                                     vocabulary=self.vocab, encoding=self.encoding, dtype='float32')
        logging.info(self.vectorizer)
        self.X_train = self.vectorizer.fit_transform(self.df_train.text.values)
        self.X_dev = self.vectorizer.transform(self.df_dev.text.values)
        self.X_test = self.vectorizer.transform(self.df_test.text.values)
        logging.info("training    n_samples: %d, n_features: %d" % self.X_train.shape)
        logging.info("development n_samples: %d, n_features: %d" % self.X_dev.shape)
        logging.info("test        n_samples: %d, n_features: %d" % self.X_test.shape)
kmeans.py 文件源码 项目:PPRE 作者: MaoYuwei 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def loadDataset():
    '''???????'''
    df = pd.read_csv('df_vec.csv')
    # print df.shape
    X = np.array(df.iloc[:, 1:])
    y = np.array(df.iloc[:, 0])
    # print y
    # bet_list = list(df.iloc[:, 0])
    # dataset = []
    # for bet in bet_list:
    #     s, bet = bet.split(':')
    #     dataset.append(bet)

    # print dataset
    # print X
    # print y
    return X, y


# def transform(dataset, n_features=1000):
#     vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
#     X = vectorizer.fit_transform(dataset)
#     print X
#     # print vectorizer
#     return X, vectorizer
pipeline_factory.py 文件源码 项目:UrbanSearch 作者: urbansearchTUD 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_binary(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      epsilon=0.1,
                                                      eta0=0.0,
                                                      fit_intercept=True,
                                                      l1_ratio=0.15,
                                                      learning_rate='optimal',
                                                      loss='log',
                                                      n_iter=10,
                                                      n_jobs=1,
                                                      penalty='l2',
                                                      power_t=0.5,
                                                      random_state=None,
                                                      shuffle=True,
                                                      verbose=0,
                                                      warm_start=False
            )))
        ])
pipeline_factory.py 文件源码 项目:UrbanSearch 作者: urbansearchTUD 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_sgdc(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', SGDClassifier(alpha=0.0001,
                                  average=False,
                                  class_weight=None,
                                  epsilon=0.1,
                                  eta0=0.0,
                                  fit_intercept=True,
                                  l1_ratio=0.15,
                                  learning_rate='optimal',
                                  loss='log',
                                  n_iter=10,
                                  n_jobs=1,
                                  penalty='l2',
                                  power_t=0.5,
                                  random_state=None,
                                  shuffle=True,
                                  verbose=0,
                                  warm_start=False))
        ])
rank_verbs.py 文件源码 项目:StrepHit 作者: Wikidata 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
    """ Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.

        :param str verb_token: Surface form of a verb, e.g., *born*
        :param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
         used to transform verbs into vectors
        :return: cosine similarity score
        :rtype: ndarray
    """
    verb_token_vector = vectorizer.transform([verb_token])
    # Here the linear kernel is the same as the cosine similarity, but faster
    # cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    scores = linear_kernel(verb_token_vector, tf_idf_matrix)
    logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
    logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
    return scores
util.py 文件源码 项目:topic-ensemble 作者: derekgreene 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ):
    """
    Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
    """
    token_pattern = re.compile(r"[\s\-]+", re.U)

    def custom_tokenizer( s ):
        return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ]

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[ v[term] ] = term
    return (X,terms)
cluster_manager.py 文件源码 项目:texta 作者: texta-tk 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _vectorize_documents(self,method='tfidf',max_features=100):
        stop_words = []

        try:
            for lexicon_id in self.params['cluster_lexicons']:
                lexicon = Lexicon.objects.get(id=int(lexicon_id))
                words = Word.objects.filter(lexicon=lexicon)
                stop_words+=[word.wrd for word in words]
        except:
            KeyError

        if method == 'count':
            vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
        if method == 'tfidf':
            vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)

        document_vectors = vectorizer.fit_transform(self.documents)
        document_vectors = document_vectors.toarray()

        return document_vectors,vectorizer.get_feature_names()
similar_posts.py 文件源码 项目:hugo_similar_posts 作者: elbaulp 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def generateTfIdfVectorizer(data, stop='english', max_df=0.08, min_df=8):
    tokenizer = tokenizer_snowball if stop != 'english' else tokenizer_porter

    tfidf = TfidfVectorizer(strip_accents=None,
                            max_df=max_df,
                            min_df=min_df,
                            lowercase=True,
                            stop_words=stop,
                            sublinear_tf=True,
                            tokenizer=tokenizer,
                            analyzer='word',
                            max_features=16,
                            preprocessor=preprocessor)
    X = tfidf.fit_transform(data)
    print('%d Features: %s' %
          (len(tfidf.get_feature_names()), tfidf.get_feature_names()))

    return X
similar_posts.py 文件源码 项目:hugo_similar_posts 作者: elbaulp 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def gridSearch(data, params, true_k):

    tfidf = TfidfVectorizer(strip_accents=None,
                            lowercase=True,
                            sublinear_tf=True,
                            analyzer='word')

    lr_tfidf = Pipeline([('vect', tfidf),
                         ('clf', KMeans(init='k-means++',
                                        n_jobs=-1,
                                        random_state=0,
                                        verbose=0))])
    gsTfIdf = GridSearchCV(
        lr_tfidf, params, n_jobs=1, verbose=1)

    gsTfIdf.fit(data)
    print()
    print("Best score: %0.3f" % gsTfIdf.best_score_)
    print("Best parameters set:")
    best_parameters = gsTfIdf.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
Content_Based.py 文件源码 项目:newsrecommender 作者: Newsrecommender 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer
word_cluster.py 文件源码 项目:PolBotCheck 作者: codeforfrankfurt 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_word_clouds(tweets, users, words_n=50, lang='english'):
    default_stopwords = set(nltk.corpus.stopwords.words(lang))
    stopwords_file = '../data/stopwords.txt'
    custom_stopwords = set(open(stopwords_file, 'r').read().splitlines())
    all_stopwords = default_stopwords | custom_stopwords

    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords))
    X = vectorizer.fit_transform(tweets)
    terms = vectorizer.get_feature_names()

    word_cloud_per_person = {}
    for doc in range(len(tweets)):
        feature_index = X[doc, :].nonzero()[1]
        tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
        doc_terms = []
        for word, score in [(terms[i], score) for (i, score) in tfidf_scores]:
            doc_terms.append((word, score))
        important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n]
        word_cloud_per_person[users[doc]] = important_terms
    return word_cloud_per_person
training_data_factory.py 文件源码 项目:sfsf 作者: jorisvanzundert 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def delegate_create( self, top, bottom, sample_size=1000, source=sfsf_config.EPUB ):
        top_sellers, bottom_sellers = top, bottom
        if source == sfsf_config.EPUB:
            training_data_top = self.sample_epubs( top_sellers, sample_size )
            training_data_bottom = self.sample_epubs( bottom_sellers, sample_size )
        else:
            training_data_top = self.sample_txts( top_sellers, sample_size )
            training_data_bottom = self.sample_txts( bottom_sellers, sample_size )
        training_samples_top = [ sample for training_data in training_data_top for sample in training_data[1] ]
        training_samples_bottom = [ sample for training_data in training_data_bottom for sample in training_data[1] ]
        isbns = [ training_data[0] for training_data in training_data_top for sample in training_data[1] ] + [ training_data[0] for training_data in training_data_bottom for sample in training_data[1] ]
        y_narr = numpy.array( [1] * len( training_samples_top ) + [0] * len( training_samples_bottom ) )
        vect = TfidfVectorizer( tokenizer = MorePunctuationTokenizer() )
        x_tdm = vect.fit_transform( training_samples_top + training_samples_bottom )
        print( 'Created training data', ':' )
        print( 'x shape', ':', x_tdm.shape )
        print( 'y shape', ':', y_narr.shape )
        # TODO: make a nicer return structure
        return { 'x': x_tdm, 'y': y_narr, 'vectorizer': vect, 'isbns': isbns }
SentiCR.py 文件源码 项目:SentiCR 作者: senticr 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def create_model_from_training_data(self):
        training_comments=[]
        training_ratings=[]
        print("Training classifier model..")
        for sentidata in self.training_data:
            comments = preprocess_text(sentidata.text)
            training_comments.append(comments)
            training_ratings.append(sentidata.rating)

        # discard stopwords, apply stemming, and discard words present in less than 3 comments
        self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, sublinear_tf=True, max_df=0.5,
                                     stop_words=mystop_words, min_df=3)
        X_train = self.vectorizer.fit_transform(training_comments).toarray()
        Y_train = np.array(training_ratings)

        #Apply SMOTE to improve ratio of the minority class
        smote_model = SMOTE(ratio=0.5, random_state=None, k=None, k_neighbors=15, m=None, m_neighbors=15, out_step=.0001,
                   kind='regular', svm_estimator=None, n_jobs=1)

        X_resampled, Y_resampled=smote_model.fit_sample(X_train, Y_train)

        model=self.get_classifier()
        model.fit(X_resampled, Y_resampled)

        return model
build.py 文件源码 项目:atap 作者: foxbook 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)
train_classifier.py 文件源码 项目:data_programming 作者: kep1616 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def construct_tf_idf_matrix(data, store=False):

    print ("TF-IDF Normalized Matrix Construction...")

    vectorizer = TfidfVectorizer(stop_words='english')
    print(data)
    training_data = vectorizer.fit_transform(data)

    print ("Done Constructing Matrix")
    print(training_data.toarray())
    if store:
        print ("Pickling Trained Transformer...")
        pickle.dump(vectorizer, open(path_config.TRANSFORMER_PICKLING_FILE, 'wb'))
        print ("Pickling Done.")

    return training_data
random_forest.py 文件源码 项目:MLAB_Intuit 作者: rykard95 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def rf_categorize(email):
    # get training corpus
    emails = []
    db = utils.get_local_db()
    for collection in db.collection_names():
        for record in db.get_collection(collection).find():
            emails.append([collection] + [record['Text']])

    # vectorize corpus
    labels = [row[0] for row in emails]
    data = [row[1] for row in emails]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    X = X.toarray()

    # vectorize input
    email_vector = vectorizer.transform([email])

    # create random forest and return prediction
    forest = RandomForestClassifier(n_estimators = int(sqrt(len(X[0])))+1)
    forest.fit(X, labels)
    return forest.predict(email_vector)[0]
04_sent.py 文件源码 项目:Building-Machine-Learning-Systems-With-Python-Second-Edition 作者: PacktPublishing 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
03_clean.py 文件源码 项目:Building-Machine-Learning-Systems-With-Python-Second-Edition 作者: PacktPublishing 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def create_ngram_model(params=None):
    def preprocessor(tweet):
        global emoticons_replaced
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    clf = MultinomialNB()
    pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
AIserver.py 文件源码 项目:Using-machine-learning-to-detect-malicious-URLs 作者: faizann24 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def TL():
    allurls = './data/data.csv' #path to our all urls file
    allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
    allurlsdata = pd.DataFrame(allurlscsv)  #converting to a dataframe

    allurlsdata = np.array(allurlsdata) #converting it into an array
    random.shuffle(allurlsdata) #shuffling

    y = [d[1] for d in allurlsdata] #all labels 
    corpus = [d[0] for d in allurlsdata]    #all urls corresponding to a label (either good or bad)
    vectorizer = TfidfVectorizer(tokenizer=getTokens)   #get a vector for each url but use our customized tokenizer
    X = vectorizer.fit_transform(corpus)    #get the X vector

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #split into training and testing set 80/20 ratio

    lgs = LogisticRegression()  #using logistic regression
    lgs.fit(X_train, y_train)
    print(lgs.score(X_test, y_test))    #pring the score. It comes out to be 98%
    return vectorizer, lgs
tools.py 文件源码 项目:document_classification 作者: scotthlee 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
        #choosing the particular flavor of vectorizer
        if method == 'counts':
            vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
        elif method == 'tfidf':
            vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')

        #fitting the vectorizer and converting the counts to an array
        full_fit = vectorizer.fit_transform(df[x_name])
        full_counts = full_fit.toarray()
        self.vocabulary_ = vectorizer.vocabulary_

        #passing the attributes up to the class instance
        self.data = df
        if sparse:
            full_counts = csr_matrix(full_counts)
        self.X = full_counts
        if y_name != None:
            self.y = np.array(df[y_name])
        return

    #splits the data into training and test sets; either called from process()
    #or on its own when your text is already vectorized and divided into x and y
textfeatures.py 文件源码 项目:spice-hate_speech_detection 作者: futurice 项目源码 文件源码 阅读 53 收藏 0 点赞 0 评论 0
def bag_of_words(messages, model=None, weighting=''):

    # TODO: Add stemmming or baseform here
    messages, stemmings2baseform =  texttools.stemming_messages_snowball(messages)

    # Create new model for extrating text features if None is given
    if model is None:
        if weighting == 'tfidf':
            model = TfidfVectorizer()
        else:
            model = CountVectorizer()
        model.fit(messages)

    # Extract features
    x = model.transform(messages)

    return x


问题


面经


文章

微信
公众号

扫码关注公众号