python类CountVectorizer()的实例源码

scikitre.py 文件源码 项目:IBRel 作者: lasigeBioTM 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
tfidf_feature.py 文件源码 项目:TextClassification 作者: mosu027 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def tfidf_feature(xtrain, xtest, stopwords_path):
    """
    tf-idf feature
    """
    xtrain = [" ".join(word) for word in xtrain]
    xtest = [" ".join(word) for word in xtest]
    stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines()
    stopwords = [word.strip("\n") for word in stopwords]
    vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5)
    count_train = vectorizer_train.fit_transform(xtrain)
    vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
    count_test = vectorizer_test.fit_transform(xtest)

    transformer = TfidfTransformer()
    tfidf_train = transformer.fit(count_train).transform(count_train)
    tfidf_test = transformer.fit(count_test).transform(count_test)

    return tfidf_train.toarray(),tfidf_test.toarray()
nGramClassifier.py 文件源码 项目:flexmatcher 作者: biggorilla-gh 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, ngram_range=(1, 1), analyzer='word', count=True,
                 n_features=200):
        """Initializes the classifier.

        Args:
            ngram_range (tuple): Pair of ints specifying the range of ngrams.
            analyzer (string): Determines what type of analyzer to be used.
            Setting it to 'word' will consider each word as a unit of language
            and 'char' will consider each character as a unit of language.
            count (boolean): Determines if features are counts of n-grams
            versus a binary value encoding if the n-gram is present or not.
            n_features (int): Maximum number of features used.
        """
        # checking what type of vectorizer to create
        if count:
            self.vectorizer = CountVectorizer(analyzer=analyzer,
                                              ngram_range=ngram_range,
                                              max_features=n_features)
        else:
            self.vectorizer = HashingVectorizer(analyzer=analyzer,
                                                ngram_range=ngram_range,
                                                n_features=n_features)
evaluate_nbsvm.py 文件源码 项目:million-post-corpus 作者: OFAI 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
    fe = CountVectorizer(
        preprocessor=normalize,
        tokenizer=micro_tokenize,
        binary=True,
    )
    predictor = NBSVM_predictor(
        kernel=conf.SVM_KERNEL,
        class_weight=conf.SVM_CLWEIGHT,
        C=conf.SVM_C,
    )
    fe.fit(txt_train)
    X = fe.transform(txt_train)
    predictor.fit(X, y_train)
    X_test = fe.transform(txt_test)
    y_pred = predictor.predict(X_test)

    return y_pred
evaluate_bow.py 文件源码 项目:million-post-corpus 作者: OFAI 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
    fe = CountVectorizer(
        preprocessor=normalize,
        tokenizer=micro_tokenize,
        binary=True,
    )
    predictor = SVC(
        kernel=conf.SVM_KERNEL,
        class_weight=conf.SVM_CLWEIGHT,
        C=conf.SVM_C,
        random_state=conf.SEED,
    )
    fe.fit(txt_train)
    X = fe.transform(txt_train)
    predictor.fit(X, y_train)
    X_test = fe.transform(txt_test)
    y_pred = predictor.predict(X_test)

    return y_pred
softtfidf.py 文件源码 项目:LLString 作者: mitll 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def compute_VwS(self,s):
        """ Compute V(w,S) as defined by Cohen et al.'s IJCAI03 paper """
        # Get term-frequency vectors and vocab list for string
        cv = CountVectorizer(min_df = 0.0, token_pattern=u'(?u)\\b\\w+\\b')
        tf = cv.fit_transform([s]); tf = tf.tocsr()
        vocab = cv.vocabulary_

        # Compute V(w,S) for string
        vprime_ws = dict()
        vprime_ws_norm = 0
        for w in vocab:
            if w in self.CORPUS_VOCAB:
                vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.LOG_IDF[self.CORPUS_VOCAB[w]]
            else:
                vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.OOV_IDF_VAL #if not in vocab, defauly to OOC_IDF_VAL
            vprime_ws_norm += vprime_ws[w]**2
        vprime_ws_norm = math.sqrt(vprime_ws_norm)

        return (vocab,vprime_ws,vprime_ws_norm)
bow_to_npy.py 文件源码 项目:SpectralLDA-MXNet 作者: Mega-DatA-Lab 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def bow_to_npy(vocabulary_fname, bow_fname, npy_fname):
    ''' Vectorize bag-of-words dump and save in NumPy file

    PARAMETERS
    -----------
    vocabulary_fname: str or Path
        Vocabulary text file name, with one word on each line.
    bow_fname: str or Path
        Bag-of-words .txt.gz file name. When uncompressed,
        each line represents a document with only lower-case words
        separated by space.
    npy_fname: str or Path
        NumPy .npy file name to write the word count vectors into.
    '''
    with Path(vocabulary_fname).open('r') as vocabulary_file:
        vocabulary = [line.strip() for line in vocabulary_file]

    vectorizer = CountVectorizer(vocabulary=vocabulary)
    with gzip.open(bow_fname, 'rt') as bow_file:
        word_counts = vectorizer.transform(bow_file)

    np.save(npy_fname, word_counts)
test_concept_count_vectorizer.py 文件源码 项目:Quadflor 作者: quadflor 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def test_read_files(self):
        docs = ['Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit']
        thesaurus = {'13542-1': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'],
                                 'narrower': ['0n'], 'altLabel': []},
                     '13542-4': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'],
                                 'narrower': ['1n'], 'altLabel': ['amet']},
                     }
        vocabulary = {'13542-1': 1, '13542-4': 0}
        fnames = []
        for doc in docs:
            file = NamedTemporaryFile(mode='w', delete=False)
            fnames.append(file.name)
            print(doc, file=file)
        cf = ConceptAnalyzer(thesaurus, input='filename')
        counter = CountVectorizer(analyzer=cf.analyze, vocabulary=vocabulary, input='filename')
        res = counter.fit_transform(fnames).todense()
        np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
similarity.py 文件源码 项目:learn-to-select-data 作者: sebastianruder 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_topic_distributions(examples, vectorizer, lda_model):
    """
    Retrieve the topic distributions of a collection of documents.
    :param examples: a list of tokenised documents
    :param vectorizer: the CountVectorizer used for transforming the documents
    :param lda_model: the trained LDA model
    :return: an array of shape (num_examples, num_topics) containing the topic
             distribution of each example
    """
    vectorized_corpus = vectorizer.transform(examples)
    gensim_corpus = gensim.matutils.Sparse2Corpus(vectorized_corpus,
                                                  documents_columns=False)
    topic_representations = []
    for doc in gensim_corpus:
        topic_representations.append(
            [topic_prob for (topic_id, topic_prob) in
             lda_model.get_document_topics(doc, minimum_probability=0.)])
    return np.array(topic_representations)


# PRE-TRAINED WORD EMBEDDINGS METHODS
PreprocessManager.py 文件源码 项目:GitHub-Recommender 作者: himangshunits 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_word_counts(input_str, limit = 100):
        input_str = PreprocessManager.remove_non_ascii(input_str)
        wordnet_lemmatizer = WordNetLemmatizer()
        snowball_stemmer = EnglishStemmer()
        tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower())
        tokenized_text = [word for word in tokenized_text if len(word) > 1]  # Filter some small words
        #tokenized_text = [word for word in tokenized_text if not word.isnumeric()]
        filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')]
        stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words]
        # Calculate frequency distribution
        frequency_dist = nltk.FreqDist(stemmed_list)

        # Output top 50 words
        result = dict()
        for word, frequency in frequency_dist.most_common(limit):
            # print(u'{};{}'.format(word, frequency))
            result[word] = frequency
        return result



    # This function just splits the words and gives the words that's all!
category.py 文件源码 项目:feature_engineering 作者: webeng 项目源码 文件源码 阅读 49 收藏 0 点赞 0 评论 0
def getModels(self):
        with open(self.data_path + '/categories.pkl', 'rb') as f:
            categories = cPickle.load(f)

        with open(self.data_path + '/category_map.pkl', 'rb') as f:
            category_map = cPickle.load(f)

        with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f:
            clf = cPickle.load(f)

        count_vect = CountVectorizer()
        with open(self.data_path + '/count_vect.pkl', 'rb') as f:
            count_vect = cPickle.load(f)

        tfidf_transformer = TfidfTransformer()
        with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f:
            tfidf_transformer = cPickle.load(f)

        with open(self.data_path + '/tree.pkl', 'rb') as f:
            tree = cPickle.load(f)

        return categories, category_map, clf, count_vect, tfidf_transformer, tree
centroid_w2v.py 文件源码 项目:text-summarizer 作者: gaetangate 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_topic_idf(self, sentences):
        vectorizer = CountVectorizer()
        sent_word_matrix = vectorizer.fit_transform(sentences)

        transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
        tfidf = transformer.fit_transform(sent_word_matrix)
        tfidf = tfidf.toarray()

        centroid_vector = tfidf.sum(0)
        centroid_vector = np.divide(centroid_vector, centroid_vector.max())
        # print(centroid_vector.max())

        feature_names = vectorizer.get_feature_names()
        word_list = []
        for i in range(centroid_vector.shape[0]):
            if centroid_vector[i] > self.topic_threshold:
                # print(feature_names[i], centroid_vector[i])
                word_list.append(feature_names[i])

        return word_list
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))



########## Stemmer + CountVectorizer wrapper #############
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))


########## Defaults TF-IDF & Count Vectorizers ########


#======== TF-IDF Vectorizer =========#
nb_classification.py 文件源码 项目:linkedin_recommend 作者: duggalr2 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def train_test():
    """Identify accuracy via training set"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
    vect = CountVectorizer()
    X_train_dtm = vect.fit_transform(X_train)  # creates vocab set and dtm for each raw document!
    X_test_dtm = vect.transform(X_test)

    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)  # make class predictions for X_test_dtm
    # w = list(X_test)
    return metrics.accuracy_score(y_test, y_pred_class)

# print(train_test())
classifier.py 文件源码 项目:geocoder-ie 作者: devgateway 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self):
        self.clf = LinearSVC()
        self.scores = []
        self.vectorizer = CountVectorizer(token_pattern=r'[A-z]+',  stop_words=english_stops,
                                          ngram_range=(1, 1))
abb1t.py 文件源码 项目:Abb1t 作者: k-freeman 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def create_speech(self):
        self.speech = dict.fromkeys(self.archives,[]) 
        #blacklist=[] # ids to be ignored, not implemented yet
        self.vectorizer = dict.fromkeys(self.archives,[])
        self.mat = dict.fromkeys(self.archives,[])
        for key in self.speech:
            self.speech[key]=[[],[]] # messages / ids / (maybe timestamps?)
            self.vectorizer[key]=CountVectorizer(min_df=1)
            if key >=0:
                continue # why create dictionaries for private messages right now...
            logfile="{}.gz".format(os.path.join(self.logpath,str(key)))
            try:
                ziplines=gzip.open(logfile).read().decode("utf-8").strip("\r\n").split("\n")[-15000:]
            except IOError:
                print("{} not found".format(logfile))
                continue
            prev_id = -1
            for msg_line in ziplines:
                msg = Msg(json.loads(msg_line))
                text=msg.get_text()
                chat_id=msg.get_chat_id()
                if (key != chat_id):
                    input("Error in your logfile (key {} / chat {})!".format(key,chat_id))
                sent_id=msg.get_sent_id()
                if text and text[0] not in ["/","!"]  and msg.get_edit_date()==0 and not self.is_blacklisted(text) and (not self.find_name(text)) and chat_id and sent_id: #sadly, @like will come through
                    if sent_id == prev_id:
                        self.speech[key][0][-1]+="\n{}".format(text)
                    else:
                        self.speech[key][0].append(text)
                        self.speech[key][1].append(sent_id)
                    prev_id = sent_id
            if self.speech[key][0]:
                self.mat[key]=self.vectorizer[key].fit_transform(self.speech[key][0])
lda_utils.py 文件源码 项目:LDA-REST 作者: valentinarho 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def compute_tf(data, stopwords_list, language, use_lemmer=True, min_df=2, max_df=0.8):
    """
    Compute the tf matrix for the provided data
    :param language: 'en' or 'it'
    :param data:
    :param stopwords_list:
    :param use_lemmer:
    :param min_df:
    :param max_df:
    :return:
    """
    lemmer_tokenizer = None

    if use_lemmer:
        if language == 'it':
            lemmer_tokenizer = LemNormalizeIt
        else:
            lemmer_tokenizer = LemNormalize

    min_df = min_df if len(data) > min_df else 1
    max_df = max_df if max_df * len(data) >= min_df else 1.0

    # tf
    tf_vectorizer = CountVectorizer(tokenizer=lemmer_tokenizer,
                                    max_df=max_df, min_df=min_df,
                                    max_features=None,
                                    stop_words=stopwords_list,
                                    token_pattern="[a-zA-Z]{3,}")

    try:
        tf = tf_vectorizer.fit_transform(data)
        tf_features_names = tf_vectorizer.get_feature_names()
    except:
        logging.warning('The computed tf matrix is empty. Check stopwords.')
        tf = []
        tf_features_names = []

    return tf, tf_features_names
textprocess.py 文件源码 项目:scik-learn-learn-Chinese-text-classider 作者: chapzq77 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def voc_count_bag(self):
        if (self.wordbag_path == "" or self.vocabulary_count_bag_name == "" or self.stopword_path ==""):
            print "wordbag_path(????????) or vocabulary_count_bag_name(?????????) or stopword_path(??????) can not be empty."
            return 
        file_obj = open(self.wordbag_path+self.trainset_name,'rb')
        self.data_set = pickle.load(file_obj)
        file_obj.close()
        #??vocabulary_count_bag?????
        self.vocabulary_count_bag.target_name = self.data_set.target_name
        self.vocabulary_count_bag.label =self.data_set.label
        self.vocabulary_count_bag.filenames =self.data_set.filenames
        corpus = self.data_set.content
        stopword_list = self.getstopword(self.stopword_path)
        #??????????,?????????????
        vectorizer = CountVectorizer(stop_words=stopword_list, max_df=500, min_df=1,max_features=10000)
        y = vectorizer.fit_transform(corpus)
        self.vocabulary_count_bag.vcm = y
        self.vocabulary_count_bag.vcm_sum = y.toarray().sum(axis=0)
        self.vocabulary_count_bag.vocabulary = vectorizer.get_feature_names()
        if not os.path.exists(self.wordbag_path):
            os.makedirs(self.wordbag_path)
        file_obj1 = open(self.wordbag_path+self.vocabulary_count_bag_name,'wb')
        pickle.dump(self.vocabulary_count_bag,file_obj1)
        file_obj1.close()
        print "????????vocabulary_count_bag???wordbag_path???????vocabulary_count_bag_name??????"
        print "#######################################"

    #???????
text_classifier.py 文件源码 项目:textar 作者: datosgobar 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
        """Definido en la declaracion de la clase.

        Attributes:
            texts (list of str): Textos a clasificar.
            ids (list of str): Identificadores únicos para cada texto (debe
                tener la misma longitud que `texts`).
            vocabulary (list): Opcional. Vocabulario a tener en cuenta para la
                vectorización de los textos. Default: usa todas las palabras
                presentes en los textos, salvo los ES_stopwords.txt.
            encoding (str): Codificación de los textos en `texts` y en `ids`.
        """
        this_dir, this_filename = os.path.split(__file__)
        es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'),
                                   header=None, encoding='utf-8')
        es_stopwords = list(np.squeeze(es_stopwords.values))
        self._check_id_length(ids)
        self.vectorizer = CountVectorizer(
            input='content', encoding=encoding, decode_error='strict',
            strip_accents='ascii', lowercase=True, preprocessor=None,
            tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
            analyzer='word', max_df=0.8, min_df=1, max_features=None,
            vocabulary=vocabulary, binary=False)

        self.transformer = TfidfTransformer()
        self.ids = None  # Matiene una lista ordenada de ids de textos.
        self.term_mat = None  # Matriz que cuenta los terminos en un texto.
        self.tfidf_mat = None  # Matriz de relevancia de los terminos.
        self.reload_texts(texts, ids)


问题


面经


文章

微信
公众号

扫码关注公众号