python类CountVectorizer()的实例源码-面圈网

ContentTest.py 文件源码项目：newsrecommender 作者: Newsrecommender 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer

nlp_utils.py 文件源码项目：search_relevance 作者: rmanak 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def getTFV(token_pattern = token_pattern,
           norm = tfidf__norm,
           max_df = tfidf__max_df,
           min_df = tfidf__min_df,
           ngram_range = (1, 1),
           vocabulary = None,
           stop_words = 'english'):
    tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None, 
                         strip_accents='unicode', analyzer='word', 
                         token_pattern=token_pattern,
                         ngram_range=ngram_range, use_idf=True, 
                         smooth_idf=True, sublinear_tf=True,
                         stop_words = stop_words, norm=norm, vocabulary=vocabulary)
    return tfv   


#========= CountVectorizer =========#

papyrus_summary_extraction_tool.py 文件源码项目：Papyrus--simple-but-effective-text-summarization-tool 作者: RebeccaMerrett 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def function_2(text):
    paragraphs = text.split('\n\n')
    count_vect = CountVectorizer()
    bow_matrix = count_vect.fit_transform(paragraphs)
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
    similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
    similarity_graph.toarray()
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph) #TextRank applied
    ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
    ten_percent = int(round(10.00/100.00 * len(ranked)))
    ten_percent_high_scores = ranked[0:ten_percent]
    summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
    return "\n\n".join(summary)

#Text taken from the user's uploaded PDF or URL, cleaned and formatted.

nlp_utils.py 文件源码项目：search_relevance 作者: rmanak 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def getBOW(token_pattern = token_pattern,
           max_df = bow__max_df,
           min_df = bow__min_df,
           ngram_range = (1, 1),
           vocabulary = None,
           stop_words = 'english'):
    bow =CountVectorizer(min_df=min_df, max_df=max_df, max_features=None, 
                         strip_accents='unicode', analyzer='word', 
                         token_pattern=token_pattern,
                         ngram_range=ngram_range,
                         stop_words = stop_words, vocabulary=vocabulary)
    return bow     


########################################################

# ------------------------------
# Simple text cleaning using 
#        
#     -replacement dict 
#        
#        or
#        
#     -WordReplacer object
#--------------------------------

nb_classification.py 文件源码项目：linkedin_recommend 作者: duggalr2 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def predict_job(job_list):
    """Assign a classification to a url"""
    # TODO: Add case where len is 1 or 0....
    job_list = [job for j in job_list for job in j]
    new_job_list = [regex.tokenize_and_stem(i) for i in job_list]
    new_job_list = [' '.join(job) for job in new_job_list]
    vect = CountVectorizer()
    x_series = pd.Series(X)
    X_train_dtm = vect.fit_transform(x_series)
    y_train = pd.Series(y)
    job_list_series = pd.Series(new_job_list)
    job_list_dtm = vect.transform(job_list_series)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred = nb.predict(job_list_dtm)
    # for i in range(len(job_list)):
    #     print(job_list[i], y_pred[i])
    return y_pred

# print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))

prepare_instances.py 文件源码项目：quoll 作者: LanguageMachines 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def run(self):
        all_file_names = []
        all_labels = []

        for n, folder_name in enumerate(os.listdir(self.in_txtdir().path)):

            full_folder_name = self.in_txtdir().path+'/'+folder_name

            if os.path.isfile(full_folder_name):
                continue

            for file_name in os.listdir(full_folder_name):
                all_labels.append(n)
                all_file_names.append(full_folder_name+'/'+file_name)

        vectorizer = CountVectorizer(input='filename')
        vector = vectorizer.fit_transform(all_file_names)
        numpy.save(self.out_npy().path,vector)
        numpy.save('labels',numpy.array(all_labels)) #Where and how do we want to save this?

#This is just to test the tasks above

main.py 文件源码项目：SNAP_R 作者: zerofox-oss 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def gen_lstm_status(screen_name, timeline, short_url, depth):
    # Create a vector of words and their frequency in on the user's timeline.
    # Experimentation shows that requiring a word to occur at least 4 * depth
    # times to be considered gives good results.
    with open("stopwords.txt", 'r') as stopwords_file:
        stopwords = [line.strip() for line in stopwords_file]
    processed_timeline_text = [preprocess_post(post) for post in timeline]

    vectorizer = CountVectorizer(min_df=4*depth, stop_words=stopwords)
    X = vectorizer.fit_transform(processed_timeline_text)
    vocab = vectorizer.get_feature_names()
    topic = random.choice(vocab)

    # Generates a status using a helper bash script.
    proc = subprocess.Popen([NN_SAMPLE_COMMAND, topic], stdout=subprocess.PIPE)
    status = topic + " " + proc.stdout.read().split("\n")[-2].strip()
    return "@" + screen_name + " " + status + " " + short_url

TreeExtract.py 文件源码项目：ModelFlow 作者: yuezPrincetechs 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def count_features(self,X,verbose=False):
        '''
        ???????????????????????????
        X?dataframe??columns????self.columns?
        ???????????self.estimators_??????????dataframe?index?X?columns?self.columns????????
        '''
        result=[]
        for i,estimator in enumerate(self.estimators_):
            tmp=pd.Series(estimator.apply(X[self.columns]))
            tmp.index=X.index
            tmp=tmp.map(lambda xx: ' '.join([yy[0] for yy in self.paths[i][xx]]))
            vect=CountVectorizer(vocabulary=self.columns,lowercase=False)
            tmp=vect.transform(tmp).toarray()
            tmp=pd.DataFrame(tmp)
            vocabulary_inverse={vect.vocabulary_[key]:key for key in vect.vocabulary_}
            tmp.columns=[vocabulary_inverse[k] for k in range(tmp.shape[1])]
            tmp.index=X.index
            tmp.index.name=X.index.name
            tmp=tmp.fillna(0)
            result.append(tmp.copy())
            if verbose:
                print('Done:',i)
        return result

data.py 文件源码项目：uci-statnlp 作者: sameersingh 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def textToTokens(text):
    """Converts input string to a corpus of tokenized sentences.

    Assumes that the sentences are divided by newlines (but will ignore empty sentences).
    You can use this to try out your own datasets, but is not needed for reading the homework data.
    """
    corpus = []
    sents = text.split("\n")
    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    count_vect.fit(sents)
    tokenizer = count_vect.build_tokenizer()
    for s in sents:
        toks = tokenizer(s)
        if len(toks) > 0:
            corpus.append(toks)
    return corpus

cluster_manager.py 文件源码项目：texta 作者: texta-tk 项目源码文件源码阅读 68 收藏 0 点赞 0 评论 0

def _vectorize_documents(self,method='tfidf',max_features=100):
        stop_words = []

        try:
            for lexicon_id in self.params['cluster_lexicons']:
                lexicon = Lexicon.objects.get(id=int(lexicon_id))
                words = Word.objects.filter(lexicon=lexicon)
                stop_words+=[word.wrd for word in words]
        except:
            KeyError

        if method == 'count':
            vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
        if method == 'tfidf':
            vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)

        document_vectors = vectorizer.fit_transform(self.documents)
        document_vectors = document_vectors.toarray()

        return document_vectors,vectorizer.get_feature_names()

base.py 文件源码项目：vec4ir 作者: lgalke 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def __init__(self, match_fn=TermMatch, binary=True, dtype=np.bool_,
                 **cv_params):
        """initializes a Matching object

        :match_fn: A matching function of signature `docs, query`
                    -> indices of matching docs
        :binary: Store only binary term occurrences.
        :dtype: Data type of internal feature matrix
        :cv_params: Parameter for the count vectorizer such as lowercase=True

        """
        # RetrievalBase.__init__(self)

        self._match_fn = match_fn
        self._vect = CountVectorizer(binary=binary, dtype=dtype,
                                     **cv_params)

ir_eval.py 文件源码项目：vec4ir 作者: lgalke 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def is_embedded(sentence, embedding, analyzer):
    """
    >>> embedding = ["a", "b", "c"]
    >>> queries =  ["a b c", "a", "b", "c", "a b c d", "d", "a b c"  ]
    >>> analyzer = lambda x: x.split()
    >>> [query for query in queries if is_embedded(query, embedding, analyzer)]
    ['a b c', 'a', 'b', 'c', 'a b c']
    >>> analyzer = CountVectorizer().build_analyzer()
    >>> [query for query in queries if is_embedded(query, embedding, analyzer)]
    ['a b c', 'a', 'b', 'c', 'a b c']
    """
    for word in analyzer(sentence):
        if word not in embedding:
            print("Dropping:", sentence, file=sys.stderr)
            return False

    return True

Content_Based.py 文件源码项目：newsrecommender 作者: Newsrecommender 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer

query_adapter.py 文件源码项目：BotValue-public 作者: arnauddelaunay 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def train_feature_finder(self, training_db, clf):
        training_sentences = []
        c = 0
        training_classes = []
        self.class_names = []
        self.vectorizer = CountVectorizer(analyzer = "word",   \
                              tokenizer = None,    \
                              preprocessor = None, \
                              stop_words = None,   \
                              max_features = 500)
        for key, value in training_db.iteritems():
            training_sentences += value
            training_classes += [c for i in range(len(value))] 
            c+=1
            self.class_names.append(key)
        train_data_features = self.vectorizer.fit_transform(training_sentences)
        train_data_features = train_data_features.toarray()
        clf = clf.fit( train_data_features, training_classes)
        return clf

categorizing.py 文件源码项目：nlp-chinese_text_classification 作者: iamiamn 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def getDatas(dataset_dir_name):
    movie_reviews = load_files(dataset_dir_name)

    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)

    #word_tokenizer ??????????????????????????????????????????????????
    vectorizer = CountVectorizer(binary = True, decode_error = u'ignore')
    word_tokenizer = vectorizer.build_tokenizer()


    #????????list
    doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
    doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)


    return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train

create_product_dictionary.py 文件源码项目：data_programming 作者: kep1616 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def run():
    py2neo.authenticate("localhost:7474","neo4j","neo4j1")
    graph = Graph("http://localhost:7474/db/data/")
    result=graph.data('''MATCH (n:Product)-[r:BELONGS_TO]->(c:Category) WITH n, rand() AS number RETURN n.name,n.description,n.catName order by number limit 3000''')
    st = ""

    for x in result:
        p=','.join(str(val).strip(string.punctuation) for (key,val) in x.items())
        st=st + p
        p=""
    vectorizer = CountVectorizer(strip_accents='ascii')
    tokenizer = vectorizer.build_tokenizer()
    preprocessor = vectorizer.build_preprocessor()

    tokens = set()


    for item in tokenizer(st):
        tokens.add(preprocessor(item))

    with codecs.open(path_config.PERSONAL_WORD_DICTIONARY_FILE, mode='wb', encoding='utf-8') as f:
        for token in tokens:
            f.write(token + '\n')

pipelines.py 文件源码项目：magic 作者: pan-webis-de 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def word_unigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=get_stopwords(),
                                 preprocessor=preprocessor,
                                 ngram_range=(1, 1))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_unigrams', pipeline)

data_feed.py 文件源码项目：Bayes 作者: krzjoa 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_data():
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    vectorizer = CountVectorizer()

    categories = ['alt.atheism', 'talk.religion.misc',
                  'comp.graphics', 'sci.space']

    # Train set
    newsgroups_train = fetch_20newsgroups(subset='train',
                                          categories=categories, shuffle=True)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    y_train = newsgroups_train.target

    # Test set
    newsgroups_test = fetch_20newsgroups(subset='test',
                                         categories=categories, shuffle=True)
    X_test = vectorizer.transform(newsgroups_test.data)
    y_test = newsgroups_test.target

    return X_train, y_train, X_test, y_test

tools.py 文件源码项目：document_classification 作者: scotthlee 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
        #choosing the particular flavor of vectorizer
        if method == 'counts':
            vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
        elif method == 'tfidf':
            vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')

        #fitting the vectorizer and converting the counts to an array
        full_fit = vectorizer.fit_transform(df[x_name])
        full_counts = full_fit.toarray()
        self.vocabulary_ = vectorizer.vocabulary_

        #passing the attributes up to the class instance
        self.data = df
        if sparse:
            full_counts = csr_matrix(full_counts)
        self.X = full_counts
        if y_name != None:
            self.y = np.array(df[y_name])
        return

    #splits the data into training and test sets; either called from process()
    #or on its own when your text is already vectorized and divided into x and y

taghasher.py 文件源码项目：pantip-libr 作者: starcolon 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def new(n_feature=128):
  vectorizer = CountVectorizer(
    encoding='utf-8',
    ngram_range=(1,1), # Unigram only
    max_features=n_feature, 
    binary=True
  )

  # Fill the gap (missing expected tags)
  # ---
  # Hypothesis: Some tags are somehow related so 
  # we smoothen the missing values with matrix factorisation.
  smoother = NMF(n_components=n_feature)

  # Binarise the vector's individual values 
  binariser = Binarizer(copy=True)

  # Count vectoriser => NMF as smoother => Binariser
  print(colored('Taghasher model created','yellow'))
  return [vectorizer,smoother,binariser]

textfeatures.py 文件源码项目：spice-hate_speech_detection 作者: futurice 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def bag_of_words(messages, model=None, weighting=''):

    # TODO: Add stemmming or baseform here
    messages, stemmings2baseform =  texttools.stemming_messages_snowball(messages)

    # Create new model for extrating text features if None is given
    if model is None:
        if weighting == 'tfidf':
            model = TfidfVectorizer()
        else:
            model = CountVectorizer()
        model.fit(messages)

    # Extract features
    x = model.transform(messages)

    return x

test_corpusFromScikit.py 文件源码项目：scattertext 作者: JasonKessler 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def test_build(self):
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        corpus = CorpusFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names,
            raw_texts=newsgroups_train.data
        ).build()
        self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(corpus
                         .get_term_freq_df()
                         .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
        self.assertGreater(len(corpus.get_texts()[0]), 5)

test_termDocMatrixFromScikit.py 文件源码项目：scattertext 作者: JasonKessler 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def test_build(self):
        from sklearn.datasets import fetch_20newsgroups
        from sklearn.feature_extraction.text import CountVectorizer
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        term_doc_mat = TermDocMatrixFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names).build()
        self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(term_doc_mat
                         .get_term_freq_df()
                         .assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])

visualizations.py 文件源码项目：tRECS 作者: TeeOhh 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def make_lda(self, nt, iterations):
        # '''
        #   description: sets important attributes and creates lda model
        #   params:     nt-number of topics for lda
        #               iterations: number of iterations for lda
        #               dim: 2d or 3d grpah
        #               threshold: minimum percentage of the maximum topic in a document which can be included in a "cluster"
        # '''

        self.nt = nt        


        self.cvectorizer = CountVectorizer(min_df=5, stop_words='english')
        cvz = self.cvectorizer.fit_transform(self.descriptions)

        # train an LDA model
        self.lda_model = lda.LDA(n_topics=nt, n_iter=iterations)
        self.X_topics_original = self.lda_model.fit_transform(cvz)

        #initialize current stuff
        self.X_topics_current = self.X_topics_original
        self.titles_current = self.titles_original

feature_extraction.py 文件源码项目：political-ad-classifier 作者: BoudhayanBanerjee 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def countvectorizer(inputpath=None, text=None):
    """
    docstring
    """
    vectorizer = CountVectorizer(min_df=1)
    if inputpath:
        filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)]
        corpus = []
        for file in filenames:
            with open(file, 'r') as f:
                data = f.read()
                corpus.append(data)
    if text:
        corpus = text

    X = vectorizer.fit_transform(corpus)
    print(X.toarray())
    print(vectorizer.get_feature_names())

utils.py 文件源码项目：text-analytics-with-python 作者: dipanjanS 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def build_feature_matrix(documents, feature_type='frequency'):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, 
                                     ngram_range=(1, 1))
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix

utils.py 文件源码项目：text-analytics-with-python 作者: dipanjanS 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix

utils.py 文件源码项目：text-analytics-with-python 作者: dipanjanS 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix

example_1.py 文件源码项目：nlp 作者: lhyxcxy 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def getTFIDF():
    """

    :return:
    """
    corpus,textList=getFenCiWords();
    vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i???????
    transformer=TfidfTransformer()#??????????tf-idf??
    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform??????????
    word=vectorizer.get_feature_names()#????????????
    weight = tfidf.toarray()  # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??"
    return weight, textList
    # for i in range(len(weight)):#???????tf-idf????????for??????????for?????????????
    #   print u"-------?????",i,u"??????tf-idf??------"
    # for j in range(len(word)):
    # print word[j],weight[i][j]

multiinstance.py 文件源码项目：IBRel 作者: lasigeBioTM 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, corpus, pairtype, relations, modelname="mil_classifier.model", test=False, ner="goldstandard",
                 generate=True):
        super(MILClassifier, self).__init__()
        self.modelname = modelname
        self.pairtype = pairtype
        self.pairs = {}  # (e1.normalized, e2.normalized) => (e1, e2)
        self.instances = {}  # bags of instances (e1.normalized, e2.normalized) -> all instances with these two entities
        self.labels = {} # (e1.normalized, e2.normalized) => label (-1/1)
        self.bag_labels = []  # ordered list of labels for each bag
        self.bag_pairs = []  # ordered list of pair labels (e1.normalized, e2.normalized)
        self.data = []  # ordered list of bags, each is a list of feature vectors
        self.predicted = []  # ordered list of predictions for each bag
        self.resultsfile = None
        self.examplesfile = None
        self.ner_model = ner
        self.vectorizer = CountVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b')
        self.corpus = corpus

        #self.vectorizer = TfidfVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b', max_features=)
        #self.classifier = misvm.MISVM(kernel='linear', C=1.0, max_iters=20)
        self.classifier = misvm.sMIL(kernel='linear', C=1)
        #self.classifier = misvm.MissSVM(kernel='linear', C=100) #, max_iters=20)
        #if generate:
        #    self.generateMILdata(test=test, pairtype=pairtype, relations=relations)