python类fetch_20newsgroups()的实例源码-面圈网

test_bag_of_features.py 文件源码项目：xpandas 作者: alan-turing-institute 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def test_bag_of_words_for_series():
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    series = XSeries(dataset.data[:10])
    assert series.data_type == str

    translator = str.maketrans('', '', string.punctuation)
    tokenizer_transformer = XSeriesTransformer(
        transform_function=lambda text: text.lower().translate(translator).strip().split()
    )

    transformed_series = tokenizer_transformer.fit_transform(series)
    # print(transformed_series)

    bag_transform = BagOfWordsTransformer()

    transformed_series = bag_transform.fit_transform(transformed_series)

    # print(transformed_series)

    assert type(transformed_series) == XDataFrame

datasets.py 文件源码项目：sef 作者: passalis 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def load_20ng_dataset_bow():
    """
    Loads the 20NG dataset
    :return:
    """

    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')

    # Convert data to tf-idf

    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95)
    train_data = vectorizer.fit_transform(newsgroups_train.data)
    test_data = vectorizer.transform(newsgroups_test.data)
    train_data = train_data.todense()
    test_data = test_data.todense()
    train_labels = newsgroups_train.target
    test_labels = newsgroups_test.target

    return train_data, train_labels, test_data, test_labels

console.py 文件源码项目：mycroft 作者: wpm 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def demo_command(args):
    def create_data_file(partition, filename, samples):
        data = pandas.DataFrame(
            {TEXT_NAME: partition.data,
             LABEL_NAME: [partition.target_names[target] for target in partition.target]}).dropna()[:samples]
        data.to_csv(filename, index=False)
        return filename

    os.makedirs(args.directory, exist_ok=True)
    print("Download a portion of the 20 Newsgroups data and create train.csv and test.csv.")
    newsgroups_train = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
    newsgroups_test = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"))
    train_filename = create_data_file(newsgroups_train, os.path.join(args.directory, "train.csv"), 1000)
    test_filename = create_data_file(newsgroups_test, os.path.join(args.directory, "test.csv"), 100)
    model_directory = os.path.join(args.directory, "model")
    print("Train a model.\n")
    cmd = "train bow %s --save-model %s --epochs 5 --logging progress\n" % (
        train_filename, model_directory)
    print("mycroft " + cmd)
    default_main(cmd.split())
    print("\nEvaluate it on the test data.\n")
    cmd = "evaluate %s %s\n" % (model_directory, test_filename)
    print("mycroft " + cmd)
    default_main(cmd.split())
    print("\n(Note that there is not enough training data here to generate accurate predictions.)")

data_feed.py 文件源码项目：Bayes 作者: krzjoa 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def get_data():
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    vectorizer = CountVectorizer()

    categories = ['alt.atheism', 'talk.religion.misc',
                  'comp.graphics', 'sci.space']

    # Train set
    newsgroups_train = fetch_20newsgroups(subset='train',
                                          categories=categories, shuffle=True)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    y_train = newsgroups_train.target

    # Test set
    newsgroups_test = fetch_20newsgroups(subset='test',
                                         categories=categories, shuffle=True)
    X_test = vectorizer.transform(newsgroups_test.data)
    y_test = newsgroups_test.target

    return X_train, y_train, X_test, y_test

test_corpusFromScikit.py 文件源码项目：scattertext 作者: JasonKessler 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def test_build(self):
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        corpus = CorpusFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names,
            raw_texts=newsgroups_train.data
        ).build()
        self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(corpus
                         .get_term_freq_df()
                         .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
        self.assertGreater(len(corpus.get_texts()[0]), 5)

test_termDocMatrixFromScikit.py 文件源码项目：scattertext 作者: JasonKessler 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def test_build(self):
        from sklearn.datasets import fetch_20newsgroups
        from sklearn.feature_extraction.text import CountVectorizer
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        term_doc_mat = TermDocMatrixFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names).build()
        self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(term_doc_mat
                         .get_term_freq_df()
                         .assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])

plot.py 文件源码项目：pymake 作者: dtrckd 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def newsgroups_class_distrib():
    from sklearn.datasets import fetch_20newsgroups
    ngroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None)
    ngroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None)
    test_data = ngroup_test.data
    train_data = ngroup_train.data
    test_groups = ngroup_test.target
    train_groups = ngroup_train.target

    n = 2000
    train_groups = train_groups[:n]
    test_groups = test_groups[:n]

    plt.figure()
    plt.hist(train_groups, 20, normed=True, range=(0, 19))
    plt.title("train groups")

    plt.figure()
    plt.hist(test_groups, 20, normed=True, range=(0, 19))
    plt.title("test groups")

    plt.show()

data.py 文件源码项目：nlp-playground 作者: jamesmishra 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def newsgroups(*, path=None, key=None, limit=None):
    """
    Return a list of newsgroup messages from the 20 newsgroups dataset.

    Arguments:
     - path(str): Unused in this case. Dataset is managed by sklearn.
     - key(str): Unused.
     - limit(int): Unused.
    """
    # This is going to download the dataset the first time we
    # run this function. Ideally we can populate these datasets
    # ahead of time.
    from sklearn.datasets import fetch_20newsgroups
    if limit:
        return fetch_20newsgroups(subset='train').data[:limit]
    return fetch_20newsgroups(subset='train').data

test_bag_of_features.py 文件源码项目：xpandas 作者: alan-turing-institute 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def test_bag_of_words_for_series_pipeline():
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))
    n = 100
    series = XSeries(dataset.data[:n])
    assert series.data_type == str

    translator = str.maketrans('', '', string.punctuation)
    tokenizer_transformer = XSeriesTransformer(
        transform_function=lambda text: text.lower().translate(translator).strip().split()
    )

    # series = tokenizer_transformer.transform(series)

    Y = np.random.binomial(1, 0.5, n)

    pipeline = PipeLineChain([
        ('preprocessing', XSeriesTransformer(
            transform_function=lambda text: text.lower().translate(translator).strip().split()
        )),
        ('extractor', BagOfWordsTransformer()),
        ('pca', PCA(n_components=10)),
        # ('svc', LinearSVC())
    ])

    pipeline = pipeline.fit(series)
    transformed_series = pipeline.transform(series)

    # print(transformed_series)

test_text_classifier.py 文件源码项目：textar 作者: datosgobar 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def setUp(self):
        """Carga de los datos de prueba (20 Newsgroups corpus)."""
        newsdata = fetch_20newsgroups(data_home="./data/")
        self.ids = [str(i) for i in range(len(newsdata.target))]
        self.texts = newsdata.data
        self.labels = [newsdata.target_names[idx] for idx in newsdata.target]
        self.tc = TextClassifier(self.texts, self.ids)

sklearn_ex1.py 文件源码项目：base_function 作者: Rockyzsu 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def case1():
    from sklearn import datasets
    news = datasets.fetch_20newsgroups(subset='all')
    # print len(news.data)
    # print len(news.target)

    # print '*'*10
    # print news.data[0]
    # print '*'*10
    # print news.target[0]
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    vec = CountVectorizer()
    x = vec.fit_transform(news.data)
    # print x.shape
    # print x[:2]
    print x[:10,:10].toarray()
    TFIDF = TfidfTransformer()
    x_tfidf = TFIDF.fit_transform(x)
    print x_tfidf[:10,:10].toarray()


    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)

    tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)


    from sklearn.naive_bayes import MultinomialNB
    mnb =MultinomialNB()
    tf_mnb = MultinomialNB()

    mmb.fit(Xtrain,ytrain)
    tf_mnb.fit(tf_Xtrain,tf_ytrain)

dataset_20newsgroup.py 文件源码项目：text-classification 作者: cahya-wirawan 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def __init__(self, cfg=None):
        super().__init__()
        self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
                                              shuffle=cfg['shuffle'], random_state=cfg['random_state'])

textcnn_datahelpers.py 文件源码项目：text-classification 作者: cahya-wirawan 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets

log_reg.py 文件源码项目：pyspark 作者: vsmolyakov 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def main():

    #parameters
    num_features = 400  #vocabulary size

    #load data    
    print "loading 20 newsgroups dataset..."
    categories = ['rec.autos','rec.sport.hockey','comp.graphics','sci.space']    
    tic = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=0, categories=categories, remove=('headers','footers','quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    train_labels = dataset.target 
    toc = time()
    print "elapsed time: %.4f sec" %(toc - tic)    

    #tf-idf vectorizer
    tfidf = TfidfVectorizer(max_df=0.5, max_features=num_features, \
                            min_df=2, stop_words='english', use_idf=True)
    X_tfidf = tfidf.fit_transform(train_corpus).toarray()

    #append document labels
    train_labels = train_labels.reshape(-1,1)
    X_all = np.hstack([train_labels, X_tfidf])

    #distribute the data    
    sc = SparkContext('local', 'log_reg')    
    rdd = sc.parallelize(X_all)    
    labeled_corpus = rdd.map(parse_doc)
    train_RDD, test_RDD = labeled_corpus.randomSplit([8, 2], seed=0)

    #distributed logistic regression
    print "training logistic regression..."
    model = LogisticRegressionWithLBFGS.train(train_RDD, regParam=1, regType='l1', numClasses=len(categories))

    #evaluated the model on test data
    labels_and_preds = test_RDD.map(lambda p: (p.label, model.predict(p.features)))    
    test_err = labels_and_preds.filter(lambda (v, p): v != p).count() / float(test_RDD.count())
    print "log-reg test error: ", test_err

    #model.save(sc, './log_reg_lbfgs_model')

twenty_news_group.py 文件源码项目：DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def fetch_data(self, subset='train', categories=None):
        """return data
        ????????

        Arguments:
        subset -> string -- ??????? train / test / all
        """
        rand = np.random.mtrand.RandomState(8675309)
        data = fetch_20newsgroups(subset=subset,
                                  categories=categories,
                                  shuffle=True,
                                  random_state=rand)

        self.data[subset] = data

datasets.py 文件源码项目：DEC-keras 作者: XifengGuo 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def load_newsgroups():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.datasets import fetch_20newsgroups
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

    vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True)
    x_sparse = vectorizer.fit_transform(newsgroups.data)
    x = np.asarray(x_sparse.todense())
    y = newsgroups.target
    print('News group data shape ', x.shape)
    print("News group number of clusters: ", np.unique(y).size)
    return x, y

cnn_text_util.py 文件源码项目：opentc 作者: cahya-wirawan 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets

twenty_newsgroup.py 文件源码项目：opentc 作者: cahya-wirawan 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def __init__(self, cfg=None):
        super().__init__()
        self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
                                              shuffle=cfg['shuffle'], random_state=cfg['random_state'])

classification.py 文件源码项目：text-analytics-with-python 作者: dipanjanS 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              remove=('headers', 'footers', 'quotes'))
    return data

download_20ng.py 文件源码项目：neural_topic_models 作者: dallascard 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def download_articles(name, categories, subset):

    data = {}
    print("Downloading articles")
    newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())

    for i in range(len(newsgroups_data['data'])):
        line = newsgroups_data['data'][i]
        data[str(len(data))] = {'text': line, 'label': newsgroups_data['target_names'][newsgroups_data['target'][i]]}

    print(len(data))
    raw_data_dir = os.path.join('..', 'data', '20ng', name)
    print("Saving to", raw_data_dir)
    fh.makedirs(raw_data_dir)
    fh.write_to_json(data, os.path.join(raw_data_dir, subset + '.json'))

test_20news.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def test_20news():
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # Extract a reduced dataset
    data2cats = datasets.fetch_20newsgroups(
        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
    # Check that the ordering of the target_names is the same
    # as the ordering in the full dataset
    assert_equal(data2cats.target_names,
                 data.target_names[-2:])
    # Assert that we have only 0 and 1 as labels
    assert_equal(np.unique(data2cats.target).tolist(), [0, 1])

    # Check that the number of filenames is consistent with data/target
    assert_equal(len(data2cats.filenames), len(data2cats.target))
    assert_equal(len(data2cats.filenames), len(data2cats.data))

    # Check that the first entry of the reduced dataset corresponds to
    # the first entry of the corresponding category in the full dataset
    entry1 = data2cats.data[0]
    category = data2cats.target_names[data2cats.target[0]]
    label = data.target_names.index(category)
    entry2 = data.data[np.where(data.target == label)[0][0]]
    assert_equal(entry1, entry2)

test_20news.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def test_20news_length_consistency():
    """Checks the length consistencies within the bunch

    This is a non-regression test for a bug present in 0.16.1.
    """
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")
    # Extract the full dataset
    data = datasets.fetch_20newsgroups(subset='all')
    assert_equal(len(data['data']), len(data.data))
    assert_equal(len(data['target']), len(data.target))
    assert_equal(len(data['filenames']), len(data.filenames))

test_train.py 文件源码项目：hh-page-classifier 作者: TeamHG-Memex 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def test_train_model():
    data = fetch_20newsgroups(
        random_state=42,
        categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])
    limit = 200
    if limit is not None:
        data['target'] = data['target'][:limit]
        data['data'] = data['data'][:limit]
    n_domains = int(len(data['target']) / 5)
    docs = [
        {
            'html': '\n'.join('<p>{}</p>'.format(t) for t in text.split('\n')),
            'url': 'http://example-{}.com/{}'.format(n % n_domains, n),
            'relevant': {'sci.space': True, 'sci.med': None}.get(
                data['target_names'][target], False),
        }
        for n, (text, target) in enumerate(zip(data['data'], data['target']))]
    result = train_model(docs)
    pprint(attr.asdict(result.meta))
    assert lst_as_dict(result.meta.advice) == [
        {'kind': 'Notice',
         'text': "The quality of the classifier is very good, ROC AUC is 0.96. "
                 "You can label more pages if you want to improve quality, "
                 "but it's better to start crawling "
                 "and check the quality of crawled pages.",
         },
        ]
    assert lst_as_dict(result.meta.description) == [
        {'heading': 'Dataset',
         'text': '200 documents, 159 labeled across 40 domains.'},
        {'heading': 'Class balance',
         'text': '33% relevant, 67% not relevant.'},
        {'heading': 'Metrics', 'text': ''},
        {'heading': 'Accuracy', 'text': '0.881 ± 0.122'},
        {'heading': 'ROC AUC', 'text': '0.964 ± 0.081'}]
    assert len(result.meta.weights['pos']) > 0
    assert len(result.meta.weights['neg']) > 0
    assert isinstance(result.model, BaseModel)
    assert hasattr(result.model, 'predict_proba')

newsgroups_clustering.py 文件源码项目：nlp-playground 作者: jamesmishra 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def main():
    """
    Cluster the newsgroups dataset and measure against labels.

    In this script, we're doing a grid search against various
    TFIDF representations of the newsgroups dataset. We want
    a TFIDF representation that has a good unsupervised
    representation.

    We're measuring the quality of that unsupervised
    representation by how well it matches up to the actual
    supervised labels of the newsgroups dataset.
    """
    newsgroups = fetch_20newsgroups(
        subset='train',
        categories=CATEGORIES,
        shuffle=True
    )
    print("Loaded data")
    gridsearch = GridSearchCV(
        Pipeline([
            ('vec', TfidfVectorizer()),
            ('cluster', ClusteringWithSupervision(
                cluster_instance=MiniBatchKMeans()))
        ]),
        {
            'vec__stop_words': (None, 'english')
        }
    )
    print("Defined pipeline. Beginning fit.")
    gridsearch.fit(newsgroups.data, newsgroups.target)
    print_best_worst(gridsearch.cv_results_)
    best_estimator = gridsearch.best_estimator_
    predicted = best_estimator.predict(newsgroups.data)
    print(
        classification_report(
            newsgroups.target,
            predicted,
            target_names=newsgroups.target_names))

corpusLoader.py 文件源码项目：vmfmix 作者: askerlee 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def load_20news(setName):
    newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes'
    totalLineNum = 0
    readDocNum = 0
    print "Loading 20 newsgroup %s data..." %setName

    setDocNum = len(newsgroups_subset.data)
    orig_docs_name = []
    orig_docs_cat = []
    orig_docs_words = []

    catNum = len(newsgroups_subset.target_names)
    cats_docsWords = [ [] for i in xrange(catNum) ]
    cats_docNames = [ [] for i in xrange(catNum) ]

    emptyFileNum = 0

    for d, text in enumerate(newsgroups_subset.data):
        if d % 50 == 49 or d == setDocNum - 1:
            print "\r%d %d\r" %( d + 1, totalLineNum ),
        text = text.encode("utf-8")
        lines = text.split("\n")
        if len(text) == 0 or len(lines) == 0:
            emptyFileNum += 1
            continue

        readDocNum += 1
        totalLineNum += len(lines)

        catID = newsgroups_subset.target[d]
        category = newsgroups_subset.target_names[catID]

        text = " ".join(lines)

        wordsInSentences, wc = extractSentenceWords(text)
        filename = newsgroups_subset.filenames[d]
        filename = os.path.basename(filename)
        orig_docs_words.append( wordsInSentences )
        orig_docs_name.append(filename)
        orig_docs_cat.append(catID)
        cats_docsWords[catID].append(wordsInSentences)
        cats_docNames[catID].append(filename)

    print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
    return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
                cats_docsWords, cats_docNames, newsgroups_subset.target_names

b_TFIDF_edited.py 文件源码项目：Machine-Learning 作者: zjuzpz 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def initializeData(data): 
#    graphics_train = fetch_20newsgroups(subset = dataSet,\
#    categories = categories, shuffle = True, random_state = 42)

    wnl = WordNetLemmatizer()
    stop_words = text.ENGLISH_STOP_WORDS

    data = data 
    #List of dicts, each element represents word to number mapping for each document
    termDictList = []
    #Dictionary for each term which stores the number of documents that contains this term
    termDocCountDict = {}
    # set of term 
    termSet = set()
    # list of int, each element represents total number of terms in each tokenlized documment
    termCountList = []    

    # get focument frequency for each term
    for i in range(len(data)):
        document = data[i].lower()
        words = set(word_tokenize(document))
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term not in termDocCountDict:
                        termDocCountDict[term] = 0
                    termDocCountDict[term] += 1

    # get termDict and termSet
    for i in range(len(data)):
        termDict = {}
        termCount = 0
        document = data[i].lower()
        words = word_tokenize(document)
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term in termDocCountDict:
                        if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000:
                            termSet.add(term)
                            termCount += 1
                            # fill in termDict
                            if term not in termDict:
                                termDict[term] = 0
                            termDict[term] += 1
                        else:
                            del termDocCountDict[term]
        termDictList.append(termDict)
        termCountList.append(termCount)

    return (termDictList, termCountList, termDocCountDict, termSet)

# function

newsgroup_classifier.py 文件源码项目：nlp-playground 作者: jamesmishra 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def main():
    """
    Train a classifier on the 20 newsgroups dataset.

    The purpose of this is mostly trying to figure out how
    to turn text into really good vector representations
    for classification... which are also hopefully good
    vector representations for unsupervised learning too.
    """
    # We don't really use our interfaces for iterating over datasets...
    # but maybe we will in the future.
    train = fetch_20newsgroups(
        subset='train',
        # categories=CATEGORIES,
        shuffle=True
    )
    test = fetch_20newsgroups(
        subset='test',
        # categories=CATEGORIES,
        shuffle=True
    )
    print("Loaded data.", len(set(train.target)), "classes.")
    glove_vectors = glove_simple()
    print("Loaded word vectors")
    pipeline = Pipeline([
        # ('vec', TfidfVectorizer()),
        ('vec', WordVectorSum(vector_dict=glove_vectors)),
        #  ('svd', TruncatedSVD()),
        ('fit', SGDClassifier())
    ])
    print("Defined pipeline. Beginning fit.")
    gridsearch = GridSearchCV(
        pipeline,
        {
            # 'vec__stop_words': ('english',),
            # 'svd__n_components': (2, 100, 500, 1000),
            # 'vec__min_df': (1, 0.01, 0.1, 0.4),
            #  'vec__max_df': (0.5, 0.75, 0.9, 1.0),
            #  'vec__max_features': (100, 1000, 10000)
        }
    )
    gridsearch.fit(train.data, train.target)
    print("Completed fit. Beginning prediction")
    predicted = gridsearch.predict(test.data)
    print("Completed prediction.")
    accuracy = np.mean(predicted == test.target)
    print("Accuracy was", accuracy)
    print("Best params", gridsearch.best_params_)
    print_best_worst(gridsearch.cv_results_)
    print(
        classification_report(
            test.target,
            predicted,
            target_names=test.target_names))