features.py 文件源码-python代码片段

features.py 文件源码

python

阅读 37 收藏 0 点赞 0 评论 0

项目：AlphaPy 作者: ScottFreeLLC 项目源码文件源码

def cvectorize(f, c, n):
    r"""Use the Count Vectorizer and TF-IDF Transformer.

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe containing the column ``c``.
    c : str
        Name of the text column in the dataframe ``f``.
    n : int
        The number of n-grams.

    Returns
    -------
    new_features : sparse matrix
        The transformed features.

    References
    ----------
    To use count vectorization and TF-IDF, you can find more
    information here [TFE]_.

    .. [TFE] http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

    """
    fc = f[c]
    fc.fillna(BSEP, inplace=True)
    cvect = CountVectorizer(ngram_range=[1, n], analyzer='char')
    cfeat = cvect.fit_transform(fc)
    tfidf_transformer = TfidfTransformer()
    new_features = tfidf_transformer.fit_transform(cfeat).toarray()
    return new_features


#
# Function apply_treatment
#