Vectorizer.py 文件源码

python
阅读 30 收藏 0 点赞 0 评论 0

项目:TextClassification 作者: AlgorTroy 项目源码 文件源码
def bag_of_words(list_of_strings, remove_puncs=True, remove_digits=True, remove_alnums=True):

    porter = PorterStemmer()
    lmtz = WordNetLemmatizer()

    # empty bag of words
    bag_of_words = []

    # Iterate for string
    for string in tqdm(list_of_strings):
        string_tokens = custom_tokenizer(string, remove_puncs=remove_puncs, get_unique=True)

        bag_of_words.extend(string_tokens)

    if remove_alnums:
        bag_of_words = [bag for bag in bag_of_words if bag.isalpha()]
    elif remove_digits:
        bag_of_words = [bag for bag in bag_of_words if (not isNumber(bag))]

    bag_of_words.sort()

    # Stem and Lemmatize the data
    bag_of_words_stemmed = []

    for word in bag_of_words:
        try:
            bag_of_words_stemmed.append(porter.stem(lmtz.lemmatize(word)))
        except:
            bag_of_words_stemmed.append(word)

    bag_of_words = list(bag_of_words_stemmed)

    # Remove stop words
    stop = set(stopwords.words('english'))
    print('Removing Stop words...')
    bag_of_words = [bag.strip().lower() for bag in bag_of_words if (bag.strip().lower() not in stop)]

    bow_counter = Counter(bag_of_words)
    bow_counter = OrderedDict(sorted(bow_counter.items()))

    return bow_counter
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号