python类Text()的实例源码-面圈网

webcrawling0203.py 文件源码项目：webcrawling 作者: etilelab 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def analyze(content):
    # ????? ???? content ? string ??? ????
    # ????? ??? nouns ?? ??? ??
    nouns=t.nouns(str(content))

    # ????? ??
    trash=["??","????","??","??","??","??","?????"]
    for i in trash:
        for j in nouns:
            if i==j:
                nouns.remove(i)

    ko=nltk.Text(nouns,name="??")

    #ranking??? ??? ????? ??
    ranking=ko.vocab().most_common(100)
    tmpData=dict(ranking)

    # ?????? ??
    wordcloud=WordCloud(font_path="/Library/Fonts/AppleGothic.ttf",relative_scaling=0.2,background_color="white",).generate_from_frequencies(tmpData)

    #matplotlib ?????? ?? ??????? ??? ???? ???
    plt.figure(figsize=(16,8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()




# ??? ??(??? ????? ???? ???? ? ?????? ??? ??)

bot.py 文件源码项目：Facebook-Bot 作者: codelovin 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def guess_by_frequency(self):
        input_data = None
        words = None
        to_replace = {}
        try:
            with open(os.path.join(os.path.dirname(__file__), "Lingvo/wordlist.txt"), 'r') as words_file:
                input_data = words_file.read().split()
                words = self.text.split()
        except FileNotFoundError:
            logging.critical("Wordlist could not be found.")
            return False
        frequencies = nltk.FreqDist(words).most_common(len(words))

        # Choosing to replace an element where needed.
        for elem in frequencies:
            word = elem[0]
            if word in to_replace.keys() or '?' not in word:
                continue

            for sample_word in input_data:
                if check_similarity(word, sample_word):
                    to_replace[word] = sample_word
                    break

        # Replacing
        for i in range(len(words)):
            if words[i] in to_replace.keys():
                words[i] = to_replace[words[i]]
        text = nltk.Text(words)
        self.text = nltk.Text(words).name[:-3]
        return True

analyze.py 文件源码项目：Distrpy 作者: j0e1in 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def analyze(content, url, title):
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    p_stemmer = LancasterStemmer()

    stop_token = ['The', 'can', 's', 'I', 't', 'am', 'are']
    texts = []
    content_tokens = word_tokenize(content)
    title_tokens = word_tokenize(title)
    content_text = nltk.Text(content_tokens)

    tokens = tokenizer.tokenize(content)

    tokens = [i for i in tokens if not i.isdigit()]  #Remove all numbers
    stopped_tokens = [i for i in tokens if not i in en_stop] #Remove all meaningless words
    stopped_tokens = [i for i in stopped_tokens if not i in stop_token] #Stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1,\
     id2word = dictionary, passes=20)
    topics = ldamodel.show_topic(0, 3)
    #topics = ldamodel.print_topics(num_topics=1, num_words=3)[0]
    Rtopic = []

    for topicTuple in topics:
        topic, rate = topicTuple
        Rtopic.append(topic)

    if len(Rtopic) == 0:
        Rtopic.append("Not English")
        Rtopic.append("Maybe Chinese?")

    return (Rtopic, url, title)

Text.py 文件源码项目：pythainlp 作者: PyThaiNLP 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def Text(str1):
    if isinstance(str1,list) == False:
        str1=word_tokenize(str(str1))
    return nltk.Text(str1)

vectorization.py 文件源码项目：atap 作者: foxbook 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)


# The corpus object

vectorization.py 文件源码项目：atap 作者: foxbook 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def sklearn_frequency_vectorize(corpus):
    # The Scikit-Learn frequency vectorize method
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus)

vectorization.py 文件源码项目：atap 作者: foxbook 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def sklearn_one_hot_vectorize(corpus):
    # The Sklearn one hot vectorize method

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import Binarizer

    freq    = CountVectorizer()
    vectors = freq.fit_transform(corpus)

    print(len(vectors.toarray()[0]))

    onehot  = Binarizer()
    vectors = onehot.fit_transform(vectors.toarray())

    print(len(vectors[0]))

vectorization.py 文件源码项目：atap 作者: foxbook 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def nltk_tfidf_vectorize(corpus):

    from nltk.text import TextCollection

    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }

vectorization.py 文件源码项目：atap 作者: foxbook 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def sklearn_tfidf_vectorize(corpus):
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer()
    return tfidf.fit_transform(corpus)