def analyze(content):
# ????? ???? content ? string ??? ????
# ????? ??? nouns ?? ??? ??
nouns=t.nouns(str(content))
# ????? ??
trash=["??","????","??","??","??","??","?????"]
for i in trash:
for j in nouns:
if i==j:
nouns.remove(i)
ko=nltk.Text(nouns,name="??")
#ranking??? ??? ????? ??
ranking=ko.vocab().most_common(100)
tmpData=dict(ranking)
# ?????? ??
wordcloud=WordCloud(font_path="/Library/Fonts/AppleGothic.ttf",relative_scaling=0.2,background_color="white",).generate_from_frequencies(tmpData)
#matplotlib ?????? ?? ??????? ??? ???? ???
plt.figure(figsize=(16,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
# ??? ??(??? ????? ???? ???? ? ?????? ??? ??)
python类Text()的实例源码
def guess_by_frequency(self):
input_data = None
words = None
to_replace = {}
try:
with open(os.path.join(os.path.dirname(__file__), "Lingvo/wordlist.txt"), 'r') as words_file:
input_data = words_file.read().split()
words = self.text.split()
except FileNotFoundError:
logging.critical("Wordlist could not be found.")
return False
frequencies = nltk.FreqDist(words).most_common(len(words))
# Choosing to replace an element where needed.
for elem in frequencies:
word = elem[0]
if word in to_replace.keys() or '?' not in word:
continue
for sample_word in input_data:
if check_similarity(word, sample_word):
to_replace[word] = sample_word
break
# Replacing
for i in range(len(words)):
if words[i] in to_replace.keys():
words[i] = to_replace[words[i]]
text = nltk.Text(words)
self.text = nltk.Text(words).name[:-3]
return True
def analyze(content, url, title):
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = LancasterStemmer()
stop_token = ['The', 'can', 's', 'I', 't', 'am', 'are']
texts = []
content_tokens = word_tokenize(content)
title_tokens = word_tokenize(title)
content_text = nltk.Text(content_tokens)
tokens = tokenizer.tokenize(content)
tokens = [i for i in tokens if not i.isdigit()] #Remove all numbers
stopped_tokens = [i for i in tokens if not i in en_stop] #Remove all meaningless words
stopped_tokens = [i for i in stopped_tokens if not i in stop_token] #Stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1,\
id2word = dictionary, passes=20)
topics = ldamodel.show_topic(0, 3)
#topics = ldamodel.print_topics(num_topics=1, num_words=3)[0]
Rtopic = []
for topicTuple in topics:
topic, rate = topicTuple
Rtopic.append(topic)
if len(Rtopic) == 0:
Rtopic.append("Not English")
Rtopic.append("Maybe Chinese?")
return (Rtopic, url, title)
def Text(str1):
if isinstance(str1,list) == False:
str1=word_tokenize(str(str1))
return nltk.Text(str1)
def tokenize(text):
stem = nltk.stem.SnowballStemmer('english')
text = text.lower()
for token in nltk.word_tokenize(text):
if token in string.punctuation: continue
yield stem.stem(token)
# The corpus object
def sklearn_frequency_vectorize(corpus):
# The Scikit-Learn frequency vectorize method
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
return vectorizer.fit_transform(corpus)
def sklearn_one_hot_vectorize(corpus):
# The Sklearn one hot vectorize method
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
freq = CountVectorizer()
vectors = freq.fit_transform(corpus)
print(len(vectors.toarray()[0]))
onehot = Binarizer()
vectors = onehot.fit_transform(vectors.toarray())
print(len(vectors[0]))
def nltk_tfidf_vectorize(corpus):
from nltk.text import TextCollection
corpus = [list(tokenize(doc)) for doc in corpus]
texts = TextCollection(corpus)
for doc in corpus:
yield {
term: texts.tf_idf(term, doc)
for term in doc
}
def sklearn_tfidf_vectorize(corpus):
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
return tfidf.fit_transform(corpus)