def getTextConfidence(self, text):
if self.typeOfSim == 'jaccard':
intend_confidenceList = []
for i in self.know_words:
intend_confidenceList.append(jaccard_compare(text, i))
if len(self.know_words) > 0:
return max(intend_confidenceList)
else :
return 0
elif self.typeOfSim == 'gensim':
try:
from gensim import corpora, models, similarities
except Exception as e:
print(e)
dictionary = corpora.Dictionary(self.know_words_remove_stopwords)
corpus = [dictionary.doc2bow(text) for text in self.know_words_remove_stopwords]
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
new_doc = text
vec_bow = dictionary.doc2bow(new_doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
most_sim = sims[0]
return most_sim[1]
python类Dictionary()的实例源码
def get_similarity(query, ans_list):
s_lenth = len(ans_list)
Corp = ans_list
# ??????????
dictionary = corpora.Dictionary(Corp)
# ??????????
corpus = [dictionary.doc2bow(text) for text in Corp]
lsi = models.LsiModel(corpus)
corpus_lsi = lsi[corpus]
vec_bow = dictionary.doc2bow(query)
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(corpus_lsi)
sims = index[vec_lsi]
similarity = list(sims)
# print(similarity)
end_lenth = len(similarity)
if s_lenth != end_lenth:
print('bug')
return similarity
def getCorpus():
documents = []
txtNames = glob.glob("original/*.txt")
for fileName in txtNames:
fp = open(fileName)
buf = fp.readline()
documents.append(buf)
stoplist = set('for a of the and to in at'.split())
texts = [[word for word in document.translate(string.maketrans("", ""), string.punctuation).lower().split() if word not in stoplist]
for document in documents]
#Actually dictionary and corpus are of no use here
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=50000)
dictionary.save('tmp/imdb.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('tmp/imdb.mm', corpus)
return texts
corpus_processor.py 文件源码
项目:DataScience-And-MachineLearning-Handbook-For-Coders
作者: wxyyxc1992
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def corpus2bow(self, tokenized_corpus=default_documents):
"""returns (vocab,corpus_in_bow)
??????? BOW ??
Arguments:
tokenized_corpus -- ?????????
Return:
vocab -- {'human': 0, ... 'minors': 11}
corpus_in_bow -- [[(0, 1), (1, 1), (2, 1)]...]
"""
dictionary = corpora.Dictionary(tokenized_corpus)
# ????
vocab = dictionary.token2id
# ?????????
corpus_in_bow = [dictionary.doc2bow(text) for text in tokenized_corpus]
return (vocab, corpus_in_bow)
def build_id2word(self, fname=None, save_to=None):
# read words.csv file
if not fname:
fname = self.words_fname or click.prompt('words file')
fname = self.__dest(fname)
assert os.path.isfile(fname), 'No such file: %s' % fname
if save_to:
self.id2word_fname = self.__dest(save_to)
else:
self.id2word_fname = LdaUtils.change_ext(fname, 'id2word')
# if there is no id2word file or the user wants to rebuild, build .id2word
if not os.path.isfile(self.id2word_fname) or click.confirm('There alread is id2word. Do you want to rebuild?'):
print 'start building id2word'
start = time()
id2word = corpora.Dictionary(LdaUtils.filter_words(LdaUtils.iter_csv(fname, -1).split()))
id2word.save(self.id2word_fname) # save
print 'building id2word takes: %s' % LdaUtils.human_readable_time(time() - start)
self.id2word = corpora.Dictionary.load(self.id2word_fname)
return self.id2word
def get_tfidf(documents): # ??gensim????tfidf
documents=[[word for word in document.text.split()] for document in documents]
dictionary = corpora.Dictionary(documents)
n_items = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in documents]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ds = []
for doc in corpus_tfidf:
d = [0] * n_items
for index, value in doc :
d[index] = value
ds.append(d)
return ds
def fit(self, documents):
'''
parameters:
documents: list of strings, each represents a document
'''
# tokens, dictionary, corpus for LDA
self.tokens = self.preProcessCorpus(documents)
self.dictionary = corpora.Dictionary(self.tokens)
self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]
self.lda = self.getLDA(dictionary=self.dictionary,
corpus=self.corpus,
num_topics=self.num_topics,
random_state=self.random_state)
self.num_dominant_topics=min(10, self.num_topics)
self.dominant_topic_ids = self.getDominantTopics(self.corpus,
self.lda,
self.num_dominant_topics)
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
model = models.TfidfModel(corpus)
corpus = [text for text in model[corpus]]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def transfer_corpus(sents):
words_dict = invert_dict(corpora.Dictionary.load('words.dict'))
max_length = 40
sentence = numpy.zeros(shape=(len(sents), max_length),dtype=numpy.int32)
label = numpy.zeros(shape=(len(sents), max_length), dtype=numpy.int32)
lengths = []
for i in range(len(sents)):
current_sent = sents[i]
words = []
labels = []
lengths.append(len(current_sent))
for item in current_sent:
words.append(words_dict[item[0]])
labels.append(label_str[item[1]])
sentence[i] = numpy.asarray(words + (max_length - len(current_sent))*[28782],dtype=numpy.float32)
label[i] = numpy.asarray(labels + (max_length - len(current_sent))*[8],dtype=numpy.float32)
return sentence,label,numpy.asarray(lengths,dtype=numpy.int32)
# train = train_ + valid_ = 16551
# test = test = 3327
def build_dictionary(generator, min_freq=5):
dictionary_path = os.path.join(DATA_PATH, DICT_NAME)
if os.path.exists(dictionary_path) and os.path.isfile(dictionary_path):
print("Delete dictionary and rebuild")
os.remove(dictionary_path)
dictionary = corpora.Dictionary(c + u for c, u in generator)
# ?????ID
filter_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if
docfreq < min_freq]
dictionary.filter_tokens(filter_ids)
dictionary.compactify()
dictionary.add_documents([_START_VOCAB])
pickle.dump(dictionary, open(dictionary_path, 'wb'))
print("SVAE dictionary to %s" % (dictionary_path))
return dictionary
def corpus2dict15(corpusfiles, lowercase=True):
""" From a given corpus, create a gensim dictionary for mapping words to ints, important: WMT15 data is already tokenized! """
corpus = list()
corpus.append(["PADDING"]) #has word index 0
corpus.append(["UNKNOWN"]) #has word index 1
for cf in corpusfiles:
if cf is not None: #source can be none
#just for huge lookuptable that contains all words from pretraining
# if lowercase:
# corpus.extend([l.lower().split() for l in codecs.open(cf,"r","utf8").readlines()])
# else:
# corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])
corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])
wordDictionary = corpora.Dictionary(corpus)
#print "... build word dictionary with vocabulary size =", len(wordDictionary)
return wordDictionary
def train_lda_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lda = models.LdaModel(corpus_tfidf,
id2word=dictionary,
iterations=1000,
num_topics=total_topics)
return lda
def reduce_dict(weibo_test):
dictionary = None
if not os.path.exists(path_tmp):
os.makedirs(path_tmp)
# ?????????????????
if not os.path.exists(path_dictionary):
dictionary = corpora.Dictionary()
files = os_path.LoadFiles(path_doc_root)
for i, msg in enumerate(files):
catg = msg[0]
file = msg[1]
file = convert_doc_to_wordlist(file, cut_all=False)
dictionary.add_documents([file])
# ??????????????
small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5]
dictionary.filter_tokens(small_freq_ids)
dictionary.compactify()
dictionary.save(path_dictionary)
svm_tfidf.reduce_tfidf(dictionary, weibo_test)
def createDictionary(extraLabel=""):
# TODO in the report note the optimization done on the dict - it was ~700 000 workds, now ~90 000
dic = Dictionary()
d = corpora.Dictionary(dic)
d.filter_extremes(no_below=10, no_above=0.6, keep_n=None)
d.compactify()
# add the visual terms as words in the vocabulary too
d.add_documents([get_visual_terms_labels(config)])
extraLabel = extraLabel+"_"+config.dictionary_label
fName = 'data/dics/%s_%s.dict' % (pretty_current_time(), extraLabel)
d.save(fName+'.bin')
d.save_as_text(fName+'.txt')
setLastDictFileName(fName+'.bin')
logger.info('Dict created and saved to %s. Size: %i' % (fName, len(d)))
return d
def generate_training_data(self, options):
"""
set self.dictionary, self.lable_types and
generate train_x(y) and test_x(y)
"""
input_table = InputTable(options['threashold'])
(training, test) = input_table.fetch_data(options['ratio_test'],
options['seed'])
word_vecs_train = self.convert_to_word_vecs(training)
topic_vecs_train = self.convert_to_topic_vecs(training)
word_vecs_test = self.convert_to_word_vecs(test)
topic_vecs_test = self.convert_to_topic_vecs(test)
# use dictionary and topic_types of training set
dictionary = corpora.Dictionary(word_vecs_train)
all_topics = list(set(topic_vecs_train))
x_train = self.adjust_x_format(dictionary, word_vecs_train)
y_train = self.adjust_y_format(all_topics, topic_vecs_train)
x_test = self.adjust_x_format(dictionary, word_vecs_test)
y_test = self.adjust_y_format(all_topics, topic_vecs_test)
return (x_train, y_train, x_test, y_test, dictionary, all_topics)
def create_vocabulary(input_stream, vocab_size, sentence_to_tokens_fn=None):
t0 = time.time()
print(" [*] Creating a new vocabulary...")
if not sentence_to_tokens_fn:
sentence_to_tokens_fn = default_sentence_to_tokens
docs = []
lines = []
for line in input_stream:
rline = line.strip()
tokens = sentence_to_tokens_fn(rline)
if '##########' not in tokens and len(rline) > 0:
lines += [token.lower() for token in tokens if token.lower() not in cachedStopWords]
elif '##########' in tokens:
docs.append(lines)
lines = []
limit = np.abs(vocab_size - 4)
vocab = corpora.Dictionary(docs)
vocab.filter_extremes(no_below=1, no_above=0.7, keep_n=limit)
print(" [*] Tokenize : %.4fs" % (time.time() - t0))
return vocab
def get_similarity(query, ans_list):
s_lenth = len(ans_list)
Corp = ans_list
# ??????????
dictionary = corpora.Dictionary(Corp)
# ??????????
corpus = [dictionary.doc2bow(text) for text in Corp]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
vec_bow = dictionary.doc2bow(query)
vec_tfidf = tfidf[vec_bow]
index = similarities.MatrixSimilarity(corpus_tfidf)
sims = index[vec_tfidf]
similarity = list(sims)
# print(similarity)
end_lenth = len(similarity)
if s_lenth != end_lenth:
print('bug')
return similarity
def load_dict(self, infile):
self.dict = corpora.Dictionary.load(infile)
def save_corpus(self, corpusfile, dictfile):
dictionary = corpora.Dictionary(self.lines)
corpus = [dictionary.doc2bow(line) for line in self.lines]
dictionary.save(dictfile)
corpora.MmCorpus.serialize(corpusfile, corpus)
def create_dictionary(self):
"""
Utility method to generate gensim-style Dictionary directly from
the corpus and vocabulary data.
"""
dictionary = Dictionary()
# replace dfs with defaultdict to avoid downstream KeyErrors
# uci vocabularies may contain terms that are not used in the document data
dictionary.dfs = defaultdict(int)
dictionary.id2token = self.id2word
dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))
dictionary.num_docs = self.num_docs
dictionary.num_nnz = self.num_nnz
for docno, doc in enumerate(self):
if docno % 10000 == 0:
logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))
for word, count in doc:
dictionary.dfs[word] += 1
dictionary.num_pos += count
return dictionary
def buildTokenDictionary(self):
"""
? self.segResponses ??????? id
"""
self.tokenDictionary = corpora.Dictionary(self.segResponses)
logging.info("?????????%s" % str(self.tokenDictionary))
def create_dictionary(self):
"""
Utility method to generate gensim-style Dictionary directly from
the corpus and vocabulary data.
"""
dictionary = Dictionary()
# replace dfs with defaultdict to avoid downstream KeyErrors
# uci vocabularies may contain terms that are not used in the document data
dictionary.dfs = defaultdict(int)
dictionary.id2token = self.id2word
dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))
dictionary.num_docs = self.num_docs
dictionary.num_nnz = self.num_nnz
for docno, doc in enumerate(self):
if docno % 10000 == 0:
logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))
for word, count in doc:
dictionary.dfs[word] += 1
dictionary.num_pos += count
return dictionary
def create_dictionary(self):
"""
Utility method to generate gensim-style Dictionary directly from
the corpus and vocabulary data.
"""
dictionary = Dictionary()
# replace dfs with defaultdict to avoid downstream KeyErrors
# uci vocabularies may contain terms that are not used in the document data
dictionary.dfs = defaultdict(int)
dictionary.id2token = self.id2word
dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))
dictionary.num_docs = self.num_docs
dictionary.num_nnz = self.num_nnz
for docno, doc in enumerate(self):
if docno % 10000 == 0:
logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))
for word, count in doc:
dictionary.dfs[word] += 1
dictionary.num_pos += count
return dictionary
def create_dictionary(self):
"""
Utility method to generate gensim-style Dictionary directly from
the corpus and vocabulary data.
"""
dictionary = Dictionary()
# replace dfs with defaultdict to avoid downstream KeyErrors
# uci vocabularies may contain terms that are not used in the document data
dictionary.dfs = defaultdict(int)
dictionary.id2token = self.id2word
dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))
dictionary.num_docs = self.num_docs
dictionary.num_nnz = self.num_nnz
for docno, doc in enumerate(self):
if docno % 10000 == 0:
logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))
for word, count in doc:
dictionary.dfs[word] += 1
dictionary.num_pos += count
return dictionary
def getWordFreq(lib_texts):
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(lib_texts)
corpus = [dictionary.doc2bow(text) for text in lib_texts]
return corpus
def train_by_lsi(lib_texts):
"""
??LSI?????
"""
from gensim import corpora, models, similarities
#?????????
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(lib_texts)
corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): ?collection words ?????????(word_id, word_frequency)??
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#???????topic???10?LSI??
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus]) # index ? gensim.similarities.docsim.MatrixSimilarity ??
return (index, dictionary, lsi)
#????? -- ??????????????????????
def query_tag(id2word, model, split_word):
# id2word = corpora.Dictionary.load(path+'.id2word')
# model = LdaMulticore.load(path+'.lda')
bow = id2word.doc2bow(split_word)
if len(bow) == 0:
return None
gamma, _ = model.inference([bow])
topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution
# [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)]
return topic_dist
def main(argv):
cli_parser = make_cli_parser()
opts, args = cli_parser.parse_args(argv)
if len(args) != 2:
cli_parser.error("Please provide an input/output file")
if not os.path.isfile(args[1]+'.lda'):
if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'):
id2word = corpora.Dictionary.load(args[1]+'.id2word')
else :
id2word = corpora.Dictionary(iter_file(args[0], opts.numlines))
# ignore words that appear in less than 5 documents or more than 20% documents
# when we do filtering, some vector becomes empty! it generates a huge problem!!
# id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None)
# save dictionary
id2word.save(args[1]+'.id2word')
# save doc2bow vector
corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word))
mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm')
model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs)
model.save(args[1]+'.lda')
infile = open(args[0])
outfile = open(args[1]+'.csv', "w")
out_csvfile = csv.writer(outfile, delimiter =',')
in_csvfile = csv.reader(infile, delimiter=',')
for row in in_csvfile:
if row[0] == 0:
break
processed_post = preprocess(row[3]).split()
if len(processed_post) == 0: # skip 0~2 word documents (quite useless)
continue
result_list = row[1:3]
result_list.extend(query_tag(id2word, model, processed_post))
out_csvfile.writerow(result_list)
infile.close()
outfile.close()
#print query_tag(id2word, model, "Hello über, world is awesome!")
def load_model_and_dictionary(self):
self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model')
self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary')
print ("Dictionary & Model Loaded Successfully")