def get_tag(sentence, config):
"""
Get semantic tag of sentence.
"""
iquestion = sentence.format(**config)
try:
keywords = analyse.extract_tags(iquestion, topK=1)
keyword = keywords[0]
except IndexError:
keyword = iquestion
tags = synonym_cut(keyword, 'wf') # tuple list
if tags:
tag = tags[0][1]
if not tag:
tag = keyword
else:
tag = keyword
return tag
python类analyse()的实例源码
def set_stop_words(self, stop_words_path):
'''
@summary: ?????
---------
@param stop_words_path: ???????
---------
@result:
'''
abs_path = _get_abs_path(stop_words_path)
if not os.path.isfile(abs_path):
raise Exception("jieba: file does not exist: " + abs_path)
content = open(abs_path, 'rb').read().decode('utf-8')
for line in content.splitlines():
self._stop_words.add(line)
jieba.analyse.set_stop_words(stop_words_path) # analyse?????????
def text_rank():
db = query_DB()
stop_words = load_stopwords()
for sample in db.get_one():
author = sample[3]
title = sample[1]
content = sample[2]
reply_number = sample[-1]
if(author == 'mikki' or author == u'??'):
continue
if(reply_number >=3):
title_seg = jieba.analyse.textrank(title,topK=5,withWeight=True,allowPOS=('ns','n','vn','v'))
for word,weight in title_seg:
weight *= 0.7 * (float(reply_number) / max_reply)
db.write_textrank(word,weight)
#content_seg = jieba.analyse.textrank(content,topK=8,withWeight=True,allowPOS=('ns','n','vn','v'))
#for word,weight in content_seg:
#weight *= 0.3 * (float(reply_number) / max_reply)
#db.write_textrank(word,weight)
def extract_tags(key_word, a_name):
'''
???????????, ????????????,??????,
?????????JD??????, ??????????5??????????,
???????????????????????????????
'''
cut_tags = [tag for tag in jieba.cut(a_name)][:8]
analyse_tags = jieba.analyse.extract_tags(a_name)
tags = [tag for tag in cut_tags if tag in analyse_tags]
# ?????????????tags???
try:
tags.remove(key_word)
except:
pass
tags.insert(0, key_word)
if len(tags) > 5:
tags = tags[:5]
return ' '.join(tags)
def loadDataFromCutFile(self,totalnum):
doc = []
cut = Cut()
for i in range(1,totalnum):
line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
if not line:
break
data = json.loads(line)
keyword = analyse.extract_tags(data['content'],topK=20)
seg = " ".join(keyword)
print seg
doc.append(seg)
return doc
#calculate tf-idf
def __call__(self, question) :
# print(question.questionSentence)
qSentence = question.questionSentence
# question.wordsToken = list(jieba.cut(qSentence))
question.wordsToken, question.posToken = getPosToken(qSentence)
assert len(question.wordsToken) == len(question.posToken)
# print 'Length words Token = %d'%(len(question.wordsToken))
# print 'Length pos token = %d'%(len(question.posToken))
question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5))
# print ' '.join(question.keyWordToken)
# dependency = parser.parse(words).next()
# print '/'.join(question.wordsToken)
# for word, flag in question.posToken:
# print('%s %s'%(word, flag))
question.questionType, question.answerType = getQuestionType(question.questionSentence)
question.getAnswerTemp()
# my_print(question.answerTemp)
# print question.answerRe
def cut_with_weight(self, sentence):
"""
Cut word string with weight
@sentence: word string
return list or None
["word1`weight1", "word2`weight2" ...]
"""
try:
top_k = 2147483647
seg_list = jieba.analyse.extract_tags(sentence, topK=top_k, withWeight=True)
return [item[0].encode('utf-8')+'`'+str(item[1]) for item in seg_list]
except Exception,e:
logger.error('cut sentence:[%s] exception:[%s]' % (sentence, str(e)))
return None
def jieba_example():
raw = "????S5????,123,?,?"
raw_seq = jieba.cut(raw)
raw_seq_list = jieba.lcut(raw)
raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
raw_with_ictclas = pseg.cut(raw)
for word, flag in raw_with_ictclas:
print word, flag
def get_hot_words(text):
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()))
print(df)
df.to_excel('./hotwords/DM.xlsx', 'DM')
def detail(info_hash):
conn,curr = sphinx_conn()
querysql='SELECT * FROM film WHERE info_hash=%s'
curr.execute(querysql,info_hash)
result=curr.fetchone()
sphinx_close(curr,conn)
#hash=Search_Hash.query.filter_by(id=id).first()
if not result:
return redirect(url_for('index'))
fenci_list=jieba.analyse.extract_tags(result['name'], 8)
tags=Search_Tags.query.order_by(Search_Tags.id.desc()).limit(20)
form=SearchForm()
return render_template('detail.html',form=form,tags=tags,hash=result,fenci_list=fenci_list,sitename=sitename)
def jieba_textrank(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')):
'''
??textrank?????????topK???????????????20?
withWeight????????????????
allowPOS???????
'''
keyword_list = []
for w in jieba.analyse.textrank(data, topK=20, withWeight=True, allowPOS=allowPOS):
keyword_list.append(w[0])
keyword = '/'.join(keyword_list)
return keyword
def jieba_tfidf(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')):
'''
??tfidf?????????topK???????????????20?
withWeight????????????????
allowPOS???????
'''
temp_result = jieba.analyse.extract_tags(
data, topK, withWeight, allowPOS)
temp_result = '/'.join(temp_result)
return temp_result
def synonym_cut(sentence, pattern="wf"):
"""Cut the sentence into a synonym vector tag.
??????????????
If a word in this sentence was not found in the synonym dictionary,
it will be marked with default value of the word segmentation tool.
????????????????????????
Args:
pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
"""
sentence = sentence.rstrip(tone_words)
synonym_vector = []
if pattern == "w":
result = list(jieba.cut(sentence))
synonym_vector = [item for item in result if item not in punctuation_all]
elif pattern == "k":
synonym_vector = analyse.extract_tags(sentence, topK=1)
elif pattern == "t":
synonym_vector = analyse.extract_tags(sentence, topK=10)
elif pattern == "wf":
result = posseg.cut(sentence)
# synonym_vector = [(item.word, item.flag) for item in result \
# if item.word not in punctuation_all]
# Modify in 2017.4.27
for item in result:
if item.word not in punctuation_all:
if len(item.flag) < 4:
item.flag = list(posseg.cut(item.word))[0].flag
synonym_vector.append((item.word, item.flag))
elif pattern == "tf":
result = posseg.cut(sentence)
tags = analyse.extract_tags(sentence, topK=10)
for item in result:
if item.word in tags:
synonym_vector.append((item.word, item.flag))
return synonym_vector
def page_tags(request, pk):
import jieba.analyse
page = Page.objects.get(pk=pk)
tags = jieba.analyse.extract_tags(page.content)
return render(request, 'tags.html', {'title': 'Tags',
'page': page, 'tags': tags})
def extarctTextRankKeywords(self, doc_str, window=5):
''' ??TextRank???????
??: http://www.letiantian.me/2014-12-01-text-rank/
'''
keywords = jieba.analyse.textrank(doc_str, withWeight=True)
return keywords
pass
def initTfidfKeywords(self, idf_file=None):
''' ??TFIDF???????????????IDF?? '''
self.words_idf = {}
if idf_file is not None:
jieba.analyse.set_idf_path(idf_file)
'''
for line in codecs.open(idf_file, 'r', 'utf-8'):
word, idf_value = line.strip().split()
self.words_idf[word] = float(idf_value)
pass
'''
pass
def extractTfidfKeywords(self, doc_str):
keywords = jieba.analyse.extract_tags(doc_str, withWeight=True)
return keywords
pass
def get_top_words(top, filename):
topK = top
content = open(filename, 'rb').read()
tags = jieba.analyse.extract_tags(content, topK=topK)
# items = str(tags).replace('u\'', '\'').decode("unicode-escape")
return tags
def cut_for_keyword(self, text, with_weight = False, top_keyword_count = None):
'''
@summary: ????
---------
@param text: ????
@param with_weight: ?????? ?????keyword, word_weight?
@param top_keyword_count: ???N???? None?????
---------
@result:
'''
result = jieba.analyse.extract_tags(text, topK = top_keyword_count, withWeight = with_weight)
return result
def extractKeyWordByTFIDF(self,sentence):
wordList=[]
if self.conf["threshold"]:
threshold=self.conf["threshold"]
tmpList=jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"])
for pair in tmpList:
if pair[1]>=threshold:
wordList.append(pair[0])
else:
wordList=list(jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"]))
return wordList
def extractKeyWordByTextRank(self,sentence):
wordList=[]
if self.conf["threshold"]:
threshold=self.conf["threshold"]
tmpList=jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"])
for pair in tmpList:
if pair[1]>=threshold:
wordList.append(pair[0])
else:
wordList=list(jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"]))
return wordList
def __get_model_answer(self, question):
tag1 = jieba.analyse.extract_tags(question, 3)
tag2 = jieba.analyse.textrank(question, 3)
keywords = []
for tag in tag1:
keywords.append(tag)
for tag in tag2:
if tag not in tag1:
keywords.append(tag)
tr4w = TextRank4Keyword()
tr4w.analyze(text=question, lower=True, window=2)
for item in tr4w.get_keywords(20, word_min_len=1):
if item.word not in keywords:
keywords.append(item.word)
kstr = ""
for k in keywords:
if len(k) != 1:
kstr = kstr + "AND" + k
else:
if k not in kstr:
kstr = kstr + "AND" + k
# print(k)
estr = kstr[3:]
print (estr)
q = self.__parser.parse(estr)
results = self.__searcher.search(q)
return results
def keywords_extract(question):
jieba.analyse.set_stop_words(stopwords)
rv = jieba.analyse.extract_tags(question, topK=10, withWeight=True)
return rv
def participle (content):
tags = jieba.analyse.extract_tags(content, topK=topK)
print(tags)
str = '/'.join(tags)
return str
def analyse_tfidf():
text = request.values.get('text', "text")
topK = request.values.get("topK", default="20")
if topK in [str(x) for x in range(3,41)]:
topK = int(topK)
else:
topK = 20
withWeight = request.values.get("withWeight", default="0")
if withWeight in ['0', '1']:
withWeight = bool(int(withWeight))
else:
withWeight = True
result = list(jieba.analyse.extract_tags(text, topK=topK, withWeight=withWeight))
return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
def analyse_textrank():
text = request.values.get('text', "text")
topK = request.values.get("topK", default="20")
if topK in [str(x) for x in range(3,41)]:
topK = int(topK)
else:
topK = 20
withWeight = request.values.get("withWeight", default="0")
if withWeight in ['0', '1']:
withWeight = bool(int(withWeight))
else:
withWeight = True
result = list(jieba.analyse.textrank(text, topK=topK, withWeight=withWeight))
return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
def get_keywords(self, all_text):
kw_list = jieba.analyse.extract_tags(all_text, topK=10, withWeight=False, allowPOS=())
# return set(kw_list)
for kw in kw_list:
print kw
def test_if_has_keyword(self, weibo_text):
content = weibo_text
tags = jieba.analyse.extract_tags(content, topK=self.topK)
for tag in tags:
if tag in self.mingan_list:
print("6666666")
print(content)
print(tag)
return True
else:
print("no")
return False
def get_keywords(self, content):
result = pseg.cut(content)
tags = jieba.analyse.textrank(content, topK=50, withWeight=False, allowPOS=('n'))
tags = [tag for tag in tags if len(tag) > 2]
return tags
def insert_into_reverse_dict(self, hash_val, text):
"""
????: ??????
????:
@hash: ??text????
@text: ??text
????: ??????????????20%?, ???????, ????????.
"""
word_num = 0;
weight_avg = 0;
weight_total = 0;
word_list = []
weight_list = []
# ????
word_with_weight = jieba.analyse.extract_tags(text, withWeight=True)
for word, weight in word_with_weight:
word_num += 1;
weight_total += float(weight);
if word_num > 0:
weight_avg = weight_total / word_num;
for word, weight in word_with_weight:
if weight < (self.rate * weight_avg):
break
word_list.append(word);
weight_list.append(weight);
# ???????
list_len = len(word_list)
key_list = self.gen_key_list(word_list, weight_list, list_len, self.word_max_len)
for key in key_list:
self.reverse_dict.add(key, 100, hash_val); # ????(key -> hash)