python类analyse()的实例源码

semantic.py 文件源码 项目:chat 作者: Decalogue 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def get_tag(sentence, config):
    """
    Get semantic tag of sentence.
    """
    iquestion = sentence.format(**config)
    try:
        keywords = analyse.extract_tags(iquestion, topK=1)
        keyword = keywords[0]
    except IndexError:
        keyword = iquestion
    tags = synonym_cut(keyword, 'wf') # tuple list
    if tags:
        tag = tags[0][1]
        if not tag:
            tag = keyword
    else:
        tag = keyword
    return tag
cut_text.py 文件源码 项目:internet-content-detection 作者: liubo0621 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def set_stop_words(self, stop_words_path):
        '''
        @summary: ?????
        ---------
        @param stop_words_path: ???????
        ---------
        @result:
        '''

        abs_path = _get_abs_path(stop_words_path)
        if not os.path.isfile(abs_path):
            raise Exception("jieba: file does not exist: " + abs_path)

        content = open(abs_path, 'rb').read().decode('utf-8')
        for line in content.splitlines():
            self._stop_words.add(line)

        jieba.analyse.set_stop_words(stop_words_path) # analyse?????????
analysis.py 文件源码 项目:my_bit_v1 作者: iSawyer 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def text_rank():
    db = query_DB()
    stop_words = load_stopwords()
    for sample in db.get_one():
        author = sample[3]
        title = sample[1]
        content = sample[2]
        reply_number = sample[-1]
        if(author == 'mikki' or author == u'??'):
            continue
        if(reply_number >=3):
            title_seg = jieba.analyse.textrank(title,topK=5,withWeight=True,allowPOS=('ns','n','vn','v'))
            for word,weight in title_seg:
                weight *= 0.7 * (float(reply_number) / max_reply)
                db.write_textrank(word,weight)

        #content_seg = jieba.analyse.textrank(content,topK=8,withWeight=True,allowPOS=('ns','n','vn','v'))
        #for word,weight in content_seg:
            #weight *= 0.3 * (float(reply_number) / max_reply)
            #db.write_textrank(word,weight)
search.py 文件源码 项目:Commodity-analysis 作者: buhuipao 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def extract_tags(key_word, a_name):
    '''
    ???????????, ????????????,??????,
    ?????????JD??????, ??????????5??????????,
    ???????????????????????????????
    '''
    cut_tags = [tag for tag in jieba.cut(a_name)][:8]
    analyse_tags = jieba.analyse.extract_tags(a_name)
    tags = [tag for tag in cut_tags if tag in analyse_tags]
    # ?????????????tags???
    try:
        tags.remove(key_word)
    except:
        pass
    tags.insert(0, key_word)
    if len(tags) > 5:
        tags = tags[:5]
    return ' '.join(tags)
InverseIndex.py 文件源码 项目:NewsSpider 作者: lzjqsdd 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def loadDataFromCutFile(self,totalnum):
        doc = []
        cut = Cut()
        for i in range(1,totalnum):
            line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
            if not line:
                break
            data = json.loads(line)
            keyword = analyse.extract_tags(data['content'],topK=20)
            seg = " ".join(keyword)
            print seg
            doc.append(seg)
        return doc


    #calculate tf-idf
question.py 文件源码 项目:QA 作者: KiddoZhu 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def __call__(self, question) :
        # print(question.questionSentence)
        qSentence = question.questionSentence
        # question.wordsToken = list(jieba.cut(qSentence))
        question.wordsToken, question.posToken = getPosToken(qSentence)
        assert len(question.wordsToken) == len(question.posToken)
        # print 'Length words Token = %d'%(len(question.wordsToken))
        # print 'Length pos token = %d'%(len(question.posToken))
        question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5))
        # print ' '.join(question.keyWordToken)
        # dependency = parser.parse(words).next()
        # print '/'.join(question.wordsToken)
        # for word, flag in question.posToken:
        #   print('%s %s'%(word, flag))
        question.questionType, question.answerType = getQuestionType(question.questionSentence)
        question.getAnswerTemp()
        # my_print(question.answerTemp)
        # print question.answerRe
word_segment.py 文件源码 项目:http_server 作者: chenguolin 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def cut_with_weight(self, sentence):
        """
        Cut word string with weight

        @sentence: word string

        return list or None
        ["word1`weight1", "word2`weight2" ...]
        """
        try:
            top_k = 2147483647
            seg_list = jieba.analyse.extract_tags(sentence, topK=top_k, withWeight=True)
            return [item[0].encode('utf-8')+'`'+str(item[1]) for item in seg_list]
        except Exception,e:
            logger.error('cut sentence:[%s] exception:[%s]' % (sentence, str(e)))
            return None
feature_extractor.py 文件源码 项目:CNKICrawler 作者: roliygu 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def jieba_example():
    raw = "????S5????,123,?,?"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag
hot_words.py 文件源码 项目:LagouJob 作者: EclipseXuLu 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_hot_words(text):
    jieba.analyse.set_stop_words(STOPWORDS_PATH)
    jieba.load_userdict(USER_CORPUS)
    df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()))
    print(df)
    df.to_excel('./hotwords/DM.xlsx', 'DM')
manage.py 文件源码 项目:zsky 作者: wenguonideshou 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def detail(info_hash):
    conn,curr = sphinx_conn()
    querysql='SELECT * FROM film WHERE info_hash=%s'
    curr.execute(querysql,info_hash)
    result=curr.fetchone()
    sphinx_close(curr,conn)
    #hash=Search_Hash.query.filter_by(id=id).first()
    if not result:
        return redirect(url_for('index'))        
    fenci_list=jieba.analyse.extract_tags(result['name'], 8)
    tags=Search_Tags.query.order_by(Search_Tags.id.desc()).limit(20)
    form=SearchForm()
    return render_template('detail.html',form=form,tags=tags,hash=result,fenci_list=fenci_list,sitename=sitename)
jieba_call.py 文件源码 项目:HtmlExtract-Python 作者: xinyi-spark 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def jieba_textrank(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')):
    '''
    ??textrank?????????topK???????????????20?
    withWeight????????????????
    allowPOS???????
    '''
    keyword_list = []
    for w in jieba.analyse.textrank(data, topK=20, withWeight=True, allowPOS=allowPOS):
        keyword_list.append(w[0])
    keyword = '/'.join(keyword_list)
    return keyword
jieba_call.py 文件源码 项目:HtmlExtract-Python 作者: xinyi-spark 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def jieba_tfidf(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')):
    '''
    ??tfidf?????????topK???????????????20?
    withWeight????????????????
    allowPOS???????
    '''
    temp_result = jieba.analyse.extract_tags(
        data, topK, withWeight, allowPOS)
    temp_result = '/'.join(temp_result)
    return temp_result
semantic.py 文件源码 项目:chat 作者: Decalogue 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    ??????????????

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    ????????????????????????

    Args:
        pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
    """
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        result = list(jieba.cut(sentence))
        synonym_vector = [item for item in result if item not in punctuation_all]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in punctuation_all]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in punctuation_all:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector
views.py 文件源码 项目:search 作者: twd2 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def page_tags(request, pk):
    import jieba.analyse
    page = Page.objects.get(pk=pk)
    tags = jieba.analyse.extract_tags(page.content)
    return render(request, 'tags.html', {'title': 'Tags',
                                         'page': page, 'tags': tags})
keyword_extraction.py 文件源码 项目:finance_news_analysis 作者: pskun 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def extarctTextRankKeywords(self, doc_str, window=5):
        ''' ??TextRank???????
            ??: http://www.letiantian.me/2014-12-01-text-rank/
        '''
        keywords = jieba.analyse.textrank(doc_str, withWeight=True)
        return keywords
        pass
keyword_extraction.py 文件源码 项目:finance_news_analysis 作者: pskun 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def initTfidfKeywords(self, idf_file=None):
        ''' ??TFIDF???????????????IDF?? '''
        self.words_idf = {}
        if idf_file is not None:
            jieba.analyse.set_idf_path(idf_file)
            '''
            for line in codecs.open(idf_file, 'r', 'utf-8'):
                word, idf_value = line.strip().split()
                self.words_idf[word] = float(idf_value)
            pass
            '''
        pass
keyword_extraction.py 文件源码 项目:finance_news_analysis 作者: pskun 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def extractTfidfKeywords(self, doc_str):
        keywords = jieba.analyse.extract_tags(doc_str, withWeight=True)
        return keywords
        pass
tfidf_top.py 文件源码 项目:Malicious_Domain_Whois 作者: h-j-13 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_top_words(top, filename):
    topK = top
    content = open(filename, 'rb').read()
    tags = jieba.analyse.extract_tags(content, topK=topK)
    # items = str(tags).replace('u\'', '\'').decode("unicode-escape")
    return tags
cut_text.py 文件源码 项目:internet-content-detection 作者: liubo0621 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def cut_for_keyword(self, text, with_weight = False, top_keyword_count = None):
        '''
        @summary: ????
        ---------
        @param text: ????
        @param with_weight: ?????? ?????keyword, word_weight?
        @param top_keyword_count: ???N???? None?????
        ---------
        @result:
        '''
        result = jieba.analyse.extract_tags(text, topK = top_keyword_count, withWeight = with_weight)
        return result
WordExtractor.py 文件源码 项目:Rnews 作者: suemi994 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def extractKeyWordByTFIDF(self,sentence):
        wordList=[]
        if self.conf["threshold"]:
            threshold=self.conf["threshold"]
            tmpList=jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"])
            for pair in tmpList:
                if pair[1]>=threshold:
                    wordList.append(pair[0])
        else:
            wordList=list(jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"]))
        return wordList
WordExtractor.py 文件源码 项目:Rnews 作者: suemi994 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def extractKeyWordByTextRank(self,sentence):
        wordList=[]
        if self.conf["threshold"]:
            threshold=self.conf["threshold"]
            tmpList=jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"])
            for pair in tmpList:
                if pair[1]>=threshold:
                    wordList.append(pair[0])
        else:
            wordList=list(jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"]))
        return wordList
Chatbot.py 文件源码 项目:AIZooService 作者: zhanglbjames 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __get_model_answer(self, question):
        tag1 = jieba.analyse.extract_tags(question, 3)
        tag2 = jieba.analyse.textrank(question, 3)
        keywords = []

        for tag in tag1:
            keywords.append(tag)
        for tag in tag2:
            if tag not in tag1:
                keywords.append(tag)

        tr4w = TextRank4Keyword()
        tr4w.analyze(text=question, lower=True, window=2)
        for item in tr4w.get_keywords(20, word_min_len=1):
            if item.word not in keywords:
                keywords.append(item.word)

        kstr = ""
        for k in keywords:
            if len(k) != 1:
                kstr = kstr + "AND" + k
            else:
                if k not in kstr:
                    kstr = kstr + "AND" + k
                    # print(k)
        estr = kstr[3:]
        print (estr)
        q = self.__parser.parse(estr)
        results = self.__searcher.search(q)
        return results
keywords_extract.py 文件源码 项目:wende 作者: h404bi 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def keywords_extract(question):
    jieba.analyse.set_stop_words(stopwords)
    rv = jieba.analyse.extract_tags(question, topK=10, withWeight=True)

    return rv
simplyParticiple.py 文件源码 项目:test_jieba 作者: donttal 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def participle (content):
    tags = jieba.analyse.extract_tags(content, topK=topK)


    print(tags)
    str = '/'.join(tags)
    return str
main.py 文件源码 项目:jieba-GAE 作者: liantian-cn 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def analyse_tfidf():
    text = request.values.get('text', "text")
    topK = request.values.get("topK", default="20")
    if topK in [str(x) for x in  range(3,41)]:
        topK = int(topK)
    else:
        topK = 20
    withWeight = request.values.get("withWeight", default="0")
    if withWeight in ['0', '1']:
        withWeight = bool(int(withWeight))
    else:
        withWeight = True

    result = list(jieba.analyse.extract_tags(text, topK=topK, withWeight=withWeight))
    return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
main.py 文件源码 项目:jieba-GAE 作者: liantian-cn 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def analyse_textrank():
    text = request.values.get('text', "text")
    topK = request.values.get("topK", default="20")
    if topK in [str(x) for x in  range(3,41)]:
        topK = int(topK)
    else:
        topK = 20
    withWeight = request.values.get("withWeight", default="0")
    if withWeight in ['0', '1']:
        withWeight = bool(int(withWeight))
    else:
        withWeight = True
    result = list(jieba.analyse.textrank(text, topK=topK, withWeight=withWeight))
    return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
process_stuff.py 文件源码 项目:momoCrawler 作者: njames741 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_keywords(self, all_text):
        kw_list = jieba.analyse.extract_tags(all_text, topK=10, withWeight=False, allowPOS=())
        # return set(kw_list)
        for kw in kw_list:
            print kw
fenci.py 文件源码 项目:SinaWeiboSpider 作者: SuperSaiyanSSS 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def test_if_has_keyword(self, weibo_text):
        content = weibo_text
        tags = jieba.analyse.extract_tags(content, topK=self.topK)

        for tag in tags:
            if tag in self.mingan_list:
                print("6666666")
                print(content)
                print(tag)
                return True
            else:
                print("no")
        return False
keywords.py 文件源码 项目:appledaily_hk_hot_keyword_pipeline 作者: howawong 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_keywords(self, content):
        result = pseg.cut(content)
        tags = jieba.analyse.textrank(content, topK=50, withWeight=False, allowPOS=('n'))
        tags = [tag for tag in tags if len(tag) > 2]
        return tags
storage.py 文件源码 项目:aibot 作者: Qiware 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def insert_into_reverse_dict(self, hash_val, text):
        """
        ????: ??????
        ????:
            @hash: ??text????
            @text: ??text
        ????: ??????????????20%?, ???????, ????????.
        """
        word_num = 0;
        weight_avg = 0;
        weight_total = 0;

        word_list = []
        weight_list = []

        # ????
        word_with_weight = jieba.analyse.extract_tags(text, withWeight=True)
        for word, weight in word_with_weight:
            word_num += 1;
            weight_total += float(weight);
        if word_num > 0:
            weight_avg = weight_total / word_num;
        for word, weight in word_with_weight:
            if weight < (self.rate * weight_avg):
                break
            word_list.append(word);
            weight_list.append(weight);

        # ???????
        list_len = len(word_list)
        key_list = self.gen_key_list(word_list, weight_list, list_len, self.word_max_len)
        for key in key_list:
            self.reverse_dict.add(key, 100, hash_val); # ????(key -> hash)


问题


面经


文章

微信
公众号

扫码关注公众号