python类cut()的实例源码-第2页-面圈网

word_cut.py 文件源码项目：ParseLawDocuments 作者: FanhuaandLuomu 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def cut(contents):  # ??
    split_contents=[]
    for line in contents:
        res=pseg.cut(line.strip())
        split_line=' '.join([w.word for w in res])
        split_contents.append(split_line)
    return split_contents

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def testDefaultCut(self):
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testDefaultCut", file=sys.stderr)

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def testCutAll(self):
        for content in test_contents:
            result = jieba.cut(content, cut_all=True)
            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutAll error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutAll", file=sys.stderr)

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testSetDictionary", file=sys.stderr)

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def testPosseg(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg", file=sys.stderr)

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def testDefaultCut_NOHMM(self):
        for content in test_contents:
            result = jieba.cut(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testDefaultCut_NOHMM", file=sys.stderr)

test_pos_no_hmm.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def cuttest(test_sent):
    result = pseg.cut(test_sent, HMM=False)
    for word, flag in result:
        print(word, "/", flag, ", ", end=' ')
    print("")

test_pos.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for word, flag in result:
        print(word, "/", flag, ", ", end=' ')
    print("")

test_pos.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
        print(w.word, "/", w.flag, ", ", end=' ')  
    print("")

textrank.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags

database.py 文件源码项目：chat_logs_analysis_for_qq 作者: q673230559 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def get_hot_noun_counts(source_file):
    f = open(source_file, "r")
    data = f.read()
    re_pat = r'[\d-]{10}\s[\d:]{7,8}\s+[^\n]+\d{5,11}\)'  # ?????['2016-06-24 15:42:52  ??(40**21)',…]
    # li=re.findall(re_pat,data)
    li_content = re.split(re_pat, data)
    s = ""
    for l in li_content:
        s = s + l
    seg_list = pseg.cut(s.strip())
    lists = []
    for w in seg_list:
        if (w.flag == "ns"):
            lists.append(w.word)
    # print("******?????**0?kp-****")
    # print("???????",len(lists))
    seg_list_norepeat = set(lists)
    # print("???????",len(seg_list_noRepeat))
    word_set = {}
    for seg in seg_list_norepeat:
        count = 0
        for ss in lists:
            if (ss == seg):
                count += 1
        word_set[seg] = count
    word_tuple_sort = sorted(word_set.items(), key=lambda e: e[1], reverse=True)
    return word_tuple_sort

cut_text.py 文件源码项目：internet-content-detection 作者: liubo0621 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def cut(self, text, cut_all = False):
        '''
        @summary: ??
        ---------
        @param text: ??
        @param cut_all: True ??? False ????
          ????????????????????????
          ???????????????????????, ???????????????
        ---------
        @result:
        '''
        result = list(jieba.cut(text, cut_all = cut_all))
        result = self.__del_stop_key(result)
        return result

classifiers.py 文件源码项目：SentimentPolarityAnalysis 作者: chaoming0625 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __is_clause_pattern3(self, the_clause, seg_result):
        for a_phrase in self.__phrase_dict:
            keys = a_phrase.keys()
            to_compile = a_phrase["key"].replace("……", "[\u4e00-\u9fa5]*")

            if "start" in keys:
                to_compile = to_compile.replace("*", "{" + a_phrase["start"] + "," + a_phrase["end"] + "}")
            if "head" in keys:
                to_compile = a_phrase["head"] + to_compile

            match = re.compile(to_compile).search(the_clause)
            if match is not None:
                can_continue = True
                pos = [flag for word, flag in posseg.cut(match.group())]
                if "between_tag" in keys:
                    if a_phrase["between_tag"] not in pos and len(pos) > 2:
                        can_continue = False

                if can_continue:
                    for i in range(len(seg_result)):
                        if seg_result[i].word in match.group():
                            try:
                                if seg_result[i + 1].word in match.group():
                                    return self.__emotional_word_analysis(
                                        a_phrase["key"] + ":" + match.group(), a_phrase["value"],
                                        [x for x, y in seg_result], i)
                            except IndexError:
                                return self.__emotional_word_analysis(
                                    a_phrase["key"] + ":" + match.group(), a_phrase["value"],
                                    [x for x, y in seg_result], i)
        return ""

KeywordsHandler.py 文件源码项目：ugc.aggregator 作者: Dreamcatcher-GIS 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def extract_keyword_by_thulac(self):
        sents = []
        comm_list = self.dao.get_hotel_comments()
        # ???????????????
        for comm in comm_list:
            sents.extend(normal.get_sentences(comm[2]))
        print "length of sentences:%d"%len(sents)
        # ??????????
        pos_sents = []
        for sent in sents:
            try:
                pos_sents.append(map(lambda x: x.split("_"), self.thu.cut(sent.encode("utf-8"))))
            except:
                print sent
                continue
        print "length of pos_sents:%d"%len(pos_sents)
        # ?????,?????
        print "counting"
        noun_dict = {}
        for pos_sent in pos_sents:
            for word in pos_sent:
                if word[1] == "n":
                    if word[0] not in noun_dict:
                        noun_dict[word[0]] = 1
                    else:
                        noun_dict[word[0]] = noun_dict[word[0]] + 1
        a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True)
        return a

Segmentation.py 文件源码项目：AIZooService 作者: zhanglbjames 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
        """????????????list???????

        Keyword arguments:
        lower                  -- ?????????????
        use_stop_words         -- ??True???????????????????
        use_speech_tags_filter -- ?????????????True????self.default_speech_tag_filter??????????    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # ??????
        word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
        word_list = [word for word in word_list if len(word)>0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list

nlu_api.py 文件源码项目：KnowledgeGraph-QA-Service 作者: kangzhun 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def seg(self, sentence):
        words = list()
        tags = list()
        for item in pseg.cut(sentence):
            words.append(item.word)
            tags.append(item.flag)
        return words, tags

hownet_handel.py 文件源码项目：Graduation-design 作者: Baichenjia 项目源码文件源码阅读 15 收藏 0 点赞 0 评论 0

def jieba_cut():
    #??pos_all_dict??
    fp_pos = open("hownet/pos_all_dict.txt", "r")   # ?????????
    fp_pos_cut = codecs.open('hownet/pos_all_cut.txt', "w+", encoding='UTF-8')  # ????????????
    contents = fp_pos.readlines()
    for content in contents:
        word = content.decode("utf-8")  # ??
        word_tag = pseg.cut(word)
        str_tag = ""
        for tag in word_tag:
            str_tag += str(tag.word) + '/' + str(tag.flag)
        p = re.compile(r'/x(.*)')
        str_tag = p.sub(r'\1', str_tag)   # ??????
        fp_pos_cut.write(str_tag)
    fp_pos.close()
    fp_pos_cut.close()

    #??pos_all_dict??
    fp_neg = open("hownet/neg_all_dict.txt", "r")   # ?????????
    fp_neg_cut = codecs.open('hownet/neg_all_cut.txt', "w+", encoding='UTF-8')  # ????????????
    contents = fp_neg.readlines()
    for content in contents:
        word = content.decode("utf-8")  # ??
        word_tag = pseg.cut(word)
        str_tag = ""
        for tag in word_tag:
            str_tag += str(tag.word) + '/' + str(tag.flag)
        p = re.compile(r'/x(.*)')
        str_tag = p.sub(r'\1', str_tag)  # ??????
        fp_neg_cut.write(str_tag)
    fp_neg.close()
    fp_neg_cut.close()

# ????????????

Textrank_count.py 文件源码项目：Graduation-design 作者: Baichenjia 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def handel_weibo_data():
    #????????????????????
    fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r')
    weibo_data = []   # ?????????????[[??][??][??]]??????????????????
    for line in fp.readlines():    # ????
        contents = []
        line = line.strip()
        line.decode('utf-8')
        seg_lines = pseg.cut(line)  # ????
        for seg_line in seg_lines:   # ??????????
            if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
                contents.append(seg_line.word)  # ????
        weibo_data.append(contents)
    fp.close()
    return weibo_data

text_process.py 文件源码项目：Graduation-design 作者: Baichenjia 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def segmentation(sentence):
    seg_list = jieba.cut(sentence)
    seg_result = []
    for w in seg_list:
        seg_result.append(w)
    #print seg_result[:]
    return seg_result

# ??????????????????

word_vector.py 文件源码项目：Spam-Message-Classifier-sklearn 作者: ZPdesu 项目源码文件源码阅读 15 收藏 0 点赞 0 评论 0

def build_analyzer(self):
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer


# ?TFID???????