python类lcut()的实例源码

Bernoulli_NaiveBayes_News_Classifier.py 文件源码 项目:Text-Classifier 作者: daniellaah 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
    """????????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    stopwords = get_stopwords(stopwords_file)
    feature_words_dict = {}
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in word_list:
                    if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
                        if word in feature_words_dict:
                            feature_words_dict[word] += 1
                        else:
                            feature_words_dict[word] = 1
    jieba.disable_parallel()
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
Bernoulli_NaiveBayes_News_Classifier.py 文件源码 项目:Text-Classifier 作者: daniellaah 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_probability(news_folder, feature_words):
    """????, prob_matrix, prob_classes
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")
    prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes)
    num_of_all_news = 0
    prob_classes = {}
    for cls in news_classes:
        prob_classes[cls] = 0
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        prob_count = {}
        for word in feature_words:
            prob_count[word] = 1 # ??????
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in prob_count.keys():
                    if word in word_list:
                        prob_count[word] += 1
        news_nums = len(news_list)
        num_of_all_news += news_nums
        prob_classes[news_class] = news_nums
        for word in prob_count.keys():
            prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ??????
    jieba.disable_parallel()
    for cls in prob_classes.keys():
        prob_classes[cls] = prob_classes[cls] / num_of_all_news
    return prob_matrix, prob_classes
Bernoulli_NaiveBayes_News_Classifier.py 文件源码 项目:Text-Classifier 作者: daniellaah 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def predict_with_content(prob_matrix, prob_classes, feature_words, content):
    word_list = set(jieba.lcut(content))
    result = {}
    for cls in prob_classes.keys():
        result[cls] = np.log(prob_classes[cls])
    for cls in result.keys():
        for word in feature_words:
            if word in word_list:
                result[cls] += np.log(prob_matrix.loc[word, cls])
            else:
                result[cls] += np.log(1 - prob_matrix.loc[word, cls])
    return max(result, key=result.get)
Bernoulli_NaiveBayes_Model.py 文件源码 项目:Text-Classifier 作者: daniellaah 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
Multinomial_NaiveBayes_News_Classifier.py 文件源码 项目:Text-Classifier 作者: daniellaah 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
    """????????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    stopwords = get_stopwords(stopwords_file)
    feature_words_dict = {}
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in word_list:
                    if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
                        if word in feature_words_dict:
                            feature_words_dict[word] += 1
                        else:
                            feature_words_dict[word] = 1
    jieba.disable_parallel()
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
Multinomial_NaiveBayes_News_Classifier.py 文件源码 项目:Text-Classifier 作者: daniellaah 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def predict_with_content(prob_matrix, prob_classes, feature_words, content):
    word_list = jieba.lcut(content)
    result = {}
    for cls in prob_classes.keys():
        result[cls] = np.log(prob_classes[cls])
    for cls in result.keys():
        for word in feature_words:
            if word in word_list:
                result[cls] += np.log(prob_matrix.loc[word, cls] * word_list.count(word))
            else:
                result[cls] += np.log(1 - prob_matrix.loc[word, cls])
    return max(result, key=result.get)
data_preprocess.py 文件源码 项目:Neural-Headline-Generator-CN 作者: QuantumLiu 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words
process_stuff.py 文件源码 项目:momoCrawler 作者: njames741 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def if_contains(self, one_page_des):
        kw_dict_high_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0}
        kw_dict_low_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0}
        # kw_dict = {u'??'}
        # kw_dict = {u'???'}
        seg_list = jieba.lcut(one_page_des, cut_all=False)
        for item in seg_list:
            if item in kw_dict:
                # print '??'
                return 1
        # print '?????????'
        return 0
text_clustering.py 文件源码 项目:text_clustering 作者: WennieZhi 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def tokenize(sentence):
    cn_sent = get_cnstr(sentence)
    term_list = jieba.lcut(cn_sent, cut_all=False)
    final_term_list = [term for term in term_list if len(term)>1 and is_cn_char(term)]
    return final_term_list
process.py 文件源码 项目:RobotWriter 作者: Moicen 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def process(file_name):

    content = read(file_name)

    words = jieba.lcut(content, cut_all=False)
    words = words + ['\n']
    vocab = set(words)
    word2int = { w: i for i, w in enumerate(vocab)}
    int2word = dict(enumerate(vocab))

    data = np.array([word2int[c] for c in words], dtype=np.int32)

    return data, word2int, int2word, vocab
data_utils.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def jieba_tokenizer(sentence):
    sentence =sentence.replace("^"," ")
    #??????????
    return jieba.lcut(sentence)
data_utils.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def jieba_tokenizer(self,sentence):
        return jieba.lcut(sentence)
data_utils.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def jieba_tokenizer(self,sentence):
        return jieba.lcut(sentence)
seq2seq.py 文件源码 项目:dynamic-seq2seq 作者: yanwii 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def segement(self, strs):
        return jieba.lcut(strs)
FOOD_command.py 文件源码 项目:slack_emoji_bot 作者: linnil1 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def init(self):
        # cut
        self.img = []
        if os.path.exists(self.food_dir):
            self.imgs = json.loads(open(self.food_dir).read())
            for img in self.imgs:
                img['jieba'] = (jieba.lcut(img['title']))
        open(self.food_dir, "w").write(json.dumps(self.imgs))

        # build
        self.jieba_dic = {}
        for img in self.imgs:
            for jiba in img['jieba']:
                self.jieba_dic[jiba] = img
FOOD_command.py 文件源码 项目:slack_emoji_bot 作者: linnil1 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def wordSearch(self, text):
        textarr = jieba.lcut(text)
        self.colorPrint("Jieba cut", textarr)
        for t in textarr:
            if t in self.jieba_dic:
                return self.jieba_dic[t]
        raise ValueError("not found")
FOOD_command.py 文件源码 项目:slack_emoji_bot 作者: linnil1 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def imageAdd(self, img):
        self.colorPrint("Add Foods", img)
        img['jieba'] = (jieba.lcut(img['title']))
        for jiba in img['jieba']:
            self.jieba_dic[jiba] = img
        self.img.append(img)
        open(self.food_dir, "w").write(json.dumps(self.imgs))
train_and_predict.py 文件源码 项目:baidu-ner-contest 作者: bojone 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def mycut(s):
    result = []
    j = 0
    s = re_replace.sub(' ', s)
    for i in not_cuts.finditer(s):
        result.extend(jieba.lcut(s[j:i.start()], HMM=False))
        if s[i.start()] in [u'?', u'“']:
            result.extend([s[i.start()], s[i.start()+1:i.end()-1], s[i.end()-1]])
        else:
            result.append(s[i.start():i.end()])
        j = i.end()
    result.extend(jieba.lcut(s[j:], HMM=False))
    return result


问题


面经


文章

微信
公众号

扫码关注公众号