def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
"""????????????
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
stopwords = get_stopwords(stopwords_file)
feature_words_dict = {}
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in word_list:
if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
if word in feature_words_dict:
feature_words_dict[word] += 1
else:
feature_words_dict[word] = 1
jieba.disable_parallel()
feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
feature_words = list(list(zip(*feature_words_tuple))[0])
return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
python类lcut()的实例源码
Bernoulli_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
Bernoulli_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def get_probability(news_folder, feature_words):
"""????, prob_matrix, prob_classes
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes)
num_of_all_news = 0
prob_classes = {}
for cls in news_classes:
prob_classes[cls] = 0
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
prob_count = {}
for word in feature_words:
prob_count[word] = 1 # ??????
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in prob_count.keys():
if word in word_list:
prob_count[word] += 1
news_nums = len(news_list)
num_of_all_news += news_nums
prob_classes[news_class] = news_nums
for word in prob_count.keys():
prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ??????
jieba.disable_parallel()
for cls in prob_classes.keys():
prob_classes[cls] = prob_classes[cls] / num_of_all_news
return prob_matrix, prob_classes
Bernoulli_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def predict_with_content(prob_matrix, prob_classes, feature_words, content):
word_list = set(jieba.lcut(content))
result = {}
for cls in prob_classes.keys():
result[cls] = np.log(prob_classes[cls])
for cls in result.keys():
for word in feature_words:
if word in word_list:
result[cls] += np.log(prob_matrix.loc[word, cls])
else:
result[cls] += np.log(1 - prob_matrix.loc[word, cls])
return max(result, key=result.get)
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
Multinomial_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
"""????????????
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
stopwords = get_stopwords(stopwords_file)
feature_words_dict = {}
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in word_list:
if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
if word in feature_words_dict:
feature_words_dict[word] += 1
else:
feature_words_dict[word] = 1
jieba.disable_parallel()
feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
feature_words = list(list(zip(*feature_words_tuple))[0])
return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
Multinomial_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def predict_with_content(prob_matrix, prob_classes, feature_words, content):
word_list = jieba.lcut(content)
result = {}
for cls in prob_classes.keys():
result[cls] = np.log(prob_classes[cls])
for cls in result.keys():
for word in feature_words:
if word in word_list:
result[cls] += np.log(prob_matrix.loc[word, cls] * word_list.count(word))
else:
result[cls] += np.log(1 - prob_matrix.loc[word, cls])
return max(result, key=result.get)
data_preprocess.py 文件源码
项目:Neural-Headline-Generator-CN
作者: QuantumLiu
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
jieba.enable_parallel(32)
for word in custom_words:
jieba.add_word(word)
words=jieba.lcut(text)
return words
def if_contains(self, one_page_des):
kw_dict_high_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0}
kw_dict_low_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0}
# kw_dict = {u'??'}
# kw_dict = {u'???'}
seg_list = jieba.lcut(one_page_des, cut_all=False)
for item in seg_list:
if item in kw_dict:
# print '??'
return 1
# print '?????????'
return 0
def tokenize(sentence):
cn_sent = get_cnstr(sentence)
term_list = jieba.lcut(cn_sent, cut_all=False)
final_term_list = [term for term in term_list if len(term)>1 and is_cn_char(term)]
return final_term_list
def process(file_name):
content = read(file_name)
words = jieba.lcut(content, cut_all=False)
words = words + ['\n']
vocab = set(words)
word2int = { w: i for i, w in enumerate(vocab)}
int2word = dict(enumerate(vocab))
data = np.array([word2int[c] for c in words], dtype=np.int32)
return data, word2int, int2word, vocab
def jieba_tokenizer(sentence):
sentence =sentence.replace("^"," ")
#??????????
return jieba.lcut(sentence)
def jieba_tokenizer(self,sentence):
return jieba.lcut(sentence)
def jieba_tokenizer(self,sentence):
return jieba.lcut(sentence)
def segement(self, strs):
return jieba.lcut(strs)
def init(self):
# cut
self.img = []
if os.path.exists(self.food_dir):
self.imgs = json.loads(open(self.food_dir).read())
for img in self.imgs:
img['jieba'] = (jieba.lcut(img['title']))
open(self.food_dir, "w").write(json.dumps(self.imgs))
# build
self.jieba_dic = {}
for img in self.imgs:
for jiba in img['jieba']:
self.jieba_dic[jiba] = img
def wordSearch(self, text):
textarr = jieba.lcut(text)
self.colorPrint("Jieba cut", textarr)
for t in textarr:
if t in self.jieba_dic:
return self.jieba_dic[t]
raise ValueError("not found")
def imageAdd(self, img):
self.colorPrint("Add Foods", img)
img['jieba'] = (jieba.lcut(img['title']))
for jiba in img['jieba']:
self.jieba_dic[jiba] = img
self.img.append(img)
open(self.food_dir, "w").write(json.dumps(self.imgs))
def mycut(s):
result = []
j = 0
s = re_replace.sub(' ', s)
for i in not_cuts.finditer(s):
result.extend(jieba.lcut(s[j:i.start()], HMM=False))
if s[i.start()] in [u'?', u'“']:
result.extend([s[i.start()], s[i.start()+1:i.end()-1], s[i.end()-1]])
else:
result.append(s[i.start():i.end()])
j = i.end()
result.extend(jieba.lcut(s[j:], HMM=False))
return result