def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
python类enable_parallel()的实例源码
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
Bernoulli_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
"""????????????
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
stopwords = get_stopwords(stopwords_file)
feature_words_dict = {}
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in word_list:
if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
if word in feature_words_dict:
feature_words_dict[word] += 1
else:
feature_words_dict[word] = 1
jieba.disable_parallel()
feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
feature_words = list(list(zip(*feature_words_tuple))[0])
return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
Bernoulli_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def get_probability(news_folder, feature_words):
"""????, prob_matrix, prob_classes
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes)
num_of_all_news = 0
prob_classes = {}
for cls in news_classes:
prob_classes[cls] = 0
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
prob_count = {}
for word in feature_words:
prob_count[word] = 1 # ??????
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in prob_count.keys():
if word in word_list:
prob_count[word] += 1
news_nums = len(news_list)
num_of_all_news += news_nums
prob_classes[news_class] = news_nums
for word in prob_count.keys():
prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ??????
jieba.disable_parallel()
for cls in prob_classes.keys():
prob_classes[cls] = prob_classes[cls] / num_of_all_news
return prob_matrix, prob_classes
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
Multinomial_NaiveBayes_Model.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
Multinomial_NaiveBayes_News_Classifier.py 文件源码
项目:Text-Classifier
作者: daniellaah
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
"""????????????
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
stopwords = get_stopwords(stopwords_file)
feature_words_dict = {}
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in word_list:
if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
if word in feature_words_dict:
feature_words_dict[word] += 1
else:
feature_words_dict[word] = 1
jieba.disable_parallel()
feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
feature_words = list(list(zip(*feature_words_tuple))[0])
return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
def save_jieba_result(file_name):
#???????
#jieba.enable_parallel(4)
dirs=path.join(path.dirname(__file__),file_name)
print(dirs)
with codecs.open(dirs,encoding='utf-8') as f:
comment_text=f.read()
cut_text=" ".join(jieba.cut(comment_text))
with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f:
f.write(cut_text)
def save_jieba_result(file_name):
#???????
#jieba.enable_parallel(4)
dirs=path.join(path.dirname(__file__),file_name)
print(dirs)
with codecs.open(dirs,encoding='utf-8') as f:
comment_text=f.read()
cut_text=" ".join(jieba.cut(comment_text))
with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f:
f.write(cut_text)
data_preprocess.py 文件源码
项目:Neural-Headline-Generator-CN
作者: QuantumLiu
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
jieba.enable_parallel(32)
for word in custom_words:
jieba.add_word(word)
words=jieba.lcut(text)
return words
def get_all_keywords(file_name):
word_lists = [] # ?????
jieba.enable_parallel(8)
with codecs.open(file_name, 'r', encoding='utf-8') as f:
Lists = f.readlines() # ????
for List in Lists:
cut_list = list(jieba.cut(List))
for word in cut_list:
word_lists.append(word)
word_lists_set = set(word_lists) # ??????
word_lists_set = list(word_lists_set)
length = len(word_lists_set)
print u"??%d????" % length
information = pd.read_excel('/Users/huazi/Desktop/zhanlang2.xlsx')
world_number_list = []
word_copy=[]
for w in word_lists_set:
if (len(w) == 1):
continue
if (word_lists.count(w) > 3):
world_number_list.append(word_lists.count(w))
word_copy.append(w)
information['key'] = word_copy
information['count'] = world_number_list
information.to_excel('sun_2.xlsx')
# ????
def save_jieba_result():
# ???????
jieba.enable_parallel(4)
dirs = path.join(path.dirname(__file__), '../pjl_comment.txt')
with codecs.open(dirs, encoding='utf-8') as f:
comment_text = f.read()
cut_text = " ".join(jieba.cut(comment_text)) # ?jieba??????????????????
with codecs.open('pjl_jieba.txt', 'a', encoding='utf-8') as f:
f.write(cut_text)
def words_split(corpus_path):
with open(corpus_path, 'r') as f:
content = f.read()
jieba.load_userdict('data/userdict.txt') # ?????????
jieba.enable_parallel(4) # ????
seg_list = jieba.cut(content, cut_all = False) # ??
return seg_list
# ?????
def __init__(self,n_core = 16):
self.rootdir = os.getcwd()
self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt'))
self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt'))
self.n_CORE=n_core
jieba.enable_parallel(self.n_CORE-1)