def WordBeark():
logger.info("running Word Beark in " + path + data)
inputfile = path + data + ".zhs"
outputfile = path + data + ".wordbreak"
i = 0
output = open(outputfile, 'w')
input = open(inputfile, 'r')
for line in input.readlines():
seg_list = jieba.cut(line)
output.write(u' '.join(seg_list))
i = i + 1
if (i % 10000 == 0):
logger.info("Cut " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
python类cut()的实例源码
process_corpus.py 文件源码
项目:question-classification-cnn-rnn-attention
作者: sefira
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
process_corpus.py 文件源码
项目:question-classification-cnn-rnn-attention
作者: sefira
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def WordBeark():
logger.info("running Word Beark in " + path + data)
inputfile = path + data + ".zhs"
outputfile = path + data + ".wordbreak"
i = 0
output = open(outputfile, 'w')
input = open(inputfile, 'r')
for line in input.readlines():
seg_list = jieba.cut(line)
output.write(u' '.join(seg_list))
i = i + 1
if (i % 10000 == 0):
logger.info("Cut " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
def sentenceToIndex(sentence, word2idx, maxLen):
"""
??????????embeddings??????
:param sentence: ??
:param word2idx: ?????
:param maxLen: ???????
:return: ??????????
"""
unknown = word2idx.get("UNKNOWN", 0)
num = word2idx.get("NUM", len(word2idx))
index = [unknown] * maxLen
i = 0
for word in jieba.cut(sentence):
if word in word2idx:
index[i] = word2idx[word]
else:
if re.match("\d+", word):
index[i] = num
else:
index[i] = unknown
if i >= maxLen - 1:
break
i += 1
return index
def bm25(p, titles, answers, scores):
original_titles = copy.deepcopy(titles)
titles = [remove_punctuation_re(title) for title in titles]
answers = [remove_punctuation_re(answer) for answer in answers]
p = remove_punctuation_re(p)
titles = [' '.join(jieba.cut(title)) for title in titles]
p = ' '.join(jieba.cut(p))
wordindoc, wordindata, doclen, sumlen = init(titles, False)
global avglen
avglen = 1.0 * sumlen / N
res = search(p, zip(titles, original_titles, answers, scores), wordindoc, wordindata, doclen)
titles, answers, scores = [], [], []
for key, _ in res:
titles.append(key[0])
answers.append(key[1])
scores.append(key[2])
return titles, answers, scores
def get_html_text(url):
response = requests.get(url)
origin_text = response.text
origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I | re.M | re.DOTALL)
origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I | re.M | re.DOTALL)
doc = html.fromstring(origin_text)
text = doc.xpath('//body//text()')
text = [i.strip() for i in text if i.strip()]
text = ' '.join(text)
seg = jieba.cut(text)
stopwords = read_stopwords('./utils/stopwords.txt') # callable read_stopwords()
seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit()
and i.strip() not in stopwords]
seg = ' '.join(seg)
return seg
def overlap_index(question,answer,q_len,a_len,stopwords = []):
qset = set(cut(question))
aset = set(cut(answer))
q_index = np.zeros(q_len)
a_index = np.zeros(a_len)
overlap = qset.intersection(aset)
for i,q in enumerate(cut(question)[:q_len]):
value = 1
if q in overlap:
value = 2
q_index[i] = value
for i,a in enumerate(cut(answer)[:a_len]):
value = 1
if a in overlap:
value = 2
a_index[i] = value
return q_index,a_index
def overlap_index(question,answer,q_len,a_len,stopwords = []):
qset = set(cut(question))
aset = set(cut(answer))
q_index = np.zeros(q_len)
a_index = np.zeros(a_len)
overlap = qset.intersection(aset)
for i,q in enumerate(cut(question)[:q_len]):
value = 1
if q in overlap:
value = 2
q_index[i] = value
for i,a in enumerate(cut(answer)[:a_len]):
value = 1
if a in overlap:
value = 2
a_index[i] = value
return q_index,a_index
def ma_overlap_zi(row):
question = cut(row["question"])
answer = cut(row["answer"])
di_question = []
di_answer = []
for w in question:
for i in range(len(w) ):
di_question.append(w[i])
for w in answer:
for i in range(len(w) ):
di_answer.append(w[i])
di_overlap = set(di_question).intersection(set(di_answer) )
di_weight_p = dict({})
for k in range(len(di_question) ):
if di_question[k] in di_overlap:
# print int(100*((k+1)/(len(question)+1)) )
di_weight_p[di_question[k] ] =((k+1)/len(di_question))**3.2# zi_weight[ int(100*((k+1)/(len(di_question)+1)) )]#((k+1)/len(di_question))**3.2
di_weight_all = 0.0
for k in di_overlap:
di_weight_all += di_weight_p[k]
return di_weight_all /(len(di_answer)+40)
def get_word_count(filename):
data_source=open(filename,'r')
data=data_source.read()
if(data!=''):
temp_result = jieba.cut(data,cut_all=True)
temp_result = '/'.join(temp_result)
word_result=temp_result.split('/')
word_view={}#word_view[i]?????????????????i?
for i in word_result:
word_view[i]=0
if(i not in word_doc):
word_doc[i]=0
for i in word_result:
if(word_view[i]==0):
word_view[i]=1;
word_doc[i]=word_doc[i]+1
def print2file(f, title, responses, marker = '', separater = True):
if marker != '':
f.write(marker + ' ')
title_cutted = jieba.cut(title.strip(), cut_all=False)
for word in title_cutted:
f.write(word + ' ')
f.write('\n')
for response in responses:
#print(response['Content'])
#if response['Content'] not in count_response.keys():
# count_response[response['Content']] = 0
#count_response[response['Content']] += 1
if marker != '':
f.write(marker + ' ')
response_cutted = jieba.cut(response['Content'].strip(), cut_all=False)
for word in response_cutted:
f.write(word + ' ')
f.write('\n')
if separater:
f.write('===\n')
reviews_preprocessing.py 文件源码
项目:Stock-SentimentAnalysis
作者: JoshuaMichaelKing
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def word_tokenization(tick_blog_list):
'''
word tokenization by jieba to list
return list : [[,], [,], ...]
'''
count = 0
seg_list = []
try:
for blog in tick_blog_list:
count += 1
if blog != '':
segments = jieba.cut(blog)
tmp = []
for seg in segments:
tmp.append(seg)
seg_list.append(tmp)
else:
print('Line%d is empty!' % cnt)
except IOError as e:
logging.error('IOError %s' % e)
finally:
return seg_list
#-------------------------------------------------------------------------------
def word_tokenization(tick_blog_list):
'''
word tokenization by jieba to list
return list : [[,], [,], ...]
'''
count = 0
seg_list = []
try:
for blog in tick_blog_list:
if blog != '':
count += 1
segments = jieba.cut(blog)
tmp = []
for seg in segments:
tmp.append(seg)
seg_list.append(tmp)
except IOError as e:
logging.error('IOError %s' % e)
finally:
return seg_list
# Python????????
def word_segment(line, stop=False, remain_number=True):
'''
???????
stop ??????
'''
if STOP_WORDS is None:
load_stopwords()
seg_list = jieba.cut(line, HMM=True)
sl = []
for word in seg_list:
word = word.strip()
if len(word) > 0 and word not in PUNCT:
if stop:
if word in STOP_WORDS:
word = None
if word is not None and not remain_number:
if util_func.atof(word) is not None:
word = None
if word is not None:
sl.append(word)
return sl
def cut_for_property(self, text):
'''
@summary: ??????
---------
@param text: ????
---------
@result: ??[(text1, property1)...(textN, propertyN)]
'''
words_list = []
words =pseg.cut(text)
for word in words:
if word.word not in self._stop_words:
words_list.append((word.word, word.flag))
return words_list
def get_seg_features(string):
"""
Segment text with jieba
features are represented in bies format
s donates single word
"""
seg_feature = []
for word in jieba.cut(string):
if len(word) == 1:
seg_feature.append(0)
else:
tmp = [2] * len(word)
tmp[0] = 1
tmp[-1] = 3
seg_feature.extend(tmp)
return seg_feature
def get_all_keywords(file_name):
word_lists=[] #?????
with codecs.open(file_name,'r',encoding='utf-8') as f:
Lists=f.readlines()
for li in Lists:
cut_list=list(jieba.cut(li))
for word in cut_list:
word_lists.append(word)
word_lists_set=set(word_lists) #???????
sort_count=[]
word_lists_set=list(word_lists_set)
length=len(word_lists_set)
print(u'??%d????'%length)
k = 1
for w in word_lists_set:
sort_count.append(w + u':' + str(word_lists.count(w)) + u"?\n")
print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"?")
k += 1
with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
f.writelines(sort_count)
def get_all_keywords(file_name):
word_lists=[] #?????
with codecs.open(file_name,'r',encoding='utf-8') as f:
Lists=f.readlines()
for li in Lists:
cut_list=list(jieba.cut(li))
for word in cut_list:
word_lists.append(word)
word_lists_set=set(word_lists) #???????
sort_count=[]
word_lists_set=list(word_lists_set)
length=len(word_lists_set)
print(u'??%d????'%length)
k = 1
for w in word_lists_set:
sort_count.append(w + u':' + str(word_lists.count(w)) + u"?\n")
print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"?")
k += 1
with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
f.writelines(sort_count)
def Delete_stopwords():
print '????????...'
f_stop = open('emotion_file/stopwords.txt') # ???????
f_stop_list = []
for word in f_stop.readlines():
f_stop_list.append(word)
f_stop.close()
f_text = open("emotion_file/data_zhuguan.txt", "r") # ????
f_nostop = codecs.open('emotion_file/data_zhuguan_nostop.txt', 'w', encoding='UTF-8')
for text in f_text.readlines(): # ??????????????
f_seg_list = list(jieba.cut(text, cut_all=False)) # ????
for word in f_seg_list:
if word in f_stop_list:
print word
else:
f_nostop.write(word)
f_text.close()
print"???????..." # ????
# ??????????????? data_jixing.txt ??????????
data_utils.py 文件源码
项目:LSTM-CRF-For-Named-Entity-Recognition
作者: zpppy
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def get_seg_features(string):
"""
Segment text with jieba
features are represented in bies format
s donates single word
"""
seg_feature = []
for word in jieba.cut(string):
if len(word) == 1:
seg_feature.append(0)
else:
tmp = [2] * len(word)
tmp[0] = 1
tmp[-1] = 3
## ??????extend????append
seg_feature.extend(tmp)
return seg_feature
ContendSplit.py 文件源码
项目:SentimentAnalysis-chinese-master
作者: Chenalong
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def jieba_contend_split(contend):
punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\'']
wordSequenceList = [] # ???? [[(id,comtend),()....]] ???????????????????
seg_list = jieba.cut(self.commentSentence)
segmentedComment = [item for item in seg_list]
segmentedCommentTuple = list(enumerate(segmentedComment))
subWordSequenceList = []
for wordTuple in segmentedCommentTuple:
if wordTuple[1] in punctuation:
if subWordSequenceList:
wordSequenceList.append(subWordSequenceList)
subWordSequenceList = []
else:
subWordSequenceList.append(wordTuple)
if subWordSequenceList:
wordSequenceList.append(subWordSequenceList)
return wordSequenceList
SentiAnalysis.py 文件源码
项目:SentimentAnalysis-chinese-master
作者: Chenalong
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def segByPunc(self):
punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\'']
wordSequenceList = [] #???? [[(id,comtend),()....]] ???????????????????
seg_list = jieba.cut(self.commentSentence)
segmentedComment = [item for item in seg_list]
segmentedCommentTuple = list(enumerate(segmentedComment))
subWordSequenceList = []
for wordTuple in segmentedCommentTuple:
if (wordTuple[1] in punctuation):
if (subWordSequenceList != []):
wordSequenceList.append(subWordSequenceList)
subWordSequenceList = []
else:
subWordSequenceList.append(wordTuple)
if (subWordSequenceList != []):
wordSequenceList.append(subWordSequenceList)
return (wordSequenceList)
#?????????????????????????
def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
sents = []
for paragraph in doc.split('\n'):
sent_splits = iter(re.split(r'(?|?|?|?)+', paragraph, flags=re.MULTILINE))
for partial_sent in sent_splits:
sent = partial_sent + next(sent_splits, '')
if sent.strip() == '': continue
toks = []
# for tok in jieba.cut(sent, ):
for tok in tokenizer(sent):
pos = 'WORD'
if tok.strip() == '':
pos = 'SPACE'
elif punct_re.match(tok):
pos = 'PUNCT'
toks.append(Tok(pos,
tok[:2].lower(),
tok.lower(),
tok,
ent_type='' if entity_type is None else entity_type.get(tok, ''),
tag='' if tag_type is None else tag_type.get(tok, '')))
sents.append(Sentence(toks, sent))
return Doc(sents, doc)
def get_result(url_set):
line_set = []
for url in url_set:
wb_data = requests.get(url,headers = headers)
soup = BeautifulSoup(wb_data.text,'lxml')
a = soup.select('span.ctt')
for i in range(len(a)):
text = re.sub('<[^>]*>', '',a[i].text)
text = re.sub('??', ' ', text)
text = re.sub('[\W]+', ' ', text)
line_set.append(text)
#print(text)
#writer.writerow((i,text))
word_list = [" ".join(jieba.cut(sentence)) for sentence in line_set]
new_text = ' '.join(word_list)
wordcloud = WordCloud(font_path="C:/Python34/Lib/site-packages/wordcloud/simhei.ttf", background_color="black").generate(new_text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
def load_utf8_data_and_labels(positive_data_file, negative_data_file):
# Load data from files
positive_data = list(codecs.open(positive_data_file, "r", encoding='utf-8').readlines())
positive_examples = list()
for s in positive_data:
positive_examples.append(" ".join(jieba.cut(s)))
negative_data = list(codecs.open(negative_data_file, "r", encoding='utf-8').readlines())
negative_examples = list()
for s in negative_data:
negative_examples.append(" ".join(jieba.cut(s)))
# Split by words
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return [x_text, y]
def test(self, input_str):
'''
4?????????????????SVM??????????????
'''
test_input = input_str
x_test = np.zeros(self.count+1) #???????
after_split = " ".join(jieba.cut(test_input)) #??
words = after_split.split(" ")
for i in words:
i = i.replace('\n','')
i = i.replace('\r','')
i = i.replace(' ','')
if self.dictionary.__contains__(i.encode('utf-8')):
x_test[self.dictionary[i.encode('utf-8')]] = 1.
# else:
# print 'Cannot find: '+i
#???0????1
if self.mySVM.predict([x_test]) == 1.:
return 1
else:
return 0
def post_desc_counter():
""" ??????
"""
# import thulac
post = open(os.path.join("data", "post_require.txt"),
"r", encoding="utf-8").read()
# ?? thulac ??
# thu = thulac.thulac(seg_only=True)
# thu.cut(post, text=True)
# ?? jieba ??
file_path = os.path.join("data", "user_dict.txt")
jieba.load_userdict(file_path)
seg_list = jieba.cut(post, cut_all=False)
counter = dict()
for seg in seg_list:
counter[seg] = counter.get(seg, 1) + 1
counter_sort = sorted(
counter.items(), key=lambda value: value[1], reverse=True)
pprint(counter_sort)
with open(os.path.join("data", "post_pre_desc_counter.csv"),
"w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(counter_sort)
def calculate_similarity(text1,text2):
raw1 = jieba.cut(text1)
raw2 = jieba.cut(text2)
raw1 = Counter(raw1)
raw2 = Counter(raw2)
same_words = set(raw1) & set(raw2)
if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0:
dot_product = 0
mod1 = 0
mod2 = 0
for word in same_words:
dot_product += raw1[word] * raw2[word]
for word in raw1:
mod1 += math.pow(raw1[word],2)
for word in raw2:
mod2 += math.pow(raw2[word],2)
cos = dot_product/math.sqrt(mod1*mod2)
else:
cos = 0
return cos
def extract_tags(sentence,topK=20):
words = jieba.cut(sentence)
freq = {}
for w in words:
if len(w.strip())<2: continue
if w.lower() in stop_words: continue
freq[w]=freq.get(w,0.0)+1.0
total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()]
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
st_list = sorted(tf_idf_list,reverse=True)
top_tuples= st_list[:topK]
tags = [a[1] for a in top_tuples]
return tags
def cut_Text(content, nomial=False):
"""
:param content: string
:param nomial: if nomial is True,only noun-like words will remain
:return:a text which format is 'a b c d'
"""
if nomial:
text = ''
words = pseg.cut(content)
for word in words:
if contain(['n'], word.flag):
text = text + ' ' + word.word
return text.strip()
else:
text = ''
words = jieba.cut(content)
for word in words:
text = text + ' ' + word
return text.strip()
def cut_Dataset(data_set, parrel=False, nomial=False):
"""
:param data_set:bunch of Dataset
:param parrel: if it is True,cut dataset in parrel.Windows is not available
:param nomial: if nomial is True,only noun-like words will remain
:return:data_set after cutted
"""
from tqdm import tqdm
data_cut = []
start = time.time()
print('cuting dataset......')
if parrel:
p = ThreadPool(9)
p.map(cut_Text, data_set.data)
p.close()
p.join()
else:
n=0
for doc_content in tqdm(data_set.data):
data_cut.append(cut_Text(doc_content, nomial))
end = time.time()
print('cuting runs %0.2f seconds.' % (end - start))
data_set.data = data_cut