def parse(self, in_file, out_file):
output_file = open(out_file, 'w')
with open(in_file, 'r') as file:
line = file.readline()
i = 0
for line in file.readlines():
sentence = ""
line = line.strip().split('\t')
for word, flag in pseg.cut(line[1].strip()):
if flag == 'x':
continue
else:
sentence = sentence + word + " "
output_file.write(sentence.strip() + "\n")
i += 1
if i % 100 == 0:
print('Handle lines %d' % i)
python类cut()的实例源码
def cut_for_property(self, text):
'''
@summary: ??????
---------
@param text: ????
---------
@result: ??[(text1, property1)...(textN, propertyN)]
'''
words_list = []
words =pseg.cut(text)
for word in words:
if word.word not in self._stop_words:
words_list.append((word.word, word.flag))
return words_list
def extract_keyword(self):
sents = []
comm_list = self.dao.get_hotel_comments()
# ???????????????
for comm in comm_list:
sents.extend(normal.get_sentences(comm[2]))
print "length of sentences:%d"%len(sents)
# ??????????
pos_sents = []
for sent in sents:
pos_sents.append(pseg.cut(sent))
print "length of pos_sents:%d"%len(pos_sents)
# ?????,?????
print "counting"
noun_dict = {}
for pos_sent in pos_sents:
for key,type in pos_sent:
if type == "n":
if key not in noun_dict:
noun_dict[key] = 1
else:
noun_dict[key] = noun_dict[key] + 1
a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True)
return a
def handel_weibo(filename):
fp = open("f://emotion/mysite/Label_extract/weibo_corpus/" + filename, 'r')
contents = []
for line in fp.readlines(): # ????
line = line.strip()
line.decode('utf-8')
seg_lines = pseg.cut(line) # ????
for seg_line in seg_lines: # ??????????
if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
contents.append(seg_line.word) # ????
#print "length:", len(contents)
fp.close()
# ??????????
fp_handel = open('f://emotion/mysite/Label_extract/weibo_corpus_handel/handel_' + filename, 'w+')
for content in contents:
fp_handel.write(content)
fp_handel.write('\n')
fp_handel.close()
# 2.???????????30????????????????
def read_test_list():
fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r')
contents = []
for line in fp.readlines(): # ????
line = line.strip()
line.decode('utf-8')
seg_lines = pseg.cut(line) # ????
for seg_line in seg_lines: # ??????????
if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
contents.append(seg_line.word) # ????
fp.close()
#for w in contents:
# print w
# ??str???????
str_test = ' '.join(contents)
return str_test
# 5.??????chinese_weibo.txt??????TF-IDF???????100??
def MatchItem(self, input, start, end,muststart, mode=None):
self.LogIn(input, start,end)
pos = start;
if end is None:
end=len(input);
seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]);
for word, flag in seg_list:
if self.Pos is None:
sword = word;
break;
else:
if flag in self.Pos:
sword = word;
break;
pos += len(word);
if pos < 0 or (muststart == True and pos != start):
self.LogOut(None)
return start + self.Len if self.Len < 0 else tnpy.int_max;
self.LogOut(sword)
m = tnpy.MatchResult(self, sword, pos);
m.rstr = sword;
return m;
def cut_Text(content, nomial=False):
"""
:param content: string
:param nomial: if nomial is True,only noun-like words will remain
:return:a text which format is 'a b c d'
"""
if nomial:
text = ''
words = pseg.cut(content)
for word in words:
if contain(['n'], word.flag):
text = text + ' ' + word.word
return text.strip()
else:
text = ''
words = jieba.cut(content)
for word in words:
text = text + ' ' + word
return text.strip()
def cut_Dataset(data_set, parrel=False, nomial=False):
"""
:param data_set:bunch of Dataset
:param parrel: if it is True,cut dataset in parrel.Windows is not available
:param nomial: if nomial is True,only noun-like words will remain
:return:data_set after cutted
"""
from tqdm import tqdm
data_cut = []
start = time.time()
print('cuting dataset......')
if parrel:
p = ThreadPool(9)
p.map(cut_Text, data_set.data)
p.close()
p.join()
else:
n=0
for doc_content in tqdm(data_set.data):
data_cut.append(cut_Text(doc_content, nomial))
end = time.time()
print('cuting runs %0.2f seconds.' % (end - start))
data_set.data = data_cut
def splitWord(self, content):
segs = pseg.cut(str(content))
result = []
for word,type in segs:
WORD = Word()
if self.wordtypeDict.has_key(word):
WORD.setword(word)
WORD.settype(self.wordtypeDict[word])
WORD.setfreq(self.wordfreqDict[word])
else:
WORD.setword(word)
WORD.settype(type)
result.append(WORD)
# print "word ", word
result.append(WORD)
return result
def get_word_list(self, text, lower=True, strip_stop_words=True, use_tag_filter=False):
text = util.as_text(text)
jieba_result = pseg.cut(text)
if use_tag_filter:
jieba_result = [
w for w in jieba_result if w.flag in self.default_tag_filter]
else:
jieba_result = [w for w in jieba_result]
word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
word_list = [word for word in word_list if len(word) > 0]
if lower:
word_list = [word.lower() for word in word_list]
if strip_stop_words:
word_list = [word.strip()
for word in word_list if word.strip() not in self.stop_words]
return word_list
def load(self):
from gensim.models import Word2Vec
# ???????????
self.link_database = []
# ????
self.vecmodel = Word2Vec.load(self.model_file)
log.info('???????')
log.info('???????')
with open(self.txt_file) as fp:
senten_list = fp.readlines()
log.debug("senten%s", senten_list)
for senten_txt in senten_list:
self.link_database.append(Senten2vec(senten_txt))
log.info('???????????')
for link in self.link_database:
link.sentence_word = (set(jieba.cut(link.sentence)))
for link in self.link_database:
link.sentence_vec = {word for word in link.sentence_word if word in self.vecmodel.wv.index2word}
log.info('???????')
# ????????????n???
def juziSim_vec(self, intxt, questionWordset, posWeight=None): # juziIn??????juziLi???????
if posWeight == None:
log.warning('there is no posWeight')
return 0
intxtSet = set(list(pseg.cut(intxt)))
if not len(intxtSet):
return 0
simWeight = 0
totalWeight = 0
for word, pos in intxtSet:
if word in self.vecmodel.wv.index2word:
wordPosWeight = posWeight.get(pos, 1)
totalWeight += wordPosWeight
wordMaxWeight = 0
for t in questionWordset:
# print(word, t)
tmp = self.vecmodel.wv.similarity(word, t)
if wordMaxWeight < tmp:
wordMaxWeight = tmp
simWeight += wordPosWeight * wordMaxWeight
if totalWeight == 0:
return 0
return simWeight/totalWeight
def __call__(self, question) :
# print(question.questionSentence)
qSentence = question.questionSentence
# question.wordsToken = list(jieba.cut(qSentence))
question.wordsToken, question.posToken = getPosToken(qSentence)
assert len(question.wordsToken) == len(question.posToken)
# print 'Length words Token = %d'%(len(question.wordsToken))
# print 'Length pos token = %d'%(len(question.posToken))
question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5))
# print ' '.join(question.keyWordToken)
# dependency = parser.parse(words).next()
# print '/'.join(question.wordsToken)
# for word, flag in question.posToken:
# print('%s %s'%(word, flag))
question.questionType, question.answerType = getQuestionType(question.questionSentence)
question.getAnswerTemp()
# my_print(question.answerTemp)
# print question.answerRe
def ansFind(wikiList, typeInfo, Ques,obj):
wordList = convert.solve(Ques)
keyList = convert.getKeyWords(wordList)
for j in range(len(wordList)):
if j >= len(wordList):
break
if wordList[j][1].startswith("u") or wordList[j][1].startswith("x") or wordList[j][1].startswith("p"):
del wordList[j]
j = j-1
sourceList = []
for i in range(len(wikiList)):
words = pseg.cut(wikiList[i])
relevantList = []
for w in words:
wordsGroup = [w.word,w.flag]
relevantList.append(wordsGroup)
sourceList.append(relevantList)
typeStr = ansExtract.getTypeStr(typeInfo)
ansList = ansExtract.check(sourceList, wordList, typeStr, typeInfo,obj)
return ansDecide.chooseAns(ansList, typeStr,typeInfo,obj)
data_process.py 文件源码
项目:cnn-svm-chinese-text-classification
作者: zpppy
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def jiebafenci(all_the_text):
re = ""
relist = ""
words = pseg.cut(all_the_text)
count = 0
for w in words:
flag = w.flag #??
tmp = w.word #??
#print "org: "+tmp
#\u4e00-\u9fa5?unicode???????????????????
#???unicode????Unicode???????????????????
if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
re = re + " " + w.word
re = re.replace("\n"," ").replace("\r"," ")
if len(re)>40:
relist = re
relist = relist + "\n"
return relist
data_process.py 文件源码
项目:cnn-svm-chinese-text-classification
作者: zpppy
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def getTrainData(inpath,outfile):
i=0
for filename in os.listdir(inpath):
fw = open(outfile+str(i)+".cut","w") #???????????
i=i+1
file_object = open(inpath+"\\"+filename,'r', encoding='UTF-8')
try:
all_the_text = file_object.read()
#all_the_text = all_the_text.decode("gb2312").encode("utf-8")
pre_text = jiebafenci(all_the_text)
pre_text.encode('UTF-8')
if len(pre_text)>30:
fw.write(pre_text)
except:
print('@'*20)
pass
finally:
file_object.close()
fw.close()
#['C000008', 'C000010', 'C000013', 'C000014', 'C000016', 'C000020', 'C000022','C000023', 'C000024']
data_process.py 文件源码
项目:cnn-svm-chinese-text-classification
作者: zpppy
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def jiebafenci(all_the_text):
re = ""
relist = ""
words = pseg.cut(all_the_text)
count = 0
for w in words:
flag = w.flag #??
tmp = w.word #??
#print "org: "+tmp
#\u4e00-\u9fa5?unicode???????????????????
#???unicode????Unicode???????????????????
if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
re = re + " " + w.word
re = re.replace("\n"," ").replace("\r"," ")
if len(re)>40:
relist = re
relist = relist + "\n"
return relist
data_process.py 文件源码
项目:cnn-svm-chinese-text-classification
作者: zpppy
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def getTrainData(inpath,outfile):
i=0
for filename in os.listdir(inpath):
fw = open(outfile+str(i)+".cut","w") #???????????
i=i+1
file_object = open(inpath+"\\"+filename,'r', encoding='UTF-8')
try:
all_the_text = file_object.read()
#all_the_text = all_the_text.decode("gb2312").encode("utf-8")
pre_text = jiebafenci(all_the_text)
pre_text.encode('UTF-8')
if len(pre_text)>30:
fw.write(pre_text)
except:
print('@'*20)
pass
finally:
file_object.close()
fw.close()
#['C000008', 'C000010', 'C000013', 'C000014', 'C000016', 'C000020', 'C000022','C000023', 'C000024']
def jiebafenci(all_the_text):
re = ""
relist = ""
words = pseg.cut(all_the_text)
count = 0
for w in words:
flag = w.flag #??
tmp = w.word #??
#print "org: "+tmp
#\u4e00-\u9fa5?unicode???????????????????
#???unicode????Unicode???????????????????
if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
re = re + " " + w.word
re = re.replace("\n"," ").replace("\r"," ")
if len(re)>40:
relist = re
relist = relist + "\n"
return relist
def jieba_example():
raw = "????S5????,123,?,?"
raw_seq = jieba.cut(raw)
raw_seq_list = jieba.lcut(raw)
raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
raw_with_ictclas = pseg.cut(raw)
for word, flag in raw_with_ictclas:
print word, flag
def jaccard_similarity_score(context1, context2, flag1, flag2):
#print 'context1', context1
try:
if flag1 and len(context1)!=0:
temp = context1[-1]
context1.pop()
context1 += list(pseg.cut(temp))
if flag2 and len(context1)!=0:
temp = context2[-1]
context2.pop()
context2 += list(pseg.cut(temp))
except:
pass
mySet = set(context1 + context2)
a1 = []
a2 = []
for item in mySet:
if item in context1:
a1.append(1)
else:
a1.append(0)
if item in context2:
a2.append(1)
else:
a2.append(0)
#print sklearn.metrics.jaccard_similarity_score(a1,a2)
return sklearn.metrics.jaccard_similarity_score(a1,a2)
# element[i]?element[j]?contextSim
def parse():
"""parse the comments"""
import jieba
import jieba.posseg as pseg
# Load User's Dictionary
path_list = os.getcwd().split('/')
path_list.append("dict.txt")
dict_path = '/'.join(path_list)
jieba.load_userdict(dict_path)
# Disimss These Flags
dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
'ud', 'ug', 'uv']
comments = Comment.query.all()
for comment in comments:
word_list = []
pseg_cut = pseg.cut(comment.body)
for word, flag in pseg_cut:
if flag not in dismiss:
word_list.append(word)
comment.parsed = '/'.join(word_list)
db.session.add(comment)
print "Comment %04d Parsed!" % comment.id
db.session.commit()
print "ALL DONE!"
def synonym_cut(sentence, pattern="wf"):
"""Cut the sentence into a synonym vector tag.
??????????????
If a word in this sentence was not found in the synonym dictionary,
it will be marked with default value of the word segmentation tool.
????????????????????????
Args:
pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
"""
sentence = sentence.rstrip(tone_words)
synonym_vector = []
if pattern == "w":
result = list(jieba.cut(sentence))
synonym_vector = [item for item in result if item not in punctuation_all]
elif pattern == "k":
synonym_vector = analyse.extract_tags(sentence, topK=1)
elif pattern == "t":
synonym_vector = analyse.extract_tags(sentence, topK=10)
elif pattern == "wf":
result = posseg.cut(sentence)
# synonym_vector = [(item.word, item.flag) for item in result \
# if item.word not in punctuation_all]
# Modify in 2017.4.27
for item in result:
if item.word not in punctuation_all:
if len(item.flag) < 4:
item.flag = list(posseg.cut(item.word))[0].flag
synonym_vector.append((item.word, item.flag))
elif pattern == "tf":
result = posseg.cut(sentence)
tags = analyse.extract_tags(sentence, topK=10)
for item in result:
if item.word in tags:
synonym_vector.append((item.word, item.flag))
return synonym_vector
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
"""????????????list???????
Keyword arguments:
lower -- ?????????????
use_stop_words -- ??True???????????????????
use_speech_tags_filter -- ?????????????True????self.default_speech_tag_filter??????????
"""
text = util.as_text(text)
jieba_result = pseg.cut(text)
if use_speech_tags_filter == True:
jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
else:
jieba_result = [w for w in jieba_result]
# ??????
word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
word_list = [word for word in word_list if len(word)>0]
if lower:
word_list = [word.lower() for word in word_list]
if use_stop_words:
word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]
return word_list
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list, it will be filtered.
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
self.pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph()
cm = defaultdict(int)
words = tuple(self.tokenizer.cut(sentence))
for i, wp in enumerate(words):
if self.pairfilter(wp):
for j in xrange(i + 1, i + self.span):
if j >= len(words):
break
if not self.pairfilter(words[j]):
continue
if allowPOS and withFlag:
cm[(wp, words[j])] += 1
else:
cm[(wp.word, words[j].word)] += 1
for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list, it will be filtered.
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
self.pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph()
cm = defaultdict(int)
words = tuple(self.tokenizer.cut(sentence))
for i, wp in enumerate(words):
if self.pairfilter(wp):
for j in xrange(i + 1, i + self.span):
if j >= len(words):
break
if not self.pairfilter(words[j]):
continue
if allowPOS and withFlag:
cm[(wp, words[j])] += 1
else:
cm[(wp.word, words[j].word)] += 1
for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
def cut(filename1,filename2): # ??????seg and pos
f=open(filename2,'w')
for line in open(filename1):
res=pseg.cut(line.strip())
split_line=' '.join([w.word for w in res])+'\n'
f.write(split_line.encode('utf-8'))
# print '%s split successful' %(filename1)
def main():
source_path,target_path=sys.argv[1],sys.argv[2]
source_files,target_files=getFileList(source_path,target_path)
# print fileList
for filename1,filename2 in zip(source_files,target_files):
cut(filename1,filename2)
def cut(contents): # ??
split_contents=[]
for line in contents:
res=pseg.cut(line.strip())
split_line=' '.join([w.word for w in res])
split_contents.append(split_line)
return split_contents
def main():
source_file='law_text.txt'
law_text_list=readFromFile(source_file)
print len(law_text_list)
split_contents=cut(law_text_list)
# cPickle.dump(split_contents,open('split_law_text.pkl','wb'))
print len(split_contents)
# for item in law_text_list:
# print item
print law_text_list[1].strip()
print split_contents[1].strip()