def cut(contents): # ??
split_contents=[]
for line in contents:
res=pseg.cut(line.strip())
split_line=' '.join([w.word for w in res])
split_contents.append(split_line)
return split_contents
python类cut()的实例源码
def testDefaultCut(self):
for content in test_contents:
result = jieba.cut(content)
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testDefaultCut", file=sys.stderr)
def testCutAll(self):
for content in test_contents:
result = jieba.cut(content, cut_all=True)
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
result = list(result)
assert isinstance(result, list), "Test CutAll error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testCutAll", file=sys.stderr)
def testSetDictionary(self):
jieba.set_dictionary("foobar.txt")
for content in test_contents:
result = jieba.cut(content)
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
result = list(result)
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testSetDictionary", file=sys.stderr)
def testPosseg(self):
import jieba.posseg as pseg
for content in test_contents:
result = pseg.cut(content)
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print("testPosseg", file=sys.stderr)
def testDefaultCut_NOHMM(self):
for content in test_contents:
result = jieba.cut(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testDefaultCut_NOHMM", file=sys.stderr)
def cuttest(test_sent):
result = pseg.cut(test_sent, HMM=False)
for word, flag in result:
print(word, "/", flag, ", ", end=' ')
print("")
def cuttest(test_sent):
result = pseg.cut(test_sent)
for word, flag in result:
print(word, "/", flag, ", ", end=' ')
print("")
def cuttest(test_sent):
result = pseg.cut(test_sent)
for w in result:
print(w.word, "/", w.flag, ", ", end=' ')
print("")
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list, it will be filtered.
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
self.pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph()
cm = defaultdict(int)
words = tuple(self.tokenizer.cut(sentence))
for i, wp in enumerate(words):
if self.pairfilter(wp):
for j in xrange(i + 1, i + self.span):
if j >= len(words):
break
if not self.pairfilter(words[j]):
continue
if allowPOS and withFlag:
cm[(wp, words[j])] += 1
else:
cm[(wp.word, words[j].word)] += 1
for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
def get_hot_noun_counts(source_file):
f = open(source_file, "r")
data = f.read()
re_pat = r'[\d-]{10}\s[\d:]{7,8}\s+[^\n]+\d{5,11}\)' # ?????['2016-06-24 15:42:52 ??(40**21)',…]
# li=re.findall(re_pat,data)
li_content = re.split(re_pat, data)
s = ""
for l in li_content:
s = s + l
seg_list = pseg.cut(s.strip())
lists = []
for w in seg_list:
if (w.flag == "ns"):
lists.append(w.word)
# print("******?????**0?kp-****")
# print("???????",len(lists))
seg_list_norepeat = set(lists)
# print("???????",len(seg_list_noRepeat))
word_set = {}
for seg in seg_list_norepeat:
count = 0
for ss in lists:
if (ss == seg):
count += 1
word_set[seg] = count
word_tuple_sort = sorted(word_set.items(), key=lambda e: e[1], reverse=True)
return word_tuple_sort
def cut(self, text, cut_all = False):
'''
@summary: ??
---------
@param text: ??
@param cut_all: True ??? False ????
????????????????????????
???????????????????????, ???????????????
---------
@result:
'''
result = list(jieba.cut(text, cut_all = cut_all))
result = self.__del_stop_key(result)
return result
def __is_clause_pattern3(self, the_clause, seg_result):
for a_phrase in self.__phrase_dict:
keys = a_phrase.keys()
to_compile = a_phrase["key"].replace("……", "[\u4e00-\u9fa5]*")
if "start" in keys:
to_compile = to_compile.replace("*", "{" + a_phrase["start"] + "," + a_phrase["end"] + "}")
if "head" in keys:
to_compile = a_phrase["head"] + to_compile
match = re.compile(to_compile).search(the_clause)
if match is not None:
can_continue = True
pos = [flag for word, flag in posseg.cut(match.group())]
if "between_tag" in keys:
if a_phrase["between_tag"] not in pos and len(pos) > 2:
can_continue = False
if can_continue:
for i in range(len(seg_result)):
if seg_result[i].word in match.group():
try:
if seg_result[i + 1].word in match.group():
return self.__emotional_word_analysis(
a_phrase["key"] + ":" + match.group(), a_phrase["value"],
[x for x, y in seg_result], i)
except IndexError:
return self.__emotional_word_analysis(
a_phrase["key"] + ":" + match.group(), a_phrase["value"],
[x for x, y in seg_result], i)
return ""
def extract_keyword_by_thulac(self):
sents = []
comm_list = self.dao.get_hotel_comments()
# ???????????????
for comm in comm_list:
sents.extend(normal.get_sentences(comm[2]))
print "length of sentences:%d"%len(sents)
# ??????????
pos_sents = []
for sent in sents:
try:
pos_sents.append(map(lambda x: x.split("_"), self.thu.cut(sent.encode("utf-8"))))
except:
print sent
continue
print "length of pos_sents:%d"%len(pos_sents)
# ?????,?????
print "counting"
noun_dict = {}
for pos_sent in pos_sents:
for word in pos_sent:
if word[1] == "n":
if word[0] not in noun_dict:
noun_dict[word[0]] = 1
else:
noun_dict[word[0]] = noun_dict[word[0]] + 1
a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True)
return a
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
"""????????????list???????
Keyword arguments:
lower -- ?????????????
use_stop_words -- ??True???????????????????
use_speech_tags_filter -- ?????????????True????self.default_speech_tag_filter??????????
"""
text = util.as_text(text)
jieba_result = pseg.cut(text)
if use_speech_tags_filter == True:
jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
else:
jieba_result = [w for w in jieba_result]
# ??????
word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
word_list = [word for word in word_list if len(word)>0]
if lower:
word_list = [word.lower() for word in word_list]
if use_stop_words:
word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]
return word_list
def seg(self, sentence):
words = list()
tags = list()
for item in pseg.cut(sentence):
words.append(item.word)
tags.append(item.flag)
return words, tags
def jieba_cut():
#??pos_all_dict??
fp_pos = open("hownet/pos_all_dict.txt", "r") # ?????????
fp_pos_cut = codecs.open('hownet/pos_all_cut.txt', "w+", encoding='UTF-8') # ????????????
contents = fp_pos.readlines()
for content in contents:
word = content.decode("utf-8") # ??
word_tag = pseg.cut(word)
str_tag = ""
for tag in word_tag:
str_tag += str(tag.word) + '/' + str(tag.flag)
p = re.compile(r'/x(.*)')
str_tag = p.sub(r'\1', str_tag) # ??????
fp_pos_cut.write(str_tag)
fp_pos.close()
fp_pos_cut.close()
#??pos_all_dict??
fp_neg = open("hownet/neg_all_dict.txt", "r") # ?????????
fp_neg_cut = codecs.open('hownet/neg_all_cut.txt', "w+", encoding='UTF-8') # ????????????
contents = fp_neg.readlines()
for content in contents:
word = content.decode("utf-8") # ??
word_tag = pseg.cut(word)
str_tag = ""
for tag in word_tag:
str_tag += str(tag.word) + '/' + str(tag.flag)
p = re.compile(r'/x(.*)')
str_tag = p.sub(r'\1', str_tag) # ??????
fp_neg_cut.write(str_tag)
fp_neg.close()
fp_neg_cut.close()
# ????????????
def handel_weibo_data():
#????????????????????
fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r')
weibo_data = [] # ?????????????[[??][??][??]]??????????????????
for line in fp.readlines(): # ????
contents = []
line = line.strip()
line.decode('utf-8')
seg_lines = pseg.cut(line) # ????
for seg_line in seg_lines: # ??????????
if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
contents.append(seg_line.word) # ????
weibo_data.append(contents)
fp.close()
return weibo_data
def segmentation(sentence):
seg_list = jieba.cut(sentence)
seg_result = []
for w in seg_list:
seg_result.append(w)
#print seg_result[:]
return seg_result
# ??????????????????
def build_analyzer(self):
def analyzer(doc):
words = pseg.cut(doc)
new_doc = ''.join(w.word for w in words if w.flag != 'x')
words = jieba.cut(new_doc)
return words
return analyzer
# ?TFID???????