def get_hot_noun_counts(source_file):
f = open(source_file, "r")
data = f.read()
re_pat = r'[\d-]{10}\s[\d:]{7,8}\s+[^\n]+\d{5,11}\)' # ?????['2016-06-24 15:42:52 ??(40**21)',…]
# li=re.findall(re_pat,data)
li_content = re.split(re_pat, data)
s = ""
for l in li_content:
s = s + l
seg_list = pseg.cut(s.strip())
lists = []
for w in seg_list:
if (w.flag == "ns"):
lists.append(w.word)
# print("******?????**0?kp-****")
# print("???????",len(lists))
seg_list_norepeat = set(lists)
# print("???????",len(seg_list_noRepeat))
word_set = {}
for seg in seg_list_norepeat:
count = 0
for ss in lists:
if (ss == seg):
count += 1
word_set[seg] = count
word_tuple_sort = sorted(word_set.items(), key=lambda e: e[1], reverse=True)
return word_tuple_sort
评论列表
文章目录