def read(self,file_name,POS_tag):
f = open(file_name, "r")
tempLine=[]
#vocabulary = {}
jieba.load_userdict("data/metadata/user_dict.txt")
for lineNo,line in enumerate(f.readlines()):
pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
m=pattern.match(line)
if m:
info=m.group(1).split(',')
temp={"time":int(float(info[0])), \
"text":[word for word,flag in pseg.cut(m.group(2)) \
if word not in self.stop_words and flag not in \
POS_tag ],
"lineno":lineNo+1,
"user":info[6]}
#?????? ???????>3???
temp2=[]
for index,text in enumerate(temp["text"]):
if len(text)>1:
temp2.append(text)
if len(temp2)>=3:
print(temp2)
temp["text"]=temp2
tempLine.append(temp)
lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
print len(lines)
return lines#,vocabulary
评论列表
文章目录