def read(self,file_name,timelength):
#f = open("data/1993410.txt", "r")
#timelength = 5640
# f = open("data/5077534.txt", "r")
# timelength = 4740
f = open(file_name, "r")
#timelength = 2582
tempLine=[]
#vocabulary=set()
vocabulary = {}
jieba.load_userdict("data/metadata/user_dict.txt")
for lineNo,line in enumerate(f.readlines()):
pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
m=pattern.match(line)
if m:
temp={}
temp={"time":int(float(m.group(1).split(',')[0])), \
"text":[word for word,flag in pseg.cut(m.group(2)) \
if word not in self.stop_words and flag not in \
["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ],
"lineno":lineNo+1}
if len(temp["text"])>3:
tempLine.append(temp)
for item in temp["text"]:
if item not in vocabulary:
vocabulary[item]=0
#print(len(tempLine))
lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
# print vocabulary
# print "vocabulary size: %d " % len(vocabulary)
# print "video comment size: %d " % len(lines)
# print lines[12]
self.store(lines,timelength)
return lines,timelength,vocabulary
评论列表
文章目录