ReadBulletScreen.py 文件源码-python代码片段

def read(self,file_name,timelength):

        #f = open("data/1993410.txt", "r")
        #timelength = 5640
        # f = open("data/5077534.txt", "r")
        # timelength = 4740
        f = open(file_name, "r")
        #timelength = 2582

        tempLine=[]
        #vocabulary=set()
        vocabulary = {}
        jieba.load_userdict("data/metadata/user_dict.txt")
        for lineNo,line in enumerate(f.readlines()):
            pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
            m=pattern.match(line)
            if m:
                temp={}
                temp={"time":int(float(m.group(1).split(',')[0])), \
                                   "text":[word  for word,flag in pseg.cut(m.group(2))  \
                                           if word not in self.stop_words and flag not in \
                                           ["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ],
                                   "lineno":lineNo+1}

                if len(temp["text"])>3:
                    tempLine.append(temp)
                    for item in temp["text"]:
                        if item not in vocabulary:
                            vocabulary[item]=0
        #print(len(tempLine))
        lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
        # print vocabulary
        # print  "vocabulary size: %d " % len(vocabulary)
        # print  "video comment size: %d " % len(lines)
        # print  lines[12]
        self.store(lines,timelength)
        return lines,timelength,vocabulary