abb1t.py 文件源码-python代码片段

def create_speech(self):
        self.speech = dict.fromkeys(self.archives,[]) 
        #blacklist=[] # ids to be ignored, not implemented yet
        self.vectorizer = dict.fromkeys(self.archives,[])
        self.mat = dict.fromkeys(self.archives,[])
        for key in self.speech:
            self.speech[key]=[[],[]] # messages / ids / (maybe timestamps?)
            self.vectorizer[key]=CountVectorizer(min_df=1)
            if key >=0:
                continue # why create dictionaries for private messages right now...
            logfile="{}.gz".format(os.path.join(self.logpath,str(key)))
            try:
                ziplines=gzip.open(logfile).read().decode("utf-8").strip("\r\n").split("\n")[-15000:]
            except IOError:
                print("{} not found".format(logfile))
                continue
            prev_id = -1
            for msg_line in ziplines:
                msg = Msg(json.loads(msg_line))
                text=msg.get_text()
                chat_id=msg.get_chat_id()
                if (key != chat_id):
                    input("Error in your logfile (key {} / chat {})!".format(key,chat_id))
                sent_id=msg.get_sent_id()
                if text and text[0] not in ["/","!"]  and msg.get_edit_date()==0 and not self.is_blacklisted(text) and (not self.find_name(text)) and chat_id and sent_id: #sadly, @like will come through
                    if sent_id == prev_id:
                        self.speech[key][0][-1]+="\n{}".format(text)
                    else:
                        self.speech[key][0].append(text)
                        self.speech[key][1].append(sent_id)
                    prev_id = sent_id
            if self.speech[key][0]:
                self.mat[key]=self.vectorizer[key].fit_transform(self.speech[key][0])