def create_speech(self):
self.speech = dict.fromkeys(self.archives,[])
#blacklist=[] # ids to be ignored, not implemented yet
self.vectorizer = dict.fromkeys(self.archives,[])
self.mat = dict.fromkeys(self.archives,[])
for key in self.speech:
self.speech[key]=[[],[]] # messages / ids / (maybe timestamps?)
self.vectorizer[key]=CountVectorizer(min_df=1)
if key >=0:
continue # why create dictionaries for private messages right now...
logfile="{}.gz".format(os.path.join(self.logpath,str(key)))
try:
ziplines=gzip.open(logfile).read().decode("utf-8").strip("\r\n").split("\n")[-15000:]
except IOError:
print("{} not found".format(logfile))
continue
prev_id = -1
for msg_line in ziplines:
msg = Msg(json.loads(msg_line))
text=msg.get_text()
chat_id=msg.get_chat_id()
if (key != chat_id):
input("Error in your logfile (key {} / chat {})!".format(key,chat_id))
sent_id=msg.get_sent_id()
if text and text[0] not in ["/","!"] and msg.get_edit_date()==0 and not self.is_blacklisted(text) and (not self.find_name(text)) and chat_id and sent_id: #sadly, @like will come through
if sent_id == prev_id:
self.speech[key][0][-1]+="\n{}".format(text)
else:
self.speech[key][0].append(text)
self.speech[key][1].append(sent_id)
prev_id = sent_id
if self.speech[key][0]:
self.mat[key]=self.vectorizer[key].fit_transform(self.speech[key][0])
评论列表
文章目录