def discretize(self, time_slice_length):
self.time_slice_length = time_slice_length
# clean the data directory
if os.path.exists('corpus'):
shutil.rmtree('corpus')
os.makedirs('corpus')
# compute the total number of time-slices
time_delta = (self.end_date - self.start_date)
time_delta = time_delta.total_seconds()/60
self.time_slice_count = int(time_delta // self.time_slice_length) + 1
self.tweet_count = np.zeros(self.time_slice_count)
print(' Number of time-slices: %d' % self.time_slice_count)
# create empty files
for time_slice in range(self.time_slice_count):
dummy_file = open('corpus/' + str(time_slice), 'w')
dummy_file.write('')
# compute word frequency
self.global_freq = dok_matrix((len(self.vocabulary), self.time_slice_count), dtype=np.short)
self.mention_freq = dok_matrix((len(self.vocabulary), self.time_slice_count), dtype=np.short)
with open(self.source_file_path, 'r') as input_file:
csv_reader = csv.reader(input_file, delimiter=self.separator)
header = next(csv_reader)
text_column_index = header.index('text')
date_column_index = header.index('date')
for line in csv_reader:
tweet_date = datetime.strptime(line[date_column_index], "%Y-%m-%d %H:%M:%S")
time_delta = (tweet_date - self.start_date)
time_delta = time_delta.total_seconds() / 60
time_slice = int(time_delta / self.time_slice_length)
self.tweet_count[time_slice] += 1
# tokenize the tweet and update word frequency
tweet_text = line[text_column_index]
words = self.tokenize(tweet_text)
mention = '@' in tweet_text
for word in set(words):
word_id = self.vocabulary.get(word)
if word_id is not None:
self.global_freq[word_id, time_slice] += 1
if mention:
self.mention_freq[word_id, time_slice] += 1
with open('corpus/' + str(time_slice), 'a') as time_slice_file:
time_slice_file.write(tweet_text+'\n')
self.global_freq = self.global_freq.tocsr()
self.mention_freq = self.mention_freq.tocsr()
评论列表
文章目录