corpus.py 文件源码-python代码片段

def discretize(self, time_slice_length):
        self.time_slice_length = time_slice_length

        # clean the data directory
        if os.path.exists('corpus'):
            shutil.rmtree('corpus')
        os.makedirs('corpus')

        # compute the total number of time-slices
        time_delta = (self.end_date - self.start_date)
        time_delta = time_delta.total_seconds()/60
        self.time_slice_count = int(time_delta // self.time_slice_length) + 1
        self.tweet_count = np.zeros(self.time_slice_count)
        print('   Number of time-slices: %d' % self.time_slice_count)

        # create empty files
        for time_slice in range(self.time_slice_count):
            dummy_file = open('corpus/' + str(time_slice), 'w')
            dummy_file.write('')

        # compute word frequency
        self.global_freq = dok_matrix((len(self.vocabulary), self.time_slice_count), dtype=np.short)
        self.mention_freq = dok_matrix((len(self.vocabulary), self.time_slice_count), dtype=np.short)
        with open(self.source_file_path, 'r') as input_file:
            csv_reader = csv.reader(input_file, delimiter=self.separator)
            header = next(csv_reader)
            text_column_index = header.index('text')
            date_column_index = header.index('date')
            for line in csv_reader:
                tweet_date = datetime.strptime(line[date_column_index], "%Y-%m-%d %H:%M:%S")
                time_delta = (tweet_date - self.start_date)
                time_delta = time_delta.total_seconds() / 60
                time_slice = int(time_delta / self.time_slice_length)
                self.tweet_count[time_slice] += 1
                # tokenize the tweet and update word frequency
                tweet_text = line[text_column_index]
                words = self.tokenize(tweet_text)
                mention = '@' in tweet_text
                for word in set(words):
                    word_id = self.vocabulary.get(word)
                    if word_id is not None:
                        self.global_freq[word_id, time_slice] += 1
                        if mention:
                            self.mention_freq[word_id, time_slice] += 1
                with open('corpus/' + str(time_slice), 'a') as time_slice_file:
                    time_slice_file.write(tweet_text+'\n')
        self.global_freq = self.global_freq.tocsr()
        self.mention_freq = self.mention_freq.tocsr()