def store_file(f_in, f_out, alphabet_words,alphabet_hashtags, dummy_word_idx, hashtag_fname=None):
#stores the tweets in batches so it fits in memory
tknzr = TweetTokenizer(reduce_len=True)
counter = 0
batch_counter = 0
output = open(f_out,'wb')
output_hashtag = open(hashtag_fname, 'wb')
batch_size = 500000
tweet_batch = []
hashtag_batch=[]
with gzip.open(f_in,'r') as f:
for tweet in f:
tweet,hashtags = preprocess_tweet(tweet)
if len(hashtags) == 1:
ht = hashtags[0]
alphabet_hashtags.add(ht)
ht_idx = alphabet_hashtags.get(ht,UNKNOWN_HASHTAG_IDX)
tweet = tweet.replace(ht,'')
tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
tweet_batch.append(tweet_tok)
hashtag_batch.append(ht_idx)
batch_counter += 1
for token in tweet_tok:
alphabet_words.add(token)
if batch_counter >= batch_size:
tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
np.save(output,tweet_idx)
np.save(output_hashtag,hashtag_batch)
print 'Saved tweets:',tweet_idx.shape
tweet_batch = []
hashtag_batch=[]
batch_counter = 0
counter += 1
if (counter%1000000) == 0:
print "Elements processed:",counter
tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
np.save(output,tweet_idx)
np.save(output_hashtag,hashtag_batch)
print len(alphabet_hashtags)
print len(alphabet_words)
print 'Saved tweets:',tweet_idx.shape
return counter
评论列表
文章目录