parse_tweets.py 文件源码-python代码片段

def store_file(f_in, f_out, alphabet_words,alphabet_hashtags, dummy_word_idx, hashtag_fname=None):
    #stores the tweets in batches so it fits in memory
    tknzr = TweetTokenizer(reduce_len=True)
    counter = 0
    batch_counter = 0
    output = open(f_out,'wb')
    output_hashtag = open(hashtag_fname, 'wb')
    batch_size = 500000
    tweet_batch = []
    hashtag_batch=[]
    with gzip.open(f_in,'r') as f:
        for tweet in f:
            tweet,hashtags = preprocess_tweet(tweet)
            if len(hashtags) == 1:
                ht = hashtags[0]
                alphabet_hashtags.add(ht)
                ht_idx = alphabet_hashtags.get(ht,UNKNOWN_HASHTAG_IDX)

                tweet = tweet.replace(ht,'')
                tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
                tweet_batch.append(tweet_tok)
                hashtag_batch.append(ht_idx)

                batch_counter += 1

                for token in tweet_tok:
                    alphabet_words.add(token)

                if batch_counter >= batch_size:
                    tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
                    np.save(output,tweet_idx)
                    np.save(output_hashtag,hashtag_batch)
                    print 'Saved tweets:',tweet_idx.shape
                    tweet_batch = []
                    hashtag_batch=[]
                    batch_counter = 0
                counter += 1
                if (counter%1000000) == 0:
                    print "Elements processed:",counter

    tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
    np.save(output,tweet_idx)
    np.save(output_hashtag,hashtag_batch)
    print len(alphabet_hashtags)
    print len(alphabet_words)
    print 'Saved tweets:',tweet_idx.shape
    return counter