def get_tweets(tweet_ids, consumer_key, consumer_secret, access_token, access_token_secret, nlp):
"""
Expands tweets from Twitter
:param tweet_ids: the list of tweet IDs to expand
:return: a dictionary of tweet ID to tweet text
"""
# Save tweets in a temporary file, in case the script stops working and re-starts
tweets = {}
if os.path.exists('tweet_temp'):
with codecs.open('tweet_temp', 'r', 'utf-8') as f_in:
lines = [tuple(line.strip().split('\t')) for line in f_in]
tweets = { tweet_id : tweet for (tweet_id, tweet) in lines }
api = twitter.Api(consumer_key=consumer_key, consumer_secret=consumer_secret, access_token_key=access_token,
access_token_secret=access_token_secret)
[sleeptime, resettime] = reset_sleep_time(api)
with codecs.open('tweet_temp', 'a', 'utf-8') as f_out:
for tweet_id in tweet_ids:
# We didn't download this tweet yet
if not tweet_id in tweets:
try:
curr_tweet = api.GetStatus(tweet_id, include_entities=False)
tweets[tweet_id] = clean_tweet(' '.join([t.lower_ for t in nlp(curr_tweet.text)]))
except twitter.TwitterError as err:
error = str(err)
# If the rate limit exceeded, this script should be stopped and resumed the next day
if 'Rate limit exceeded' in error:
raise
# Other error - the tweet is not available :(
print 'Error reading tweet id:', tweet_id, ':', error
tweets[tweet_id] = 'TWEET IS NOT AVAILABLE'
print >> f_out, '\t'.join((tweet_id, tweets[tweet_id]))
time.sleep(sleeptime)
if time.time() >= resettime:
[sleeptime, resettime] = reset_sleep_time(api)
return tweets
评论列表
文章目录