def searchTweets(self, tag, limit=50000, tfilter=" -filter:retweets", resultType="recent"):
# if tfilter is appended to tag, it'll have some problem about tqdm, or what???.
# I don't know why it'll have the problem.
#tag += tfilter
try:
tweets = []
tweetsObj = tweepy.Cursor(self.API.search,
q=tag,
result_type=resultType,
exclude_replies = True).items(limit)
pBar = tqdm(tweetsObj, ascii=True, total=limit, desc="Getting Tweets!")
for cnt, tweet in enumerate(pBar):
pBar.update(1)
if not cnt < limit:
break
tweets.append(tweet)
except tweepy.error.TweepError as et:
print(et)
except Exception as e:
print(e)
return tweets
# if the tweets has more than 2 tweet which is tweeted by same user, it delete old tweet.
python类Cursor()的实例源码
def rest_tweets(self, query, lang="pt", limit=None):
"""
returns all the tweets within 7 days top according to the query received by this method
returns the complete tweet
:param query: should contain all the words and can include logic operators
should also provide the period of time for the search
ex: rock OR axe
(visit https://dev.twitter.com/rest/public/search to see how to create a query)
:param lang: the language of the tweets
:param limit: defines the maximum amount of tweets to fetch
:return: tweets: a list of all tweets obtained after the request
"""
tweets = []
for tweet in tw.Cursor(self.api.search, q=query, lang=lang).items(limit):
tweets.append(tweet._json)
return tweets
def analyzetweets(self, access_token, access_token_secret, mytweets=False, q=None):
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
sentimentlist = []
subjectivitylist = []
number = NUMBER_OF_TWEETS
tweets = tweepy.Cursor(api.user_timeline).items() if mytweets else tweepy.Cursor(api.search, q=q).items(number)
for index, tweet in enumerate(tweets):
analysis = TextBlob(tweet.text).sentiment
sentimentlist.append(analysis.polarity)
subjectivitylist.append(analysis.subjectivity)
self.update_state(state="RUNNING", meta={"current": index + 1, "total": number})
sentimentavg = float(sum(sentimentlist) / max(len(sentimentlist), 1))
subjectivityavg = float(sum(subjectivitylist) / max(len(subjectivitylist), 1))
return {"current": number, "total": number, "subjectivityavg": subjectivityavg, "sentimentavg": sentimentavg}
def save_hashtag(hashtag):
for status in tweepy.Cursor(api_twitter.search, q=hashtag).items(1000):
try:
for media in status.extended_entities['media']:
print(media['media_url'])
urllib.request.urlretrieve(media['media_url'], os.path.join(os.getcwd(), os.path.join('files', 'riko_meme'), media['media_url'].link.split('/')[-1]))
except AttributeError:
pass
def getTimeline(self, limit=50000, resultType="recent"):
try:
tweets = []
tweetsObj = tweepy.Cursor(self.API.home_timeline,
result_type=resultType,
exclude_replies = False).items(limit)
pBar = tqdm(tweetsObj, ascii=True, total=limit, desc="Getting Tweets!")
for cnt, tweet in enumerate(pBar):
pBar.update(1)
if not cnt < limit:
break
tweets.append(tweet)
except tweepy.error.TweepError as et:
print(et)
except Exception as e:
print(e)
return tweets
def getFriendIds(self, userId, limit=100000):
if self._byProtected(userId):
return []
friendIds = []
try:
friends = tweepy.Cursor(\
self.API.friends_ids,\
user_id = userId, \
cursor = -1\
).items()
for cnt, friend in enumerate(friends):
if not cnt < limit:
break
friendIds.append(friend)
return friendIds
except tweepy.error.TweepError as et:
print(et)
return []
def getTweets(self, userId, limit=50):
tweets = []
try:
tweetsObj = tweepy.Cursor( \
self.API.user_timeline, \
user_id=userId, \
exclude_replies = True \
).items(limit)
for cnt, tweet in enumerate(tweetsObj):
if not cnt < limit:
break
# print(tweet.text.replace("\n", ""))
tweets.append(tweet)
except tweepy.error.TweepError as et:
print(et)
return tweets
def crawl_user_data(portrait, path):
api = portrait_api(portrait)
now = datetime.datetime.now().strftime("%Y%m%d%H%M")
timeline = [t._json for t in tweepy.Cursor(api.user_timeline, user_id=portrait.auth_id_str, count=200, since_id=portrait.last_tweet_id).items()]
if timeline:
with gzip.open('{0}/{1}_{2}.data.gz'.format(path, portrait.auth_id_str, now), 'wt') as f:
f.write(json.dumps(timeline))
print('loaded tweets', len(timeline))
if not portrait.demo_portrait:
print(portrait.auth_screen_name, 'not a demo portrait. downloading connectivity')
connectivity = [t for t in tweepy.Cursor(api.friends_ids, user_id=portrait.auth_id_str, cursor=-1).items()]
print('loaded friends', len(connectivity))
with gzip.open('{0}/{1}_{2}.friends.gz'.format(path, portrait.auth_id_str, now), 'wt') as f:
f.write(json.dumps(connectivity))
return True
def getFollower(profile):
i = 0
l = []
printColour("\n[*] ", BLUE)
print "Follower list:\n"
for user in tweepy.Cursor(api.followers, screen_name=profile, count=200).items():
try:
l.append(user.screen_name)
i = i + 1
except:
print "[-] Timeout, sleeping for 15 minutes..."
time.sleep(15*60)
for user in l:
printColour("[+] @" + user, GREEN)
print(" (https://www.twitter.com/" + user + ")\n")
printColour("\n[*] ", CYAN)
print "Total follower: " + str(len(l)-1) + "\n"
def getFollowing(profile):
i = 0
l = []
printColour("\n[*] ", BLUE)
print "Following list:\n"
for user in tweepy.Cursor(api.friends, screen_name=profile, count=200).items():
try:
l.append(user.screen_name)
i = i + 1
except:
print "[-] Timeout, sleeping for 15 minutes..."
time.sleep(15*60)
for user in l:
printColour("[+] @" + user, GREEN)
print(" (https://www.twitter.com/" + user + ")\n")
printColour("\n[*] ", CYAN)
print "Total following: " + str(len(l)-1) + "\n"
def rogue(s):
printColour("\n[*] ", BLUE)
c = 0
print "Potential rogue profile:\n"
pageList = []
tmp = []
i=0
for page in tweepy.Cursor(api.search_users, q=s, include_entities=False, count=20).pages():
if (c>30): # Counter to limit the request
break
c +=1
for result in page:
if result.screen_name not in tmp:
i += 1
tmp.append(result.screen_name)
printColour("[+] " + result.name + " (@" + result.screen_name + ")", GREEN)
print "\n"
printColour("\n[*] ", CYAN)
print "Total potential rogue profile: " + str(i) + "\n"
def get_tweets(self, since_id):
"""Looks up metadata for all Trump tweets since the specified ID."""
tweets = []
# Include the first ID by passing along an earlier one.
since_id = str(int(since_id) - 1)
# Use tweet_mode=extended so we get the full text.
for status in Cursor(self.twitter_api.user_timeline,
user_id=TRUMP_USER_ID, since_id=since_id,
tweet_mode="extended").items():
# Use the raw JSON, just like the streaming API.
tweets.append(status._json)
self.logs.debug("Got tweets: %s" % tweets)
return tweets
def save_tweets_with_retweets(screen_name):
timestamp = time.strftime("%d.%m.%Y %H:%M:%S", time.localtime())
print(timestamp)
for tweet in limit_handled(tweepy.Cursor(TWITTER_API.user_timeline, id=screen_name, count=200).items()):
retweets = get_retweets(tweet.id)
db.saveRetweets(tweet, retweets)
def search(self, target, date, maxnum = 10):
''' Collect all the tweets with the keyword
self.target, in the range self.date[0] -
self.date[1]
'''
self.target = target
self.date = date
cursor = tweepy.Cursor(
self.api.search,
q = self.target,
since = self.date[0],
until = self.date[1],
show_user = True)
return cursor.items(maxnum)
def search_on_user(api, user_name, search_term):
""" Searches a term over a user's twitter feed """
limit.check_remaining_calls(api)
c = tweepy.Cursor(api.search, q=search_term+ ' -RT' + ' from:'+user_name, lang="en") # Removes retweets
limit.check_remaining_calls(api)
list_of_tweets = []
counter = 0
for tweet in c.items():
limit.check_remaining_calls(api)
counter = counter + 1
tweet_text = tweet.text
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
match = re.search(regex, tweet_text)
if match:
link = match.group()
list_of_tweets.append(link)
if counter == 0:
return 'null'
return list_of_tweets[0]
def get_user_tweets(self, username, since_id=None):
"""
Download all tweets for an user
Max is around 3200 tweets
"""
if self.api is None:
self._authenticate()
tweets = []
if since_id:
cursor = tweepy.Cursor(self.api.user_timeline, screen_name=username, since_id=since_id)
else:
cursor = tweepy.Cursor(self.api.user_timeline, screen_name=username)
for item in cursor.items():
tweets.append(item)
return tweets
def get_searched_tweets(self, hashtag, since_id=None):
"""
Search all tweets for a hashtag
"""
if self.api is None:
self._authenticate()
tweets = []
if since_id:
cursor = tweepy.Cursor(self.api.search, q=hashtag, count=100, since_id=since_id)
else:
cursor = tweepy.Cursor(self.api.search, q=hashtag, count=100)
try:
for item in cursor.items():
tweets.append(item)
except tweepy.error.TweepError:
print("Reached Twitter rate limit")
return tweets
def save(self):
try:
print("[search] [search_term: {}]".format(self.screen_name))
i = 0
for page in tweepy.Cursor(self.client.user_timeline,
screen_name=self.screen_name,
count=200).pages(100):
print("{}.".format(i))
i = i + 1
sleep(config.TWITTER_API_DELAY)
self.process_page(page)
except tweepy.error.RateLimitError:
print("[search] [error: rate limit] [{}]".format(self))
sleep(60)
except tweepy.error.TweepError as e:
print("[search] [error: tweepy] [{}]".format(e))
sleep(60)
except:
print("[search] [error: unknown] [{}]".format(sys.exc_info()[0]))
sleep(60)
def limit_handled(cursor: tweepy.Cursor):
"""Wrap cursor access with rate limiting
:param cursor: The cursor to siphon
:returns: Cursor items
"""
while True:
try:
yield cursor.next()
except tweepy.RateLimitError:
time.sleep(15 * 60)
def get_friends(self, callback, pages_limit=0):
api = self._api
user = self._user
if user.friends_count > _FRIENDS_COUNT_MAX_:
logging.warning('The user [%d]-[%s] has too many [%d] friends!'
% (user.id, user.screen_name, user.friends_count))
return
cursor = tweepy.Cursor(api.friends_ids, user_id=user.id, screen_name=user.screen_name)
friends = []
try:
for friends_page in cursor.pages(pages_limit):
friends.extend(friends_page)
if callable(callback):
callback(friends)
except tweepy.TweepError as e:
logging.warning([user.id, user.screen_name, e])
def get_tweets(listOfTweets, keyword, numOfTweets):
# Iterate through all tweets containing the given word, api search mode
for tweet in tweepy.Cursor(api.search, q=keyword).items(numOfTweets):
# Add tweets in this format
dict_ = {'Screen Name': tweet.user.screen_name,
'User Name': tweet.user.name,
'Tweet Created At': unicode(tweet.created_at),
'Tweet Text': tweet.text,
'User Location': unicode(tweet.user.location),
'Tweet Coordinates': unicode(tweet.coordinates),
'Retweet Count': unicode(tweet.retweet_count),
'Retweeted': unicode(tweet.retweeted),
'Phone Type': unicode(tweet.source),
'Favorite Count': unicode(tweet.favorite_count),
'Favorited': unicode(tweet.favorited),
'Replied': unicode(tweet.in_reply_to_status_id_str)
}
listOfTweets.append(dict_)
return listOfTweets
# Connect to DB
py_twitter_scrape.py 文件源码
项目:social-fork-analysis
作者: ethereumproject
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def crawl_target(api, target_type, target_list):
for target in target_list:
if target_type == 'user':
statuses = limit_handled(tweepy.Cursor(api.user_timeline,
id=target).items())
elif target_type == 'hashtag':
statuses = limit_handled(tweepy.Cursor(api.search,
target).items())
print('Crawling %s' % target)
for status in statuses:
if status.created_at.timestamp() > catastrophe_period_start:
if not tweet_db.get(bytes(status.id_str, 'utf-8')):
print('Saving tweet: %s' % status.id_str)
write_to_tweet_db(status)
if not user_db.get(bytes(status.author.id_str, 'utf-8')):
print('Saving user: %s' % status.author.id_str)
write_to_user_db(status.author)
else:
print('Reached {time}, on to the next {ttype}'.format(time=status.created_at.strftime('%Y %h %d %H:%M:%S'), ttype=target_type))
break
def _build_file(self):
self.total_rows = 0
#Get recent tweets from dril and add to new file
for status in tweepy.Cursor(api.user_timeline, 'dril', since_id=self.since).items():
self.total_rows += self._process_status(status)
#Put content of old file in new file
#This is kind of messy uhhh
try:
#Open things for reading and writing
readFile = open('data/dril.csv', 'rt', encoding='utf-8')
writeFile = open('data/new.csv', 'at', encoding='utf-8')
read = reader(readFile)
write = writer(writeFile, delimiter=',', quoting=QUOTE_NONNUMERIC) #Uhhhhmmmmmhmh mmmm
for row in read:
write.writerow([int(row[0]), row[1]])
self.total_rows += 1
except IOError:
print('Failed to open file (1) [okay if this is the first time running]')
#Rename the new file to be the old file
os.rename('data/new.csv', 'data/dril.csv')
def past(name,d):
# Global variable count initialized to 0
global count
count = 0
# variable u storing todays date.
u=datetime.date.today()
# Cursor searching for tweet with matching query 'q=name'
# 'since' refers to the starting date
# 'until' refers to today's date
# whenever the tweet with the matching query is fetched the count variable is incremented by one.
for tweet in tweepy.Cursor(api.search,q=name,since=u-datetime.timedelta(d),until=u,lang='en').items():
count+=1
# REST API ends here.
# Flask routing.
# local host with template index.html(can be found in the template folder)
def get_followers_id (user_keys,api,user,f_log,flag_fast):
dict_followers={}
try:
print 'get %s ids followers' % user
for page in tweepy.Cursor(api.followers_ids,screen_name=user,
count=5000,
monitor_rate_limit=True,
wait_on_rate_limit=True,
wait_on_rate_limit_notify = True,
retry_count = 5,
retry_delay = 5 ).pages():
for follower_id in page:
dict_followers[follower_id]=1
if flag_fast:
return dict_followers
except:
f_log.write(('%s, %s error en tweepy, method followers/id, user %s\n') % (time.asctime(),TypeError(),user))
return dict_followers
def get_following_id (user_keys,api,user,f_log,flag_fast):
dict_following={}
try:
print 'get %s ids followers' % user
for page in tweepy.Cursor(api.friends_ids,screen_name=user,
monitor_rate_limit=True,
wait_on_rate_limit=True,
wait_on_rate_limit_notify = True,
retry_count = 5,
retry_delay = 5 ).pages():
for following_id in page:
dict_following[following_id]=1
if flag_fast:
return dict_following
except:
f_log.write(('%s, %s error en tweepy, method friends/id, user %s\n') % (time.asctime(),TypeError(),user))
return dict_following
def get_tweets(username: str, num=1):
statuses = list(tweepy.Cursor(api_twitter.user_timeline, id=id).items(num))
return statuses
def archive(userid, filename='saved.txt'):
with open(filename, 'a') as save:
for status in tweepy.Cursor(api_twitter.user_timeline, id=userid).items(200):
save.write((html.unescape(encode_tweet(status))))
def get_tweets(api_twitter, username: str, num=1):
statuses = list(tweepy.Cursor(api_twitter.user_timeline, id=id).items(num))
return statuses
def del_yesterday_info():
# ????????
d = datetime.now() + timedelta(days=-1)
yesterday = "%s/%s/%s" % (d.year, d.month, d.day)
api = get_api()
# ?????????TL???
myinfo = api.me()
try:
tweets = tweepy.Cursor(api.user_timeline, id=myinfo.id).items(100)
except Exception as e:
log.exception(e)
# ????????????
for t in tweets:
r = re.compile(yesterday)
sentence = t.text.encode('utf-8')
s = re.match(r, sentence)
if s is None:
pass
else:
try:
api.destroy_status(t.id)
except Exception as e:
log.exception(e)