def google_news_run(keyword, limit=10, year_start=2010, year_end=2011, debug=True, sleep_time_every_ten_articles=0):
num_articles_index = 0
ua = UserAgent()
result = []
while num_articles_index < limit:
url = forge_url(keyword, num_articles_index, year_start, year_end)
if debug:
logging.debug('For Google -> {}'.format(url))
logging.debug('Total number of calls to Google = {}'.format(NUMBER_OF_CALLS_TO_GOOGLE_NEWS_ENDPOINT))
headers = {'User-Agent': ua.chrome}
try:
response = requests.get(url, headers=headers, timeout=20)
links = extract_links(response.content)
nb_links = len(links)
if nb_links == 0 and num_articles_index == 0:
raise Exception(
'No results fetched. Either the keyword is wrong '
'or you have been banned from Google. Retry tomorrow '
'or change of IP Address.')
if nb_links == 0:
print('No more news to read for keyword {}.'.format(keyword))
break
for i in range(nb_links):
cur_link = links[i]
logging.debug('TITLE = {}, URL = {}, DATE = {}'.format(cur_link[1], cur_link[0], cur_link[2]))
result.extend(links)
except requests.exceptions.Timeout:
logging.debug('Google news TimeOut. Maybe the connection is too slow. Skipping.')
pass
num_articles_index += 10
if debug and sleep_time_every_ten_articles != 0:
logging.debug('Program is going to sleep for {} seconds.'.format(sleep_time_every_ten_articles))
time.sleep(sleep_time_every_ten_articles)
return result
评论列表
文章目录