def get_capital():
capital_url = 'http://www.capitalfm.co.ke/news/{}/{:02}'.format(today.year, today.month)
if check_connection(capital_url):
capital = requests.get(capital_url)
soup = BeautifulSoup(capital.text, 'lxml', parse_only=SoupStrainer('div'))
capital = []
for article in soup.select('div.entry-information'):
article_link = article.a
link = article_link['href']
title = article_link.get_text()
capital_link = requests.get(link)
soup_link = BeautifulSoup(capital_link.text, 'lxml', parse_only=SoupStrainer(['meta', 'img', 'div']))
article_date = soup_link.find("meta", property="article:published_time")['content']
image = ''
try:
image = soup_link.find("meta", property="og:image")['content']
except (TypeError, ValueError):
try:
image = soup_link.find('img', class_='size-full')['src']
except (TypeError, ValueError):
print('Capital: No image found')
try:
content = get_content(soup_link, 'entry-content').split('\u2013')[1].strip()
except IndexError:
content = get_content(soup_link, 'entry-content').strip()
news_dict = {
'category': 'news',
'source': 'capital',
'title': title,
'link': link,
'image': image,
'content': content,
'date': article_date,
'date_added': datetime.datetime.utcnow()
}
collection.update({'link': link}, news_dict, upsert=True)
capital.append(news_dict)
return capital
评论列表
文章目录