def get_tuko():
tuko_url = 'https://www.tuko.co.ke'
if check_connection(tuko_url):
tuko = requests.get(tuko_url)
soup = BeautifulSoup(tuko.text, 'lxml', parse_only=SoupStrainer('a'))
tuko = []
for link in soup.select('a.news__link', limit=6):
news_title = '{}({})'.format(link.get_text(), link.get('href'))
tuko_link = requests.get(link.get('href'))
soup_link = BeautifulSoup(tuko_link.text, 'lxml', parse_only=SoupStrainer(['p', 'meta', 'img']))
try:
article_date = soup_link.find("meta", itemprop="datePublished")['content']
except (TypeError, ValueError):
print('Tuko: No article date meta')
continue
image = ''
try:
image = soup_link.find("meta", property="og:image")['content']
except (TypeError, ValueError):
try:
image = soup_link.find('img', class_='article-image__picture')['src']
except (TypeError, ValueError):
print('Tuko: No image found')
news_dict = {
'category': 'news',
'source': 'tuko',
'title': link.get_text(),
'link': link.get('href'),
'image': image,
'content': [link_inner.get_text().strip(' ,.-') for link_inner in
soup_link.select('p.align-left > strong', limit=3) if not
link_inner.get_text().startswith('READ ALSO')],
'date': article_date,
'date_added': datetime.datetime.utcnow()
}
collection.update({'link': link.get('href')}, news_dict, upsert=True)
tuko.append(news_dict)
return tuko
评论列表
文章目录