def get_standard():
standard_url = 'https://www.standardmedia.co.ke/'
if check_connection(standard_url):
standard = requests.get(standard_url)
soup = BeautifulSoup(standard.text, 'lxml', parse_only=SoupStrainer('div'))
standard = []
for link in soup.select('.col-xs-8.zero a', limit=11):
if link.get_text():
news_title = '{}({})'.format(link.get_text().strip(), link.get('href'))
standard_link = requests.get(link.get('href'))
soup_link = BeautifulSoup(standard_link.text, 'lxml', parse_only=SoupStrainer(['script', 'div']))
try:
data = json.loads(soup_link.find('script', type='application/ld+json').text.replace("\\", r"\\"))
article_date = data['dateModified']
image = data['image']['url']
if image == 'https://www.standardmedia.co.ke':
image = ''
except (ValueError, AttributeError):
print('Standard: invalid json detected')
continue
try:
content = get_content(soup_link, 'main-article')
except AttributeError:
try:
content = get_content(soup_link, 'story')
except AttributeError:
print('Standard: No content found')
continue
news_dict = {
'category': 'news',
'source': 'standard',
'title': link.get_text().strip(),
'link': link.get('href'),
'image': image,
'content': content,
'date': article_date,
'date_added': datetime.datetime.utcnow()
}
collection.update({'link': link.get('href')}, news_dict, upsert=True)
standard.append(news_dict)
return standard
评论列表
文章目录