scrape_web.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:kenya-news-scrapper 作者: alfiepoleon 项目源码 文件源码
def get_standard():
    standard_url = 'https://www.standardmedia.co.ke/'
    if check_connection(standard_url):
        standard = requests.get(standard_url)
        soup = BeautifulSoup(standard.text, 'lxml', parse_only=SoupStrainer('div'))
        standard = []
        for link in soup.select('.col-xs-8.zero a', limit=11):
            if link.get_text():
                news_title = '{}({})'.format(link.get_text().strip(), link.get('href'))
                standard_link = requests.get(link.get('href'))
                soup_link = BeautifulSoup(standard_link.text, 'lxml', parse_only=SoupStrainer(['script', 'div']))
                try:
                    data = json.loads(soup_link.find('script', type='application/ld+json').text.replace("\\", r"\\"))
                    article_date = data['dateModified']
                    image = data['image']['url']
                    if image == 'https://www.standardmedia.co.ke':
                        image = ''
                except (ValueError, AttributeError):
                    print('Standard: invalid json detected')
                    continue
                try:
                    content = get_content(soup_link, 'main-article')
                except AttributeError:
                    try:
                        content = get_content(soup_link, 'story')
                    except AttributeError:
                        print('Standard: No content found')
                        continue

                news_dict = {
                    'category': 'news',
                    'source': 'standard',
                    'title': link.get_text().strip(),
                    'link': link.get('href'),
                    'image': image,
                    'content': content,
                    'date': article_date,
                    'date_added': datetime.datetime.utcnow()
                }
                collection.update({'link': link.get('href')}, news_dict, upsert=True)
                standard.append(news_dict)
        return standard
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号