documents.py 文件源码

python
阅读 36 收藏 0 点赞 0 评论 0

项目:openkamer 作者: openkamer 项目源码 文件源码
def get_kamervraag_document_id_and_content(url):
    logger.info('get kamervraag document id and content for url: ' + url)
    page = requests.get(url, timeout=60)
    tree = lxml.html.fromstring(page.content)
    elements = tree.xpath('//ul/li/a[@id="technischeInfoHyperlink"]')
    if elements:
        document_id = elements[0].get('href').split('/')[-1]
    else:
        elements = tree.xpath('/html/head/meta[@name="dcterms.identifier"]')
        if not elements:
            return None, '', ''
        document_id = elements[0].get('content')
    logger.info('document id: ' + document_id)
    content_html = ''
    if tree.xpath('//div[@id="main-column"]'):
        content_html = lxml.etree.tostring(tree.xpath('//div[@id="main-column"]')[0])
    titles = tree.xpath('//h1[@class="kamervraagomschrijving_kop no-toc"]')
    title = ''
    if titles:
        title = titles[0].text_content()
        title = re.sub('\s{2,}', ' ', title).strip()
    return document_id, content_html, title
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号