def get_kamervraag_document_id_and_content(url):
logger.info('get kamervraag document id and content for url: ' + url)
page = requests.get(url, timeout=60)
tree = lxml.html.fromstring(page.content)
elements = tree.xpath('//ul/li/a[@id="technischeInfoHyperlink"]')
if elements:
document_id = elements[0].get('href').split('/')[-1]
else:
elements = tree.xpath('/html/head/meta[@name="dcterms.identifier"]')
if not elements:
return None, '', ''
document_id = elements[0].get('content')
logger.info('document id: ' + document_id)
content_html = ''
if tree.xpath('//div[@id="main-column"]'):
content_html = lxml.etree.tostring(tree.xpath('//div[@id="main-column"]')[0])
titles = tree.xpath('//h1[@class="kamervraagomschrijving_kop no-toc"]')
title = ''
if titles:
title = titles[0].text_content()
title = re.sub('\s{2,}', ' ', title).strip()
return document_id, content_html, title
评论列表
文章目录