def scrape(self):
# Return Wikipedia page and turn into a tree.
base_url = 'https://en.wikipedia.org'
response = requests.get(base_url + '/wiki/Cabinet_of_the_United_States')
tree = html.document_fromstring(response.text)
# Get all of the rows of the Cabinet table.
rows = tree.xpath('//th[text()="Cabinet"]')[0].getparent().getparent().getchildren()
obj = []
# Iterate through all rows.
for x in rows:
# Retrieve all of the elements per row.
data = x.getchildren()
# Only look at this if we're looking at Cabinet members.
if len(data) == 3 and data[0].tag == 'td':
print(data[1].xpath('div/a'))
# Clean up data with strip.
obj.append({
"title": [x for x in data[0].text_content().split('\n') if x != ''][0],
"seal": 'https:' + data[0].xpath('a/img')[0].attrib['src'],
"img": 'https:' + data[1].xpath('a/img')[0].attrib['src'],
"name": [x for x in data[1].text_content().split('\n') if x != ''][0],
"details": base_url + data[1].xpath('div/a')[0].attrib['href'] if len(data[1].xpath('div/a')) > 0 else None,
"is_acting": (len([x for x in data[1].text_content().split('\n') if x != '']) > 1 and [x for x in data[1].text_content().split('\n') if x != ''][1] == 'Acting'),
"date_appointed": data[2].text_content(),
})
print(json.dumps(obj))
评论列表
文章目录