def get_details(self, response):
self.log('Starting the second parsing phase')
loader = ItemLoader(item=LibraryOrFrameworkItem(), response=response)
# Load the values obtained in the first phase
loader.add_value('name', response.meta['name'])
language = response.meta['language']
loader.add_value('stable_release', response.meta['stable_version'])
loader.add_value('release_date', response.meta['rel_date'])
descr = response.xpath('//*[@id="mw-content-text"]/div/p[1] | //*[@id="mw-content-text"]/p[1]').extract_first()
cleaned_descr = cleanhtml(descr)
loader.add_value('description', cleaned_descr)
license_found = False
for row in response\
.xpath('//*[@id="mw-content-text"]/div/table[position()<=3]/tr'):
header = row.xpath('./th/a/text() | ./th/text()').extract_first()
key, value = self.get_key_value(header, row)
if key:
if key == 'license': # If we find the license in the main page, we will use it
license_found = True
loader.add_value(key, value)
# If we not found the license in the main page
# We will use the license found on the start page
if not license_found:
loader.add_value('license', response.meta['license'])
return {
"item": loader.load_item(),
"language": language
# We need to return the language separately in order to manage the many to many relation
}
# Given a couple (key, elem), obtained during the scraping, he returns the valid couple (key1, value1)
# to add to the db. If key is not valid, he will return the tuple (None, None)
评论列表
文章目录