lof.py 文件源码-python代码片段

def get_details(self, response):
        self.log('Starting the second parsing phase')
        loader = ItemLoader(item=LibraryOrFrameworkItem(), response=response)

        # Load the values obtained in the first phase
        loader.add_value('name', response.meta['name'])

        language = response.meta['language']

        loader.add_value('stable_release', response.meta['stable_version'])
        loader.add_value('release_date', response.meta['rel_date'])


        descr = response.xpath('//*[@id="mw-content-text"]/div/p[1] | //*[@id="mw-content-text"]/p[1]').extract_first()
        cleaned_descr = cleanhtml(descr)
        loader.add_value('description', cleaned_descr)

        license_found = False
        for row in response\
                    .xpath('//*[@id="mw-content-text"]/div/table[position()<=3]/tr'):
            header = row.xpath('./th/a/text() | ./th/text()').extract_first()
            key, value = self.get_key_value(header, row)
            if key:
                if key == 'license': # If we find the license in the main page, we will use it
                    license_found = True
                loader.add_value(key, value)
        # If we not found the license in the main page
        # We will use the license found on the start page
        if not license_found:
            loader.add_value('license', response.meta['license'])

        return {
            "item": loader.load_item(),
            "language": language
            # We need to return the language separately in order to manage the many to many relation
        }

    # Given a couple (key, elem), obtained during the scraping, he returns the valid couple (key1, value1)
    # to add to the db. If key is not valid, he will return the tuple (None, None)