sgg_spider.py 文件源码-python代码片段

def parse_details(self, response):
        # response = get(response.url)

        institution = response.xpath('//h2/text()').extract()[0].strip() 
        logging.warn("scrapping: %s - %s"%(response.url, institution))

        for tr in response.xpath('//table[@class="fancy"]/tr'): 

            if tr.xpath('td[1]'):
                item = Item()
                titlu =  xtract(tr, 'td[1]//div/text()') 
                type_ = xtract(tr, 'td[2]//div//strong/text()')
                consult = xtract(tr, 'td[3]//div/text()')
                avizare = xtract(tr, 'td[4]//div/text()')
                avizori = xtract(tr, 'td[5]//div/text()')
                termen_avize = xtract(tr, 'td[6]//div/text()')
                mfp_mj = xtract(tr, 'td[7]//div/text()')
                reavizare = xtract(tr, 'td[8]//div/text()')
                init_1 = xtract(tr, 'td[9]//a/@href')
                init_2 = xtract(tr, 'td[10]//a/@href')
                final_1 = xtract(tr, 'td[11]//a/@href')
                final_2 = xtract(tr, 'td[12]//a/@href')

                docs = [{"type": "nota", "url": response.urljoin(f)} for f in [init_1, init_2, final_1, final_2] if f]

                item['identifier'] = identify(institution, titlu)
                item['title'] = titlu
                item['type'] = type_
                item['institution'] = "sgg"
                item['date'] = consult
                item['description'] = ""
                item['feedback_days'] = None
                item['contact'] = None
                item['documents'] = docs

                yield item