registrant_spider.py 文件源码-python代码片段

def get_first_page(self, response):
        request_state = self.if_too_many_request(response.body, 'first_page')
        registrant = response.meta['registrant']
        if (request_state == False):
            s = Selector(text=response.body)
            content = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr/td[@class = "lf"]/a/img[@alt="..."]/../@href'
            domain_url_list = s.xpath(content).extract()
            content2 = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr'
            s_list = s.xpath(content2)
            domain_url_list2 = []
            for s in s_list:
                url2 = s.xpath('td[@class = "lf"]/a/img[@alt="..."]/../@href').extract()[0]
                domain_url_list2.append(url2)
            for url in domain_url_list2:
                cookie = get_cookie()
                url = "https://www.benmi.com" + url
                item = RwhoisRegistrantItem()
                item['registrant'] = registrant
                yield scrapy.Request(url, headers=self.head, meta={'cookie': cookie, 'item': item},
                                     cookies={"__cfduid": cookie[1], "cf_clearance": cookie[2],
                                              "BenmiUserInfo2": "Benmi-UN=hahaha321",
                                              "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "},
                                     callback=self.get_domain_name, dont_filter=True)