android_apps_spider.py 文件源码-python代码片段

def parse(self, response):
        response_domain = urlparse(response.url).netloc
        appItemList = []
        cookie = {}
        xpath_rule = self.scrape_rules['xpath']
        for key in xpath_rule.keys():
            if key in response_domain:
                appItemList.extend(
                        self.parse_xpath(response, xpath_rule[key]))
                break
        custom_parser_rule = self.scrape_rules['custom_parser']
        for key in custom_parser_rule.keys():
            if key in response_domain:
                appItemList.extend(
                        getattr(custom_parser, custom_parser_rule[key])(response))
                break
        #if "appchina" in response_domain:
        #    xpath = "//a[@id='pc-download' and @class='free']/@href"
        #    appItemList.extend(self.parse_xpath(response, xpath))
        #elif "hiapk" in response_domain:
        #    xpath = "//a[@class='linkbtn d1']/@href"
        #    appItemList.extend(self.parse_xpath(response, xpath))
        #elif "android.d.cn" in response_domain:
        #    xpath = "//a[@class='down']/@href"
        #    appItemList.extend(self.parse_xpath(response, xpath))
        #elif "anzhi" in response_domain:
        #    xpath = "//div[@id='btn']/a/@onclick"
        #    appItemList.extend(self.parse_anzhi(response, xpath))
        #else:
        #    pass
        sel = Selector(response)
        for url in sel.xpath('//a/@href').extract():
            url = urljoin(response.url, url)
            yield Request(url, meta=cookie, callback=self.parse)

        for item in appItemList:
            yield item


    #def parse_appchina(self, response):
    #    appItemList = []
    #    hxs = HtmlXPathSelector(response)
    #    for url in hxs.select(
    #        "//a[@id='pc-download' and @class='free']/@href"
    #        ).extract():
    #        url = urljoin(response.url, url)
    #        log.msg("Catch an application: %s" % url, level=log.INFO)
    #        appItem = AppItem()
    #        appItem['url'] = url
    #        appItemList.append(appItem)
    #    return appItemList