Patent.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:PatentCrawler 作者: will4906 项目源码 文件源码
def parseNotFirstPage(self, response):
        sipo = response.meta['sipo']
        soup = BeautifulSoup(response.body_as_unicode(), 'lxml')
        itemList = soup.find_all(attrs={"class": "item"})
        for item in itemList:
            sipocrawler = SipoCrawlerItem()
            itemSoup = BeautifulSoup(item.prettify(), 'lxml')
            patentid = itemSoup.find(attrs={'name': 'idHidden'}).get('value')
            nrdAn = itemSoup.find(attrs={'name': 'nrdAnHidden'}).get('value')
            nrdPn = itemSoup.find(attrs={'name': 'nrdPnHidden'}).get('value')
            sipocrawler['patent_id'] = str(patentid)
            formdata = url_config.detailSearch.get('formdata')
            formdata.__setitem__('nrdAn', str(patentid).split('.')[0])
            formdata.__setitem__('cid', str(patentid))
            formdata.__setitem__('sid', str(patentid))
            yield FormRequest(
                url=url_config.detailSearch.get('url'),
                formdata=formdata,
                callback=self.parsePatentDetail,
                meta={'sipo': sipo, 'sipocrawler': sipocrawler, 'lawinfo': {'nrdAn': nrdAn, 'nrdPn': nrdPn}}
            )

    # ??????
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号