def parseNotFirstPage(self, response):
sipo = response.meta['sipo']
soup = BeautifulSoup(response.body_as_unicode(), 'lxml')
itemList = soup.find_all(attrs={"class": "item"})
for item in itemList:
sipocrawler = SipoCrawlerItem()
itemSoup = BeautifulSoup(item.prettify(), 'lxml')
patentid = itemSoup.find(attrs={'name': 'idHidden'}).get('value')
nrdAn = itemSoup.find(attrs={'name': 'nrdAnHidden'}).get('value')
nrdPn = itemSoup.find(attrs={'name': 'nrdPnHidden'}).get('value')
sipocrawler['patent_id'] = str(patentid)
formdata = url_config.detailSearch.get('formdata')
formdata.__setitem__('nrdAn', str(patentid).split('.')[0])
formdata.__setitem__('cid', str(patentid))
formdata.__setitem__('sid', str(patentid))
yield FormRequest(
url=url_config.detailSearch.get('url'),
formdata=formdata,
callback=self.parsePatentDetail,
meta={'sipo': sipo, 'sipocrawler': sipocrawler, 'lawinfo': {'nrdAn': nrdAn, 'nrdPn': nrdPn}}
)
# ??????
评论列表
文章目录