def parse_detail(self, response):
content = response.css('#work span::text').extract()
reg = "^(http|https|ftp)://.*(.com|.cn|.html|.htm|.asp|.jsp)"
url = response.url
reg_url_name = ".*?(\d+)"
get_url = re.match(reg_url_name, url)
if get_url:
self.get_name = get_url.group(1)
reference_url_list = []
for each_line in content:
get_reference_url = re.match(reg, each_line)
if get_reference_url:
reference_url_list.append(get_reference_url.group(0))
self.count = 0
if reference_url_list:
for each_url in reference_url_list:
yield Request(url=each_url, dont_filter=True, callback=self.parse_reference)
self.count += 1
else:
pass
reference_news_spider.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录