def parse_kb(self, response):
# initial html tokenization to find regions segmented by e.g. "======"
# or "------"
filtered = response.xpath(
"//div[@class='sfdc_richtext']").extract()[0].split("=-")
for entry in [x and x.strip() for x in filtered]:
resp = HtmlResponse(url=response.url, body=entry,
encoding=response.encoding)
for link in resp.xpath("//a"):
href = link.xpath("@href").extract()[0]
if "cache-www" in href:
text = resp.xpath("//text()").extract()
text_next = link.xpath("following::text()").extract()
item = FirmwareLoader(item=FirmwareImage(),
response=response,
date_fmt=["%b %d, %Y", "%B %d, %Y",
"%m/%d/%Y"])
version = FirmwareLoader.find_version_period(text_next)
if not version:
version = FirmwareLoader.find_version_period(text)
item.add_value("version", version)
item.add_value("date", item.find_date(text))
item.add_value("url", href)
item.add_value("product", response.meta["product"])
item.add_value("vendor", self.name)
yield item.load_item()
评论列表
文章目录