toysrus.py 文件源码

python
阅读 21 收藏 0 点赞 0 评论 0

项目:alltheplaces 作者: alltheplaces 项目源码 文件源码
def parse(self, response):
        marker_txt = re.findall(re.compile("markerData.*\}", re.MULTILINE), response.body_as_unicode())
        if not len(marker_txt):
            return
        markers_json = "{\"" + marker_txt[0]
        markers = list(json.loads(markers_json).values())[0]

        if not len(markers):
            return
        for marker in markers:
            marker_response = HtmlResponse(url="", body=marker["info"].encode("utf-8"))
            hours = re.findall(r"\{\"label.*\}", marker["info"])
            hours = hours[0]
            parsed_hours = json.loads(hours)

            addr_parts = marker_response.css(".address span:not(.phone)::text").extract()
            url = marker_response.css("header a").xpath("@href").extract_first()
            city, state = addr_parts[-1].split(",")

            yield GeojsonPointItem(lat=marker.get("lat"), lon=marker.get("lng"),
                                   name=marker_response.css("header a::text").extract_first(default=None),
                                   addr_full=", ".join(addr_parts),
                                   city=city.strip(),
                                   state=state.strip(),
                                   country="United States",
                                   phone=marker_response.css(".phone::text").extract_first(),
                                   website=url,
                                   opening_hours=get_hours(parsed_hours["days"]),
                                   ref=url.split("/")[-1].split(".")[0])
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号