detail.py 文件源码

python
阅读 36 收藏 0 点赞 0 评论 0

项目:PyCrawler 作者: KillersDeath 项目源码 文件源码
def goodsDetail(detail_url):
    '''
    ??xpath??????
    :param detail_url: ???url
    :return: ?????? dict
    '''
    goods_data = defaultdict()
    # ?????
    goods_data['source_url'] = detail_url
    # ??html body???str??
    body = getHtmlFromJs(detail_url)['content'].encode('utf-8')
    html = HtmlResponse(url=detail_url,body=str(body))
    # ??
    goods_data['name'] = html.xpath('/html/body/div[7]/div[2]/h1/text()').extract()[0]
    # ??
    goods_data['price'] = html.selector.xpath('/html/body/div[7]/div[2]/div[2]/ul/li[1]/label[1]/text()').extract()[0]
    # ??
    goods_data['type'] = html.selector.xpath('/html/body/div[7]/div[2]/div[2]/ul/li[3]/label/text()').extract()[0]
    # ??
    goods_data['detail'] = html.selector.xpath('/html/body/div[9]/div[2]/div[2]/table').extract()[0]
    # ??
    pics = []
    for pic in html.selector.xpath('/html/body/div[7]/div[1]/div[2]/div[2]/ul/li/img'):
        # ??????,????
        pics.append(pic.xpath('@src').extract()[0].replace('!240240',''))
    goods_data['pics'] = '|'.join(pics)
    goods_data['storage'] = ''
    goods_data['lack_period'] = ''
    goods_data['created'] = int(time.time())
    goods_data['updated'] = int(time.time())

    # print(goods_data['detail'])
    return goods_data
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号