spider.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:collectors 作者: opentrials 项目源码 文件源码
def __init__(self, conf=None, conn=None, page_from=None, page_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Default values
        if page_from is None:
            page_from = '1'
        if page_to is None:
            page_to = '1'

        # Make start urls
        self.start_urls = _make_start_urls(
                prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi',
                page_from=page_from)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'cgi-open-bin/ctr_e/ctr_view.cgi',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
                process_value=partial(_process_url, page_from, page_to),
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号