url.py 文件源码-python代码片段

url.py 文件源码

python

阅读 20 收藏 0 点赞 0 评论 0

项目：hoaxy-backend 作者: IUNetSci 项目源码文件源码

def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for PageSpider.

        Parameters
        ----------
        domains : list
            A list of domains for the site.
        urls : list
            A list of URLs of the site.
        href_xpaths : list
            A list of XPATH expression indicating the ancestors of `<a>`
            element.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.
        """
        self.session = kwargs.pop('session', None)
        self.platform_id = kwargs.pop('platform_id', None)
        self.href_xpaths = kwargs.pop('href_xpaths', ())
        self.url_regex = kwargs.pop('url_regex', None)
        self.start_urls = urls
        self.allowed_domains = domains
        self.link_extractor = LinkExtractor(
            allow_domains=self.allowed_domains,
            restrict_xpaths=self.href_xpaths,
            unique=True)
        super(PageSpider, self).__init__(*args, **kwargs)