python类LinkExtractor()的实例源码

allrecipes_spider.py 文件源码 项目:RecipesScraper 作者: brandonmburroughs 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
    """Parse the recipe list."""
    recipes = LinkExtractor(allow=r"/recipe/\d+/.*").extract_links(response)
    if len(recipes) > 0:
      for recipe_link in recipes:
        yield scrapy.Request(recipe_link.url, callback=self.parse_item)
link.py 文件源码 项目:Scrapy-BenchCLI 作者: Parth-Vader 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def main():
    start = timer()

    url = 'http://scrapinghub.com/'
    link_extractor = LinkExtractor()
    total = 0
    for files in glob.glob('sites/*'):

        f = (io.open(files, "r", encoding="utf-8"))
        html = f.read()

        r3 = HtmlResponse(url=url, body=html, encoding='utf8')
        links = link_extractor.extract_links(r3)
        total = total + len(links)
    end = timer()
    print("\nTotal number of links extracted = {0}".format(total))
    print("Time taken = {0}".format(end - start))
    click.secho("Rate of link extraction : {0} links/second\n".format(
        float(total / (end - start))), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / (end - start)))))
spider.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self, conf=None, conn=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make urls
        self.start_urls = [
            'http://www.takedaclinicaltrials.com/browse/?protocol_id=',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'browse/summary/',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'browse',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()
proxy_spider.py 文件源码 项目:ip_proxy_pool 作者: leeyis 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self,rule):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.rule = rule
        self.name = rule.name
        self.allowed_domains = rule.allowed_domains.split(',')
        self.start_urls = rule.start_urls.split(',')
        rule_list = []

        # ??`???`???
        if len(rule.next_page):
            rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))

        rule_list.append(Rule(LinkExtractor(
            allow=rule.allow_url.split(','),
            unique=True),
            follow=True,
            callback='parse_item'))

        self.rules = tuple(rule_list)
        super(ProxySpiderSpider, self).__init__()
followall.py 文件源码 项目:Scrapy-BenchCLI 作者: Parth-Vader 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, **kw):
        super(FollowAllSpider, self).__init__(**kw)
        url = 'http://localhost/books.toscrape.com/index.html'
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow()
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, url, search_terms=None, *args, **kwargs):
        if url.startswith('.') or url.startswith('/'):
            with Path(url).open('rt', encoding='utf8') as f:
                urls = [line.strip() for line in f]
        else:
            urls = [u for u in url.split() if u]
        self.start_urls = [add_http_if_no_scheme(_url) for _url in urls]
        self.search_terms = search_terms
        self._extra_search_terms = None  # lazy-loaded via extra_search_terms
        self._reset_link_extractors()
        self.images_link_extractor = LinkExtractor(
            tags=['img'], attrs=['src'], deny_extensions=[],
            canonicalize=False)
        self.state = {}
        self.use_splash = None  # set up in start_requests
        self._screenshot_dest = None  # type: Path
        # Load headless horseman scripts
        self.lua_source = load_directive('headless_horseman.lua')
        self.js_source = load_directive('headless_horseman.js')
        super().__init__(*args, **kwargs)
user.py 文件源码 项目:Spider 作者: poluo 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_tag(self, response):
        res = LinkExtractor(allow=('.*/user/.*'), allow_domains='www.reddit.com').extract_links(response)
        for one in res:
            if one.text != 'Click here!':
                path = one.url.replace('https://www.reddit.com', '')
                yield Request(url=one.url, callback=self.parse_user, meta={'cookies': True, 'path': path})

        res = LinkExtractor(allow=('.*/comments/.*'), allow_domains='www.reddit.com').extract_links(response)
        for one in res:
            path = one.url.replace('https://www.reddit.com', '')
            yield Request(url=one.url, callback=self.parse_comment, meta={'cookies': True, 'path': path})

        next_page = response.css(
            '#siteTable > div.nav-buttons > span > span.next-button > a::attr(href)').extract_first()
        if next_page:
            path = next_page.replace('https://www.reddit.com', '')
            yield Request(url=next_page, callback=self.parse_tag, meta={'cookies': True, 'path': path})
        else:
            self.logger.info('No next page in parse_tag')
mirror_spider.py 文件源码 项目:wayback-machine-scraper 作者: sangaline 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self, domains, directory, allow=(), deny=(), unix=False):
        self.directory = directory
        self.unix = unix
        self.rules = (
            Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'),
        )

        # parse the allowed domains and start urls
        self.allowed_domains = []
        self.start_urls = []
        for domain in domains:
            url_parts = domain.split('://')
            unqualified_url = url_parts[-1]
            url_scheme = url_parts[0] if len(url_parts) > 1 else 'http'
            full_url = '{0}://{1}'.format(url_scheme, unqualified_url)
            bare_domain = unqualified_url.split('/')[0]
            self.allowed_domains.append(bare_domain)
            self.start_urls.append(full_url)

        super().__init__()
article_spider.py 文件源码 项目:scrapy-demo 作者: ParadeTo 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def __init__(self, website):
        self.name = website.spider_name
        self.redis_key = website.spider_name + ":start_urls"

        self.website = website
        self.allowed_domains = website.allow_domains.split(";")
        self.start_urls = website.start_urls.split(";")

        rule_list = []
        rules_to_follow = website.rules_to_follow.split(";")
        rules_to_parse = website.rules_to_parse.split(";")
        rule_list.append(
            Rule(LinkExtractor(allow=rules_to_parse), follow=True, callback='parse_detail')
        )
        rule_list.append(
            Rule(LinkExtractor(allow=rules_to_follow), follow=True)
        )

        self.rules = tuple(rule_list)
        super(ArticleSpider, self).__init__()
newspaper_crawler.py 文件源码 项目:newspaper-scraper-couchbase 作者: aleonsan 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def __init__(self, topic=None, newspaper=None, term='', *args, **kwargs):
        self.term = term
        if newspaper:
            sources = [source for source in SOURCE_NEWSPAPERS if newspaper == source['name']]
        else:
            sources = TOPIC_TO_SOURCES.get(topic, SOURCE_NEWSPAPERS)
        self.allowed_domains = [source['allowed_domains'] for source in sources]
        self.start_urls = [source['url'] for source in sources]
        self.rules = []
        for source in sources:
            if topic:
                allowed_domain_regex=(source['allowed_subdomains_regex'][topic], )
            else:
               allowed_domain_regex = (regexsubdomain for topic, regexsubdomain in source['allowed_subdomains_regex'].items())
            rule = Rule(link_extractor=LinkExtractor(allow=allowed_domain_regex), 
                                                     callback='parse_with_term',
                                                     cb_kwargs={
                                                         'term': self.term,
                                                         'newspaper': newspaper
                                                     },
                                                     follow=True)
            self.rules.append(rule)

        return super(NewspaperCrawler, self).__init__(*args, **kwargs)
onionspider.py 文件源码 项目:ahmia-crawler 作者: ahmia 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_link_extractor(self):
        return LinkExtractor(allow=r'^http://[a-z2-7]{16}.onion',
                             deny=[r'^https://blockchainbdgpzk.onion/address/',
                                   r'^https://blockchainbdgpzk.onion/tx/'],
                             deny_domains=settings.get('FAKE_DOMAINS'))
spiders.py 文件源码 项目:domain-discovery-crawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, seeds=None, login_credentials=None, profile=None):
        super().__init__()
        self.le = LinkExtractor(canonicalize=False)
        self.files_le = LinkExtractor(deny_extensions=[], canonicalize=False)
        self.images_le = LinkExtractor(
            tags=['img'], attrs=['src'], deny_extensions=[], canonicalize=False)
        if seeds:
            with Path(seeds).open('rt', encoding='utf8') as f:
                self.start_urls = [url for url in (line.strip() for line in f)
                                   if not url.startswith('#')]
        if login_credentials:
            with Path(login_credentials).open('rt', encoding='utf8') as f:
                self.login_credentials = json.load(f)
        else:
            self.login_credentials = None
        if profile:
            setup_profiling(profile)
spider.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, conf=None, conn=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make urls
        self.start_urls = [
            'http://www.pfizer.com/research/clinical_trials/find_a_trial?recr=0',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'find_a_trial/NCT\d+',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()
spider.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make start urls
        self.start_urls = _make_start_urls(
                prefix='http://www.gsk-clinicalstudyregister.com/search',
                date_from=date_from, date_to=date_to)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'study\/\d+'
            ), callback=parse_record),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal
broadSpider.py 文件源码 项目:Broad_Crawler 作者: rafacheng 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def extractLinks(self, response):
        retv = []
        link_extractor = LinkExtractor()
        if isinstance(response, HtmlResponse):
            links = link_extractor.extract_links(response)
            for link in links:
                if self.postfix in link.url:
                    retv.append(link.url)
        return retv
test_media_pipeline.py 文件源码 项目:scrapy-cdr 作者: TeamHG-Memex 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, url):
        super(Spider, self).__init__()
        self.start_urls = [url]
        self.le = LinkExtractor(canonicalize=False)
        self.files_le = LinkExtractor(
            tags=['a'], attrs=['href'], deny_extensions=[], canonicalize=False)
epicurious_spider.py 文件源码 项目:RecipesScraper 作者: brandonmburroughs 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def parse(self, response):
    """Parse the recipe list."""
    recipes = LinkExtractor(
        allow=("/recipes/.*/views")
    ).extract_links(response)
    if len(recipes) > 0:
      for recipe_link in recipes:
        yield scrapy.Request(recipe_link.url, callback=self.parse_item)

      base_url, page = response.url.split("=")
      yield scrapy.Request("{}={}".format(base_url, int(page) + 1),
                           callback=self.parse)
    else:
      print "Finished on {}".format(response.url)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def link_extractor(self):
        return LinkExtractor(allow=self.allowed, unique=False,
                             canonicalize=False)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def iframe_link_extractor(self):
        return LinkExtractor(
            allow=self.allowed, tags=['iframe'], attrs=['src'],
            unique=False, canonicalize=False)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def files_link_extractor(self):
        return LinkExtractor(
            allow=self.allowed,
            tags=['a'],
            attrs=['href'],
            deny_extensions=[],  # allow all extensions
            canonicalize=False,
        )
tineret.py 文件源码 项目:czl-scrape 作者: code4romania 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse(self, response):
        articleLinks = LinkExtractor(restrict_css='div.main > div.article')
        pages = articleLinks.extract_links(response)
        for page in pages:
            yield scrapy.Request(page.url, callback=self.parse_article)
user.py 文件源码 项目:Spider 作者: poluo 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_comment(self, response):
        # Do not show all comment
        res = LinkExtractor(allow=('.*/user/.*'), allow_domains='www.reddit.com').extract_links(response)
        for one in res:
            path = one.url.replace('https://www.reddit.com', '')
            yield Request(url=one.url, callback=self.parse_user, meta={'cookies': True, 'path': path})
sis_spider.py 文件源码 项目:spiders 作者: poodarchu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, forum_id=58, digit=1, *args, **kwargs):
        self.start_urls = [self.ip_format % d for d in [int(forum_id)]]
        self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),]
        super(sisSpider, self).__init__(*args, **kwargs)
bboybattles.py 文件源码 项目:dancedeets-monorepo 作者: mikelambert 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
        e = LinkExtractor()
        urls = [link.url for link in e.extract_links(response)]
        for url in urls:
            parsed = urlparse.urlsplit(url)
            qs = urlparse.parse_qs(parsed.query)
            if qs and 'Url' in qs:
                event_url = qs['Url'][0]
                yield self.add_url(event_url)
comeon5678.py 文件源码 项目:dancedeets-monorepo 作者: mikelambert 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse(self, response):
        e = LinkExtractor()
        urls = [link.url for link in e.extract_links(response)]
        for url in urls:
            if response.url != url:
                yield self.addurl(url)
        if urls:
            qs = urlparse.parse_qs(urlparse.urlparse(response.url).query)
            qs = dict((k, v[0]) for (k, v) in qs.iteritems())
            qs['p'] = int(qs['p']) + 1
            url = 'http://comeon5678.com/event/list'
            yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self, url):
        self.start_urls = [url]
        self.link_extractor = LinkExtractor()
        self.collected_items = []
        self.visited_urls = []
        self.responses = []
        super(TestSpider, self).__init__()
chsi.py 文件源码 项目:gaokao 作者: EasyData 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse_jianjie(self, response):

        item = response.meta['item']
        item['intro'] = response.xpath(u'//div[@class="schInfoSubT" and a/@name="2"]/following-sibling::div[1]').extract_first()

        for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response):
            yield Request(link.url, meta={'item': item}, callback=self.parse_zhuanye)
i2pspider.py 文件源码 项目:ahmia-crawler 作者: ahmia 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_link_extractor(self):
        return LinkExtractor(allow=r'.i2p',)
spider.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self, conf=None, conn=None, page_from=None, page_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Default values
        if page_from is None:
            page_from = '1'
        if page_to is None:
            page_to = '1'

        # Make start urls
        self.start_urls = _make_start_urls(
                prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi',
                page_from=page_from)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'cgi-open-bin/ctr_e/ctr_view.cgi',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
                process_value=partial(_process_url, page_from, page_to),
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal
spider.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make start urls
        self.start_urls = _make_start_urls(
            prefix='http://www.anzctr.org.au/TrialSearch.aspx',
            date_from=date_from, date_to=date_to)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'Trial/Registration/TrialReview.aspx',
                process_value=lambda value: value.replace('http', 'https', 1),
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal


问题


面经


文章

微信
公众号

扫码关注公众号