python类urldefrag()的实例源码-面圈网

test_urlparse.py 文件源码项目：fast_urlparse 作者: Parsely 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def test_urldefrag(self):
        str_cases = [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
        ]
        def _encode(t):
            return type(t)(x.encode('ascii') for x in t)
        bytes_cases = [_encode(x) for x in str_cases]
        for url, defrag, frag in str_cases + bytes_cases:
            result = urlparse.urldefrag(url)
            self.assertEqual(result.geturl(), url)
            self.assertEqual(result, (defrag, frag))
            self.assertEqual(result.url, defrag)
            self.assertEqual(result.fragment, frag)

so2html.py 文件源码项目：chat 作者: cambridgeltl 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def oa_to_standoff(annotations, target_key='target'):
    """Convert OA annotations to Standoff objects."""
    standoffs = []
    for annotation in annotations:
        target = annotation[target_key]
        # assume target is current doc, ignore all but fragment.
        fragment = urlparse.urldefrag(target)[1]
        try:
            start_end = fragment.split('=', 1)[1]
            start, end = start_end.split(',')
        except IndexError:
            warn('failed to parse target %s' % target)
            start, end = 0, 1
        for type_, norm in _parse_body(annotation):
            standoffs.append(Standoff(int(start), int(end), type_, norm))
    return standoffs

crawler.py 文件源码项目：oscp 作者: sealmindset 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def _pre_visit_url_condense(self, url):

        """ Reduce (condense) URLs into some canonical form before
        visiting.  All occurrences of equivalent URLs are treated as
        identical.

        All this does is strip the \"fragment\" component from URLs,
        so that http://foo.com/blah.html\#baz becomes
        http://foo.com/blah.html """

        base, frag = urlparse.urldefrag(url)
        return base

    ## URL Filtering functions.  These all use information from the
    ## state of the Crawler to evaluate whether a given URL should be
    ## used in some context.  Return value of True indicates that the
    ## URL should be used.

??????.py 文件源码项目：Python_Study 作者: thsheep 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def remove_fragment(url):
    pure_url, _ = urldefrag(url)
    return pure_url

link_crawler.py 文件源码项目：WebScraping 作者: liinnux 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

link_crawler3.py 文件源码项目：WebScraping 作者: liinnux 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

process_crawler.py 文件源码项目：WebScraping 作者: liinnux 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

threaded_crawler.py 文件源码项目：WebScraping 作者: liinnux 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

link_crawler.py 文件源码项目：WebScraping 作者: liinnux 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

discover.py 文件源码项目：oa_qian 作者: sunqb 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def getDisplayIdentifier(self):
        """Return the display_identifier if set, else return the claimed_id.
        """
        if self.display_identifier is not None:
            return self.display_identifier
        if self.claimed_id is None:
            return None
        else:
            return urlparse.urldefrag(self.claimed_id)[0]

discover.py 文件源码项目：oa_qian 作者: sunqb 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def normalizeURL(url):
    """Normalize a URL, converting normalization failures to
    DiscoveryFailure"""
    try:
        normalized = urinorm.urinorm(url)
    except ValueError, why:
        raise DiscoveryFailure('Normalizing identifier: %s' % (why[0],), None)
    else:
        return urlparse.urldefrag(normalized)[0]

test_urlparse.py 文件源码项目：oil 作者: oilshell 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))

test_urlparse.py 文件源码项目：python2-tracer 作者: extremecoders-re 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))

url.py 文件源码项目：krauler 作者: occrp-attic 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        url = urlnorm.norm(url)
        url, _ = urldefrag(url)
        url = re.sub('[\n\r]', '', url)
        url = url.rstrip('/')
        return url
    except:
        return None

test_urlparse.py 文件源码项目：pefile.pypy 作者: cloudtracer 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))

test_urlparse.py 文件源码项目：ndk-python 作者: gittor 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))

asycrawler.py 文件源码项目：asyncmultitasks 作者: willwinworld 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def remove_fragment(url):
    pure_url, frag = urldefrag(url)  # ??#?
    return pure_url

webspider.py 文件源码项目：aweasome_learning 作者: Knight-ZXW 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url

client.py 文件源码项目：zenchmarks 作者: squeaky-pl 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def _urljoin(base, url):
    """
    Construct a full ("absolute") URL by combining a "base URL" with another
    URL. Informally, this uses components of the base URL, in particular the
    addressing scheme, the network location and (part of) the path, to provide
    missing components in the relative URL.

    Additionally, the fragment identifier is preserved according to the HTTP
    1.1 bis draft.

    @type base: C{bytes}
    @param base: Base URL.

    @type url: C{bytes}
    @param url: URL to combine with C{base}.

    @return: An absolute URL resulting from the combination of C{base} and
        C{url}.

    @see: L{urlparse.urljoin}

    @see: U{https://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-22#section-7.1.2}
    """
    base, baseFrag = urldefrag(base)
    url, urlFrag = urldefrag(urljoin(base, url))
    return urljoin(url, b'#' + (urlFrag or baseFrag))

discover.py 文件源码项目：micro-blog 作者: nickChenyx 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def getDisplayIdentifier(self):
        """Return the display_identifier if set, else return the claimed_id.
        """
        if self.display_identifier is not None:
            return self.display_identifier
        if self.claimed_id is None:
            return None
        else:
            return urlparse.urldefrag(self.claimed_id)[0]

discover.py 文件源码项目：micro-blog 作者: nickChenyx 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def normalizeURL(url):
    """Normalize a URL, converting normalization failures to
    DiscoveryFailure"""
    try:
        normalized = urinorm.urinorm(url)
    except ValueError, why:
        raise DiscoveryFailure('Normalizing identifier: %s' % (why[0],), None)
    else:
        return urlparse.urldefrag(normalized)[0]

webspider.py 文件源码项目：browser_vuln_check 作者: lcatro 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url

handler.py 文件源码项目：yjspider 作者: junyu1991 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def _add_link(self,a_link):
        if not self._redis_enable:
            task.url_task.extend(a_link)
            print('Add link to memory')
            return
        self._log.debug("putting url into redis %s " % self.name)
        for a_l in a_link:
            #pass
            self._r.lpush(self._r.hget(self.name,codes.url),urlparse.urldefrag(a_l)[0])

test.py 文件源码项目：yjspider 作者: junyu1991 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def handle_link(self):
        #ownload_url=self._r.hget(self.name,codes.url)
        a_link=[a.get('href') for a in self._soup.find_all('a') if a.get('href')]
        a_link=list(set(a_link))
        b_link=[]
        for a in a_link:
            a=urlparse.urldefrag(a)[0]
            if a.startswith('//jandan.net/ooxx') or a.startswith('//wx1.sinaimg.cn'):
                print("Putting %s " % (a))
                #self._r.lpush(download_url,a)
                b_link.append(a)
        self._add_link(b_link)

term.py 文件源码项目：Meiji 作者: GiovanniBalestrieri 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def defrag(self):
        if "#" in self:
            url, frag = urldefrag(self)
            return URIRef(url)
        else:
            return self

rdfxml.py 文件源码项目：Meiji 作者: GiovanniBalestrieri 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def startElementNS(self, name, qname, attrs):
        stack = self.stack
        stack.append(ElementHandler())
        current = self.current
        parent = self.parent
        base = attrs.get(BASE, None)
        if base is not None:
            base, frag = urldefrag(base)
            if parent and parent.base:
                base = urljoin(parent.base, base)
            else:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base = urljoin(systemId, base)
        else:
            if parent:
                base = parent.base
            if base is None:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base, frag = urldefrag(systemId)
        current.base = base
        language = attrs.get(LANG, None)
        if language is None:
            if parent:
                language = parent.language
        current.language = language
        current.start(name, qname, attrs)

webspider.py 文件源码项目：LinuxBashShellScriptForOps 作者: DingGuodong 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url

term.py 文件源码项目：prophet 作者: MKLab-ITI 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def defrag(self):
        if "#" in self:
            url, frag = urldefrag(self)
            return URIRef(url)
        else:
            return self

rdfxml.py 文件源码项目：prophet 作者: MKLab-ITI 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def startElementNS(self, name, qname, attrs):
        stack = self.stack
        stack.append(ElementHandler())
        current = self.current
        parent = self.parent
        base = attrs.get(BASE, None)
        if base is not None:
            base, frag = urldefrag(base)
            if parent and parent.base:
                base = urljoin(parent.base, base)
            else:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base = urljoin(systemId, base)
        else:
            if parent:
                base = parent.base
            if base is None:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base, frag = urldefrag(systemId)
        current.base = base
        language = attrs.get(LANG, None)
        if language is None:
            if parent:
                language = parent.language
        current.language = language
        current.start(name, qname, attrs)

discover.py 文件源码项目：Hawkeye 作者: tozhengxq 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def getDisplayIdentifier(self):
        """Return the display_identifier if set, else return the claimed_id.
        """
        if self.display_identifier is not None:
            return self.display_identifier
        if self.claimed_id is None:
            return None
        else:
            return urlparse.urldefrag(self.claimed_id)[0]