python类urldefrag()的实例源码

test_urlparse.py 文件源码 项目:fast_urlparse 作者: Parsely 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_urldefrag(self):
        str_cases = [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
        ]
        def _encode(t):
            return type(t)(x.encode('ascii') for x in t)
        bytes_cases = [_encode(x) for x in str_cases]
        for url, defrag, frag in str_cases + bytes_cases:
            result = urlparse.urldefrag(url)
            self.assertEqual(result.geturl(), url)
            self.assertEqual(result, (defrag, frag))
            self.assertEqual(result.url, defrag)
            self.assertEqual(result.fragment, frag)
so2html.py 文件源码 项目:chat 作者: cambridgeltl 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def oa_to_standoff(annotations, target_key='target'):
    """Convert OA annotations to Standoff objects."""
    standoffs = []
    for annotation in annotations:
        target = annotation[target_key]
        # assume target is current doc, ignore all but fragment.
        fragment = urlparse.urldefrag(target)[1]
        try:
            start_end = fragment.split('=', 1)[1]
            start, end = start_end.split(',')
        except IndexError:
            warn('failed to parse target %s' % target)
            start, end = 0, 1
        for type_, norm in _parse_body(annotation):
            standoffs.append(Standoff(int(start), int(end), type_, norm))
    return standoffs
crawler.py 文件源码 项目:oscp 作者: sealmindset 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def _pre_visit_url_condense(self, url):

        """ Reduce (condense) URLs into some canonical form before
        visiting.  All occurrences of equivalent URLs are treated as
        identical.

        All this does is strip the \"fragment\" component from URLs,
        so that http://foo.com/blah.html\#baz becomes
        http://foo.com/blah.html """

        base, frag = urlparse.urldefrag(url)
        return base

    ## URL Filtering functions.  These all use information from the
    ## state of the Crawler to evaluate whether a given URL should be
    ## used in some context.  Return value of True indicates that the
    ## URL should be used.
??????.py 文件源码 项目:Python_Study 作者: thsheep 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def remove_fragment(url):
    pure_url, _ = urldefrag(url)
    return pure_url
link_crawler.py 文件源码 项目:WebScraping 作者: liinnux 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)
link_crawler3.py 文件源码 项目:WebScraping 作者: liinnux 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)
process_crawler.py 文件源码 项目:WebScraping 作者: liinnux 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)
threaded_crawler.py 文件源码 项目:WebScraping 作者: liinnux 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)
link_crawler.py 文件源码 项目:WebScraping 作者: liinnux 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)
discover.py 文件源码 项目:oa_qian 作者: sunqb 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def getDisplayIdentifier(self):
        """Return the display_identifier if set, else return the claimed_id.
        """
        if self.display_identifier is not None:
            return self.display_identifier
        if self.claimed_id is None:
            return None
        else:
            return urlparse.urldefrag(self.claimed_id)[0]
discover.py 文件源码 项目:oa_qian 作者: sunqb 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def normalizeURL(url):
    """Normalize a URL, converting normalization failures to
    DiscoveryFailure"""
    try:
        normalized = urinorm.urinorm(url)
    except ValueError, why:
        raise DiscoveryFailure('Normalizing identifier: %s' % (why[0],), None)
    else:
        return urlparse.urldefrag(normalized)[0]
test_urlparse.py 文件源码 项目:oil 作者: oilshell 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
test_urlparse.py 文件源码 项目:python2-tracer 作者: extremecoders-re 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
url.py 文件源码 项目:krauler 作者: occrp-attic 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        url = urlnorm.norm(url)
        url, _ = urldefrag(url)
        url = re.sub('[\n\r]', '', url)
        url = url.rstrip('/')
        return url
    except:
        return None
test_urlparse.py 文件源码 项目:pefile.pypy 作者: cloudtracer 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
test_urlparse.py 文件源码 项目:ndk-python 作者: gittor 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_urldefrag(self):
        for url, defrag, frag in [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', ''),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', ''),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', ''),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', ''),
            (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
            (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
            ]:
            self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
asycrawler.py 文件源码 项目:asyncmultitasks 作者: willwinworld 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def remove_fragment(url):
    pure_url, frag = urldefrag(url)  # ??#?
    return pure_url
webspider.py 文件源码 项目:aweasome_learning 作者: Knight-ZXW 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url
client.py 文件源码 项目:zenchmarks 作者: squeaky-pl 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _urljoin(base, url):
    """
    Construct a full ("absolute") URL by combining a "base URL" with another
    URL. Informally, this uses components of the base URL, in particular the
    addressing scheme, the network location and (part of) the path, to provide
    missing components in the relative URL.

    Additionally, the fragment identifier is preserved according to the HTTP
    1.1 bis draft.

    @type base: C{bytes}
    @param base: Base URL.

    @type url: C{bytes}
    @param url: URL to combine with C{base}.

    @return: An absolute URL resulting from the combination of C{base} and
        C{url}.

    @see: L{urlparse.urljoin}

    @see: U{https://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-22#section-7.1.2}
    """
    base, baseFrag = urldefrag(base)
    url, urlFrag = urldefrag(urljoin(base, url))
    return urljoin(url, b'#' + (urlFrag or baseFrag))
discover.py 文件源码 项目:micro-blog 作者: nickChenyx 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def getDisplayIdentifier(self):
        """Return the display_identifier if set, else return the claimed_id.
        """
        if self.display_identifier is not None:
            return self.display_identifier
        if self.claimed_id is None:
            return None
        else:
            return urlparse.urldefrag(self.claimed_id)[0]
discover.py 文件源码 项目:micro-blog 作者: nickChenyx 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def normalizeURL(url):
    """Normalize a URL, converting normalization failures to
    DiscoveryFailure"""
    try:
        normalized = urinorm.urinorm(url)
    except ValueError, why:
        raise DiscoveryFailure('Normalizing identifier: %s' % (why[0],), None)
    else:
        return urlparse.urldefrag(normalized)[0]
webspider.py 文件源码 项目:browser_vuln_check 作者: lcatro 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url
handler.py 文件源码 项目:yjspider 作者: junyu1991 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def _add_link(self,a_link):
        if not self._redis_enable:
            task.url_task.extend(a_link)
            print('Add link to memory')
            return
        self._log.debug("putting url into redis %s " % self.name)
        for a_l in a_link:
            #pass
            self._r.lpush(self._r.hget(self.name,codes.url),urlparse.urldefrag(a_l)[0])
test.py 文件源码 项目:yjspider 作者: junyu1991 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def handle_link(self):
        #ownload_url=self._r.hget(self.name,codes.url)
        a_link=[a.get('href') for a in self._soup.find_all('a') if a.get('href')]
        a_link=list(set(a_link))
        b_link=[]
        for a in a_link:
            a=urlparse.urldefrag(a)[0]
            if a.startswith('//jandan.net/ooxx') or a.startswith('//wx1.sinaimg.cn'):
                print("Putting %s " % (a))
                #self._r.lpush(download_url,a)
                b_link.append(a)
        self._add_link(b_link)
term.py 文件源码 项目:Meiji 作者: GiovanniBalestrieri 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def defrag(self):
        if "#" in self:
            url, frag = urldefrag(self)
            return URIRef(url)
        else:
            return self
rdfxml.py 文件源码 项目:Meiji 作者: GiovanniBalestrieri 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def startElementNS(self, name, qname, attrs):
        stack = self.stack
        stack.append(ElementHandler())
        current = self.current
        parent = self.parent
        base = attrs.get(BASE, None)
        if base is not None:
            base, frag = urldefrag(base)
            if parent and parent.base:
                base = urljoin(parent.base, base)
            else:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base = urljoin(systemId, base)
        else:
            if parent:
                base = parent.base
            if base is None:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base, frag = urldefrag(systemId)
        current.base = base
        language = attrs.get(LANG, None)
        if language is None:
            if parent:
                language = parent.language
        current.language = language
        current.start(name, qname, attrs)
webspider.py 文件源码 项目:LinuxBashShellScriptForOps 作者: DingGuodong 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url
term.py 文件源码 项目:prophet 作者: MKLab-ITI 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def defrag(self):
        if "#" in self:
            url, frag = urldefrag(self)
            return URIRef(url)
        else:
            return self
rdfxml.py 文件源码 项目:prophet 作者: MKLab-ITI 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def startElementNS(self, name, qname, attrs):
        stack = self.stack
        stack.append(ElementHandler())
        current = self.current
        parent = self.parent
        base = attrs.get(BASE, None)
        if base is not None:
            base, frag = urldefrag(base)
            if parent and parent.base:
                base = urljoin(parent.base, base)
            else:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base = urljoin(systemId, base)
        else:
            if parent:
                base = parent.base
            if base is None:
                systemId = self.locator.getPublicId() \
                    or self.locator.getSystemId()
                if systemId:
                    base, frag = urldefrag(systemId)
        current.base = base
        language = attrs.get(LANG, None)
        if language is None:
            if parent:
                language = parent.language
        current.language = language
        current.start(name, qname, attrs)
discover.py 文件源码 项目:Hawkeye 作者: tozhengxq 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def getDisplayIdentifier(self):
        """Return the display_identifier if set, else return the claimed_id.
        """
        if self.display_identifier is not None:
            return self.display_identifier
        if self.claimed_id is None:
            return None
        else:
            return urlparse.urldefrag(self.claimed_id)[0]


问题


面经


文章

微信
公众号

扫码关注公众号