def test_urldefrag(self):
str_cases = [
('http://python.org#frag', 'http://python.org', 'frag'),
('http://python.org', 'http://python.org', ''),
('http://python.org/#frag', 'http://python.org/', 'frag'),
('http://python.org/', 'http://python.org/', ''),
('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
('http://python.org/?q', 'http://python.org/?q', ''),
('http://python.org/p#frag', 'http://python.org/p', 'frag'),
('http://python.org/p?q', 'http://python.org/p?q', ''),
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
]
def _encode(t):
return type(t)(x.encode('ascii') for x in t)
bytes_cases = [_encode(x) for x in str_cases]
for url, defrag, frag in str_cases + bytes_cases:
result = urlparse.urldefrag(url)
self.assertEqual(result.geturl(), url)
self.assertEqual(result, (defrag, frag))
self.assertEqual(result.url, defrag)
self.assertEqual(result.fragment, frag)
python类urldefrag()的实例源码
def oa_to_standoff(annotations, target_key='target'):
"""Convert OA annotations to Standoff objects."""
standoffs = []
for annotation in annotations:
target = annotation[target_key]
# assume target is current doc, ignore all but fragment.
fragment = urlparse.urldefrag(target)[1]
try:
start_end = fragment.split('=', 1)[1]
start, end = start_end.split(',')
except IndexError:
warn('failed to parse target %s' % target)
start, end = 0, 1
for type_, norm in _parse_body(annotation):
standoffs.append(Standoff(int(start), int(end), type_, norm))
return standoffs
def _pre_visit_url_condense(self, url):
""" Reduce (condense) URLs into some canonical form before
visiting. All occurrences of equivalent URLs are treated as
identical.
All this does is strip the \"fragment\" component from URLs,
so that http://foo.com/blah.html\#baz becomes
http://foo.com/blah.html """
base, frag = urlparse.urldefrag(url)
return base
## URL Filtering functions. These all use information from the
## state of the Crawler to evaluate whether a given URL should be
## used in some context. Return value of True indicates that the
## URL should be used.
def remove_fragment(url):
pure_url, _ = urldefrag(url)
return pure_url
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def getDisplayIdentifier(self):
"""Return the display_identifier if set, else return the claimed_id.
"""
if self.display_identifier is not None:
return self.display_identifier
if self.claimed_id is None:
return None
else:
return urlparse.urldefrag(self.claimed_id)[0]
def normalizeURL(url):
"""Normalize a URL, converting normalization failures to
DiscoveryFailure"""
try:
normalized = urinorm.urinorm(url)
except ValueError, why:
raise DiscoveryFailure('Normalizing identifier: %s' % (why[0],), None)
else:
return urlparse.urldefrag(normalized)[0]
def test_urldefrag(self):
for url, defrag, frag in [
('http://python.org#frag', 'http://python.org', 'frag'),
('http://python.org', 'http://python.org', ''),
('http://python.org/#frag', 'http://python.org/', 'frag'),
('http://python.org/', 'http://python.org/', ''),
('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
('http://python.org/?q', 'http://python.org/?q', ''),
('http://python.org/p#frag', 'http://python.org/p', 'frag'),
('http://python.org/p?q', 'http://python.org/p?q', ''),
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
]:
self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
def test_urldefrag(self):
for url, defrag, frag in [
('http://python.org#frag', 'http://python.org', 'frag'),
('http://python.org', 'http://python.org', ''),
('http://python.org/#frag', 'http://python.org/', 'frag'),
('http://python.org/', 'http://python.org/', ''),
('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
('http://python.org/?q', 'http://python.org/?q', ''),
('http://python.org/p#frag', 'http://python.org/p', 'frag'),
('http://python.org/p?q', 'http://python.org/p?q', ''),
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
]:
self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
def normalize_url(url):
# TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
try:
url = urlnorm.norm(url)
url, _ = urldefrag(url)
url = re.sub('[\n\r]', '', url)
url = url.rstrip('/')
return url
except:
return None
def test_urldefrag(self):
for url, defrag, frag in [
('http://python.org#frag', 'http://python.org', 'frag'),
('http://python.org', 'http://python.org', ''),
('http://python.org/#frag', 'http://python.org/', 'frag'),
('http://python.org/', 'http://python.org/', ''),
('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
('http://python.org/?q', 'http://python.org/?q', ''),
('http://python.org/p#frag', 'http://python.org/p', 'frag'),
('http://python.org/p?q', 'http://python.org/p?q', ''),
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
]:
self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
def test_urldefrag(self):
for url, defrag, frag in [
('http://python.org#frag', 'http://python.org', 'frag'),
('http://python.org', 'http://python.org', ''),
('http://python.org/#frag', 'http://python.org/', 'frag'),
('http://python.org/', 'http://python.org/', ''),
('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
('http://python.org/?q', 'http://python.org/?q', ''),
('http://python.org/p#frag', 'http://python.org/p', 'frag'),
('http://python.org/p?q', 'http://python.org/p?q', ''),
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
]:
self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
def remove_fragment(url):
pure_url, frag = urldefrag(url) # ??#?
return pure_url
def remove_fragment(url):
pure_url, frag = urldefrag(url)
return pure_url
def _urljoin(base, url):
"""
Construct a full ("absolute") URL by combining a "base URL" with another
URL. Informally, this uses components of the base URL, in particular the
addressing scheme, the network location and (part of) the path, to provide
missing components in the relative URL.
Additionally, the fragment identifier is preserved according to the HTTP
1.1 bis draft.
@type base: C{bytes}
@param base: Base URL.
@type url: C{bytes}
@param url: URL to combine with C{base}.
@return: An absolute URL resulting from the combination of C{base} and
C{url}.
@see: L{urlparse.urljoin}
@see: U{https://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-22#section-7.1.2}
"""
base, baseFrag = urldefrag(base)
url, urlFrag = urldefrag(urljoin(base, url))
return urljoin(url, b'#' + (urlFrag or baseFrag))
def getDisplayIdentifier(self):
"""Return the display_identifier if set, else return the claimed_id.
"""
if self.display_identifier is not None:
return self.display_identifier
if self.claimed_id is None:
return None
else:
return urlparse.urldefrag(self.claimed_id)[0]
def normalizeURL(url):
"""Normalize a URL, converting normalization failures to
DiscoveryFailure"""
try:
normalized = urinorm.urinorm(url)
except ValueError, why:
raise DiscoveryFailure('Normalizing identifier: %s' % (why[0],), None)
else:
return urlparse.urldefrag(normalized)[0]
def remove_fragment(url):
pure_url, frag = urldefrag(url)
return pure_url
def _add_link(self,a_link):
if not self._redis_enable:
task.url_task.extend(a_link)
print('Add link to memory')
return
self._log.debug("putting url into redis %s " % self.name)
for a_l in a_link:
#pass
self._r.lpush(self._r.hget(self.name,codes.url),urlparse.urldefrag(a_l)[0])
def handle_link(self):
#ownload_url=self._r.hget(self.name,codes.url)
a_link=[a.get('href') for a in self._soup.find_all('a') if a.get('href')]
a_link=list(set(a_link))
b_link=[]
for a in a_link:
a=urlparse.urldefrag(a)[0]
if a.startswith('//jandan.net/ooxx') or a.startswith('//wx1.sinaimg.cn'):
print("Putting %s " % (a))
#self._r.lpush(download_url,a)
b_link.append(a)
self._add_link(b_link)
def defrag(self):
if "#" in self:
url, frag = urldefrag(self)
return URIRef(url)
else:
return self
def startElementNS(self, name, qname, attrs):
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
base = attrs.get(BASE, None)
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
language = attrs.get(LANG, None)
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
def remove_fragment(url):
pure_url, frag = urldefrag(url)
return pure_url
def defrag(self):
if "#" in self:
url, frag = urldefrag(self)
return URIRef(url)
else:
return self
def startElementNS(self, name, qname, attrs):
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
base = attrs.get(BASE, None)
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
language = attrs.get(LANG, None)
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
def getDisplayIdentifier(self):
"""Return the display_identifier if set, else return the claimed_id.
"""
if self.display_identifier is not None:
return self.display_identifier
if self.claimed_id is None:
return None
else:
return urlparse.urldefrag(self.claimed_id)[0]