def strip_uri(self, uri, expected_language=None,
remove_index=False):
''' Returns (stripped_uri, success) '''
parsed_uri = urlparse.urlparse(uri)
matched_languages = [self.match(parsed_uri.path),
self.match(parsed_uri.query)]
if (expected_language is not None) and \
(expected_language not in matched_languages):
# we removed a bit of the URL but is does not support our
# hope to find expected_language, e.g. removed /fr/ when we were
# looking for Italian pages.
return '', False
stripped_path = self.strip_path(parsed_uri.path)
# repair some stripping artifacts
stripped_path = re.sub(r'//+', '/', stripped_path)
stripped_path = re.sub(r'__+', '_', stripped_path)
stripped_path = re.sub(r'/_+', '/', stripped_path)
stripped_path = re.sub(r'_/', '/', stripped_path)
stripped_path = re.sub(r'--+', '-', stripped_path)
# remove new trailing /
if stripped_path and stripped_path[-1] == '/' \
and parsed_uri.path and parsed_uri.path[-1] != '/':
stripped_path = stripped_path[:-1]
# add removed trailing /
if not stripped_path.endswith('/') and parsed_uri.path.endswith('/'):
stripped_path += '/'
stripped_query = self.strip_query(parsed_uri.query)
# remove index files from tail of path if query empty
if remove_index and not stripped_query:
if stripped_path.split('/')[-1].startswith('index'):
stripped_path = '/'.join(stripped_path.split('/')[:-1])
netloc = parsed_uri.netloc
if '@' in netloc:
netloc = netloc.split('@')[1]
if ':' in netloc:
netloc = netloc.split(':')[0]
if not netloc:
return '', False
stripped_uri = urlparse.ParseResult(scheme='http',
netloc=parsed_uri.netloc,
path=stripped_path,
params='',
query=stripped_query,
fragment='').geturl()
return stripped_uri, stripped_uri != uri
languagestripper.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录