languagestripper.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:wmt16-document-alignment-task 作者: christianbuck 项目源码 文件源码
def strip_uri(self, uri, expected_language=None,
                  remove_index=False):
        ''' Returns (stripped_uri, success) '''
        parsed_uri = urlparse.urlparse(uri)

        matched_languages = [self.match(parsed_uri.path),
                             self.match(parsed_uri.query)]

        if (expected_language is not None) and \
                (expected_language not in matched_languages):
            # we removed a bit of the URL but is does not support our
            # hope to find expected_language, e.g. removed /fr/ when we were
            # looking for Italian pages.
            return '', False

        stripped_path = self.strip_path(parsed_uri.path)

        # repair some stripping artifacts
        stripped_path = re.sub(r'//+', '/', stripped_path)
        stripped_path = re.sub(r'__+', '_', stripped_path)
        stripped_path = re.sub(r'/_+', '/', stripped_path)
        stripped_path = re.sub(r'_/', '/', stripped_path)
        stripped_path = re.sub(r'--+', '-', stripped_path)

        # remove new trailing /
        if stripped_path and stripped_path[-1] == '/' \
                and parsed_uri.path and parsed_uri.path[-1] != '/':
            stripped_path = stripped_path[:-1]

        # add removed trailing /
        if not stripped_path.endswith('/') and parsed_uri.path.endswith('/'):
            stripped_path += '/'

        stripped_query = self.strip_query(parsed_uri.query)

        # remove index files from tail of path if query empty
        if remove_index and not stripped_query:
            if stripped_path.split('/')[-1].startswith('index'):
                stripped_path = '/'.join(stripped_path.split('/')[:-1])

        netloc = parsed_uri.netloc
        if '@' in netloc:
            netloc = netloc.split('@')[1]
        if ':' in netloc:
            netloc = netloc.split(':')[0]
        if not netloc:
            return '', False

        stripped_uri = urlparse.ParseResult(scheme='http',
                                            netloc=parsed_uri.netloc,
                                            path=stripped_path,
                                            params='',
                                            query=stripped_query,
                                            fragment='').geturl()

        return stripped_uri, stripped_uri != uri
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号