utils.py 文件源码-python代码片段

def extract_real_url_from_embedded_url(embedded_url):
    """
    ? embed_real_url_to_embedded_url() ????url???????????url
    `cdn_redirect_encode_query_str_into_url`????????, ??????????????????

    eg: https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css
        ---> https://foo.com/a.php?q=something (assume it returns an css) (base64 only)
    eg2: https://cdn.domain.com/a/b/_zm24_.bG92ZT1saXZl._zm24_.jpg
        ---> https://foo.com/a/b/?love=live (assume it returns an jpg) (base64 only)
    eg3: https://cdn.domain.com/a/b/_zm24z_.[some long long base64 encoded string]._zm24_.jpg
        ---> https://foo.com/a/b/?love=live[and a long long query string] (assume it returns an jpg) (gzip + base64)
    eg4:https://cdn.domain.com/a  (no change)
        ---> (no query string): https://foo.com/a (assume it returns an png) (no change)
    :param embedded_url: ??????URL
    :return: ??????????URL, ???????URL, ????None
    :type embedded_url: str
    :rtype: Union[str, None]
    """
    if '._' + cdn_url_query_encode_salt + '_.' not in embedded_url[-15:]:  # check url mark
        return None
    m = regex_extract_base64_from_embedded_url.search(embedded_url)
    b64 = get_group('b64', m)

    # 'https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css'
    # real_request_url_no_query ---> 'https://cdn.domain.com/a.php'
    real_request_url_no_query = embedded_url[:m.span()[0]]

    query_string_byte = base64.urlsafe_b64decode(b64)
    is_gzipped = get_group('gzip', m)
    if is_gzipped:
        query_string_byte = zlib.decompress(query_string_byte)
    query_string = query_string_byte.decode(encoding='utf-8')

    result = urljoin(real_request_url_no_query, '?' + query_string)
    # dbgprint('extract:', embedded_url, 'to', result)
    return result