def extract_real_url_from_embedded_url(embedded_url):
"""
? embed_real_url_to_embedded_url() ????url???????????url
`cdn_redirect_encode_query_str_into_url`????????, ??????????????????
eg: https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css
---> https://foo.com/a.php?q=something (assume it returns an css) (base64 only)
eg2: https://cdn.domain.com/a/b/_zm24_.bG92ZT1saXZl._zm24_.jpg
---> https://foo.com/a/b/?love=live (assume it returns an jpg) (base64 only)
eg3: https://cdn.domain.com/a/b/_zm24z_.[some long long base64 encoded string]._zm24_.jpg
---> https://foo.com/a/b/?love=live[and a long long query string] (assume it returns an jpg) (gzip + base64)
eg4:https://cdn.domain.com/a (no change)
---> (no query string): https://foo.com/a (assume it returns an png) (no change)
:param embedded_url: ??????URL
:return: ??????????URL, ???????URL, ????None
:type embedded_url: str
:rtype: Union[str, None]
"""
if '._' + cdn_url_query_encode_salt + '_.' not in embedded_url[-15:]: # check url mark
return None
m = regex_extract_base64_from_embedded_url.search(embedded_url)
b64 = get_group('b64', m)
# 'https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css'
# real_request_url_no_query ---> 'https://cdn.domain.com/a.php'
real_request_url_no_query = embedded_url[:m.span()[0]]
query_string_byte = base64.urlsafe_b64decode(b64)
is_gzipped = get_group('gzip', m)
if is_gzipped:
query_string_byte = zlib.decompress(query_string_byte)
query_string = query_string_byte.decode(encoding='utf-8')
result = urljoin(real_request_url_no_query, '?' + query_string)
# dbgprint('extract:', embedded_url, 'to', result)
return result
评论列表
文章目录