def fixGoogleURL(url):
"""
Fixes the url extracted from HTML when
performing a google search
:param url:
:return: Correctly formatted URL to be used in requests.get
"""
if "&sa=" in url:
url=url.split("&")[0]
if "/url?q=" in url[0:7]:
url = url[7:] #Removes the "/url?q=" prefix
if url[:30] == "http://www.google.com/url?url=": #Used to get rid of this header and just retrieve the Stack Overflow link
url = url[30:]
if "http" not in url[:4]:
url = "https://" + url #Add the protocol if it doesn't already exist
#Makes sure that we stay in the questions section of Stack Overflow
if not bool(re.search("/questions/[0-9]+", url)) and not bool(re.search("\.com/a/[0-9]", url)):
return None
if url[:17] == "https:///url?url=": #Resolves rare bug in which this is a prefix
url = url[17:]
return url
评论列表
文章目录