def get_file_path(uri):
"""Return file path from an uri."""
url = urlsplit(uri)
if url.scheme.lower() == "file":
return unquote(url.path)
return None
python类unquote()的实例源码
def _is_fetching_self(url, method):
"""Checks if the fetch is for the same URL from which it originated.
Args:
url: str, The URL being fetched.
method: value from _VALID_METHODS.
Returns:
boolean indicating whether or not it seems that the app is trying to fetch
itself.
"""
if (method != GET or
"HTTP_HOST" not in os.environ or
"PATH_INFO" not in os.environ):
return False
_, host_port, path, _, _ = urlparse.urlsplit(url)
if host_port == os.environ['HTTP_HOST']:
current_path = urllib2.unquote(os.environ['PATH_INFO'])
desired_path = urllib2.unquote(path)
if (current_path == desired_path or
(current_path in ('', '/') and desired_path in ('', '/'))):
return True
return False
def uriparse(uri):
"""Uri parser & return the path."""
if not isinstance(uri, str):
uri = uri.get_uri()
return unquote(urlparse(uri).path)
def _showSiteVerificationInfo(site):
import urllib2
printKeyValueList([u'Site', site[u'site'][u'identifier']])
Ind.Increment()
printKeyValueList([u'ID', urllib2.unquote(site[u'id'])])
printKeyValueList([u'Type', site[u'site'][u'type']])
printKeyValueList([u'All Owners', None])
if u'owners' in site:
Ind.Increment()
for owner in site[u'owners']:
printKeyValueList([owner])
Ind.Decrement()
Ind.Decrement()
# gam update verify|verification <DomainName> cname|txt|text|file|site
def check_timestamp(self, bucket_url, bucket_type, timestamp):
"""Check timestamps of signed URLs."""
timestamp_raw = timestamp
offsets = []
mark_request = False
start = 0
try:
if bucket_type != 'Azure':
now = int(time.time())
diff = (int(timestamp) - now) / 3600
else:
timestamp = unquote(timestamp)
timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S%fZ')
diff = int((timestamp - datetime.now()).total_seconds()) / 3600
except ValueError:
return
if diff > 24:
start = self.helpers.indexOf(self.response,
timestamp_raw, True, 0, self.response_len)
if start < 0:
start = self.helpers.indexOf(self.request,
timestamp_raw, True, 0, self.request_len)
mark_request = True
self.offset[0] = start
self.offset[1] = start + len(timestamp_raw)
offsets.append(self.offset)
if mark_request:
markers = [self.callbacks.applyMarkers(self.request_response, offsets, None)]
else:
markers = [self.callbacks.applyMarkers(self.request_response, None, offsets)]
issue_name = '%s Signed URL Excessive Expiration Time' % bucket_type
issue_level = 'Information'
issue_detail = '''The following %s signed URL was found to be valid for more than
24 hours (expires in %sh):<br><li>%s</li>''' % (bucket_type, diff, bucket_url)
self.scan_issues.append(
ScanIssue(self.request_response.getHttpService(),
self.current_url, markers, issue_name, issue_level, issue_detail)
)
def url_to_lookup(url_in):
name = url_in[len(strip_out):]
find_name = urldecode(name)
find_name = escape(find_name, quote = True)
return fixup_find_name(find_name)
def index():
keywords = request.cookies.get('keywords')
if not keywords:
keywords = DEFAULT_KEYWORDS
else:
keywords = unquote(keywords)
target_date = get_date_str(request.cookies.get('datetoken'))
column_list = []
for kw in keywords.split(","):
src = "twitter" if "tweets" in kw.lower() else "arxiv"
num_page = 80 if src == "twitter" else NUMBER_EACH_PAGE
posts = get_posts(src, keywords=kw, since=target_date, start=0, num=num_page)
column_list.append((src, kw, posts))
return render_template("index.html", columns=column_list)
def _is_fetching_self(url, method):
"""Checks if the fetch is for the same URL from which it originated.
Args:
url: str, The URL being fetched.
method: value from _VALID_METHODS.
Returns:
boolean indicating whether or not it seems that the app is trying to fetch
itself.
"""
if (method != GET or
"HTTP_HOST" not in os.environ or
"PATH_INFO" not in os.environ):
return False
_, host_port, path, _, _ = urlparse.urlsplit(url)
if host_port == os.environ['HTTP_HOST']:
current_path = urllib2.unquote(os.environ['PATH_INFO'])
desired_path = urllib2.unquote(path)
if (current_path == desired_path or
(current_path in ('', '/') and desired_path in ('', '/'))):
return True
return False
def downloadFile():
if len(sys.argv) > 1:
query = sys.argv[1]
else:
query = None
url = query + '+'
response = urllib2.urlopen(url)
# Parse URL for the file extension
returnUrl = response.geturl()
if 'filename=' in returnUrl:
# Looks like there's a filename in the return URL!
nS = returnUrl.find('filename=')+9
nE = returnUrl.find('&', nS)
urlFileName = urllib2.unquote(returnUrl[nS:nE])
eS = urlFileName.rfind('.') + 1
extension = urlFileName[eS:]
# Let's infer the type from the name
type = ''
# Check to see if it's a screencast
if 'Capture' in urlFileName:
type = 'screencast'
elif 'Shot' not in urlFileName:
type = 'file'
else:
# If we can't get the file name, assume it's a PNG
extension = 'png'
type = ''
fileName = getNextFileName(extension, type)
with open(fileName, 'wr') as file:
file.write(response.read())
# Run the file download method!
def put(self, sim):
data = self.request.body
#data = urllib2.unquote(request.data.replace("+", " "))
json_dict = json.loads(data)
typemap = json_dict["sim_typemap"]
wire_format = json_dict["wire_format"] if "wire_format" in json_dict else "json"
app_id = json_dict["app_id"]
FrameServer.Store.register_app(sim, typemap, wire_format = wire_format)
def put(self, sim):
data = urllib2.unquote(request.data.replace("+", " "))
json_dict = json.loads(data)
typemap = json_dict["sim_typemap"]
wire_format = json_dict["wire_format"] if "wire_format" in json_dict else "json"
app_id = json_dict["app_id"]
FrameServer.Store.register_app(sim, typemap, wire_format = wire_format)
def put(self, sim):
data = urllib2.unquote(request.data.replace("+", " "))
json_dict = json.loads(data)
typemap = json_dict["sim_typemap"]
wire_format = json_dict["wire_format"] if "wire_format" in json_dict else "json"
app_id = json_dict["app_id"]
FrameServer.Store.register_app(sim, typemap, wire_format = wire_format)
def select_url(url, html, fruitline_spider_variable):
if html < 10:
return []
try:
html_element = document_fromstring(urllib2.unquote(html))
html_element.make_links_absolute(url)
links = [i[2] for i in html_element.iterlinks()]
except Exception, e:
spider_logger.error("Function: select_url, Info: %s" % str(e))
return []
links_unrepeat = set()
[links_unrepeat.add(i) for i in links]
final_links = []
for i in list(links_unrepeat):
full_url = repair_url(i, fruitline_spider_variable)
if fruitline_spider_variable.filter_rule != "":
pattern = re.compile(fruitline_spider_variable.filter_rule)
if re.match(pattern, full_url):
if full_url not in fruitline_spider_variable.crawled_url_queue:
d = dict()
d['method'] = "get"
d['url'] = full_url
final_links.append(d)
else:
if full_url not in fruitline_spider_variable.crawled_url_queue:
d = dict()
d['method'] = "get"
d['url'] = full_url
final_links.append(d)
return final_links
def extractSummary(self, response):
scripts = response.findAll('script')
for script in scripts:
if 'bookDesc_iframe' in script.text:
group = re.search('bookDescEncodedData = "(.*)"', script.text)
if group:
encoded_summary = urllib2.unquote(group.group(1))
summary_text = BeautifulSoup(encoded_summary, "html.parser")
return summary_text.text
return ""
def SearchGoogle(num,target,option):
leak_target=""
start_page = 0
nlink = ""
url_google = []
user_agent = {'User-agent': 'Mozilla/5.0'}
if option == 1:
print "\nLooking leak information into the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+site:"+target
else: #option ==2
extension = target.split(".")[1]
leak_target = target.replace(extension,'')
print "\nLooking leak information outside the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=site.*es+intext:"+leak_target+"+intitle:"+leak_target+"(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+-site:"+target+"+-site:*."+target
try:
response = requests.get(SearchGoogle, headers = user_agent)
except requests.exceptions.RequestException as e:
print "\nError connection to server!" #+ response.url,
pass
except requests.exceptions.ConnectTimeout as e:
print "\nError Timeout",target
pass
#Parser HTML of BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
if response.text.find("Our systems have detected unusual traffic") != -1:
print "CAPTCHA detected - Plata or captcha !!!Maybe try form another IP..."
url_google.append("CAPTCHA detected - Plata or captcha !!!Maybe try form another IP...")
return url_google
#Parser url's throught regular expression
raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
#print raw_links
for link in raw_links:
#Cache Google
if link["href"].find("webcache.googleusercontent.com") == -1:
nlink = link["href"].replace("/url?q=","")
#Parser likns
nlink = re.sub(r'&sa=.*', "", nlink)
nlink = urllib2.unquote(nlink).decode('utf8')
url_google.append(nlink)
#print url_google
if len(raw_links) < 2:
#Verify if Google's Captcha has caught us!
print "No more results..."
url_google.append("No more results")
#captcha = True
return url_google
return url_google
########################################
def SearchGoogle(num,target,option):
leak_target=""
start_page = 0
nlink = ""
user_agent = {'User-agent': 'Mozilla/5.0'}
if option == 1:
print "\nLooking leak information into the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+site:"+target
else: #option ==2
leak_target= target.rstrip(".es")
print "\nLooking leak information outside the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=site.*es+intext:"+leak_target+"+intitle:"+leak_target+"(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+-site:"+target+"+-site:*."+target
try:
response = requests.get(SearchGoogle, headers = user_agent)
except requests.exceptions.RequestException as e:
print "\nError connection to server!" + response.url,
pass
except requests.exceptions.ConnectTimeout as e:
print "\nError Timeout",target
pass
#Parser HTML of BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
#Parser url's throught regular expression
raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
#print raw_links
for link in raw_links:
#Cache Google
if link["href"].find("webcache.googleusercontent.com") == -1:
nlink = link["href"].replace("/url?q=","")
#Parser likns
nlink = re.sub(r'&sa=.*', "", nlink)
nlink = urllib2.unquote(nlink).decode('utf8')
url_google.append(nlink)
if len(raw_links) < 1:
#Verify if Google's Captcha has caught us!
print "CAPTCHA detected!!!Maybe try form another IP..."
#captcha = True
return True
else:
return False
########################################
####### FUNCTION CREATE A DORK ######
#********************************************************#
#Define and design the dork
def SearchGoogle(num,target,option):
leak_target=""
start_page = 0
nlink = ""
url_google = []
user_agent = {'User-agent': 'Mozilla/5.0'}
if option == 1:
print "\nLooking leak information into the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+site:"+target
else: #option ==2
extension = target.split(".")[1]
leak_target = target.replace(extension,'')
#leak_target= target.rstrip(".es") #Cambiarlo
print "\nLooking leak information outside the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=site.*es+intext:"+leak_target+"+intitle:"+leak_target+"(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+-site:"+target+"+-site:*."+target
try:
response = requests.get(SearchGoogle, headers = user_agent)
except requests.exceptions.RequestException as e:
print "\nError connection to server!" #+ response.url,
pass
except requests.exceptions.ConnectTimeout as e:
print "\nError Timeout",target
pass
#Parser HTML of BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
if response.text.find("Our systems have detected unusual traffic") != -1:
print "CAPTCHA detected - Plata or captcha !!!Maybe try form another IP..."
url_google.append("CAPTCHA detected - Plata or captcha !!!Maybe try form another IP...")
return url_google
#Parser url's throught regular expression
raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
#print raw_links
for link in raw_links:
#Cache Google
if link["href"].find("webcache.googleusercontent.com") == -1:
nlink = link["href"].replace("/url?q=","")
#Parser likns
nlink = re.sub(r'&sa=.*', "", nlink)
nlink = urllib2.unquote(nlink).decode('utf8')
url_google.append(nlink)
#print url_google
if len(raw_links) < 2:
#Verify if Google's Captcha has caught us!
print "No more results..."
url_google.append("No more results")
#captcha = True
return url_google
return url_google
########################################
def SearchGoogle(num,target,option):
leak_target=""
start_page = 0
nlink = ""
user_agent = {'User-agent': 'Mozilla/5.0'}
if option == 1:
print "\nLooking leak information into the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+site:"+target
else: #option ==2
leak_target= target.rstrip(".es")
print "\nLooking leak information outside the target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=site.*es+intext:"+leak_target+"+intitle:"+leak_target+"(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:ppt)+-site:"+target+"+-site:*."+target
try:
response = requests.get(SearchGoogle, headers = user_agent)
except requests.exceptions.RequestException as e:
print "\nError connection to server!" + response.url,
pass
except requests.exceptions.ConnectTimeout as e:
print "\nError Timeout",target
pass
#Parser HTML of BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
#Parser url's throught regular expression
raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
#print raw_links
for link in raw_links:
#Cache Google
if link["href"].find("webcache.googleusercontent.com") == -1:
nlink = link["href"].replace("/url?q=","")
#Parser likns
nlink = re.sub(r'&sa=.*', "", nlink)
nlink = urllib2.unquote(nlink).decode('utf8')
url_google.append(nlink)
if len(raw_links) < 1:
#Verify if Google's Captcha has caught us!
print "CAPTCHA detected!!!Maybe try form another IP..."
#captcha = True
return True
else:
return False
########################################
####### FUNCTION CREATE A DORK ######
#********************************************************#
#Define and design the dork
def SearchGoogle(num,target,language):
start_page = 0
nlink = ""
user_agent = {'User-agent': 'Mozilla/5.0'}
nlink_clean = ""
response =""
soup = ""
raw_links = ""
#Split the target in domain and extension
domain = target.replace(".es",'')
extension = target.split(".")[1]
print "\nLooking domains and subdomains of target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=(site:*."+target+"+OR+site:*"+target+"+OR+site:"+domain+"*."+extension+")+-site:www."+target+"&lr=lang_"+language+"&filter=&num=100"
try:
response = requests.get(SearchGoogle, headers = user_agent)
except requests.exceptions.RequestException as e:
print "\nError connection to server!"
pass
except requests.exceptions.ConnectTimeout as e:
print "\nError Timeout",target
pass
try:
#Parser HTML of BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
if response.text.find("Our systems have detected unusual traffic") != -1:
print "CAPTCHA detected - Plata or captcha !!!Maybe try form another IP..."
return True
#Parser url's throught regular expression
raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
#print raw_links
for link in raw_links:
#Cache Google
if link["href"].find("webcache.googleusercontent.com") == -1:
nlink = link["href"].replace("/url?q=","")
#Parser links
nlink = re.sub(r'&sa=.*', "", nlink)
nlink = urllib2.unquote(nlink).decode('utf8')
nlink_clean = nlink.split("//")[-1].split("/")[0]
url_google.append(nlink_clean)
except Exception as e:
print e
if len(raw_links) < 2:
#Verify if the search has taken some results
print "No more results!!!"
#captcha = True
return True
else:
return False
def SearchGoogle(num,target,language):
start_page = 0
nlink = ""
user_agent = {'User-agent': 'Mozilla/5.0'}
nlink_clean = ""
response =""
soup = ""
raw_links = ""
url_google_final =[]
#Split the target in domain and extension
domain = target.replace(".es",'')
extension = target.split(".")[1]
print "\nLooking domains and subdomains of target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=(site:*."+target+"+OR+site:*"+target+"+OR+site:"+domain+"*."+extension+")+-site:www."+target+"+-site:"+target+"&lr=lang_"+language+"&filter=&num=100"
#https://www.google.es/search?q=(site:*.vodafone.com+OR+site:*vodafone.com+OR+site:vodafone*.com)+-site:www.vodafone.com+-site:vodafone.com&lr=lang_en
#inurl:"http?://*vodafone*.es" -site:www.vodafone.es -site:vodafone.es
#(site:*.vodafone.es OR site:*vodafone.es OR site:vodafone*.es) -site:vodafone.es
try:
response = requests.get(SearchGoogle, headers = user_agent)
except requests.exceptions.RequestException as e:
print "\nError connection to server!" #+ response.url,
pass
except requests.exceptions.ConnectTimeout as e:
print "\nError Timeout",target
pass
try:
#Parser HTML of BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
if response.text.find("Our systems have detected unusual traffic") != -1:
print "CAPTCHA detected - Plata or captcha !!!Maybe try form another IP..."
return True
#Parser url's throught regular expression
raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
#print raw_links
for link in raw_links:
#Cache Google
if link["href"].find("webcache.googleusercontent.com") == -1:
nlink = link["href"].replace("/url?q=","")
#Parser links
nlink = re.sub(r'&sa=.*', "", nlink)
nlink = urllib2.unquote(nlink).decode('utf8')
nlink_clean = nlink.split("//")[-1].split("/")[0]
url_google.append(nlink_clean)
url_google_final =DeleteDuplicate(url_google)
return url_google_final
except Exception as e:
print e
if len(raw_links) < 2:
#Verify if Google's Captcha has caught us!
print "No more results!!!"
#captcha = True
return True
else:
return False