def get_url(domain,port,timeout):
url_list = []
if port ==443:
surl = 'https://' + domain
else:
surl = 'http://' + domain
res = urllib2.urlopen(surl, timeout=timeout)
html = res.read()
root_url = res.geturl()
m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I)
if m:
for url in m:
ParseResult = urlparse.urlparse(url[1])
if ParseResult.netloc and ParseResult.scheme:
if domain == ParseResult.hostname:
url_list.append(HTMLParser.HTMLParser().unescape(url[1]))
elif not ParseResult.netloc and not ParseResult.scheme:
url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1])))
return list(set(url_list))
评论列表
文章目录