def linksExtractor(url, fileFormat='png'):
tag = 'a'
attr = 'href'
if (fileFormat in ['png', 'jpg', 'jpeg', 'tiff', 'bmp', 'svg', 'gif']):
tag = 'img'
attr = 'src'
try:
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
req=urllib2.Request(url, None, headers)
htmlDoc=urllib2.urlopen(req).read()
except urllib2.HTTPError as err:
print("Server Response : " + str(err.code()))
return "Server refused to connect!"
except urllib2.URLError:
return 'Invalid URL!'
page = BeautifulSoup(htmlDoc, 'html.parser')
page.prettify()
res = []
for link in page.find_all(tag):
pre = link.get(attr)
pre = str(pre)
if (pre[-len(fileFormat):] == fileFormat):
res.append(pre)
else:
pass
if (len(res) < 1):
return 'EMPTY'
return res
评论列表
文章目录