def get_file_urls(mainUrl,extension):
uniFileUrls = []
if not mainUrl.lower().startswith('http://') and not mainUrl.lower().startswith('https://'):
mainUrl = 'http://%s'%mainUrl
print('Downloading from %s...'%mainUrl)
if extension.startswith('*'):
extension = extension[1:]
if not extension.startswith('.'):
extension = '.' + extension
req = urllib.request.Request(
mainUrl,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
urlContent = urllib.request.urlopen(req).read().decode('utf-8')
html = lxml.html.fromstring(urlContent)
urls = html.xpath('//a/@href')
for url in urls:
if url.endswith(extension):
url = urljoin(mainUrl,url)
if url not in uniFileUrls:
uniFileUrls.append(url)
return uniFileUrls
评论列表
文章目录