linksExtractor.py 文件源码-python代码片段

linksExtractor.py 文件源码

python

阅读 38 收藏 0 点赞 0 评论 0

项目：Crawl-And-Download 作者: AsciiKay 项目源码文件源码

def linksExtractor(url, fileFormat='png'):
    tag = 'a'
    attr = 'href'
    if (fileFormat in ['png', 'jpg', 'jpeg', 'tiff', 'bmp', 'svg', 'gif']):
        tag = 'img'
        attr = 'src'

    try:
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
        req=urllib2.Request(url, None, headers)
        htmlDoc=urllib2.urlopen(req).read()
    except urllib2.HTTPError as err:
        print("Server Response : " + str(err.code()))
        return "Server refused to connect!"
    except urllib2.URLError:
        return 'Invalid URL!'

    page = BeautifulSoup(htmlDoc, 'html.parser')
    page.prettify()
    res = []


    for link in page.find_all(tag):
        pre = link.get(attr)
        pre = str(pre)
        if (pre[-len(fileFormat):] == fileFormat):
            res.append(pre)
        else:
            pass

    if (len(res) < 1):
        return 'EMPTY'
    return res