phishtank.py 文件源码-python代码片段

phishtank.py 文件源码

python

阅读 21 收藏 0 点赞 0 评论 0

项目：attacks-pages-collector 作者: ifreddyrondon 项目源码文件源码

def gather():
    url_regex = '(?:http.*://)?(?P<host>[^:/ ]+).?(?P<port>[0-9]*).*'
    base_url = "http://data.phishtank.com/data/online-valid.csv.bz2"
    attack_type = "undefined"
    res = get_url(base_url)

    results = bz2.decompress(res.content)
    for line in results.split("\n")[1:]:
        if line == "":
            continue

        line = line.split(",")
        site_url = line[1]
        m = re.search(url_regex, site_url)
        host = m.group('host')
        ip_address = get_ip(host)
        if ip_address == "undefined":
            who_is, country = "undefined", "undefined"
        else:
            who_is, country = get_who_is_and_country(ip_address)

        doc = {
            'IP': ip_address,
            'SourceInfo': base_url,
            'Type': attack_type,
            'Country': country,
            'Domain': host,
            'URL': host,
            'WhoIsInfo': who_is,
        }

        pprint(doc)