mian3.py 文件源码-python代码片段

mian3.py 文件源码

python

阅读 17 收藏 0 点赞 0 评论 0

项目：ProxyIPCrawler 作者: uilliu 项目源码文件源码

def ProxyIPSpider(self):
    # get the proxy
    f = open('proxy.txt', 'w')
    for page in range(1,50):
        url = 'http://www.xicidaili.com/nn/%s' %page
        user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
        request = urllib2.Request(url)
        request.add_header("User-Agent", user_agent)
        content = urllib2.urlopen(request)
        soup = BeautifulSoup(content)
        trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
        for tr in trs[1:]:
            tds = tr.findAll('td')
            ip = tds[2].text.strip()
            port = tds[3].text.strip()
            protocol = tds[6].text.strip()
            if protocol == 'HTTP' or protocol == 'HTTPS':
                f.write('%s=%s:%s\n' % (protocol, ip, port))
                print '%s://%s:%s' % (protocol, ip, port)