getlinks_4.py 文件源码-python代码片段

getlinks_4.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

def getLinks(pageUrl):
    global pages
    html=urlopen("http://en.wikipedia.org"+pageUrl)
    bs=BeautifulSoup(html,"html.parser")
    try:
        print(bs.h1.get_text())
        print(bs.find(id="mw-content-text").findAll("p")[0])
        print(bs.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("????????")
    for link in bs.findAll("a",href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newpage=link.attrs["href"]
                print("---------\n"+newpage)
                pages.add(newpage)
                getLinks(newpage)