python类BeautifulSoup()的实例源码

get_all_songs.py 文件源码 项目:encore.ai 作者: dyelax 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def download_songs(url):
  time.sleep(random.random() * 0.5)
  try:
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page, 'html.parser')

    # Get the artist name
    artist_name = soup.findAll('h1')[0].get_text()[:-7].lower().replace(' ', '_')

    # Store all songs for a given artist
    with open('artist_data/'+artist_name+'.txt', 'wb') as w:
      for song in soup.findAll('a', {'target': '_blank'}):
        if 'lyrics/' in song['href']:
          song_url = song['href'][1:].strip()
          w.write(song_url + '\n')
  except urllib2.HTTPError:
    print '404 not found'
downloader.py 文件源码 项目:earthy 作者: alvations 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def packages(self):
        """
        Parse XML file to locate packages.
        """
        xml = requests.get(self._xml_url).content
        soup = BeautifulSoup(xml, "html.parser")
        nltk_packages, third_party = defaultdict(dict), defaultdict(dict)
        for pack in soup.find_all('package'):
            package_attributes = pack.attrs
            name = package_attributes['id']
            # Keeps track of nltk_data packages vs third_party packages.
            if package_attributes['url'].startswith(self._nltk_data_url):
                nltk_packages[name] = package_attributes
            else:
                third_party[name] = package_attributes
        return nltk_packages, third_party
tibia.py 文件源码 项目:Jumper-Cogs 作者: Redjumpman 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _online_tibia(self):
        """Get total players playing"""
        url = "http://www.tibia.com/community/?subtopic=worlds"
        try:
            async with aiohttp.get(url) as response:
                soup = BeautifulSoup(await response.text(), "html.parser")
                div1 = soup.find('div', attrs={'id': 'RightArtwork'})
                div2 = div1.find('div', attrs={'id': 'PlayersOnline'})
                test = div2.get_text()
                test1 = test.replace("Players Online", "")
                new = "Players currently playing Tibia: " + test1
                # div2 = div1.find('div', attrs={'class': 'Border_2'})
                # div3 = div2.find('div', attrs={'class': 'Border_3'})
                # table = div3.find_all('table', attrs={'class': 'Table1'})
                # tr = table.find_all('tr')
                # tbody = div4.find('div', attrs={'class': 'CaptionInnerContainer'})
                await self.bot.say(str(new))
        except:
            await self.bot.say("Could not retrive data. The webserver may be offline.")
tibia.py 文件源码 项目:Jumper-Cogs 作者: Redjumpman 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _server_tibia(self, servername):
        """Get Server Info"""
        servername = servername.title()
        url = "https://secure.tibia.com/community/?subtopic=worlds&world=" + str(servername)
        try:
            async with aiohttp.get(url) as response:
                soup = BeautifulSoup(await response.text(), "html5lib")
                b = soup.find_all("table", attrs={'class': 'Table1'})
                new = []
                rows = b[1].tbody.div.find_all('td')
                for row in rows:
                    new.append(row.get_text())
                k = new[::2]
                l = new[1::2]
                zipped = list(zip(k, l))
                t = tabulate(zipped, headers=["Category", "Info"])
                await self.bot.say("```Python" + "\n" + str(t) + "```")
        except:
            await self.bot.say("Unable to retrive server data. The webserver may be offline.")
views.py 文件源码 项目:CourseGrab 作者: nnsun 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_course_status(course_num):
    client = Client()
    subject = client.get_course_subject(course_num)
    if subject is None:
        return None
    semester = get_semester()
    subject_url = "http://classes.cornell.edu/browse/roster/" + semester + "/subject/" + subject
    subject_page = requests.get(subject_url)
    subject_page.raise_for_status()
    subject_bs4 = bs4.BeautifulSoup(subject_page.text, "html.parser")
    course_code_tags = subject_bs4.find_all("strong", class_="tooltip-iws")
    for tag in course_code_tags:
        course_code = int(tag.getText().strip())
        if course_num == course_code:
            section = tag.parent.parent.parent
            status = section.find_all('li', class_ = "open-status")[0].i["class"][-1]
            if "open-status-open" in status:
                return "open"
            if "open-status-closed" in status:
                return "closed"
            if "open-status-warning" in status:
                return "waitlist"
            if "open-status-archive" in status:
                return "archive"
communication.py 文件源码 项目:WPS-4th 作者: Fastcampus-WPS 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def get_soup_from_url(url, params=None):
    '''
    url? parameter? ???? ?? URL? GET??? ?? ??(HTML text)?
    BeautifulSoup??? ??? ??
    :param url: GET??? ?? URL string
    :param params: GET?? ???? dict
    :return: BeautifulSoup object
    '''
    # requests.get??? ?? ???(response??)? r??? ??
    r = requests.get(url, params=params)
    # response???? text???? ??? ??? html_doc??? ??
    html_doc = r.text

    # BeautifulSoup??? ??, ??? html text
    soup = BeautifulSoup(html_doc, 'lxml')
    return soup
test_tree.py 文件源码 项目:Gank-Alfred-Workflow 作者: hujiaweibujidao 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def test_tag_inherits_self_closing_rules_from_builder(self):
        if XML_BUILDER_PRESENT:
            xml_soup = BeautifulSoup("", "xml")
            xml_br = xml_soup.new_tag("br")
            xml_p = xml_soup.new_tag("p")

            # Both the <br> and <p> tag are empty-element, just because
            # they have no contents.
            self.assertEqual(b"<br/>", xml_br.encode())
            self.assertEqual(b"<p/>", xml_p.encode())

        html_soup = BeautifulSoup("", "html")
        html_br = html_soup.new_tag("br")
        html_p = html_soup.new_tag("p")

        # The HTML builder users HTML's rules about which tags are
        # empty-element tags, and the new tags reflect these rules.
        self.assertEqual(b"<br/>", html_br.encode())
        self.assertEqual(b"<p></p>", html_p.encode())
Scrape.py 文件源码 项目:TorScrapper 作者: ConanKapoor 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def Scrape(url):
    timeout = 10
    socket.setdefaulttimeout(timeout)

    #Collecting html content.
    headers = {'User-Agent': 'TorScrapper - Onion scrapper | github.com/ConanKapoor/TorScrapper.git' }
    req = urllib.request.Request(url,None,headers)
    response = urllib.request.urlopen(req)

    #Using BeautifulSoup to parse html object response.
    page = BeautifulSoup(response.read(),'html.parser')

    #Saving output
    token = re.sub(r'[^\w]', '', url)
    name = os.path.abspath("") + '/Output/Scraped-' + token +'.html'
    file = open(name,'w')
    file.write(str(page))
    file.close()

# Taking input.
sciencedirect_collect.py 文件源码 项目:scientific-paper-summarisation 作者: EdCo95 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def getJournalURL(jname):
# get journal URL given the journal name for retrieving article PIIs
    urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html"
    retl = ""
    with urllib.request.urlopen(urlstr) as url:
        response = url.read()
        linkcnt = 0
        for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
            if linkcnt == 0:
                linkcnt += 1
                continue
            if link.has_attr("href"):
                if link.text.lower() == jname.lower():
                    #print(link["href"])
                    retl = link["href"]
                    break
            linkcnt += 1
    return retl
recipe-578681.py 文件源码 项目:code 作者: ActiveState 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def get_url(self, query):
        site1 = urllib.urlopen('http://www.youtube.com/results?search_query=%s'%query)
        html = site1.read()
        soup = BS(html)

        links = soup.findAll('a')
        vidlinks = [link.get('href') for link in links if link.get('href') is not None]
        vlink = [ i for i in vidlinks if '/watch?v=' in i][0]

        img_link = soup.findAll('img',{'alt':'Thumbnail', 'width':'185'})[0].get('src')
        img_url =  'http:%s' %img_link

        imagethread = threading.Thread(target=lambda:urllib.urlretrieve(img_url, 'Files\image.jpg'))
        imagethread.start()

        return vlink
recipe-578193.py 文件源码 项目:code 作者: ActiveState 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def run(self):
        ind=self.qu.get()
        url=self.url+str(ind)
        soup =bs.BeautifulSoup(''.join( ul.urlopen(url).readlines() ))
        bu = up.urlsplit(self.url)
        print 'started with the ' ,str(url).split('/')[-1],
        for i in  soup.find_all(attrs = { "class" : "recipe-title"}):
            sp = up.urlsplit(i.a.get('href'))
            path = sp.path
            print path
            if re.search(pat, path):
                path = bu.scheme+'://'+bu.netloc+path
                filename = str(path).split('/')[-2]
                filename = op.join(op.abspath(op.curdir),filename+'.py') # recipe will be stored in given location
#                filename = op.join(op.abspath(op.curdir),filename+'.html')
#uncomment the above line if downloading the web page for teh recipe
                print path
                self.q.put((path,filename))
        self.fetch_data()
        time.sleep(1)
        self.qu.task_done()
        self.q.join()
        print 'done with the ' ,str(url).split('/')[-1],
sks.py 文件源码 项目:meg-server 作者: Argonne-National-Laboratory 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_all_key_signatures(cfg, keyid):
    """
    Get all signatures for a specific key. We exclude self signed signatures
    because this is not helpful for us.
    """
    content, status_code = make_sks_request(
        cfg, requests.get, "lookup", {"op": "vindex", "search": "0x{}".format(keyid)}, None
    )
    if status_code != 200:
        return status_code, content
    elem = BeautifulSoup(content, HTML_PARSER).span
    ids = []
    while (elem.findNext().name != "strong" and elem.findNext()):
        elem = elem.findNext()
        if "op=get" in elem["href"] and elem.text != keyid:
            ids.append(elem.text)
    return ids
sks.py 文件源码 项目:meg-server 作者: Argonne-National-Laboratory 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def search_key(cfg, search_str):
    """
    Search for a key by a given string
    """
    content, status_code = make_sks_request(
        cfg, requests.get, "lookup", {"op": "index", "search": search_str}, None
    )
    if status_code != 200:
        return content, status_code
    bs = BeautifulSoup(content, HTML_PARSER)
    regex = re.compile(r"^pub *\d{3,4}\w\/([\w\d]{8})")
    ids = []
    for pre in bs.findAll("pre"):
        match = regex.search(pre.text.strip("\r\n"))
        if match and not "KEY REVOKED" in pre.text:
            ids.append(match.groups()[0])
    return {"ids": ids}, status_code
money163_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        soup = BeautifulSoup(response.body.decode('gbk'))
        pic = soup.find('p' , class_ = 'f_center').find('img').get('src') if  soup.find('p' , class_ = 'f_center') and soup.find('p' , class_ = 'f_center').find('img') else None
        referer_web = soup.find('a',id = 'ne_article_source').text if soup.find('a',id = 'ne_article_source') else None
        referer_url = soup.find('a',id = 'ne_article_source').get('href') if soup.find('a',id = 'ne_article_source') else None
        author = soup.find('span',class_ = 'ep-editor').text if soup.find('span',class_ = 'ep-editor') else None
        if u"?" in author:
            author = author.split(u"?")[-1]
        crawl_date = NOW
        read_num = soup.find('div',class_ = 'post_comment_joincount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
        comment_num = soup.find('div',class_ = 'post_comment_tiecount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
        content = soup.find('div',class_ = 'post_text').get_text(strip=True) if soup.find('div',class_ = 'post_text') else None
        item['referer_web'] = referer_web
        item['content'] = content
        item['referer_url'] = referer_url
        item['author'] = author
        item['crawl_date'] = crawl_date
        item['pic'] = pic
        item['comment_num'] = int(comment_num)
        item['read_num'] = int(read_num)
        yield item
tech_ifeng_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_news(self, response):
        item = response.meta.get("item", NewsItem())
        soup = BeautifulSoup(response.body.decode("utf-8").encode("utf-8"),"lxml")
        pic = soup.find("p",class_ = "detailPic").find("img").get("src") if soup.find("p",class_ = "detailPic") else None
        referer_web = soup.find("span",class_ = "ss03").text if soup.find("span",class_ = "ss03") else None
        author = soup.find("span",itemprop="author").find("span").text if soup.find("span",itemprop="author") else None
        temp = soup.find("div" ,id = "main_content")
        if temp:
            ps = temp.find_all("p") if temp.find_all("p") else None
            content = "\n\n".join([ p.text.strip() for p in ps])
        else:
            content = None
        item['pic'] = pic
        item['referer_web'] = referer_web
        item['author'] = author
        item['content'] = content
        item['crawl_date'] = NOW
        yield item
luxe_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_news(self,response):
        item = response.meta.get("item",None)
        # #??????????????????????
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
        #     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        #
        #     delta = self.end_now-struct_date
        #     if delta.days == self.end_day:
        #         # pass
        #         raise CloseSpider('today scrapy end')
        soup = BeautifulSoup(response.body)
        news_content_group = soup.find("div",class_="entry-content group")
        #??????
        news_content_group.find("div",class_="related_posts").replace_with("")
        content = news_content_group.text.strip()
        item["content"] = content
        item["catalogue"] = u"????"
        yield item
leiphone_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body, 'lxml')
        origin_date = soup.find("td", class_="time").text.strip()
        struct_date= datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M")
        news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        content = soup.find("div", class_= "lph-article-comView").text.strip() if soup.find("div", class_= "lph-article-comView").text.strip() else None
        item["news_date"]= news_date
        item["crawl_date"]= NOW
        item["content"] = content
        item["catalogue"] = u"????"
        item = judge_news_crawl(item)
        if item:
            yield item
        else:
            self.flag = int(pageindex)
cnta_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse(self,response):
        origin_url = response.url
        if "index" not in origin_url:
            soup = BeautifulSoup(response.body,"lxml")
            catalogue =  soup.find("a",class_ = "blue CurrChnlCls").get("title").strip()
            news_list = soup.find("div", class_ = "lie_main_m").find_all("li")
            for news in news_list:
                title = news.find("a").text.strip()
                news_url = "http://www.cnta.gov.cn/xxfb" + news.find("a").get("href")[2:]
                news_no = news_url.rsplit("/",1)[-1].split(".")[0]
                item = NewsItem(
                        news_url =news_url,
                        title = title,
                        news_no = news_no,
                        catalogue = catalogue,
                    )
                yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
        else:
            topic_url = origin_url.rsplit(".",1)[0]
            self.flag.setdefault(topic_url,0)
            yield scrapy.Request(origin_url,callback=self.parse_topic)
transport163.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse(self, response):
        origin_url = response.url
        #http://money.163.com/special/002526O5/transport_02.html
        search_result = re.search(r"_(\d)*?\.",origin_url)
        #????
        pageindex = search_result.group(1) if search_result else 1
        soup = BeautifulSoup(response.body,"lxml")
        news_list = soup("div",class_="list_item clearfix")
        for news in news_list:
            news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None
            title = news.find("h2").text if news.find("h2") else None
            news_url = news.find("h2").a.get("href",None) if news.find("h2") else None
            abstract = news.find("p").contents[0] if news.find("p") else None
            item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date)
            item = judge_news_crawl(item)   #??????????
            if item:
                request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item})
                yield request
            else:
                self.flag = int(pageindex)
        if not self.flag:
            next_url = self.next_url % int(pageindex)+1
            yield scrapy.Request(next_url)
transport163.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        soup = BeautifulSoup(response.body)
        referer_web = soup.find("a",id="ne_article_source").text if soup.find("a",id="ne_article_source") else None
        referer_url = soup.find("a",id="ne_article_source").get("href",None) if soup.find("a",id="ne_article_source") else None
        comment_num = soup.find("a",class_="post_cnum_tie").text if soup.find("a",id="ne_article_source") else None
        content = soup.find("div",class_="post_text").text.strip() if soup.find("div",class_="post_text") else None
        #??: ?????????-?????  ??????
        author_source = soup.find("span",class_="left").text if soup.find("span",class_="left") else None
        #TODO ??????
        # import pdb;pdb.set_trace()
        # author = re.search(u"??(.*)",author_source).group(1)[1:] if author_source else None
        # item["author"]=author
        item["referer_web"]=referer_web
        item["referer_url"]=referer_url
        item["comment_num"]=comment_num
        item["content"]=content
        item["crawl_date"]=NOW
        yield item


问题


面经


文章

微信
公众号

扫码关注公众号