python类BeautifulSoup()的实例源码

DY2018.py 文件源码 项目:Python 作者: Guzi219 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def ParseHtml(self, html):
        soup = BeautifulSoup(html)
        links = soup.findAll('a', attrs={'class': 'ulink'})
        #print len(links)
        if len(links) == 0: #the js return
            # tmp_js = soup.find(name='script', attrs={'language': 'javascript'})
            js_str = soup.script.string #two ways to get the <script></script>
            new_url = js_str[16:-1] #get the new url
            new_url = eval(new_url) #eval:??????????
            self.ParseHtml(self.LoadPage(new_url))
        else:
            # print type(links)
            for link in links:
                # print type(link)
                # print type(link.string)
                # print unicode(link.string)
                titles = re.findall(r'?(.+?)?', str(link.string)) #unicode(link.string))
                if len(titles) <> 0:
                    print titles[0]
                # print 'url is %s, title is %s.' %(link['href'], titles[0])
qiubaiadult.py 文件源码 项目:Python 作者: Guzi219 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def GetTotalPage(self, html):
        # create the BeautifulSoup
        some_soup = BeautifulSoup(html)
        #get the page div
        ele_a = some_soup.find('div', attrs={'class': 'page'})
        #get the last div>a text='??'
        last_a = ele_a.findAll('a')[-1]
        #substr 0:.html
        pagenum = last_a.get('href')[:-5]
        print 'pagenum :', pagenum
        # print type(last_a)

        self.SaveTotalPageToFile(pagenum)

    # store the max page number to totalpage.ini
    #new_page_num: new max page num
MrJattParser.py 文件源码 项目:song-cli 作者: ankitmathur3193 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def list_of_all_href(self,html):
        '''
        It will return all hyper links found in the mr-jatt page for download
        ''' 
        soup=BeautifulSoup(html)
        links=[]
        a_list=soup.findAll('a','touch')
        for x in xrange(len(a_list)-1):
            link = a_list[x].get('href')
            name = a_list[x]
            name = str(name)
            name=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',name)
            name=re.sub(r'^[0-9]+\.','',name)
            links.append([link,name])

        #quit()
        return links
crawler.py 文件源码 项目:FreeFoodCalendar 作者: Yuliang-Zou 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def crawler(urls, max_urls):
    crawled = Set()
    queued = Set(urls)
    pairs = []
    while urls and len(crawled) < max_urls:
        page=urls.pop(0)
        if is_html(page):
            if page not in crawled:
                try:
                    print(page)
                    links=BeautifulSoup(urllib2.urlopen(page,timeout=5).read(), parseOnlyThese=SoupStrainer('a'))
                    for link in links:
                        url = domain + link['href']
                        if verify(url) and url not in queued:
                            # print(url)
                            urls.append('http://' +url)
                            # print(urls)
                            queued.add('http://' +url)
                    # print(page)
                    crawled.add(page)
                    # print(crawled)
                except:
                    continue
    return crawled,pairs
search.py 文件源码 项目:doork 作者: AeonDave 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def _extract_description(self, result):
        desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
        if not desc_div:
            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
            return None

        desc_strs = []
        def looper(tag):
            if not tag: return
            for t in tag:
                try:
                    if t.name == 'br': break
                except AttributeError:
                    pass

                try:
                    desc_strs.append(t.string)
                except AttributeError:
                    desc_strs.append(t)

        looper(desc_div)
        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>

        desc = ''.join(s for s in desc_strs if s)
        return self._html_unescape(desc)
search.py 文件源码 项目:doork 作者: AeonDave 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _extract_description(self, result):
        desc_td = result.findNext('td')
        if not desc_td:
            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
            return None

        desc_strs = []
        def looper(tag):
            if not tag: return
            for t in tag:
                try:
                    if t.name == 'br': break
                except AttributeError:
                    pass

                try:
                    desc_strs.append(t.string)
                except AttributeError:
                    desc_strs.append(t)

        looper(desc_td)
        looper(desc_td.find('wbr')) # BeautifulSoup does not self-close <wbr>

        desc = ''.join(s for s in desc_strs if s)
        return self._html_unescape(desc)
googlesets.py 文件源码 项目:doork 作者: AeonDave 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _get_results_page(self, set_type):
        if set_type == LARGE_SET:
            url = GoogleSets.URL_LARGE
        else:
            url = GoogleSets.URL_SMALL

        safe_items = [urllib.quote_plus(i) for i in self.items]
        blank_items = 5 - len(safe_items)
        if blank_items > 0:
            safe_items += ['']*blank_items

        safe_url = url % tuple(safe_items)

        try:
            page = self.browser.get_page(safe_url)
        except BrowserError, e:
            raise GSError, "Failed getting %s: %s" % (e.url, e.error)

        return BeautifulSoup(page)
sponsoredlinks.py 文件源码 项目:doork 作者: AeonDave 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _get_results_page(self):
        if self._page == 0:
            if self._results_per_page == 10:
                url = SponsoredLinks.SEARCH_URL_0
            else:
                url = SponsoredLinks.SEARCH_URL_1
        else:
            if self._results_per_page == 10:
                url = SponsoredLinks.NEXT_PAGE_0
            else:
                url = SponsoredLinks.NEXT_PAGE_1

        safe_url = url % { 'query': urllib.quote_plus(self.query),
                           'start': self._page * self._results_per_page,
                           'num': self._results_per_page }

        try:
            page = self.browser.get_page(safe_url)
        except BrowserError, e:
            raise SLError, "Failed getting %s: %s" % (e.url, e.error)

        return BeautifulSoup(page)
html_parser.py 文件源码 项目:minerva 作者: linzhi 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_content(cls, url=None, session=None):
        """
        @brief: ??url????????????
        """

        hyperlinks = set()
        soup_context = None

        # ???????????????
        html_context = cls.parse_page(url, session)
        if html_context:
            soup_context = BeautifulSoup.BeautifulSoup(html_context)
            if soup_context:
                soup_context = BeautifulSoup.BeautifulSoup(html_context)
                for each_link in soup_context.findAll('a'):
                    hyperlink = urlparse.urljoin(url, (each_link or {}).get('href'))
                    hyperlinks.add(hyperlink)

        return hyperlinks, soup_context
scholar.py 文件源码 项目:google_scholar_paper_finder 作者: maikelronnau 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def make_soup(markup, parser=None):
        """Factory method returning a BeautifulSoup instance. The created
        instance will use a parser of the given name, if supported by
        the underlying BeautifulSoup instance.
        """
        if 'bs4' in sys.modules:
            # We support parser specification. If the caller didn't
            # specify one, leave it to BeautifulSoup to pick the most
            # suitable one, but suppress the user warning that asks to
            # select the most suitable parser ... which BS then
            # selects anyway.
            if parser is None:
                warnings.filterwarnings('ignore', 'No parser was explicitly specified')
            return BeautifulSoup(markup, parser)

        return BeautifulSoup(markup)
scholar.py 文件源码 项目:citations 作者: frederick0329 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def make_soup(markup, parser=None):
        """Factory method returning a BeautifulSoup instance. The created
        instance will use a parser of the given name, if supported by
        the underlying BeautifulSoup instance.
        """
        if 'bs4' in sys.modules:
            # We support parser specification. If the caller didn't
            # specify one, leave it to BeautifulSoup to pick the most
            # suitable one, but suppress the user warning that asks to
            # select the most suitable parser ... which BS then
            # selects anyway.
            if parser is None:
                warnings.filterwarnings('ignore', 'No parser was explicitly specified')
            return BeautifulSoup(markup, parser)

        return BeautifulSoup(markup)
apache2_mod_proxy.py 文件源码 项目:DevOps 作者: YoLoveLife 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_member_attributes(self):
        """ Returns a dictionary of a balancer member's attributes."""

        balancer_member_page = fetch_url(self.module, self.management_url)

        try:
            assert balancer_member_page[1]['status'] == 200
        except AssertionError:
            self.module.fail_json(msg="Could not get balancer_member_page, check for connectivity! " + balancer_member_page[1])
        else:
            try:
                soup = BeautifulSoup(balancer_member_page[0])
            except TypeError:
                self.module.fail_json(msg="Cannot parse balancer_member_page HTML! " + str(soup))
            else:
                subsoup = soup.findAll('table')[1].findAll('tr')
                keys = subsoup[0].findAll('th')
                for valuesset in subsoup[1::1]:
                    if re.search(pattern=self.host, string=str(valuesset)):
                        values = valuesset.findAll('td')
                        return dict((keys[x].string, values[x].string) for x in range(0, len(keys)))
scraper_vod.py 文件源码 项目:plugin.video.brplay 作者: olavopeixoto 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_categories():

    url = "http://sexyhotplay.com.br/categorias/"
    html = client.request(url, headers={'Cookie': 'disclaimer-sexyhotplay=1;'})

    soup = bs(html)
    div = soup.find('div', attrs={'class': 'colunas-3-15'})

    links = div.findAll('a', attrs={'class': 'link'}, recursive=True)

    results = []
    for link in links:
        label = link.find('strong').string
        url = 'http://sexyhotplay.com.br' + link['href']
        results.append({
            'name': label,
            # 'clearlogo': os.path.join(artPath, 'logo_sexyhot.png'),
            'url': url
        })

    return results
twitter.py 文件源码 项目:MalwrAgent 作者: michaelschratt 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def f_grab_cmd_from_twitter_profile(profile_name):
        """grab 0xXXXXXXXX tag from profile, tag must match [a-zA-Z0-9_]
        :rtype: string
        :param profile_name: twitter profile name without leading @
        :return: string embedded in the profile description
        """
        url = 'https://twitter.com/%(profile)s'
        payload = {
            'profile': profile_name
        }
        html = requests.get(url % payload)
        soup = soupy(html.text)
        profile_description = soup.find('meta', {'name': 'description'})['content']
        match = re.search('0x(\w+)', profile_description)
        output = match.group(1)  # group 1 consists of match between ( )

        return str(output)
HoerbuchIn.py 文件源码 项目:download-manager 作者: thispc 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def decrypt(self, pyfile):
        self.pyfile = pyfile

        if self.article.match(pyfile.url):
            html = self.load(pyfile.url)
            soup = BeautifulSoup.BeautifulSoup(
                html, convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES)

            links = []
            for a in soup.findAll("a", attrs={'href': self.hoster_links}):
                for decrypted_link in self.decrypt_folder(a.get('href')):
                    links.append(decrypted_link)

            self.packages.append((pyfile.name, links, pyfile.name))
        else:
            self.links = self.decrypt_folder(pyfile.url)
USTC_Today3.0.py 文件源码 项目:USTC-Today 作者: HengRuiZ 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def search(key_word):
    global x
    search_url='http://news.sogou.com/news?ie=utf8&p=40230447&interV=kKIOkrELjboMmLkEkLoTkKIMkLELjb8TkKIMkrELjboImLkEk74TkKILmrELjbgRmLkEkLY=_485898072&query=%E4%B8%AD%E7%A7%91%E5%A4%A7&'
    req=urllib2.urlopen(search_url.replace('key_word',key_word))
    real_visited=0
    html=req.read()
    soup=BeautifulSoup(html)
    #print soup
    content  = soup.findAll(name="a",attrs={"href":True,"data-click":True,"target":True}) #resultset object
    num = len(content)
    #print num
    for i in range(9):
        #???????????????????url
        p_str= content[2*i] #if no result then nontype object
        tit[i]=p_str.renderContents()
        tit[i]=tit[i].decode('utf-8', 'ignore')#need it
        tit[i]= re.sub("<[^>]+>","",tit[i])
        print(tit[i])
        url[i]=str(p_str.get("href"))
        print(url[i])
        #???????url???
        img[i]=getimg(url[i])
        w, h = img[i].size
        img[i]=resize(w,h, w_box, h_box,img[i])
mian3.py 文件源码 项目:ProxyIPCrawler 作者: uilliu 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def ProxyIPSpider(self):
    # get the proxy
    f = open('proxy.txt', 'w')
    for page in range(1,50):
        url = 'http://www.xicidaili.com/nn/%s' %page
        user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
        request = urllib2.Request(url)
        request.add_header("User-Agent", user_agent)
        content = urllib2.urlopen(request)
        soup = BeautifulSoup(content)
        trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
        for tr in trs[1:]:
            tds = tr.findAll('td')
            ip = tds[2].text.strip()
            port = tds[3].text.strip()
            protocol = tds[6].text.strip()
            if protocol == 'HTTP' or protocol == 'HTTPS':
                f.write('%s=%s:%s\n' % (protocol, ip, port))
                print '%s://%s:%s' % (protocol, ip, port)
mian3.py 文件源码 项目:ProxyIPCrawler 作者: uilliu 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def caiji2(self):   #“???IP”
    # ??????IP???????????????????????????????
    of = open('proxy.txt', 'w')
    url = 'http://www.haodailiip.com/guonei/'
    for i in range(1,20):
        Url = 'http://www.haodailiip.com/guonei/' + str(i)
        print u"????"+Url
        html = requests.get(Url).text
        bs = BeautifulSoup(html)
        table = bs.find('table',{"class":"proxy_table"})
        tr = table.findAll('tr')
        for i in range(1,31):
            td = tr[i].findAll('td')
            proxy_ip = td[0].text.strip()
            proxy_port = td[1].text.strip()
            of.write('http=%s:%s\n' %(proxy_ip,proxy_port))
            print 'http=%s:%s\n' %(proxy_ip,proxy_port)
        time.sleep(2)
    of.closed
mian4.py 文件源码 项目:ProxyIPCrawler 作者: uilliu 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def ProxyIPSpider(self):
    # get the proxy
    f = open('proxy.txt', 'w')
    for page in range(1,50):
        url = 'http://www.xicidaili.com/nn/%s' %page
        user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
        request = urllib2.Request(url)
        request.add_header("User-Agent", user_agent)
        content = urllib2.urlopen(request)
        soup = BeautifulSoup(content)
        trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
        for tr in trs[1:]:
            tds = tr.findAll('td')
            ip = tds[2].text.strip()
            port = tds[3].text.strip()
            protocol = tds[6].text.strip()
            if protocol == 'HTTP' or protocol == 'HTTPS':
                f.write('%s=%s:%s\n' % (protocol, ip, port))
                print '%s://%s:%s' % (protocol, ip, port)
mian4.py 文件源码 项目:ProxyIPCrawler 作者: uilliu 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def caiji2(self):   #“???IP”
    # ??????IP???????????????????????????????
    of = open('proxy.txt', 'w')
    url = 'http://www.haodailiip.com/guonei/'
    for i in range(1,20):
        Url = 'http://www.haodailiip.com/guonei/' + str(i)
        print u"????"+Url
        html = requests.get(Url).text
        bs = BeautifulSoup(html)
        table = bs.find('table',{"class":"proxy_table"})
        tr = table.findAll('tr')
        for i in range(1,31):
            td = tr[i].findAll('td')
            proxy_ip = td[0].text.strip()
            proxy_port = td[1].text.strip()
            of.write('http=%s:%s\n' %(proxy_ip,proxy_port))
            print 'http=%s:%s\n' %(proxy_ip,proxy_port)
        time.sleep(2)
    of.closed
stallmanbot.py 文件源码 项目:homemadescripts 作者: helioloureiro 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def DuckDuckGo(cmd):
    debug(cmd.text)
    q = cmd.text.split()
    if len(q) == 1:
        return
    question = "+".join(q[1:])
    debug("Question=%s" % question)
    req = requests.get("https://duckduckgo.com/html/?q=%s" % question)
    answer = None
    html = bp.BeautifulSoup(req.text)
    responses = html.findAll("div", id="zero_click_abstract")
    try:
        answer = responses[0].text
    except Exception as e:
        print e # get internal
        pass
    if not answer:
        bot.reply_to(cmd, "Não tenho a menor idéia.  Tem de perguntar no google.")
        return
    try:
        bot.reply_to(cmd, answer)
    except Exception as e:
        bot.reply_to(cmd, "Deu merda: %s" % e)
findspec.py 文件源码 项目:pixiebot 作者: umago 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def _update_cache(release):
    LOG.debug('Updating cache for the release "%s"', release)
    url = BASE_URL % release
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page)
    specs = {}
    for link in soup.findAll('a', attrs={'href': re.compile('.html$')}):
        href = link.get('href')
        title = ' '.join(href.replace('.html', '').split('-'))
        link = url + href
        specs[title] = link

    _CACHE[release] = {}
    _CACHE[release]['specs'] = specs
    _CACHE[release]['updated_at'] = datetime.datetime.utcnow()
    LOG.info('Cache updated for the release "%s"', release)
bot.py 文件源码 项目:yogame 作者: tivisse 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def update_planet_fleet(self, planet):
        resp = self.br.open(self._get_url('fleet', planet))
        soup = BeautifulSoup(resp)
        ships = {}
        for k, v in self.SHIPS.iteritems():
            available = 0
            try:
                s = soup.find(id='button' + v)
                available = int(s.find('span', 'textlabel').nextSibling.replace('.', ''))
            except:
                available = 0
            ships[k] = available

        #self.logger.info('Updating %s fleet' % planet)
        #self.logger.info('%s' % fleet)
        planet.ships = ships
bot.py 文件源码 项目:yogame 作者: tivisse 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def update_planet_research(self, planet):
        resp = self.br.open(self._get_url('research', planet))
        soup = BeautifulSoup(resp)
        try:
            ButtonList = soup.find(id='buttonz')
            AllResearchList = ButtonList.findAll('li')
            for research in AllResearchList:
                if research.get('class') == 'on':
                    fb = research.find('a', 'fastBuild')
                    if fb:
                        build_url = fb.get('onclick') if fb else ''
                        build_url = self._parse_research_url(build_url)
                        self.logger.info('Research launched on %s:%s'% (planet, fb.get('title')))
                        self.br.open(build_url)
                        break
        except:
            self.logger.exception('Exception while retrieving researches')
bot.py 文件源码 项目:yogame 作者: tivisse 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def update_planet_facilities(self, planet):
        resp = self.br.open(self._get_url('station', planet))
        soup = BeautifulSoup(resp)
        try:
            ButtonList = soup.find(id='stationbuilding')
            AllResearchList = ButtonList.findAll('li')
            for research in AllResearchList:
                if research.get('class') == 'on':
                    fb = research.find('a', 'fastBuild')
                    if fb:
                        build_url = fb.get('onclick') if fb else ''
                        build_url = self._parse_research_url(build_url)
                        self.logger.info('Facility upgraded on %s:%s'% (planet, fb.get('title')))
                        self.br.open(build_url)
                        break
        except:
            self.logger.exception('Exception while retrieving facilities statuses')


        return True
getPostsWithAnswers.py 文件源码 项目:BiLSTM-CCM 作者: codedecde 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def getFirstPostData(forum_text):
    soup = BeautifulSoup(forum_text)
    title = ""
    date = ""
    body = ""
    try:
        date = soup.find("div", attrs={"class": "postDate"}).text
    except AttributeError:
        print "Date not found"
    try:
        title = soup.find("div", attrs={"class": "postTitle"}).text
    except AttributeError:
        print "Title not found"
    try:
        body = soup.find("div", attrs={"class": "postBody"}).text
    except AttributeError:
        print "Body not found, now this is weird"
    return [title,date,body]
scholar.py 文件源码 项目:snowballing 作者: JoaoFelipe 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def parse(self, html):
        """
        This method initiates parsing of HTML content, cleans resulting
        content as needed, and notifies the parser instance of
        resulting instances via the handle_article callback.
        """
        self.soup = BeautifulSoup(html, "html.parser")

        # This parses any global, non-itemized attributes from the page.
        self._parse_globals()

        # Now parse out listed articles:
        for div in self.soup.findAll(ScholarArticleParser._tag_results_checker):
            self._parse_article(div)
            self._clean_article()
            if self.article['title']:
                self.handle_article(self.article)
google.py 文件源码 项目:script.module.metadatautils 作者: marcelveldt 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_data(self, search_query):
        '''helper method to get data from google images by scraping and parsing'''
        params = {"site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query}
        headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \
            IEMobile/7.0; LG; GW910)'}
        html = ''
        try:
            html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text
        except Exception as exc:
            log_exception(__name__, exc)
        soup = BeautifulSoup.BeautifulSoup(html)
        results = []
        for div in soup.findAll('div'):
            if div.get("id") == "images":
                for a_link in div.findAll("a"):
                    page = a_link.get("href")
                    try:
                        img = page.split("imgurl=")[-1]
                        img = img.split("&imgrefurl=")[0]
                        results.append(img)
                    except Exception:
                        pass
        return results
imdb.py 文件源码 项目:script.module.metadatautils 作者: marcelveldt 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_top250_db(self):
        '''
            get the top250 listing for both movies and tvshows as dict with imdbid as key
            uses 7 day cache to prevent overloading the server
        '''
        results = {}
        for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]:
            html = requests.get(
                "http://www.imdb.com/chart/%s" %
                listing[0], headers={
                    'User-agent': 'Mozilla/5.0'}, timeout=20)
            soup = BeautifulSoup.BeautifulSoup(html.text)
            for table in soup.findAll('table'):
                if table.get("class") == "chart full-width":
                    for td_def in table.findAll('td'):
                        if td_def.get("class") == "titleColumn":
                            a_link = td_def.find("a")
                            if a_link:
                                url = a_link["href"]
                                imdb_id = url.split("/")[2]
                                imdb_rank = url.split(listing[1])[1]
                                results[imdb_id] = try_parse_int(imdb_rank)
        self.write_kodidb(results)
        return results
PurgeAttributes.py 文件源码 项目:Anki-Addons 作者: searene 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def purgeAttributes(self, mime, _old):
    html = mime.html()
    soup = BeautifulSoup(html)
    newMime = QMimeData()
    for tag in soup.recursiveChildGenerator():
        # remove attributes in the list
        index = -1
        try:
            for key, value in tag.attrs:
                index += 1
                if key != 'style':
                    continue
                new = value.split(';')
                new = ';'.join([s for s in new
                    if s.split(':')[0].strip() not in REMOVE_ATTRIBUTES])
                tag.attrs[index] = (u'style', new)
        except AttributeError: 
            # 'NavigableString' object has no attribute 'attrs'
            pass

    # assign the modified html to new Mime
    newMime.setHtml(str(soup).decode('utf8'))

    # default _processHtml method
    return _old(self, newMime)


问题


面经


文章

微信
公众号

扫码关注公众号