python类SoupStrainer()的实例源码

sciencedirect_collect.py 文件源码 项目:scientific-paper-summarisation 作者: EdCo95 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def getJournalURL(jname):
# get journal URL given the journal name for retrieving article PIIs
    urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html"
    retl = ""
    with urllib.request.urlopen(urlstr) as url:
        response = url.read()
        linkcnt = 0
        for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
            if linkcnt == 0:
                linkcnt += 1
                continue
            if link.has_attr("href"):
                if link.text.lower() == jname.lower():
                    #print(link["href"])
                    retl = link["href"]
                    break
            linkcnt += 1
    return retl
serializer.py 文件源码 项目:maoyan 作者: Eastwu5788 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self, data, encoding=None):
        """
         Initialize serializer class
         :param data: ori data
         :param encoding: encoding type of your ori data
         """
        self.data = data

        if not self.data:
            raise ValueError("You must input origin data to this class")

        # if you don't support encoding type we will use chardet to check the type
        self.encoding = encoding if encoding else UnicodeDammit(self.data).original_encoding
        self.encoding = None if self.encoding == "utf-8" else self.encoding

        # initialize beautiful soup
        # only_content_div = SoupStrainer("body")
        self.obj = BeautifulSoup(data, features="lxml", from_encoding=self.encoding)
schema.py 文件源码 项目:taemin 作者: ningirsu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_title(html):
        """
            Get the title element from a HTML document

            :param str html: The html to parse

            :Example:

            >>> Link.get_title("xxxx<title>Title</title>xxxx")
            'Title'

            >>> print(Link.get_title("xxxx<>Title</title>xxxx"))
            None
        """
        bs = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('title'))

        title = bs.find("title")
        if not title:
            return None

        if not title.string:
            return None

        return title.string.strip().replace('\n', ' ')
collect.py 文件源码 项目:web_page_classification 作者: yuhui-lin 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_child_urls(main_page, max_child=20):
    """retrieve urls from giving html page.
    args:
        main_page(str): html file.
        max_child(int): max number of return urls.
    return:
        list of url string.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    children = []
    for link in BeautifulSoup(main_page,
                              "html.parser",
                              parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link['href'].startswith("http"):
            children.append(link['href'])
    if len(children) > max_child:
        children = children[:max_child]
    return children
vrtplayer.py 文件源码 项目:plugin.video.vrt.nu 作者: pietje666 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __get_menu_items(self, url, soupstrainer_parser_selector, routing_action, video_dictionary_action=None):
        response = requests.get(url)
        tiles = SoupStrainer('a', soupstrainer_parser_selector)
        soup = BeautifulSoup(response.content, "html.parser", parse_only=tiles)
        listing = []
        for tile in soup.find_all(class_="tile"):
            link_to_video = tile["href"]
            thumbnail, title = self.__get_thumbnail_and_title(tile)
            video_dictionary = None
            if video_dictionary_action is not None:
                video_dictionary = video_dictionary_action(tile)

            item = helperobjects.TitleItem(title, {'action': routing_action, 'video': link_to_video},
                                           False, thumbnail, video_dictionary)
            listing.append(item)
        return listing
dhtml.py 文件源码 项目:reahl 作者: reahl 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def read(self):
        with io.open(self.filename, 'rb') as dhtml_file:
            def strain(name, attrs):
                if name == 'title':
                    return True
                if name == 'div' and dict(attrs).get('id', None) in self.ids:
                    return True
                return False
            soup = BeautifulSoup(dhtml_file, "lxml", parse_only=SoupStrainer(strain))
            parser = html_parser.HTMLParser()
            self.title = parser.unescape(soup.title.decode_contents()) if soup.title else _('Untitled')
            for an_id in self.ids:
                found_elements = soup.find_all(id=an_id)
                if found_elements:
                    [element] = found_elements
                    self.elements[an_id] = element.decode_contents()
                else:
                    self.elements[an_id] = ''
            self.original_encoding = soup.original_encoding
lyrics.py 文件源码 项目:DropMuse 作者: DropMuse 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_lyrics_with_urls(urls):
    # TODO

    ret = []
    for url in urls:
        time.sleep(3)
        print(url)

        response = urlopen(url, timeout=5)
        content = response.read()
        for lyrics in bs(content, "html.parser", parse_only=SoupStrainer('p')):
            if(lyrics.has_attr('style')):
                lyrics = re.sub('</?br/?>', '\n', str(lyrics))
                lyrics = re.sub('<.*?>', '', str(lyrics))
                lyrics = re.sub('\n', ' \n', str(lyrics))
                ret.append(lyrics)
                print(lyrics)
                print(str(get_sentiment(lyrics)))
    return ret
lyrics.py 文件源码 项目:DropMuse 作者: DropMuse 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_lyrics(artist, song):
    artist = format_artist(artist)
    song = format_song(song)

    time.sleep(1)
    url = LYRICS_URL.format(artist, song)
    content = None
    try:
        response = urlopen(url)
        content = response.read()
    except Exception as e:
        print(url)
        print(e)
        print("failed\n")
        return None

    soup = bs(content, "html.parser", parse_only=SoupStrainer('div'))
    for l in soup:
        for lyrics in soup.find_all(string=lambda t: isinstance(t, Comment)):
            if "start of lyrics" in lyrics or "Usage" in lyrics:
                lyrics = re.sub('</?br/?>', '', str(lyrics.parent))
                lyrics = re.sub('<.*?>', '', str(lyrics))

                return str(lyrics)
ThreadedCategoryScraper.py 文件源码 项目:Wakapedia 作者: ACMProjectsTeam3 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def scrape_category_page(url):
    global ALL_TEXT, non_bmp_map, threads, count
    soup = BeautifulSoup(urllib.request.urlopen(url), 'lxml', parse_only=SoupStrainer('div'))

      ### accounts for categories with over 200 pages
    link = soup.find('a', href=True, text='next page')
    if (link != None):
        try:
            t = catThread('https://en.wikipedia.org' + link['href'])
            t.daemon = True
            t.start()
            threads.append(t)
        except:
            print ("Error: Unable to thread.")

      ### sends links of wikipedia articles in the category to be scraped
    pages_in_category = soup.find('div', {'id':'mw-pages'}).find('div',{'class':'mw-category'})
    for obj in pages_in_category.findAll('a'):
        tempbun = scrape(Bundle('https://en.wikipedia.org' + obj['href'], False))
        with lock:
            ALL_TEXT += tempbun.text.translate(non_bmp_map)
            print (count)
            count += 1
html_pbp.py 文件源码 项目:Hockey-Scraper 作者: HarryShomer 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_soup(game_html):
    """
    Uses Beautiful soup to parses the html document.
    Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order

    :param game_html: html doc

    :return: "soupified" html and player_shifts portion of html (it's a bunch of td tags)
    """
    strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})
    soup = BeautifulSoup(game_html.text, "lxml", parse_only=strainer)
    soup = soup.select('td.+.bborder')

    if len(soup) == 0:
        soup = BeautifulSoup(game_html.text, "html.parser", parse_only=strainer)
        soup = soup.select('td.+.bborder')

        if len(soup) == 0:
            soup = BeautifulSoup(game_html.text, "html5lib")
            soup = soup.select('td.+.bborder')

    return soup
get_child_urls.py 文件源码 项目:misc 作者: yuhui-lin 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_child_urls(main_page, max_child=20):
    """retrieve urls from giving html page.
    args:
        main_page(str): html file.
        max_child(int): max number of return urls.
    return:
        list of url string.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    children = []
    for link in BeautifulSoup(main_page, "html.parser", parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link['href'].startswith("http"):
            children.append(link['href'])
    if len(children) > max_child:
        children = children[:max_child]
    return children
sciencedirect_collect.py 文件源码 项目:scientific-paper-summarisation 作者: EdCo95 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def collectArticles(urlstr):
# get article PIIs
    retl = []
    with urllib.request.urlopen(urlstr) as url:
        response = url.read()
        linkcnt = 0
        for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
            if linkcnt == 0:
                linkcnt += 1
                continue
            if link.has_attr("href"):
                #print(link["href"])
                retl.append(link["href"])
            linkcnt += 1
    return retl
gd_scrape.py 文件源码 项目:pitchfx-data-scraper 作者: whazell 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_links ( url ):
    '''
        Get all the links off of the page:
        gd2.mlb.com/components/game/mlb/year/month/day/

        And finds the links for the games that have the following 
        format:

        gid_year_mm_dd_team1mlb_team2mlb   
    '''
    f = get_page (url)
    if f==False: return False

    # Compile the regex to match links outside of the loop for 
    # performance
    links = []
    regex = re.compile("\"gid_(.*?)\"", re.IGNORECASE)

    # Find all links on page and if they are links to games then add to list
    for link in BeautifulSoup(f, "lxml",parse_only=SoupStrainer('a', href=True) ):
        match = regex.findall(str(link))
        if match:
           links.extend(match)

    return links
wget.py 文件源码 项目:isar 作者: ilbers 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def _check_latest_version(self, url, package, package_regex, current_version, ud, d):
        """
        Return the latest version of a package inside a given directory path
        If error or no version, return ""
        """
        valid = 0
        version = ['', '', '']

        bb.debug(3, "VersionURL: %s" % (url))
        soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a"))
        if not soup:
            bb.debug(3, "*** %s NO SOUP" % (url))
            return ""

        for line in soup.find_all('a', href=True):
            bb.debug(3, "line['href'] = '%s'" % (line['href']))
            bb.debug(3, "line = '%s'" % (str(line)))

            newver = self._parse_path(package_regex, line['href'])
            if not newver:
                newver = self._parse_path(package_regex, str(line))

            if newver:
                bb.debug(3, "Upstream version found: %s" % newver[1])
                if valid == 0:
                    version = newver
                    valid = 1
                elif self._vercmp(version, newver) < 0:
                    version = newver

        pupver = re.sub('_', '.', version[1])

        bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" %
                (package, pupver or "N/A", current_version[1]))

        if valid:
            return pupver

        return ""
scraper.py 文件源码 项目:play-scraper 作者: danieliu 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _parse_multiple_apps(self, list_response):
        """Extracts app ids from a list's Response object, sends GET requests to
        each app, parses detailed info and returns all apps in a list.

        :param list_response: the Response object from a list request
        :return: a list of app dictionaries
        """
        list_strainer = SoupStrainer('span', {'class': 'preview-overlay-container'})
        soup = BeautifulSoup(list_response.content, 'lxml', parse_only=list_strainer)

        app_ids = [x.attrs['data-docid'] for x in soup.select('span.preview-overlay-container')]
        responses = multi_app_request(app_ids)

        app_strainer = SoupStrainer('div', {'class': 'main-content'})
        apps = []
        errors = []
        for i, r in enumerate(responses):
            if r is not None and r.status_code == requests.codes.ok:
                soup = BeautifulSoup(r.content, 'lxml', parse_only=app_strainer)
                apps.append(self._parse_app_details(soup))
            else:
                errors.append(app_ids[i])

        if errors:
            self._log.error("There was an error parsing the following apps: {errors}.".format(
                errors=", ".join(errors)))

        return apps
utils.py 文件源码 项目:play-scraper 作者: danieliu 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_categories():
    """
    Sends a GET request to the front page (base url of the app store),
    parses and returns a list of all available categories.

    Note: May contain some promotions, e.g. "Popular Characters"
    """
    categories = {}
    strainer = SoupStrainer('a', {'class': 'child-submenu-link'})

    response = send_request('GET', s.BASE_URL)
    soup = BeautifulSoup(response.content, 'lxml', parse_only=strainer)
    category_links = soup.select('a.child-submenu-link')

    age = '?age='

    for cat in category_links:
        url = urljoin(s.BASE_URL, cat.attrs['href'])
        category_id = url.split('/')[-1]
        name = cat.string

        if age in category_id:
            category_id = 'FAMILY'
            url = url.split('?')[0]
            name = 'Family'

        if category_id not in categories:
            categories[category_id] = {
                'name': name,
                'url': url,
                'category_id': category_id}

    return categories
solr_index_mapper.py 文件源码 项目:-PunkScan 作者: swordli 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def resolve_title(url):

    #grab the first title if there's more than one
    try:
        pnk_log(mod, "Requesting %s" % url)
        r = pnk_request(url)
        response_text = r.text

        for title in BeautifulSoup(response_text, 'html.parser', parse_only=SoupStrainer('title')):
            return title.text.strip()
    except:
        return None
getSingle.py 文件源码 项目:LazyBook 作者: cfifty 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def getSingle(s):

    # load in your friends dictionary
    structDir = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'Structs'))
    with open(structDir + '/friendsDict.pkl','rb') as input:
        friendsDict = pickle.load(input)

    # -------------- Now, let's compile a list of friends who are single ------------
    Single = []
    iteration = 1
    relatStrainer = SoupStrainer(text=re.compile("Single</div>"))
    relatExt = "/about?section=relationship&pnref=about"
    relatExtBeta = "&sk=about&section=relationship"
    fbook = "https://facebook.com"


    for friend in friendsDict: 
        if (friendsDict[friend].find("php") != -1):
            relatURL = fbook + friendsDict[friend] + relatExtBeta
        else:
            relatURL = fbook + friendsDict[friend] + relatExt

        relatInfo = s.get(relatURL)
        soup = BeautifulSoup(relatInfo.text,"lxml",parse_only=relatStrainer)
        comment = soup.find(text=re.compile("Single</div>"))
        if (comment != None):
            # since some names have special characters, we need to strip these
            temp = friend.encode('utf-8').strip()
            Single.append(temp + "\n")
        print friend + " is single = " + str(comment != None)
        # print iteration
        iteration += 1

    # print Single

    singleStr = ''.join(Single)

    with open(structDir + "/single.txt","wb") as f: 
        f.write(singleStr)
getFriends.py 文件源码 项目:LazyBook 作者: cfifty 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def getFriendsList(friends, part,s):
    ID = vanity
    if(part == 1):
        index = 0;
    elif(part == 2): 
        index = 24;
    elif(part == 3):
        index = 24+36
    else:
        index = 24+36+36

    # find scrape their total number of friends
    temp = s.get('https://www.facebook.com/' + ID + '/friends')
    soup = BeautifulSoup(temp.text,"lxml")
    strainer = SoupStrainer('a',href=re.compile("fref=fr_tab"))

    # iterator over entire friends list and pull out the relevant information from 
    # the html docs that display 24 or 36 friends each
    while (index < (numFriends)): 
        if index == 0:
            temp = s.get('https://m.facebook.com/' + ID + '/friends')
            soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer)
            tempLst = soup.findAll('a')
            for item in tempLst:
                friends.append(item)
            index = 24 + 36*3
        else: 
            temp = (s.get('https://m.facebook.com/' + ID + '/friends?startindex='
                + str(index)))
            soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer)
            tempLst = soup.findAll('a')
            for item in tempLst:
                friends.append(item)
            index = index + 36*4
    return
html.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table')
get_training_data.py 文件源码 项目:BachMaker 作者: anbrjohn 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def scrape(webpage, extension=".mid"):
    # Get all the files of a given extension from a webpage
    http = httplib2.Http()
    status, response = http.request(webpage)
    files = []
    for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
        if link.has_attr('href'):
            linkname = link['href']
            if linkname[-len(extension):] == extension:
                files += [linkname]
    return files
subhd.py 文件源码 项目:AlfredWorkflow-DYHub 作者: Jeff2Ma 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def get_film_info_subhd():
    items = []
    target_url = 'http://subhd.com'
    content = urllib2.urlopen(target_url).read().decode('utf-8')
    only_hotl_tags = SoupStrainer(class_='hotl')
    soup = BeautifulSoup(content, "html.parser", parse_only=only_hotl_tags)
    i = 0
    for link in soup.find_all('a', limit=7):
        link_url = target_url + link.get('href')
        link_img = target_url + link.findChildren('img')[0].get('src')
        cover_img = 'http://img3.doubanio.com/view/movie_poster_cover/spst/public/' + link_img.split('/sub/poster/l/')[
            1]
        link_title = link.findChildren('img')[0].get('title')

        save_path = os.path.abspath("./icons/icon-s")
        imgData = urllib2.urlopen(cover_img).read()
        fileName = save_path + str(i) + '.jpg'
        output = open(fileName, 'wb+')
        output.write(imgData)
        output.close()

        json_item = dict(title=link_title, subtitle='', arg=link_url, icon='icons/icon-s' + str(i) + '.jpg')
        items.append(json_item)
        i = i + 1

    return generate_xml(items)
solr_index_mapper.py 文件源码 项目:punkspider 作者: aiwennba 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def resolve_title(url):

    #grab the first title if there's more than one
    try:
        pnk_log(mod, "Requesting %s" % url)
        r = pnk_request(url)
        response_text = r.text

        for title in BeautifulSoup(response_text, 'html.parser', parse_only=SoupStrainer('title')):
            return title.text.strip()
    except:
        return None
Crack.py 文件源码 项目:VulScript 作者: y1ng1996 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __get_version(self):
        '''
        get jenkins version
        :return:
        '''
        try:
            html = urllib2.urlopen(self.url + '/login?from=%2F').read()
            links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
            version_text = BeautifulSoup(html, "html.parser", parse_only= links)
            if version_text.text != "":
                color_output("[+]....jenkins version is %s" % version_text.text)
                version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
                if len(version_re) != 0:
                    if version_re[0][0:4] >= self.check_version:
                        self.user_link = ASYNCH_PEOPEL_PERFIX
                    else:
                        self.user_link = PEOPLE_PERFIX
            else:
                color_output("[-]....can't get jenkins version!")
                sys.exit()
        except urllib2.URLError,e:
            color_output("[-]....can't get jenkins version!")
            sys.exit()
        except Exception,e:
            color_output("[-]....get version error:%s" % str(e))
            sys.exit()
GenerateCategoryMC.py 文件源码 项目:Wakapedia 作者: ACMProjectsTeam3 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def scrape(url):
      ### opens url so it's like a file
    try:
        link = urllib.request.urlopen(url)
    except urllib.error.HTTPError:
        return ''

    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=SoupStrainer('p'))

    alltxt = ''
      ### iterate thru the <p> tags
    for para in soup.find_all('p'):
        alltxt = alltxt + para.get_text() + ' '

    return alltxt
HTMLscraper.py 文件源码 项目:Wakapedia 作者: ACMProjectsTeam3 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def scrape(bun):
    ### opens url so it's like a file
  link = urllib.request.urlopen(bun.URL)

  soup = None
    ### flag for retrieving categories (or not)
  if bun.categories:
    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml')
  else:
    p_tags = SoupStrainer('p')
    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=p_tags)

    ### dictionary of paragraphs
  doc = {}
    ### add token and count to replace paragraphs in HTML
  token = 'Waka'
  count = 0

    ### all the paragraph texts in one string
  alltxt = ''
    ### iterate thru the <p> tags
  for para in soup.find_all('p'):
      ### put raw text in dictionary
    doc[token+str(count)] = para.get_text()
    alltxt = alltxt + para.get_text() + ' '
      ### replace <p> contents with a token
    para.string = token + str(count)
    count+=1

    ### get the list of categories
  cats = []
  if bun.categories:
    for cat in soup.find('div', {'id': 'catlinks'}).find('ul').findAll('li'):
      cats.append('https://en.wikipedia.org' + cat.find('a')['href'])

  for css in soup.find_all('link', rel='stylesheet'):
    css['href'] = '//en.wikipedia.org/' + css['href']

  for js in soup.find_all('script', src=re.compile('.*')):
    js['src'] = '//en.wikipedia.org/' + js['src']

    ### update stuff in Bundle
  bun.paragraphs = doc
  bun.text = alltxt
  bun.html = str(soup.encode('ascii', 'xmlcharrefreplace').decode('utf-8'))
  bun.categories = cats

  return bun
jenkins.py 文件源码 项目:hacker-scripts 作者: restran 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def __get_version(self):
        '''
        get jenkins version
        :return:
        '''
        try:
            html = urllib2.urlopen(self.url + '/login?from=%2F').read()
            links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
            version_text = BeautifulSoup(html, "html.parser", parse_only= links)
            if version_text.text != "":
                color_output("[+]....jenkins version is %s" % version_text.text)
                version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
                if len(version_re) != 0:
                    if version_re[0][0:4] >= self.check_version:
                        self.user_link = ASYNCH_PEOPEL_PERFIX
                    else:
                        self.user_link = PEOPLE_PERFIX
            else:
                color_output("[-]....can't get jenkins version!")
                sys.exit()
        except urllib2.URLError,e:
            color_output("[-]....can't get jenkins version!")
            sys.exit()
        except Exception,e:
            color_output("[-]....get version error:%s" % str(e))
            sys.exit()
parser.py 文件源码 项目:quality-content-synthesizer 作者: pratulyab 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self, text_blob, *args, **kwargs):
        TextParser.text_strainer = SoupStrainer(TextParser.strain_through)
        self.soup = BeautifulSoup(text_blob, 'html.parser', parse_only=TextParser.text_strainer)
        self.text = self._extract_text()
scrape_web.py 文件源码 项目:kenya-news-scrapper 作者: alfiepoleon 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_tuko():
    tuko_url = 'https://www.tuko.co.ke'
    if check_connection(tuko_url):
        tuko = requests.get(tuko_url)
        soup = BeautifulSoup(tuko.text, 'lxml', parse_only=SoupStrainer('a'))
        tuko = []
        for link in soup.select('a.news__link', limit=6):
            news_title = '{}({})'.format(link.get_text(), link.get('href'))
            tuko_link = requests.get(link.get('href'))
            soup_link = BeautifulSoup(tuko_link.text, 'lxml', parse_only=SoupStrainer(['p', 'meta', 'img']))
            try:
                article_date = soup_link.find("meta", itemprop="datePublished")['content']
            except (TypeError, ValueError):
                print('Tuko: No article date meta')
                continue
            image = ''
            try:
                image = soup_link.find("meta", property="og:image")['content']
            except (TypeError, ValueError):
                try:
                    image = soup_link.find('img', class_='article-image__picture')['src']
                except (TypeError, ValueError):
                    print('Tuko: No image found')
            news_dict = {
                'category': 'news',
                'source': 'tuko',
                'title': link.get_text(),
                'link': link.get('href'),
                'image': image,
                'content': [link_inner.get_text().strip(' ,.-') for link_inner in
                            soup_link.select('p.align-left > strong', limit=3) if not
                            link_inner.get_text().startswith('READ ALSO')],
                'date': article_date,
                'date_added': datetime.datetime.utcnow()
            }
            collection.update({'link': link.get('href')}, news_dict, upsert=True)
            tuko.append(news_dict)
        return tuko
scrape_web.py 文件源码 项目:kenya-news-scrapper 作者: alfiepoleon 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_capital():
    capital_url = 'http://www.capitalfm.co.ke/news/{}/{:02}'.format(today.year, today.month)
    if check_connection(capital_url):
        capital = requests.get(capital_url)
        soup = BeautifulSoup(capital.text, 'lxml', parse_only=SoupStrainer('div'))
        capital = []
        for article in soup.select('div.entry-information'):
            article_link = article.a
            link = article_link['href']
            title = article_link.get_text()
            capital_link = requests.get(link)
            soup_link = BeautifulSoup(capital_link.text, 'lxml', parse_only=SoupStrainer(['meta', 'img', 'div']))
            article_date = soup_link.find("meta", property="article:published_time")['content']
            image = ''
            try:
                image = soup_link.find("meta", property="og:image")['content']
            except (TypeError, ValueError):
                try:
                    image = soup_link.find('img', class_='size-full')['src']
                except (TypeError, ValueError):
                    print('Capital: No image found')

            try:
                content = get_content(soup_link, 'entry-content').split('\u2013')[1].strip()
            except IndexError:
                content = get_content(soup_link, 'entry-content').strip()
            news_dict = {
                'category': 'news',
                'source': 'capital',
                'title': title,
                'link': link,
                'image': image,
                'content': content,
                'date': article_date,
                'date_added': datetime.datetime.utcnow()
            }
            collection.update({'link': link}, news_dict, upsert=True)
            capital.append(news_dict)
        return capital


问题


面经


文章

微信
公众号

扫码关注公众号