python类parse()的实例源码

scraping.py 文件源码 项目:ovcurriculum 作者: etandel 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def get_sections(curriculum_code):
    r = requests.get(BASE_URL.format(curriculum_code))
    r.raise_for_status()
    tree = parse_html(BytesIO(r.content))
    return list(map(build_section,
                    tree.xpath(TABLES_XPATH)[RELEVANT_SECTIONS]))
jsonschema_role.py 文件源码 项目:deb-python-jsonschema 作者: openstack 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def fetch_or_load(spec_path):
    """
    Fetch a new specification or use the cache if it's current.

    :argument cache_path: the path to a cached specification

    """

    headers = {}

    try:
        modified = datetime.utcfromtimestamp(os.path.getmtime(spec_path))
        date = modified.strftime("%a, %d %b %Y %I:%M:%S UTC")
        headers["If-Modified-Since"] = date
    except OSError as error:
        if error.errno != errno.ENOENT:
            raise

    request = urllib.Request(VALIDATION_SPEC, headers=headers)
    response = urllib.urlopen(request)

    if response.code == 200:
        with open(spec_path, "w+b") as spec:
            spec.writelines(response)
            spec.seek(0)
            return html.parse(spec)

    with open(spec_path) as spec:
        return html.parse(spec)
get_cook_data.py 文件源码 项目:partytime 作者: sunlightlabs 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def get_latest_url(list_url):
    doc = parse(list_url).getroot()
    return 'http://www.cookpolitical.com%s' % doc.cssselect('h1')[0].getnext().cssselect('a')[0].values()[0]
get_cook_data.py 文件源码 项目:partytime 作者: sunlightlabs 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def get_senate_ratings():
    url = get_latest_url('http://www.cookpolitical.com/node/4060')
    doc = parse(url).getroot()

    good_tds = []

    for td in doc.cssselect('td'):
        d = dict(td.items())
        if not d.has_key('width') or not d['width'] == '92':
            continue
        data = [x for x in list(td.itertext()) if x.strip()]
        if len(data) == 1:
            continue

        rating = re.sub(r' \(.*$', '', data[0]) \
                .lower() \
                .replace(' ', '_') \
                .replace('toss_up', 'tossup') \

        data = data[1:]

        for race in data:
            state = re.search(r'[A-Z]{2}', race).group()
            district = ''
            body = 'S'

            cr, created = CookRating.objects.get_or_create(body=body,
                                           state=state,
                                           district=district,
                                           rating=rating)
            cr.save()
get_cook_data.py 文件源码 项目:partytime 作者: sunlightlabs 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_house_ratings():
    url = get_latest_url('http://www.cookpolitical.com/node/4056')
    doc = parse(url).getroot()

    tables = doc.cssselect('table.nestedTable')

    data = {}

    (data['likely_dem'],
     data['lean_dem'],
     data['dem_tossup'],
     data['gop_tossup'],
     data['lean_gop'],
     data['likely_gop']) = tables

    candidate_data = {}

    for key in data.keys():
        rows = data[key].cssselect('tr')[1:]
        for row in rows:
            district, incumbent, score = list(row.itertext())[::2]
            rating = key
            state, district = district.split('-')
            body = 'H'

            cr, created = CookRating.objects.get_or_create(body=body,
                                           state=state,
                                           district=district,
                                           rating=rating)
            cr.save()
elpais_spider.py 文件源码 项目:el_pais_editoriales 作者: rinze 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def process_editorial_list(url):
    """
    Process a page that contains a list of editorials.
    Returns:
        - A list of URLs to individual editorial articles.
        - The URL to the next editorial list.
    """
    content = urllib2.urlopen(url)
    tree = html.parse(content)
    content.close()
    next_edlist = get_next_edlist(tree)
    artlist = get_edarticles(tree)

    return (artlist, next_edlist)
xpath.py 文件源码 项目:qal 作者: OptimalBPM 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _structure_init(self, _dataset):
        """Initializes the XML structure that data is to be applied to."""
        print("XpathDataset._structure_init")
        super(XpathDataset, self)._structure_init(_dataset)

        # Parse important information data from XPath 
        _root_node_name, self._structure_row_node_name, _parent_xpath = self._structure_parse_root_path(self.rows_xpath)

        # If the structure already loaded?
        if self._structure_row_node_parent is None:

            # If not try to load, or create file.    
            import os

            if os.path.exists(make_path_absolute(self.filename, self._base_path)):

                try:
                    self.load(_add_node_ref=True)
                except Exception as e:
                    raise Exception("XpathDataset.save - error parsing " + self.xpath_data_format + " file : " + str(e))
            else:
                # Create a tree with root node based on the first  

                if _root_node_name != "":
                    # noinspection PyUnusedLocal
                    if self.encoding:
                        _encoding = self.encoding
                    else:
                        _encoding = "UTF-8"
                    # TODO: Check why this is done, _tree isn't used
                    # noinspection PyUnusedLocal
                    _tree = etree.parse(io.StringIO("<?xml version='1.0' ?>\n<" + _root_node_name + "/>"))
                else:
                    raise Exception("XpathDataset.save - rows_xpath(" + str(
                        self.rows_xpath) + ") must be absolute and have at least the name of the root node. " +
                                           "Example: \"/root_node\" ")

        # If the structure there yet? It could be an XML file with only a top node. 
        if self._structure_row_node_parent is None:
            # If not existing, create a node structure up to the parent or the row nodes
            # from the information in the xpath.
            self._structure_top_node = self._structure_create_xpath_nodes(self._structure_top_node, self.rows_xpath)
html.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False, encoding=self.encoding)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = parse_url(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('%r is not a valid url scheme, valid schemes are '
                           '%s') % (scheme, _valid_schemes)
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r


问题


面经


文章

微信
公众号

扫码关注公众号