def get_sections(curriculum_code):
r = requests.get(BASE_URL.format(curriculum_code))
r.raise_for_status()
tree = parse_html(BytesIO(r.content))
return list(map(build_section,
tree.xpath(TABLES_XPATH)[RELEVANT_SECTIONS]))
python类parse()的实例源码
def fetch_or_load(spec_path):
"""
Fetch a new specification or use the cache if it's current.
:argument cache_path: the path to a cached specification
"""
headers = {}
try:
modified = datetime.utcfromtimestamp(os.path.getmtime(spec_path))
date = modified.strftime("%a, %d %b %Y %I:%M:%S UTC")
headers["If-Modified-Since"] = date
except OSError as error:
if error.errno != errno.ENOENT:
raise
request = urllib.Request(VALIDATION_SPEC, headers=headers)
response = urllib.urlopen(request)
if response.code == 200:
with open(spec_path, "w+b") as spec:
spec.writelines(response)
spec.seek(0)
return html.parse(spec)
with open(spec_path) as spec:
return html.parse(spec)
def get_latest_url(list_url):
doc = parse(list_url).getroot()
return 'http://www.cookpolitical.com%s' % doc.cssselect('h1')[0].getnext().cssselect('a')[0].values()[0]
def get_senate_ratings():
url = get_latest_url('http://www.cookpolitical.com/node/4060')
doc = parse(url).getroot()
good_tds = []
for td in doc.cssselect('td'):
d = dict(td.items())
if not d.has_key('width') or not d['width'] == '92':
continue
data = [x for x in list(td.itertext()) if x.strip()]
if len(data) == 1:
continue
rating = re.sub(r' \(.*$', '', data[0]) \
.lower() \
.replace(' ', '_') \
.replace('toss_up', 'tossup') \
data = data[1:]
for race in data:
state = re.search(r'[A-Z]{2}', race).group()
district = ''
body = 'S'
cr, created = CookRating.objects.get_or_create(body=body,
state=state,
district=district,
rating=rating)
cr.save()
def get_house_ratings():
url = get_latest_url('http://www.cookpolitical.com/node/4056')
doc = parse(url).getroot()
tables = doc.cssselect('table.nestedTable')
data = {}
(data['likely_dem'],
data['lean_dem'],
data['dem_tossup'],
data['gop_tossup'],
data['lean_gop'],
data['likely_gop']) = tables
candidate_data = {}
for key in data.keys():
rows = data[key].cssselect('tr')[1:]
for row in rows:
district, incumbent, score = list(row.itertext())[::2]
rating = key
state, district = district.split('-')
body = 'H'
cr, created = CookRating.objects.get_or_create(body=body,
state=state,
district=district,
rating=rating)
cr.save()
def process_editorial_list(url):
"""
Process a page that contains a list of editorials.
Returns:
- A list of URLs to individual editorial articles.
- The URL to the next editorial list.
"""
content = urllib2.urlopen(url)
tree = html.parse(content)
content.close()
next_edlist = get_next_edlist(tree)
artlist = get_edarticles(tree)
return (artlist, next_edlist)
def _structure_init(self, _dataset):
"""Initializes the XML structure that data is to be applied to."""
print("XpathDataset._structure_init")
super(XpathDataset, self)._structure_init(_dataset)
# Parse important information data from XPath
_root_node_name, self._structure_row_node_name, _parent_xpath = self._structure_parse_root_path(self.rows_xpath)
# If the structure already loaded?
if self._structure_row_node_parent is None:
# If not try to load, or create file.
import os
if os.path.exists(make_path_absolute(self.filename, self._base_path)):
try:
self.load(_add_node_ref=True)
except Exception as e:
raise Exception("XpathDataset.save - error parsing " + self.xpath_data_format + " file : " + str(e))
else:
# Create a tree with root node based on the first
if _root_node_name != "":
# noinspection PyUnusedLocal
if self.encoding:
_encoding = self.encoding
else:
_encoding = "UTF-8"
# TODO: Check why this is done, _tree isn't used
# noinspection PyUnusedLocal
_tree = etree.parse(io.StringIO("<?xml version='1.0' ?>\n<" + _root_node_name + "/>"))
else:
raise Exception("XpathDataset.save - rows_xpath(" + str(
self.rows_xpath) + ") must be absolute and have at least the name of the root node. " +
"Example: \"/root_node\" ")
# If the structure there yet? It could be an XML file with only a top node.
if self._structure_row_node_parent is None:
# If not existing, create a node structure up to the parent or the row nodes
# from the information in the xpath.
self._structure_top_node = self._structure_create_xpath_nodes(self._structure_top_node, self.rows_xpath)
html.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def _build_doc(self):
"""
Raises
------
ValueError
* If a URL that lxml cannot parse is passed.
Exception
* Any other ``Exception`` thrown. For example, trying to parse a
URL that is syntactically correct on a machine with no internet
connection will fail.
See Also
--------
pandas.io.html._HtmlFrameParser._build_doc
"""
from lxml.html import parse, fromstring, HTMLParser
from lxml.etree import XMLSyntaxError
parser = HTMLParser(recover=False, encoding=self.encoding)
try:
# try to parse the input in the simplest way
r = parse(self.io, parser=parser)
try:
r = r.getroot()
except AttributeError:
pass
except (UnicodeDecodeError, IOError):
# if the input is a blob of html goop
if not _is_url(self.io):
r = fromstring(self.io, parser=parser)
try:
r = r.getroot()
except AttributeError:
pass
else:
# not a url
scheme = parse_url(self.io).scheme
if scheme not in _valid_schemes:
# lxml can't parse it
msg = ('%r is not a valid url scheme, valid schemes are '
'%s') % (scheme, _valid_schemes)
raise ValueError(msg)
else:
# something else happened: maybe a faulty connection
raise
else:
if not hasattr(r, 'text_content'):
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
return r