def xml(self, url, method='get', params=None, data=None):
"""
?????xml
:type url: str
:param url: API
:type method: str
:param method: HTTP METHOD
:type params: dict
:param params: query
:type data: dict
:param data: body
:rtype: html.HtmlElement
:return:
"""
r = self.req(url, method, params, data)
# this is required for avoid utf8-mb4 lead to encoding error
return self.to_xml(r.content, base_url=r.url)
python类HtmlElement()的实例源码
def _fragments_from_string(html_string):
fragments = html.fragments_fromstring(html_string)
if not len(fragments):
return []
# convert and append text node before starting tag
if not isinstance(fragments[0], html.HtmlElement):
if len(fragments[0].strip()) > 0:
if len(fragments) == 1:
return html.fragments_fromstring('<p>%s</p>' % fragments[0])
else:
paragraph = _create_element('p')
paragraph.text = fragments[0]
fragments[1].addprevious(paragraph)
fragments.insert(1, paragraph)
fragments.pop(0)
if not len(fragments):
return []
# remove xml instructions (if cleaning is disabled)
for instruction in fragments[0].xpath('//processing-instruction()'):
instruction.drop_tag()
return fragments
def _sanitize_html_frags(html_value, valid_tags, valid_attributes):
fragments = html.fragments_fromstring(html_value)
for f in fragments:
if isinstance(f, html.HtmlElement):
_sanitize_html_rec(f, valid_tags, valid_attributes)
if f.tag in valid_tags:
_clean_attributes(f, valid_attributes)
yield html.tostring(f, encoding="unicode")
else:
if f.text:
yield f.text
for sub in f:
yield html.tostring(sub, encoding="unicode")
if f.tail:
yield f.tail
if f.tag in ('p', 'br'):
yield '\n'
else:
yield f
def get_role(self, intervention):
roles = intervention.xpath('.//span[@class="italic"][text()[re:test(.,"^[\s\xad\-–?—\.]*(?:{})[\s\xad\-–?\.]*(?:\([A-Z][A-Z]\))?[\s\xad\-–?—\.]*$", "m")]]'.format('|'.join(self.loc['roles'])), namespaces=self.ns)
if len(roles) > 0:
output = []
for role in roles:
if type(role) is str:
output.append(role)
elif type(role) is html.HtmlElement:
output.append(role.text)
for role in roles:
lang = re.match(
r'.*({}).*'.format('|'.join(self.langs)),
role.text)
if lang is not None:
i_lang = lang.group(1)
else:
i_lang = None
role.drop_tree()
else:
output = None
i_lang = None
if output is not None:
output = " ".join(output)
output = re.sub(r'\n', r' ', output)
output = re.sub(r' +', r' ', output)
output = re.sub(r'\([\p{Lu}\&/\-–]+\)', r'', output)
output = re.sub(r'(\p{Ll})[\s\.\xad–\-?—,\)]+\Z', r'\1', output)
output = re.sub(r'\A[\xad\s\.—–\-?,\)\(]+', r'', output)
output = re.sub(r'[\xad\s\.—–\-?,\)]+\Z', r'', output)
return output, i_lang
def _create_element(element, text=None):
# creates lxml element without document tree (no body, no parents)
new_element = html.HtmlElement()
new_element.tag = element
if text:
new_element.text = text
return new_element
def preprocess_media_tags(element):
if isinstance(element, html.HtmlElement):
if element.tag in ['ol', 'ul']:
# ignore any spaces between <ul> and <li>
element.text = ''
elif element.tag == 'li':
# ignore spaces after </li>
element.tail = ''
elif element.tag == 'iframe':
iframe_src = element.get('src')
youtube = re.match(youtube_re, iframe_src)
vimeo = re.match(vimeo_re, iframe_src)
if youtube or vimeo:
element.text = '' # ignore any legacy text
if youtube:
yt_id = urlparse(iframe_src).path.replace('/embed/', '')
element.set('src', '/embed/youtube?url=' + quote_plus('https://www.youtube.com/watch?v=' + yt_id))
elif vimeo:
element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2)))
if not len(element.xpath('./ancestor::figure')):
_wrap_figure(element)
else:
element.drop_tag()
elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet':
twitter_links = element.xpath('.//a[@href]')
for tw_link in twitter_links:
if twitter_re.match(tw_link.get('href')):
twitter_frame = html.HtmlElement()
twitter_frame.tag = 'iframe'
twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href')))
element.addprevious(twitter_frame)
_wrap_figure(twitter_frame)
element.drop_tree()
def _get_html_tree(self) -> html.HtmlElement:
"""
Gets html of the page as a tree, this can be used to extract the data using some xpath expressions.
:return: Returns the root of the html tree.
"""
return html.fromstring(self._get_html())
def _clean_html(html_value, cleaner):
fragments = html.fragments_fromstring(html_value)
for f in fragments:
if isinstance(f, html.HtmlElement):
cleaner(f)
yield html.tostring(f, encoding="unicode")
else:
yield f