def remove(self, item):
"""
This is as list.remove but works with id.
data = '<a><b></b><b></b></a>'
html = Html()
dom = html.feed(data)
for root, ind in dom.sail_with_root():
if ind.name == 'b':
root.remove(ind)
print dom
It should print.
<a ></a>
"""
index = self.index(item)
del self[index]
python类feed()的实例源码
def take(self, *args):
"""
It returns the first object whose one of its
attributes matches (key0, value0), (key1, value1), ... .
Example:
data = '<a><b id="foo" size="1"></b></a>'
html = Html()
dom = html.feed(data)
print dom.take(('id', 'foo'))
print dom.take(('id', 'foo'), ('size', '2'))
"""
seq = self.match(*args)
try:
item = seq.next()
except StopIteration:
return None
else:
return item
def walk_with_root(self):
"""
Like walk but carries root.
Example:
html = Html()
data = '<body><em>alpha</em></body>'
dom = html.feed(data)
for (root, name, attr), (ind, name, attr) in dom.walk_with_root():
print root, name, ind, name
Output:
<em >alpha</em> 1 alpha 1
<body ><em >alpha</em></body> em <em >alpha</em> em
<body ><em >alpha</em></body> body <body ><em >alpha</em></body> body
"""
for root, ind in self.sail_with_root():
yield ((root, root.name, root.attr),
(ind, ind.name, ind.attr))
def __init__(self, data):
"""
The data holds the characters.
Example:
html = Html()
data = '<body><em>alpha</em></body>'
dom = html.feed(data)
x = dom.fst('em')
x.append(Data('\nbeta'))
It outputs.
<body ><em >alpha
beta</em></body>
"""
Root.__init__(self, DATA)
self.data = data
def feed(self, data):
self.reset()
HTMLParser.feed(self, data)
def feed(self, data):
self.reset()
HTMLParser.feed(self, data)
def feed(self, chars):
# [8]
if self.phase in [self.TERMINATED, self.FOUND]:
self._terminate()
return HTMLParser.feed(self, chars)
def findHTMLMeta(stream):
"""Look for a meta http-equiv tag with the YADIS header name.
@param stream: Source of the html text
@type stream: Object that implements a read() method that works
like file.read
@return: The URI from which to fetch the XRDS document
@rtype: str
@raises MetaNotFound: raised with the content that was
searched as the first parameter.
"""
parser = YadisHTMLParser()
chunks = []
while 1:
chunk = stream.read(CHUNK_SIZE)
if not chunk:
# End of file
break
chunks.append(chunk)
try:
parser.feed(chunk)
except HTMLParseError, why:
# HTML parse error, so bail
chunks.append(stream.read())
break
except ParseDone, why:
uri = why[0]
if uri is None:
# Parse finished, but we may need the rest of the file
chunks.append(stream.read())
break
else:
return uri
content = ''.join(chunks)
raise MetaNotFound(content)
def feed(self, in_html):
self.output = ""
HTMLParser.feed(self, in_html)
return self.output
def html_to_md(h):
p = MyHTMLParser()
return p.feed(h)
def sail(self):
"""
This is used to navigate through the xml/html document.
Every xml/html object is represented by a python class
instance that inherits from Root.
The method sail is used to return an iterator
for these objects.
Example:
data = '<a> <b> </b> </a>'
html = Html()
dom = html.feed(data)
for ind in dom.sail():
print type(ind),',', ind.name
It would output.
<class 'ehp.Root'> , a
<class 'ehp.Root'> , b
"""
for indi in self[:]:
for indj in indi.sail():
yield(indj)
yield(indi)
def index(self, item):
"""
This is similar to index but uses id
to check for equality.
Example:
data = '<a><b></b><b></b></a>'
html = Html()
dom = html.feed(data)
for root, ind in dom.sail_with_root():
print root.name, ind.name, root.index(ind)
It would print.
a b 0
a b 1
a 0
The line where it appears ' a 0' corresponds to the
outmost object. The outmost object is an instance of Root
that contains all the other objects.
"""
count = 0
for ind in self:
if ind is item: return count
count = count + 1
raise ValueError
def find(self, name, *args):
"""
It is used to find all objects that match name.
Example 1:
data = '<a><b></b><b></b></a>'
html = Html()
dom = html.feed(data)
for ind in dom.find('b'):
print ind
It should print.
<b ></b>
<b ></b>
Example 2.
data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>'
html = Html()
dom = html.feed(data)
for ind in dom.find('p', ('style', 'color:green')):
print ind
Output.
<p style="color:green" > beta.</p>
"""
for ind in self.sail():
if ind.name == name:
for key, value in args:
if ind.attr[key] != value:
break
else:
yield(ind)
def find_with_root(self, name, *args):
"""
Like Root.find but returns its parent tag.
from ehp import *
html = Html()
dom = html.feed('''<body> <p> alpha </p> <p> beta </p> </body>''')
for root, ind in dom.find_with_root('p'):
root.remove(ind)
print dom
It would output.
<body > </body>
"""
for root, ind in self.sail_with_root():
if ind.name == name:
for key, value in args:
if ind.attr[key] != value:
break
else:
yield(root, ind)
def match(self, *args):
"""
It returns a sequence of objects whose attributes match.
(key0, value0), (key1, value1), ... .
Example:
data = '<a size="1"><b size="1"></b></a>'
html = Html()
dom = html.feed(data)
for ind in dom.match(('size', '1')):
print ind
It would print.
<b size="1" ></b>
<a size="1" ><b size="1" ></b></a>
"""
for ind in self.sail():
for key, value in args:
if ind.attr[key] != value:
break
else:
yield(ind)
def match_with_root(self, *args):
"""
Like Root.match but with its parent tag.
Example:
from ehp import *
html = Html()
dom = html.feed('''<body> <p style="color:black"> xxx </p>
<p style = "color:black"> mmm </p></body>''')
for root, ind in dom.match_with_root(('style', 'color:black')):
del ind.attr['style']
item = dom.fst('body')
item.attr['style'] = 'color:black'
print dom
Output.
<body style="color:black" > <p > xxx </p>
<p > mmm </p></body>
"""
for root, ind in self.sail_with_root():
for key, value in args:
if ind.attr[key] != value:
break
else:
yield(root, ind)
def text(self):
"""
It returns all objects whose name matches DATA.
It basically returns a string corresponding
to all asci characters that are inside a xml/html
tag.
Example:
html = Html()
data = '<body><em>This is all the text.</em></body>'
dom = html.feed(data)
print dom.fst('em').text()
It outputs.
This is all the text.
Notice that if you call text() on an item with
children then it returns all the *printable* characters
for that node.
"""
return self.join('', DATA)
def walk(self):
"""
Like sail but carries name and attr.
Example:
html = Html()
data = '<body> <em> This is all the text.</em></body>'
dom = html.feed(data)
for ind, name, attr in dom.walk():
print 'TAG:', ind
print 'NAME:', name
print 'ATTR:', attr
It should print.
TAG:
NAME: 1
ATTR:
TAG: This is all the text.
NAME: 1
ATTR:
TAG: <em > This is all the text.</em>
NAME: em
ATTR:
TAG: <body > <em > This is all the text.</em></body>
NAME: body
ATTR:
"""
for ind in self.sail():
yield (ind, ind.name, ind.attr)
def fromfile(self, filename):
"""
It builds a structure from a file.
"""
fd = open(fname, 'r')
data = fd.read()
fd.close()
return self.feed(data)
def feed(self, chars):
# [8]
if self.phase in [self.TERMINATED, self.FOUND]:
self._terminate()
return HTMLParser.feed(self, chars)
def findHTMLMeta(stream):
"""Look for a meta http-equiv tag with the YADIS header name.
@param stream: Source of the html text
@type stream: Object that implements a read() method that works
like file.read
@return: The URI from which to fetch the XRDS document
@rtype: str
@raises MetaNotFound: raised with the content that was
searched as the first parameter.
"""
parser = YadisHTMLParser()
chunks = []
while 1:
chunk = stream.read(CHUNK_SIZE)
if not chunk:
# End of file
break
chunks.append(chunk)
try:
parser.feed(chunk)
except HTMLParseError, why:
# HTML parse error, so bail
chunks.append(stream.read())
break
except ParseDone, why:
uri = why[0]
if uri is None:
# Parse finished, but we may need the rest of the file
chunks.append(stream.read())
break
else:
return uri
content = ''.join(chunks)
raise MetaNotFound(content)
def feed(self, chars):
# [8]
if self.phase in [self.TERMINATED, self.FOUND]:
self._terminate()
return HTMLParser.feed(self, chars)
def findHTMLMeta(stream):
"""Look for a meta http-equiv tag with the YADIS header name.
@param stream: Source of the html text
@type stream: Object that implements a read() method that works
like file.read
@return: The URI from which to fetch the XRDS document
@rtype: str
@raises MetaNotFound: raised with the content that was
searched as the first parameter.
"""
parser = YadisHTMLParser()
chunks = []
while 1:
chunk = stream.read(CHUNK_SIZE)
if not chunk:
# End of file
break
chunks.append(chunk)
try:
parser.feed(chunk)
except HTMLParseError, why:
# HTML parse error, so bail
chunks.append(stream.read())
break
except ParseDone, why:
uri = why[0]
if uri is None:
# Parse finished, but we may need the rest of the file
chunks.append(stream.read())
break
else:
return uri
content = ''.join(chunks)
raise MetaNotFound(content)
def join(self, delim, *args):
"""
It joins all the objects whose name appears in args.
Example 1:
html = Html()
data = '<a><b> This is cool. </b><b> That is. </b></a>'
dom = html.feed(data)
print dom.join('', 'b')
print type(dom.join('b'))
It would print.
<b > This is cool. </b><b > That is. </b>
<type 'str'>
Example 2:
html = Html()
data = '<a><b> alpha</b><c>beta</c> <b>gamma</a>'
dom = html.feed(data)
print dom.join('', 'b', 'c')
It would print.
<b > alpha</b><c >beta</c><b >gamma</b>
Example 3:
html = Html()
data = '<a><b>alpha</b><c>beta</c><b>gamma</a>'
dom = html.feed(data)
print dom.join('\n', DATA)
It would print.
alpha
beta
gamma
"""
data = ''
for ind in self.sail():
if ind.name in args:
data = '%s%s%s' % (data, delim, ind)
return data
def fst(self, name, *args):
"""
It returns the first object whose name
matches.
Example 1:
html = Html()
data = '<body> <em> Cool. </em></body>'
dom = html.feed(data)
print dom.fst('em')
It outputs.
<em > Cool. </em>
Example 2:
data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>'
html = Html()
dom = html.feed(data)
for ind in dom.find('p', ('style', 'color:green')):
print ind
print dom.fst('p', ('style', 'color:green'))
print dom.fst_with_root('p', ('style', 'color:green'))
Output:
<p style="color:green" > beta.</p>
<p style="color:green" > beta.</p>
(<ehp.Tag object at 0xb7216c0c>, <ehp.Tag object at 0xb7216d24>)
"""
# for ind in self.sail():
# if ind.name == name:
# for key, value in args:
# if ind.attr[key] != value:
# break
# else:
# return ind
seq = self.find(name, *args)
try:
item = seq.next()
except StopIteration:
return None
else:
return item