def parse(self):
"""Generator that parses the HTML source, yielding markup events.
:return: a markup event stream
:raises ParseError: if the HTML text is not well formed
"""
def _generate():
if self.encoding:
reader = codecs.getreader(self.encoding)
source = reader(self.source)
else:
source = self.source
try:
bufsize = 4 * 1024 # 4K
done = False
while 1:
while not done and len(self._queue) == 0:
data = source.read(bufsize)
if not data: # end of data
self.close()
done = True
else:
if not isinstance(data, unicode):
raise UnicodeError("source returned bytes, but no encoding specified")
self.feed(data)
for kind, data, pos in self._queue:
yield kind, data, pos
self._queue = []
if done:
open_tags = self._open_tags
open_tags.reverse()
for tag in open_tags:
yield END, QName(tag), pos
break
except html.HTMLParseError, e:
msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
raise ParseError(msg, self.filename, e.lineno, e.offset)
return Stream(_generate()).filter(_coalesce)
评论列表
文章目录