def feed(self, data):
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
try:
bytes
if bytes is str:
raise NameError
self.encoding = self.encoding + '_INVALID_PYTHON_3'
except NameError:
if self.encoding and type(data) == type(u''):
data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
sgmllib.SGMLParser.close(self)
python类SGMLParser()的实例源码
def __init__(self, formatter, verbose=0):
"""Creates an instance of the HTMLParser class.
The formatter parameter is the formatter instance associated with
the parser.
"""
sgmllib.SGMLParser.__init__(self, verbose)
self.formatter = formatter
def reset(self):
sgmllib.SGMLParser.reset(self)
self.savedata = None
self.isindex = 0
self.title = None
self.base = None
self.anchor = None
self.anchorlist = []
self.nofill = 0
self.list_stack = []
# ------ Methods used internally; some may be overridden
# --- Formatter interface, taking care of 'savedata' mode;
# shouldn't need to be overridden
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.entries = []
self.dates = []
self.inHtml = 0
self.inDate = 0
self.data = ""
def __init__(self, url, verbose=VERBOSE, checker=None):
self.myverbose = verbose # now unused
self.checker = checker
self.base = None
self.links = {}
self.names = []
self.url = url
sgmllib.SGMLParser.__init__(self)
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
def __init__(self, formatter, verbose=0):
"""Creates an instance of the HTMLParser class.
The formatter parameter is the formatter instance associated with
the parser.
"""
sgmllib.SGMLParser.__init__(self, verbose)
self.formatter = formatter
def reset(self):
sgmllib.SGMLParser.reset(self)
self.savedata = None
self.isindex = 0
self.title = None
self.base = None
self.anchor = None
self.anchorlist = []
self.nofill = 0
self.list_stack = []
# ------ Methods used internally; some may be overridden
# --- Formatter interface, taking care of 'savedata' mode;
# shouldn't need to be overridden
def __init__(self, encoding, _type):
self.encoding = encoding
self._type = _type
sgmllib.SGMLParser.__init__(self)
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
def feed(self, data):
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
try:
bytes
if bytes is str:
raise NameError
self.encoding = self.encoding + '_INVALID_PYTHON_3'
except NameError:
if self.encoding and isinstance(data, str):
data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
sgmllib.SGMLParser.close(self)
def parse_declaration(self, i):
try:
return sgmllib.SGMLParser.parse_declaration(self, i)
except sgmllib.SGMLParseError:
# escape the doctype declaration and continue parsing
self.handle_data('<')
return i+1
def __init__(self, baseuri, baselang, encoding, entities):
sgmllib.SGMLParser.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
_BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
self.entities=entities
def __init__(self, encoding, _type):
self.encoding = encoding
self._type = _type
sgmllib.SGMLParser.__init__(self)
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
def feed(self, data):
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
try:
bytes
if bytes is str:
raise NameError
self.encoding = self.encoding + u'_INVALID_PYTHON_3'
except NameError:
if self.encoding and isinstance(data, unicode):
data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
sgmllib.SGMLParser.close(self)
def parse_declaration(self, i):
try:
return sgmllib.SGMLParser.parse_declaration(self, i)
except sgmllib.SGMLParseError:
# escape the doctype declaration and continue parsing
self.handle_data('<')
return i+1
def __init__(self, baseuri, baselang, encoding, entities):
sgmllib.SGMLParser.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
_BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
self.entities=entities
def __init__(self):
sgmllib.SGMLParser.__init__(self)
def __init__(self, encoding, _type):
self.encoding = encoding
self._type = _type
sgmllib.SGMLParser.__init__(self)
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
def feed(self, data):
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
try:
bytes
if bytes is str:
raise NameError
self.encoding = self.encoding + u'_INVALID_PYTHON_3'
except NameError:
if self.encoding and isinstance(data, unicode):
data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
sgmllib.SGMLParser.close(self)
def parse_declaration(self, i):
try:
return sgmllib.SGMLParser.parse_declaration(self, i)
except sgmllib.SGMLParseError:
# escape the doctype declaration and continue parsing
self.handle_data('<')
return i+1
def __init__(self, baseuri, baselang, encoding, entities):
sgmllib.SGMLParser.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
_BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
self.entities=entities
def __init__(self, encoding, _type):
self.encoding = encoding
self._type = _type
sgmllib.SGMLParser.__init__(self)
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
def feed(self, data):
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
try:
bytes
if bytes is str:
raise NameError
self.encoding = self.encoding + u'_INVALID_PYTHON_3'
except NameError:
if self.encoding and isinstance(data, unicode):
data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
sgmllib.SGMLParser.close(self)
def parse_declaration(self, i):
try:
return sgmllib.SGMLParser.parse_declaration(self, i)
except sgmllib.SGMLParseError:
# escape the doctype declaration and continue parsing
self.handle_data('<')
return i+1
def __init__(self, baseuri, baselang, encoding, entities):
sgmllib.SGMLParser.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
_BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
self.entities=entities
def __init__(self, encoding, _type):
self.encoding = encoding
self._type = _type
sgmllib.SGMLParser.__init__(self)