def originalTextFor(expr, asString=True):
"""
Helper to return the original, untokenized text for a given expression. Useful to
restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching
input text. By default, returns astring containing the original parsed text.
If the optional C{asString} argument is passed as C{False}, then the return value is a
C{L{ParseResults}} containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if
the expression passed to C{L{originalTextFor}} contains expressions with defined
results names, you must set C{asString} to C{False} if you want to preserve those
results name values.
Example::
src = "this is test <b> bold <i>text</i> </b> normal text "
for tag in ("b","i"):
opener,closer = makeHTMLTags(tag)
patt = originalTextFor(opener + SkipTo(closer) + closer)
print(patt.searchString(src)[0])
prints::
['<b> bold <i>text</i> </b>']
['<i>text</i>']
"""
locMarker = Empty().setParseAction(lambda s,loc,t: loc)
endlocMarker = locMarker.copy()
endlocMarker.callPreparse = False
matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
if asString:
extractText = lambda s,l,t: s[t._original_start:t._original_end]
else:
def extractText(s,l,t):
t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]]
matchExpr.setParseAction(extractText)
matchExpr.ignoreExprs = expr.ignoreExprs
return matchExpr
python类whitespace()的实例源码
def setDefaultWhitespaceChars( chars ):
r"""
Overrides the default whitespace chars
Example::
# default whitespace chars are space, <TAB> and newline
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def', 'ghi', 'jkl']
# change to just treat newline as significant
ParserElement.setDefaultWhitespaceChars(" \t")
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def']
"""
ParserElement.DEFAULT_WHITE_CHARS = chars
def leaveWhitespace( self ):
"""
Disables the skipping of whitespace before matching the characters in the
C{ParserElement}'s defined pattern. This is normally only used internally by
the pyparsing module, but may be needed in some whitespace-sensitive grammars.
"""
self.skipWhitespace = False
return self
def setWhitespaceChars( self, chars ):
"""
Overrides the default whitespace chars
"""
self.skipWhitespace = True
self.whiteChars = chars
self.copyDefaultWhiteChars = False
return self
def parseImpl( self, instring, loc, doActions=True ):
if loc != 0:
# see if entire string up to here is just whitespace and ignoreables
if loc != self.preParse( instring, 0 ):
raise ParseException(instring, loc, self.errmsg, self)
return loc, []
def __init__( self, expr, joinString="", adjacent=True ):
super(Combine,self).__init__( expr )
# suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself
if adjacent:
self.leaveWhitespace()
self.adjacent = adjacent
self.skipWhitespace = True
self.joinString = joinString
self.callPreparse = True
def originalTextFor(expr, asString=True):
"""
Helper to return the original, untokenized text for a given expression. Useful to
restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching
input text. By default, returns astring containing the original parsed text.
If the optional C{asString} argument is passed as C{False}, then the return value is a
C{L{ParseResults}} containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if
the expression passed to C{L{originalTextFor}} contains expressions with defined
results names, you must set C{asString} to C{False} if you want to preserve those
results name values.
Example::
src = "this is test <b> bold <i>text</i> </b> normal text "
for tag in ("b","i"):
opener,closer = makeHTMLTags(tag)
patt = originalTextFor(opener + SkipTo(closer) + closer)
print(patt.searchString(src)[0])
prints::
['<b> bold <i>text</i> </b>']
['<i>text</i>']
"""
locMarker = Empty().setParseAction(lambda s,loc,t: loc)
endlocMarker = locMarker.copy()
endlocMarker.callPreparse = False
matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
if asString:
extractText = lambda s,l,t: s[t._original_start:t._original_end]
else:
def extractText(s,l,t):
t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]]
matchExpr.setParseAction(extractText)
matchExpr.ignoreExprs = expr.ignoreExprs
return matchExpr
def setDefaultWhitespaceChars( chars ):
r"""
Overrides the default whitespace chars
Example::
# default whitespace chars are space, <TAB> and newline
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def', 'ghi', 'jkl']
# change to just treat newline as significant
ParserElement.setDefaultWhitespaceChars(" \t")
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def']
"""
ParserElement.DEFAULT_WHITE_CHARS = chars
def leaveWhitespace( self ):
"""
Disables the skipping of whitespace before matching the characters in the
C{ParserElement}'s defined pattern. This is normally only used internally by
the pyparsing module, but may be needed in some whitespace-sensitive grammars.
"""
self.skipWhitespace = False
return self
def setWhitespaceChars( self, chars ):
"""
Overrides the default whitespace chars
"""
self.skipWhitespace = True
self.whiteChars = chars
self.copyDefaultWhiteChars = False
return self
def parseImpl( self, instring, loc, doActions=True ):
result = instring[loc] == self.firstQuoteChar and self.re.match(instring,loc) or None
if not result:
raise ParseException(instring, loc, self.errmsg, self)
loc = result.end()
ret = result.group()
if self.unquoteResults:
# strip off quotes
ret = ret[self.quoteCharLen:-self.endQuoteCharLen]
if isinstance(ret,basestring):
# replace escaped whitespace
if '\\' in ret and self.convertWhitespaceEscapes:
ws_map = {
r'\t' : '\t',
r'\n' : '\n',
r'\f' : '\f',
r'\r' : '\r',
}
for wslit,wschar in ws_map.items():
ret = ret.replace(wslit, wschar)
# replace escaped characters
if self.escChar:
ret = re.sub(self.escCharReplacePattern,"\g<1>",ret)
# replace escaped quotes
if self.escQuote:
ret = ret.replace(self.escQuote, self.endQuoteChar)
return loc, ret
def parseImpl( self, instring, loc, doActions=True ):
if loc != 0:
# see if entire string up to here is just whitespace and ignoreables
if loc != self.preParse( instring, 0 ):
raise ParseException(instring, loc, self.errmsg, self)
return loc, []
def originalTextFor(expr, asString=True):
"""
Helper to return the original, untokenized text for a given expression. Useful to
restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching
input text. By default, returns astring containing the original parsed text.
If the optional C{asString} argument is passed as C{False}, then the return value is a
C{L{ParseResults}} containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if
the expression passed to C{L{originalTextFor}} contains expressions with defined
results names, you must set C{asString} to C{False} if you want to preserve those
results name values.
Example::
src = "this is test <b> bold <i>text</i> </b> normal text "
for tag in ("b","i"):
opener,closer = makeHTMLTags(tag)
patt = originalTextFor(opener + SkipTo(closer) + closer)
print(patt.searchString(src)[0])
prints::
['<b> bold <i>text</i> </b>']
['<i>text</i>']
"""
locMarker = Empty().setParseAction(lambda s,loc,t: loc)
endlocMarker = locMarker.copy()
endlocMarker.callPreparse = False
matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
if asString:
extractText = lambda s,l,t: s[t._original_start:t._original_end]
else:
def extractText(s,l,t):
t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]]
matchExpr.setParseAction(extractText)
matchExpr.ignoreExprs = expr.ignoreExprs
return matchExpr
def setDefaultWhitespaceChars( chars ):
r"""
Overrides the default whitespace chars
Example::
# default whitespace chars are space, <TAB> and newline
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def', 'ghi', 'jkl']
# change to just treat newline as significant
ParserElement.setDefaultWhitespaceChars(" \t")
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def']
"""
ParserElement.DEFAULT_WHITE_CHARS = chars
def leaveWhitespace( self ):
"""
Disables the skipping of whitespace before matching the characters in the
C{ParserElement}'s defined pattern. This is normally only used internally by
the pyparsing module, but may be needed in some whitespace-sensitive grammars.
"""
self.skipWhitespace = False
return self
def setWhitespaceChars( self, chars ):
"""
Overrides the default whitespace chars
"""
self.skipWhitespace = True
self.whiteChars = chars
self.copyDefaultWhiteChars = False
return self
def parseImpl( self, instring, loc, doActions=True ):
if loc != 0:
# see if entire string up to here is just whitespace and ignoreables
if loc != self.preParse( instring, 0 ):
raise ParseException(instring, loc, self.errmsg, self)
return loc, []
def __init__( self, expr, joinString="", adjacent=True ):
super(Combine,self).__init__( expr )
# suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself
if adjacent:
self.leaveWhitespace()
self.adjacent = adjacent
self.skipWhitespace = True
self.joinString = joinString
self.callPreparse = True
def originalTextFor(expr, asString=True):
"""
Helper to return the original, untokenized text for a given expression. Useful to
restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching
input text. By default, returns astring containing the original parsed text.
If the optional C{asString} argument is passed as C{False}, then the return value is a
C{L{ParseResults}} containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if
the expression passed to C{L{originalTextFor}} contains expressions with defined
results names, you must set C{asString} to C{False} if you want to preserve those
results name values.
Example::
src = "this is test <b> bold <i>text</i> </b> normal text "
for tag in ("b","i"):
opener,closer = makeHTMLTags(tag)
patt = originalTextFor(opener + SkipTo(closer) + closer)
print(patt.searchString(src)[0])
prints::
['<b> bold <i>text</i> </b>']
['<i>text</i>']
"""
locMarker = Empty().setParseAction(lambda s,loc,t: loc)
endlocMarker = locMarker.copy()
endlocMarker.callPreparse = False
matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
if asString:
extractText = lambda s,l,t: s[t._original_start:t._original_end]
else:
def extractText(s,l,t):
t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]]
matchExpr.setParseAction(extractText)
matchExpr.ignoreExprs = expr.ignoreExprs
return matchExpr
def _munge_whitespace(self, text):
"""_munge_whitespace(text : string) -> string
Munge whitespace in text: expand tabs and convert all other
whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
becomes " foo bar baz".
"""
if self.expand_tabs:
text = text.expandtabs()
if self.replace_whitespace:
if isinstance(text, str):
text = text.translate(self.whitespace_trans)
elif isinstance(text, unicode):
text = text.translate(self.unicode_whitespace_trans)
return text