def parseStatus(rtext):
texts = [t for t in rtext.contents if isinstance(t, NavigableString)]
for text in texts:
upperText = text.strip().upper()
originalText = upperText
for char in CHARS_TO_IGNORE:
upperText = upperText.replace(char, "")
upperWords = upperText.split()
if (("CLEAR" in upperWords or "CLR" in upperWords) and not originalText.endswith("?")):
return states.CLEAR
elif ("STAT" in upperWords or "STATUS" in upperWords):
return states.REQUEST
elif ("?" in originalText):
return states.REQUEST
elif (text.strip().upper() in ("BLUE", "BLUES ONLY", "ONLY BLUE" "STILL BLUE", "ALL BLUES")):
return states.CLEAR
python类NavigableString()的实例源码
def parseShips(rtext):
def formatShipName(text, word):
newText = u"""<span style="color:#d95911;font-weight:bold"> {0}</span>"""
text = text.replace(word, newText.format(word))
return text
texts = [t for t in rtext.contents if isinstance(t, NavigableString)]
for text in texts:
upperText = text.upper()
for shipName in evegate.SHIPNAMES:
if shipName in upperText:
hit = True
start = upperText.find(shipName)
end = start + len(shipName)
if ((start > 0 and upperText[start - 1] not in (" ", "X")) or (
end < len(upperText) - 1 and upperText[end] not in ("S", " "))):
hit = False
if hit:
shipInText = text[start:end]
formatted = formatShipName(text, shipInText)
textReplace(text, formatted)
return True
def _censorNaviStrCandidateWithTemplate(self, candi_str, template_str, template_var_cache):
if not type(candi_str) == element.NavigableString or not type(template_str) == element.NavigableString:
return False
matchObj = self.RegPattern.search(template_str)
if matchObj is not None:
varName = matchObj.group(1)
varValue = None
subed_tmpl_str = self.RegPattern.sub('(.+)', template_str)
reg2 = re.compile(subed_tmpl_str)
self.logger.debug('subed tmpl reg2 =', reg2)
mo2 = reg2.match(candi_str)
if mo2 is not None:
varValue = mo2.group(1)
self._procTemplateVariable(varName, varValue, template_var_cache)
else:
return False
elif not candi_str == template_str:
return False
return True
def is_text(self):
''' Check if this element is a text
Also comments and processing instructions
are instances of NavigableString,
so we have to make additional checks
'''
if not isinstance(self.context, NavigableString):
return False
if (
self.is_comment() or
self.is_doctype() or
self.is_processing_instruction()
):
return False
return True
def text(self, target=None, ignore_pureascii_words=False):
"""
Get all text in HTML, skip script and comment
:param target: the BeatuifulSoup object, default self.b
:param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
:return: list of str
"""
if target is None:
target = self.b
from bs4 import Comment
from bs4.element import NavigableString,Doctype
result = []
for descendant in target.descendants:
if not isinstance(descendant, NavigableString) \
or isinstance(descendant,Doctype) \
or descendant.parent.name in ["script", "style"] \
or isinstance(descendant, Comment) \
or "none" in descendant.parent.get("style","")\
or "font-size:0px" in descendant.parent.get("style",""):
continue
data = descendant.strip()
if len(data) > 0:
if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
if PY2:
result.append(data.encode())
else:
result.append(data)
return result
def parseUrls(rtext):
def findUrls(s):
# yes, this is faster than regex and less complex to read
urls = []
prefixes = ("http://", "https://")
for prefix in prefixes:
start = 0
while start >= 0:
start = s.find(prefix, start)
if start >= 0:
stop = s.find(" ", start)
if stop < 0:
stop = len(s)
urls.append(s[start:stop])
start += 1
return urls
def formatUrl(text, url):
newText = u"""<a style="color:#28a5ed;font-weight:bold" href="link/{0}">{0}</a>"""
text = text.replace(url, newText.format(url))
return text
texts = [t for t in rtext.contents if isinstance(t, NavigableString)]
for text in texts:
urls = findUrls(text)
for url in urls:
textReplace(text, formatUrl(text, url))
return True
def getSoupStringConcat(soupTag):
'''
Beautiful soup tags return their content text in the .string parameter if there is only one string child.
Some unfortunate cases on scotus blog have more than one child-string, and this helper just concat's them.
:param soupTag: a bs4 tag that contains one or more strings
:return: a string containing all string children of soupTag, concatenated.
'''
if isinstance(soupTag, NavigableString): return soupTag.string
result = ""
for t in soupTag.descendants:
if t.string is not None and isinstance(t, NavigableString): # only include NavigableStrings (work around .string default searching behavior)
if t.parent.name != "script": # prevent reading js
result = result + t.string
return result
def is_whitespace_string(elem):
return isinstance(elem, NavigableString) and elem.strip() == ""
def lex(source):
"""Convert source into a stream of (css_classes, token_string)."""
soup = BeautifulSoup(coqdoc(source))
root = soup.find(class_='code')
strip_soup(root, is_whitespace_string)
for elem in root.children:
if isinstance(elem, NavigableString):
yield [], elem
elif elem.name == "span":
cls = "coqdoc-{}".format(elem['type'])
yield [cls], elem.string
elif elem.name == 'br':
pass
else:
raise ValueError(elem)
def _parseTagRecursive(self, candi_tag, template_tag, template_var_cache):
for idx, tmpChild in enumerate(template_tag.contents):
if tmpChild.name == 'lisp_pass':
# this means <...>,
# indicating that anything in this tag is expected to be ignored.
continue
if len(candi_tag.contents) <= idx:
return False
candiChild = candi_tag.contents[idx]
typeCandi = type(candiChild)
typeTmp = type(tmpChild)
valid = False
if typeCandi == typeTmp == element.Tag:
if self._censorTagCandidateWithTemplate(candiChild, tmpChild, template_var_cache):
valid = self._parseTagRecursive(candiChild, tmpChild, template_var_cache)
elif typeCandi == typeTmp == element.NavigableString:
valid = self._censorNaviStrCandidateWithTemplate(
candiChild, tmpChild, template_var_cache)
if valid is False and len(template_var_cache) > 0:
self.logger.warning(template_tag)
self.logger.warning(candi_tag)
self.logger.warning('censor not passed. cache will be cleared')
template_var_cache.clear()
return False
return True
def text(self, target=None, ignore_pureascii_words=False):
"""
Get all text in HTML, skip script and comment
:param target: the BeatuifulSoup object, default self.b
:param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
:return: list of str
"""
if target is None:
target = self.b
from bs4 import Comment
from bs4.element import NavigableString,Doctype
result = []
for descendant in target.descendants:
if not isinstance(descendant, NavigableString) \
or isinstance(descendant,Doctype) \
or descendant.parent.name in ["script", "style"] \
or isinstance(descendant, Comment) \
or "none" in descendant.parent.get("style","")\
or "font-size:0px" in descendant.parent.get("style",""):
continue
data = descendant.strip()
if len(data) > 0:
if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
if PY2:
result.append(data.encode())
else:
result.append(data)
return result
def text(self):
''' Return the text contained in this element (if any)
Convert the text characters to html entities
'''
if not isinstance(self.context, NavigableString):
return u''
if self.is_comment():
return unicode(self.context)
return self.escaper.substitute_html(self.context.string)
def parse_article(self, url):
raw = self.session.get(url, verify=False)
soup = BeautifulSoup(raw.text, "lxml")
try:
article = {}
article["Author"] = soup.select(".article-meta-value")[0].contents[0].split(" ")[0]
article["Board"] = soup.select(".article-meta-value")[1].contents[0]
article["Title"] = soup.select(".article-meta-value")[2].contents[0]
article["Date"] = soup.select(".article-meta-value")[3].contents[0]
content = ""
for tag in soup.select("#main-content")[0]:
if type(tag) is NavigableString and tag !='\n':
content += tag
break
article["Content"] = content
findIPtag = u'? ???:'
# deal different ip type
try:
ip_temp = soup.find(string = re.compile(findIPtag))
ip_temp = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*", ip_temp).group()
except:
try:
ip_temp = 'NA'
f2_content = soup.select('.f2')
for content in f2_content:
if findIPtag in content.contents[0]:
ip_temp = content.next_sibling.split()[-1]
break
except:
ip_temp = 'NA'
article["IP"] = ip_temp
upvote = 0
downvote = 0
novote = 0
response_list = []
for response_struct in soup.select(".push"):
if "warning-box" not in response_struct['class']:
response_dic = {}
response_dic["Content"] = response_struct.select(".push-content")[0].contents[0][1:]
response_dic["Vote"] = response_struct.select(".push-tag")[0].contents[0][0]
response_dic["User"] = response_struct.select(".push-userid")[0].contents[0]
response_list.append(response_dic)
if response_dic["Vote"] == u"?":
upvote += 1
elif response_dic["Vote"] == u"?":
downvote += 1
else:
novote += 1
article["Responses"] = response_list
article["UpVote"] = upvote
article["DownVote"] = downvote
article["NoVote"] = novote
except Exception as e:
print(e)
print(u"error in: %s " % url)
return article
def check_html(runner, html, key=None, app=None, check_html=True, check_classes=True):
caller = stack()[1]
filepos = '{}:{:d}'.format(caller.filename.rpartition('/')[2], caller.lineno)
app = app or filepos.partition('_')[2].partition('.')[0]
if key:
filepos += '-{}'.format(key)
store = []
soup = BeautifulSoup(html, 'html.parser')
for desc in soup.descendants:
if isinstance(desc, Tag):
name = desc.name
attrs = desc.attrs
store.append(name)
for attr in sorted(attrs):
tag = str(attrs.get('name'))
if name == 'input' and tag == 'csrfmiddlewaretoken' and attr == 'value':
continue
store.append(attr)
val = attrs[attr]
if check_classes and attr == 'class':
for cls in val:
if cls:
runner.assertIn(cls, CLASS_ARRAY[app], msg=filepos)
if isinstance(val, list):
store.extend(sorted(val))
elif (isinstance(val, str)
and not (val.startswith(STATIC_URL) or ('date' in tag and attr == 'value'))):
if '?' in val:
part = val.rpartition('?')
store.append(part[0])
for arg in sorted(part[2].split('&')):
store.append(arg)
else:
store.append(val)
elif isinstance(desc, NavigableString):
store.append(str(desc))
string = ' '.join(' '.join(store).split())
hsh = md5(string.encode()).hexdigest()[:HASH_LEN]
if check_html:
if WRITE_CHECKFILE:
print(filepos, hsh, file=CHECKFILE)
elif CHECK_HTML:
runner.assertIn(filepos, CHECK_ARRAY, msg=filepos)
runner.assertEqual(CHECK_ARRAY[filepos][:HASH_LEN], hsh, msg=filepos)