def process_post_bodies(bodies: List[Tag]) -> (str, list):
for body in bodies:
cites = list()
cited = body.findAll('div', {'class': 'cite'})
if cited:
cites = [c['name'] for c in cited]
collect_text = []
for tag in body:
# TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here)
if tag.name not in ('div', 'p'):
if hasattr(tag, 'text'):
collect_text.append(tag.text)
elif isinstance(tag, NavigableString):
collect_text.append(str(tag))
else:
collect_text.append('\n')
else:
yield ''.join(collect_text), cites
python类NavigableString()的实例源码
def checklistInENMLtoSoup(soup):
'''
Transforms Evernote checklist elements to github `* [ ]` task list style
'''
transform_tags = ['p','div']
# soup.select cant be used with dashes: https://bugs.launchpad.net/beautifulsoup/+bug/1276211
for todo in soup.find_all('en-todo'):
parent = todo.parent
transform = parent.find() == todo and parent.name in transform_tags
checked = todo.attrs.get('checked',None) == "true"
todo.replace_with("[x] " if checked else "[ ] ")
# EN checklist can appear anywhere, but if they appear at the beggining
# of a block element, transform it so it ressembles github markdown syntax
if transform:
content = ''.join(unicode(child) for child in parent.children
if isinstance(child, NavigableString)
).strip()
new_tag = soup.new_tag("li")
new_tag.string = content
parent.replace_with(new_tag)
def process_tag(tag, valid_tags=()):
if isinstance(tag, NavigableString):
return tag
if tag.name in valid_tags:
for subtag in tag.contents:
subtag.replaceWith(process_tag(subtag, valid_tags))
return tag
else:
result = ""
for subtag in tag.contents:
result += str(process_tag(subtag, valid_tags))
return result
def get_students(self):
group = self.group
group_name = group[:group.find('(')].strip()
group_code = group[group.find('(')+1:group.find(')')]
students = []
for row in self.table.children:
if type(row) == NavigableString:
continue
active = True
link = row.find(class_='fio_3').parent
if link.has_attr('style') and link['style'] == 'color:gray;': #????? ?????? - ??????? ????????
active = False
student_id = parse_qs(urlparse(link['href']).query)['sid'][0]
name = row.find(class_='fio_3').string.strip()
record_book_id = row.find(class_='hc3').string.strip()
name = " ".join(name.split())
record_book_id = " ".join(record_book_id.split())
students.append({'name': name, 'id': student_id, 'record_book': record_book_id, 'active': int(active)})
return {'group': group_name, 'code': group_code, 'students': students, 'id': self.group_id}
def parse(movie):
url = PAGE_URL % movie.id
r = requests.get(url)
soup = BeautifulSoup(r.text.encode('utf-8'), 'lxml')
movie.score = soup.find('strong', 'rating_num').text
info = soup.find('div', {'id': 'info'})
for linebreak in info.find_all('br'):
linebreak.extract()
for span in info.contents:
if isinstance(span, NavigableString): continue
if span.contents[0]:
if span.contents[0].string == u'??':
if isinstance(span.contents[1], NavigableString):
movie.director = span.contents[2].text
elif span.contents[0].string == u'??':
if isinstance(span.contents[1], NavigableString):
movie.actor = span.contents[2].text
print movie
def parse_character_results(soup):
"""
Parse a page of character results.
:param soup: The BS4 class object
:return: Returns a list of dictionaries containing a name, gender and list of dictionaries containing a game name/id pair
for games they appeared in.
"""
soup = list(soup.find_all('table', class_='stripe')[0].children)[1:]
characters = []
for item in soup:
temp_c = {'gender': None, 'name': None, 'games': {}}
temp_c['gender'] = item.abbr.get('title')
temp_c['name'] = list(item.children)[1].a.string
temp_c['games'] = []
for game in list(list(list(item.children)[1].children)[1].children):
if isinstance(game, NavigableString):
continue
temp_c['games'].append({'name': game.string, 'id': game.get('href').split('/')[1]})
characters.append(temp_c)
del temp_c
return characters
def print_content(contents):
for content in contents:
name = content.name
#if not isinstance(content, Tag):
if isinstance(content, NavigableString):
s = str(content)
s = s.replace("\n","")
print s.strip()
else:
if name == "img":
'''
img = content.find("img")
if img:
print img.get("src")
'''
print "[??]"
elif name == "br":
print ""
elif name == "noscript":
continue
elif name == "li":
print "•",
print_content(content.contents)
def get_detail(self, host_soup, vul_summary):
''' host report -> section 2.2: vulnerability detail, return dict '''
name_detail_lst = host_soup.find('div', id='vul_detail').table.contents
same_vuls = []
for i in name_detail_lst:
if type(i) is NavigableString:
continue
if i.span:
name = i.span.string
for name_port in vul_summary:
if name in name_port:
same_vuls.append(name_port)
elif same_vuls:
# in case of repeat vulnerability but differ port
lst_solu = self.get_solution(i)
for name_port in same_vuls:
lst = vul_summary.get(name_port)
if lst and (len(lst) == 5):
vul_summary[name_port].extend(lst_solu)
same_vuls = []
return vul_summary
def get_solution(self, tag):
'''['????', '????', 'CVE??'] '''
value = []
tr_lst = tag.table.contents
for i in tr_lst:
if type(i) is NavigableString:
continue
if i.th.string in (u'????', u'????'):
val = [i.strip() for i in i.td.strings]
val = '\n'.join(val).replace('\n*', '*')
value.append(val)
elif i.th.string == u'CVE??':
value.append(i.td.string)
if len(value) == 2:
value.append(None)
return value
def __get_navigable_strings(self,soup):
if isinstance(soup, NavigableString):
if type(soup) not in (Comment, Declaration) and soup.strip():
yield soup
elif soup.name not in ('script', 'style'):
for c in soup.contents:
for g in self.__get_navigable_strings(c):
yield g
def parse_aiml_text(text):
text = '<p>' + text + '</p>'
soup = BeautifulSoup(text, 'lxml')
tokens = []
try:
for c in soup.p.children:
if isinstance(c, NavigableString):
token = c.string.strip()
if token:
tokens.append(token)
except Exception as ex:
logger.warn(ex)
return text
return ' '.join(tokens)
def get_first_text(soup, strip = False, types = (NavigableString, CData)):
data = None
for s in soup._all_strings(strip, types = types):
data = s
break
return data
def get_texts(soup, strip = False, types = (NavigableString, CData)):
texts = []
for s in soup._all_strings(strip, types = types):
texts.append(s)
return texts
def html_to_text(html):
"Creates a formatted text email message from a rendered html template (page)"
soup = BeautifulSoup(html, 'html.parser')
# Ignore anything in head
body, text = soup.body, []
if body is None:
return ""
else:
for element in body.descendants:
# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
if type(element) == NavigableString:
# We use the assumption that other tags can't be inside a script or style
if element.parent.name in ('script', 'style'):
continue
elif element.parent.name == 'a':
# replace link text with the link
#text += [element.parent['href']]
continue
# remove any multiple and leading/trailing whitespace
string = ' '.join(element.string.split())
if string:
if element.parent.name == 'p':
# Add extra paragraph formatting newline
string = '\n' + string
text += [string]
doc = '\n'.join(text) #.encode('utf-8')
return doc
def parse_notes(self):
notes = []
#??????????? ????????? ???? ?????? ??????? ??? ?????
for tag in self.page.find_all(class_='div-comment'):
sibling = tag.next_sibling.next_sibling if type(tag.next_sibling) is NavigableString else tag.next_sibling
if sibling and not (sibling.has_attr('class') and 'div-control' in sibling['class']):
note = tag.get_text()
if note.startswith('?????(?):'):
notes.append({'name':'authors', 'value':note[10:].strip()})
elif note.startswith('??????????:'):
notes.append({'name':'comment', 'value':note[12:].strip()})
else:
raise NotImplementedError('??????????? ????? ? ??????????: {}'.format(note))
return notes
def parse_description(self, tag_id):
description_span = self.page.find(id=tag_id)
description = []
for discipline_property in description_span.find_all(class_='div-comment'):
property_name = discipline_property.string.strip()
sibling = discipline_property.next_sibling.next_sibling if type(discipline_property.next_sibling) \
is NavigableString else discipline_property.next_sibling
property_value = sibling.string
property_value = property_value.strip() if property_value else ''
if property_value:
description.append({'name':property_name, 'value':property_value})
return description
def strip_tags(html, invalid_tags):
soup = BeautifulSoup(html,"html.parser")
coref_id_set=set()
set2text={}
for tag in soup.findAll(True):
if tag.name in invalid_tags:
s = ""
for c in tag.contents:
if not isinstance(c, NavigableString):
c = strip_tags(unicode(c), invalid_tags)
s += unicode(c)
tag.replaceWith(s)
for t in soup.find_all("coref"):
if t['set-id'] in coref_id_set :
pronoun_regex = re.compile('|'.join(pronouns))
# print t.get_text(),
if len(pronouns.intersection(nltk.word_tokenize(t.get_text().lower()))) > 0:
# print t.get_text(),
t.replaceWith(set2text[t['set-id']])
# print "REPLACED WITH :" , set2text[t['set-id']]
else:
coref_id_set.add(t['set-id'])
set2text[t['set-id']]=t.get_text()
# print soup
soup = re.sub("(\\t|\\r?\\n)+", " ",str(soup))
soup = re.sub("</s><s>","\n",soup)
soup = re.sub('<[^>]*>', '', soup)
return soup
def get_parrafos(soup):
prfs= soup.find_all(['li','table'])
ps = soup.find_all('p')
for p in ps:
if not p.span:
prfs.append(p)
continue
flag=False
for c in p.contents:
if ((isinstance(c, bs4.NavigableString) or isinstance(c, unicode)) and not is_vacio(c)):
flag=True
break
if flag:
prfs.append(p)
return prfs
def eqsibling(n):
r=[]
tag=n.name
s=n.next_sibling
while s:
if (isinstance(s, bs4.NavigableString) or isinstance(s, unicode)):
if not is_vacio(s):
break
elif s.name!=tag or not eqclass(s,n):
break
r.append(s)
s=s.next_sibling
return r
def search_esv(message, verse):
"""
Search for a bible passage from the English Standard Version.
Example::
bible Romans 12:16
"""
r = await http.get("http://www.esvapi.org/v2/rest/passageQuery", params={
"key": "IP",
"passage": verse,
"output-format": "crossway-xml-1.0",
"include-simple-entities": "true",
})
doc = BeautifulSoup(r.text(), features="lxml")
if not doc.passage:
raise CommandError("Verse not found.")
lines = []
for verse_unit in doc.passage.content.find_all('verse-unit'):
num = int(verse_unit.find('verse-num').text)
woc = verse_unit.find('woc')
if woc:
text = woc.text
else:
text = "".join([str(node) for node in verse_unit.children
if isinstance(node, NavigableString) and not isinstance(node, Comment)])
lines.append("**{}** {}".format(num, text.strip()))
return "\n".join(lines)
Scraper_for_UN_indicator_descriptions.py 文件源码
项目:Vasco_de_data
作者: KeynesYouDigIt
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def UNHDR_scrape_description():
#the final object will be a dictionary with indicator name as the key and desctiption as content
may_contain_indicators=[]
clean_listed_indicators={}
urls = ['http://hdr.undp.org/en/composite/HDI',
'http://hdr.undp.org/en/composite/IHDI',
'http://hdr.undp.org/en/composite/trends',
'http://hdr.undp.org/en/composite/GDI',
'http://hdr.undp.org/en/composite/GII',
'http://hdr.undp.org/en/composite/MPI',]
for url in urls:
url_response_raw = rq.get(url)
BS = BeautifulSoup(url_response_raw.text, "lxml")
p_elements = BS.find_all('p')
p_contents = []
for e in p_elements:
p_contents.append(e)
for paragraph in p_contents:
if not isinstance(paragraph,NavigableString):
if 'Definitions' in paragraph.text:
may_contain_indicators.append(paragraph)
for paragraf in may_contain_indicators:
if ':' in paragraf.text:
with_colons_added = paragraf.get_text('::')
dub_colon_as_list = []
for i in enumerate(with_colons_added.split('::')):
dub_colon_as_list.append(i)
for i,string in dub_colon_as_list:
if ': ' in string:
indicator_name_full=str(unicodedata.normalize('NFKD',dub_colon_as_list[i-1][1]).encode('ascii', 'ignore')).strip('\n')
indicator_name_abridged=indicator_name_full[:indicator_name_full.find(':')]
description=str(unicodedata.normalize('NFKD',dub_colon_as_list[i][1]).encode('ascii', 'ignore')).strip('\n')
if i+1<len(dub_colon_as_list) and 'http' in dub_colon_as_list[i+1][1]:
details_link=dub_colon_as_list[i+1][1]
else:
details_link ='no further link provided for this indicator'
print 'adding %s %s %s' % (indicator_name_abridged, description, details_link)
clean_listed_indicators[indicator_name_abridged]=[description,details_link]
return clean_listed_indicators
def get_summary(self, host_soup):
''' host report -> section 1: host summary, return list '''
result = []
condition = (u'IP??', u'????')
p = host_soup.find('tr', class_='even').parent
for i in p.contents:
if type(i) is NavigableString:
continue
elif i.th.string in condition:
result.append(i.td.string)
if len(result) < 2:
result.append(None)
return result
def checklistInSoupToENML(soup):
'''
Transforms github style checklists `* [ ]` in the BeautifulSoup tree to
enml.
'''
checktodo_re = re.compile(r'\[(.)\]')
# To be more github compatible, if in a list all elements begins with `[ ]``
# transform it to normal `[ ]` evernote elements
for ul in soup.find_all('ul'):
tasks = []; istodo = True
for li in ul.find_all('li'):
task = soup.new_tag('div')
todo_tag = soup.new_tag('en-todo')
reg = checktodo_re.match(li.get_text())
istodo = istodo and reg
character = reg.group(1) if reg else None
if character == "x": todo_tag['checked']="true"
task.append(todo_tag)
if reg: task.append(NavigableString(li.get_text()[3:].strip()))
tasks.append(task)
if istodo:
for task in tasks: ul.insert_after(task)
ul.extract()
# For the rest of elements just replace `[ ]` with the appropriate element
for todo in soup.find_all(text=checktodo_re):
str_re = re.match(r'(.*)\[(.)\](.*)',todo)
pre = str_re.group(1)
post = str_re.group(3)
todo_tag = soup.new_tag('en-todo')
if str_re.group(2) == "x": todo_tag['checked']="true"
todo.replace_with(todo_tag)
todo_tag.insert_before(pre)
todo_tag.insert_after(post)
def check_all_pages(target=None):
"""Reads all pages for a target and checks them for style."""
target = dactyl_build.get_target(target)
pages = dactyl_build.get_pages(target)
pp_env = dactyl_build.setup_pp_env()
print("Style Checker - checking all pages in target %s" % target["name"])
style_issues = []
for page in pages:
if "md" not in page:
# Not a doc page, move on
continue
logger.info("Checking page %s" % page["name"])
page_issues = []
html = dactyl_build.parse_markdown(page, pages=pages, target=target)
soup = BeautifulSoup(html, "html.parser")
overrides = get_overrides(soup)
content_elements = ["p","li","a","em","strong","th","td",
"h1","h2","h3","h4","h5","h6"]
for el in soup.descendants:
if (type(el) == NavigableString and
el.parent.name in content_elements and
str(el).strip()):
passage = str(el).strip()
passage_issues = check_passage(passage, overrides)
if passage_issues:
page_issues += passage_issues
#print("'%s' (%s)" % (el, el.parent.name))
# for el in soup.find_all(content_elements):
# for passage in el.stripped_strings:
# passage_issues = check_passage(passage, overrides)
# if passage_issues:
# page_issues += passage_issues
if page_issues:
style_issues.append( (page["name"], page_issues) )
return style_issues