def parser_day_bangumi(soup):
"""
:param soup:
:type soup: bs4.Tag
:return: list
:rtype: list[dict]
"""
li = []
for soup in soup.find_all('li'):
url = soup.select_one('a')
span = soup.find('span')
if url:
name = url['title']
url = url['href']
assert isinstance(url, str)
bangumi_id = url.split('/')[-1]
soup.find('li', )
li.append({'name': name, 'keyword': bangumi_id, 'cover': span['data-src']})
return li
python类Tag()的实例源码
def process_post_bodies(bodies: List[Tag]) -> (str, list):
for body in bodies:
cites = list()
cited = body.findAll('div', {'class': 'cite'})
if cited:
cites = [c['name'] for c in cited]
collect_text = []
for tag in body:
# TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here)
if tag.name not in ('div', 'p'):
if hasattr(tag, 'text'):
collect_text.append(tag.text)
elif isinstance(tag, NavigableString):
collect_text.append(str(tag))
else:
collect_text.append('\n')
else:
yield ''.join(collect_text), cites
def append_to(parent, tag, **kwargs):
"""
Append an element to the supplied parent.
:param parent: Parent to append to.
:param tag: Tag to create.
:param args: Tag args.
:param kwargs: Tag kwargs.
:return: New element.
"""
if hasattr(parent, "soup"):
soup = parent.soup
else:
soup = parent.find_parent("html")
# Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4
new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs)
new_tag.soup = soup
parent.append(new_tag)
return new_tag
def read_component(thing):
if isinstance(thing, Tag):
if thing.name == "em":
return "*" + read_component(thing.next_element) + "*"
elif thing.name == "strong":
return "**" + read_component(thing.next_element) + "**"
elif thing.name == "u":
return "__" + read_component(thing.next_element) + "__"
elif thing.attrs.get("style") == "text-decoration: line-through;":
return "~~" + read_component(thing.next_element) + "~~"
elif thing.attrs.get("id") is not None and "footnoteref" in \
thing.attrs["id"]:
return ""
else:
return read_component(thing.next_element)
else:
return thing
def show_weather(cityinfo):
print(u'?????? #%s,%s# ???...' % (cityinfo.get(u'parent_name_ch'), cityinfo.get(u'city_name_ch')))
weather_content = api.getWeather(cityinfo.get(u'id'))
soup = BeautifulSoup(weather_content, u'html.parser')
# print(soup.prettify())
# print(soup.title)
table_tag = soup.find_all(u'table', class_=u'sevendays')[0]
for child in table_tag.children:
if not isinstance(child, Tag):
continue
date = child.find(u'td', class_=u'date').get_text()
temp = child.find(u'td', class_=u'temp').get_text()
desc = child.find(u'td', class_=u'desc').get_text()
print(''.join(date.split()))
print(''.join(temp.split()))
print(''.join(desc.split()))
print(u'=================')
def _showWeather(self, city):
self.info.insert(tk.INSERT, u'?????? #%s, %s# ???...\n\n\n' % (
city.get(u'city_name_ch'), city.get(u'parent_name_ch')))
weather_content = self.api.getWeather(city.get(u'id'))
soup = BeautifulSoup(weather_content, u'html.parser')
table_tag = soup.find_all(u'table', class_=u'sevendays')[0]
for child in table_tag.children:
if not isinstance(child, Tag):
continue
date = child.find(u'td', class_=u'date').get_text()
temp = child.find(u'td', class_=u'temp').get_text()
desc = child.find(u'td', class_=u'desc').get_text()
self.info.insert(tk.INSERT, ''.join(date.split()) + '\n')
self.info.insert(tk.INSERT, ''.join(temp.split()) + '\n')
self.info.insert(tk.INSERT, ''.join(desc.split()) + '\n')
self.info.insert(tk.INSERT, u'=================' + '\n')
def get_movie_list(kw_movie, pageIndex=0):
url = api_movies.format(movie=kw_movie, page_index=pageIndex)
html = fetch_text(url)
dom = BeautifulSoup(html, 'html.parser')
try:
# 1.movie
div_items = dom.find_all('div', 'item prel clearfix') # type:Tag
movies = []
for div in div_items:
movie = process_movie_item(div)
movies.append(movie)
# 2.page next
div_page = dom.find('div', 'pagination l clearfix')
index, haveNext = process_page_next(div_page)
page = PageMovie(movies, index, haveNext)
return Resp(page)
except Exception as e:
return Resp(errorMsg=e.__repr__())
def process_movie_item(div_item: Tag) -> Movie:
movie = Movie()
# ----------------
div1 = div_item.find('div', 'litpic hidden-xs hidden-sm')
a = div1.findChild()
# detail_url
movie.detail_url = base_url + a['href']
# avatar
img = a.findChild()
movie.avatar_url = img['data-original']
# ---------------
div2 = div_item.find('div', 'title') # type:Tag
b = div2.select("p a b")[0] # type:Tag
movie.name = b.text
return movie
def get_MovieList(keyword: str) -> List[Movie]:
'''
?????????????
:param keyword:
:return:
'''
r = requests.get(base_url + '/search?ad=1&q={0}'.format(keyword))
dom = BeautifulSoup(r.text, 'html.parser')
list_movie = []
div_blocks = dom.find_all('div', class_='item prel clearfix')
try:
for div_block in div_blocks: # type:Tag
movie = get_Movie(div_block)
if movie:
list_movie.append(movie)
except BaseException:
pass
return list_movie
def get_Movie(item: Tag) -> Movie:
'''
??????
:param item:
:return:
'''
try:
movie = Movie()
a = item.select_one('div.title p a') # type:Tag
movie.detail_url = a['href']
movie.name = a.findChild().text
except BaseException:
pass
return movie
def get_ZimusByMovie(url: str) -> List[Zimu]:
r = requests.get(base_url + "/" + url)
dom = BeautifulSoup(r.text, 'html.parser')
list_zimu = []
father = dom.select_one('body tbody') # type: Tag
trs = father.select('tr') # type:List[Tag]
for tr in trs:
try:
a = tr.select_one('td a')
zimu = Zimu()
zimu.detail_url = a['href']
zimu.name = a['title']
list_zimu.append(zimu)
except BaseException:
continue
return list_zimu
def naver_complete_login(request, app, token):
provider = providers.registry.by_id(NaverProvider.id)
headers = {'authorization': 'Bearer {}'.format(token.token)}
resp = requests.get(API_URL + '/nid/getUserProfile.xml', headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'xml')
parsed = {}
for sub in ('result', 'response'):
props = {}
for tag in soup.find(sub):
if isinstance(tag, Tag):
props[tag.name] = tag.text
parsed[sub] = props
extra_data = parsed['response']
login = provider.sociallogin_from_response(request, extra_data)
return login
def parse_translation_table(self, table):
""" Overrides GeneralParser's method.
:param table: a Tag object. Not necessary a table; can be a div.
:return: (translation, language_name, language_code)
"""
# go through all "li" elements in a table
for li in table.find_all('li'):
if not isinstance(li, Tag):
continue
text = li.get_text().split(':')
if len(text) < 2:
continue
# language name is before ":"
lang_name = text[0]
# language code is usually in super script
lang_code = li.find(class_="trad-sup-code")
if lang_code:
lang_code = lang_code.text.strip()[1:-1]
else:
lang_code = ""
# There are two functions that removes parentheses. Not sure which one to use.
t = remove_parenthesis(text[1])
trans_list = re.split(COMMA_OR_SEMICOLON, t)
# each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
# lang_code and transliteration may not exist
for trans in trans_list:
translation = trans.split('(')[0].strip()
yield (translation, lang_name.strip(), lang_code)
def parse_translation_table(self, table):
"""
Parse the table to get translations and the languages.
Hopefully this function will work for most editions. Override this method if needed.
:param table: a Tag object. Not necessary a table; can be a div.
:return: (translation, language_name, language_code)
"""
for li in table.find_all('li'):
if not isinstance(li, Tag):
continue
text = li.get_text().split(':')
# TBD: the table is not a translation table
# OR the table is a translation table but there are some <li> without colon
if len(text) < 2:
continue
# language name is before ":"
lang_name = text[0].strip()
# language code is in super script
lang_code = li.find("sup")
if lang_code:
lang_code = remove_all_punctuation(lang_code.text).strip()
else:
lang_code = ""
t = remove_parenthesis(text[1])
trans_list = re.split(COMMA_OR_SEMICOLON, t)
# each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
# lang_code and transliteration may not exist
for trans in trans_list:
# translation = trans.split('(')[0].strip()
translation = re.split(r'[(??]', trans)[0].strip()
# Throw out tuples if they have '[['
if "[[" in translation:
continue
yield (translation, lang_name, lang_code)
def parse_unordered_list_polish(self, ulist):
for li in ulist.find_all('li'):
if not isinstance(li, Tag):
continue
if not li.get_text() == '':
text = li.get_text().split(':')
lang_name = text[0]
lang_code = ''
if len(text) > 1:
trans_list = re.split(COMMA_OR_SEMICOLON, text[1])
for trans in trans_list:
translation = remove_parenthesis(trans).strip()
yield (translation, lang_name, lang_code)
def parse_translation_table_russian(self, table):
for li in table.find_all('li'):
if not isinstance(li, Tag):
continue
text = li.get_text().split(':')
# language name is before ":"
lang_name = text[0]
lang_code = ''
if li.find("sub"):
lang_code = li.find("sub").get_text()
# remove the lang code from the lang name
lang_name = lang_name[:-len(lang_code)]
if len(text) > 1:
t = remove_parenthesis(text[1])
else:
t = remove_parenthesis(text[0])
trans_list = re.split(COMMA_OR_SEMICOLON, t)
for trans in trans_list:
translation = trans.split('(')[0].strip()
if not translation == '':
yield (translation, lang_name, lang_code)
def bs_tag_to_string(bstag: Tag) -> str:
return ''.join(str(item) for item in bstag.contents)
def search_by_keyword(self, keyword, count=None):
"""
return a list of dict with at least 4 key: download, name, title, episode
example:
[
{
'name':"?????????",
'download': 'magnet:?xt=urn:btih:what ever',
'title': "[????] ????????? ?12? MP4 720p ?",
'episode': 12
},
]
```
:param keyword: search key word
:type keyword: str
:param count: how many page to fetch from website
:type count: int
:return: list of episode search result
:rtype: list[dict]
"""
result = []
r = network.get(server_root + "Home/Search", params={'searchstr': keyword}).text
s = BeautifulSoup(r, 'lxml')
td_list = s.find_all('tr', attrs={'class': 'js-search-results-row'}) # type:list[bs4.Tag]
for tr in td_list:
title = tr.find('a', class_='magnet-link-wrap').text
time_string = tr.find_all('td')[2].string
result.append({
'download': tr.find('a', class_='magnet-link').attrs.get('data-clipboard-text', ''),
'name': keyword,
'title': title,
'episode': self.parse_episode(title),
'time': int(time.mktime(time.strptime(time_string, "%Y/%m/%d %H:%M")))
})
# print(result)
return result
```
def author(self):
"""The author of this work."""
# The author of the work is kept in the byline, in the form
#
# <h3 class="byline heading">
# <a href="/users/[author_name]" rel="author">[author_name]</a>
# </h3>
#
byline_tag = self._soup.find('h3', attrs={'class': 'byline'})
a_tag = [t
for t in byline_tag.contents
if isinstance(t, Tag)]
assert len(a_tag) == 1
return a_tag[0].contents[0].strip()
def isTagClass(obj):
return isinstance(obj, Tag)
def getelementlistwithlabel(tagObj, label, options={}):
if isinstance(tagObj, Tag):
elementlist = []
templist = tagObj.find_all(label, attrs=options)
elementlist.extend(templist)
return elementlist
else:
print '??????,??Tag?? ????:' + tagObj
return None
def gettextlistwithlabel(tagObj):
if isinstance(tagObj, Tag):
strlist = tagObj.get_text()
return strlist.encode('utf-8')
else:
print '??????,??Tag?? ????:' + tagObj
return None
def _parse_sample_tag(self, tag):
assert isinstance(tag, bs4.Tag)
assert tag.name == 'pre'
prv = utils.previous_sibling_tag(tag)
pprv = tag.parent and utils.previous_sibling_tag(tag.parent)
if prv.name == 'h6' and tag.parent.name == 'div' and tag.parent['class'] == ['paragraph'] and pprv.name == 'h5':
log.debug('h6: %s', str(prv))
log.debug('name.encode(): %s', prv.string.encode())
s = tag.string or '' # tag.string for the tag "<pre></pre>" returns None
return utils.textfile(s.lstrip()), pprv.string + ' ' + prv.string
def previous_sibling_tag(tag):
tag = tag.previous_sibling
while tag and not isinstance(tag, bs4.Tag):
tag = tag.previous_sibling
return tag
def next_sibling_tag(tag):
tag = tag.next_sibling
while tag and not isinstance(tag, bs4.Tag):
tag = tag.next_sibling
return tag
def __init__(self, form, url):
assert isinstance(form, bs4.Tag)
assert form.name == 'form'
self.form = form
self.url = url
self.payload = {}
self.files = {}
for input in self.form.find_all('input'):
log.debug('input: %s', str(input))
if input.attrs.get('type') in [ 'checkbox', 'radio' ]:
continue
if 'name' in input.attrs and 'value' in input.attrs:
self.payload[input['name']] = input['value']
def _parse_sample_tag(self, tag):
assert isinstance(tag, bs4.Tag)
assert tag.name == 'h2'
name = tag.contents[0]
if ':' in name:
name = name[: name.find(':') ]
if name in [ 'Sample input', 'Sample output' ]:
nxt = tag.next_sibling
while nxt and nxt.string.strip() == '':
nxt = nxt.next_sibling
if nxt.name == 'pre':
s = utils.textfile(utils.dos2unix(nxt.string.lstrip()))
else:
s = ''
return s, name
def soupify(self, body):
# https://www.crummy.com/software/BeautifulSoup/
# docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
# bs4 codebase: http://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/files
if isinstance(body, Tag): return body
soup = BeautifulSoup(body, "html.parser")
return soup
def parse_tasks(tasks, year):
db = []
for task in tasks:
if not isinstance(task, bs4.Tag):
continue
task = task.td
base_name = task.strong.text.strip()
next_tag = task.strong.next_sibling.next_sibling.name
if next_tag == 'span':
name = base_name
maximum = task.span.text.strip()
results = parse_results(task.table, year)
db.append({'category': 'common', 'name': name,
'max': int(maximum), 'students': results,
'year': year})
else:
for st in task.findAll('font'):
if st.previous.name != 'div':
continue
name = st.text.strip()
category = base_name
maximum = st.findNext('span').text.strip()
results = parse_results(st.findNext('table'), year)
db.append({'category': category, 'name': name,
'max': int(maximum), 'students': results,
'year': year})
return db
def is_leaf_table(table_soup):
if not isinstance(table_soup,Tag):
return True
if len(table_soup.find_all('table')) == 0:
return True
return False