def make_nsfw_safe(text):
"""Make NSFW safer by adding click-to-show class to images."""
soup = BeautifulSoup(text, "lxml")
images = soup.find_all("img")
for image in images:
if image.get("class"):
image["class"] = "%s nsfw" % " ".join(image.get("class"))
else:
image["class"] = "nsfw"
image.replace_with(image)
result = str(soup)
# We don't want html/body, which BeautifulSoup kindly wraps our new HTML in
if result.startswith("<html><body>") and result.endswith("</body></html>"):
result = result[len("<html><body>"):-len("</body></html>")]
return result
python类BeautifulSoup()的实例源码
def mathjax(s):
with open("temp.log", "w") as f:
f.write(s)
p = Popen([app.config['mjpage'],
'--dollars',
'--output', "CommonHTML",
'--fontURL',
("https://cdnjs.cloudflare.com/ajax/libs/"
"mathjax/2.7.0/fonts/HTML-CSS")], stdout=PIPE, stdin=PIPE,
stderr=PIPE)
#filename = hashlib.sha256(s.encode('utf-8')).hexdigest()
#with open(filename, 'w') as f:
# print(s, file=f)
res = p.communicate(input=s.encode('utf-8'))
out = res[0].decode('utf-8')
err = res[1].decode('utf-8')
soup = BeautifulSoup(out, 'html.parser')
style = str(soup.style)
body = "".join(str(s) for s in soup.body.children)
return style, body
def get_best(url):
url = 'http://www.infoarena.ro' + url
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
name = soup.find('span', {'class': 'username'}).find('a')['href'][35:]
tests = soup.find_all('td', {'class': 'number'})
max_ms = -1
for test in tests:
test = test.string
if test.endswith('ms'):
time = int(test.strip('ms'))
max_ms = max(max_ms, time)
if name not in d or max_ms < d[name][0]:
d[name] = (max_ms, url)
print(max_ms, name, url)
def decrypt(hash, tipo):
global word
try:
if(tipo == 0):
url = BeautifulSoup(urllib.urlopen("https://md5.gromweb.com/?md5=" + hash), "html.parser")
else:
url = BeautifulSoup(urllib.urlopen("https://sha1.gromweb.com/?hash=" + hash), "html.parser")
password = url.find("em", {"class": "long-content string"})
password = re.sub(re.compile("<.*?>"), "", str(password)).strip()
if str(password) == "None":
print word+"\t\t\t\t[-] Senha nao encontrada! :-("
else:
print word+"\t\t\t\t[+] Senha encontrada: " + password
except IOError:
decryptwl(hash, tipo)
def add_afsc_links(full_afsc_dict, reddit):
"""
Add links to /r/AirForce wiki from given filename into the dictionary.
:param dict: either enlisted_dict or officer_dict
:param reddit: PRAW reddit object
"""
# gets dict of AFSC to link on /r/AirForce wiki
wiki_page = reddit.subreddit("AirForce").wiki["index"]
wiki_soup = BeautifulSoup(wiki_page.content_html, "html.parser")
links = wiki_soup.find_all("a")
# currently all wiki AFSC are enlisted
for link in links:
# not all links have /r/AirForce/wiki/jobs so this is more generalized
# using only /r/AirForce/ wiki links
if "www.reddit.com/r/AirForce/wiki/" in link["href"]:
AFSC_code = link["href"].split("/")[-1].upper()
base_afsc = AFSC_code[:5] # shaves off any prefixes
if base_afsc in full_afsc_dict["enlisted"].keys():
full_afsc_dict["enlisted"][base_afsc]["link"] = link["href"]
def process_POST_request(request):
dict_ = urlparse.parse_qs(request.text)
def htmlify(thing):
try:
html = dict_[thing][0]
except KeyError as e:
html = ''
return '<html>' + html + '</html>'
uri = dict_['uri'][0]
head = htmlify('head')
body = htmlify('body')
try:
text = dict_['data'][0]
except KeyError as e:
text = ''
headsoup = BeautifulSoup(head, 'lxml')
bodysoup = BeautifulSoup(body, 'lxml')
target_uri = getUri(uri, headsoup, bodysoup)
doi = getDoi(headsoup, bodysoup)
return target_uri, doi, head, body, text
def getRosiItem():
start = time.time()
index = 1
while True:
url = "http://www.mmxyz.net/category/rosi/page/{}/".format(index)
res = requests.get(url,timeout=10)
if res.status_code == 404:
print("+ Time: {:.2f} S +".format(time.time()-start))
print("+ Total Pages: {} +".format(index-1))
print("+ Total Numbers: {} +".format(len(RosiItems)))
print("+-------------------------+\r\n\r\n")
return
soup = BeautifulSoup(res.content, "html.parser")
rosiList = soup.find_all("a", class_="inimg")
for rosi in rosiList:
RosiItems.append(rosi['href'])
index += 1
def hltb(bot,trigger):
if not trigger.group(2):
return bot.say("Enter a game name to search.")
game = trigger.group(2)
url = "http://howlongtobeat.com/search_main.php?page=1"
payload = {"queryString":game,"t":"games","sorthead":"popular","sortd":"Normal Order","length_type":"main","detail":"0"}
test = {'Content-type':'application/x-www-form-urlencoded', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36','origin':'https://howlongtobeat.com','referer':'https://howlongtobeat.com'}
session = requests.Session()
session.post(url, headers=test, data=payload)
r = session.post(url, headers=test, data=payload)
if len(r.content) < 250:
return bot.say("No results.")
bs = BeautifulSoup(r.content)
first = bs.findAll("div", {"class":"search_list_details"})[0]
name = first.a.text
time = first.findAll('div')[3].text
bot.say('{} - {}'.format(name, time))
def craw_last_index(ptt_class_name):
#ptt_class_name = 'Soft_Job'
index_url = 'https://www.ptt.cc/bbs/' + ptt_class_name + '/index.html'
res = requests.get(index_url,verify = True)
soup3 = BeautifulSoup(res.text, "lxml")
x = soup3('',{'class':"btn wide"},text = re.compile('??'))
last_index = x[0]['href']
last_index = last_index.replace('/bbs/' + ptt_class_name + '/index','')
last_index = int( last_index.replace('.html','') )+1
return last_index
#---------------------------------------------------------------------------------
# ?? ubuntu - crontab-e, ????, ??????? data
# ?? PTT ????, ???????, ??????,
# ??????DATA, ???? index ??????, ??????? data,
# ?????, ??????
def addToCart(self):
print '\nADD TO CART -----------------'
session_get = self.user_session.get(self.URL_product_url, headers=self.get_headers)
#print session_get.content
soup = BeautifulSoup(session_get.content, 'lxml')
results = soup.find_all('select', class_='size-select')
#print results
for item in results[0].select('option'):
re_result = re.sub(self.sub_pattern, '', item.string)
#print re_result
matchObj = re.search(r"^%s+$" % self.user_size, re_result)
if matchObj:
self.post_data_addToCart['pid'] = item['value']
self.post_data_addToCart['masterPID'] = item['value'].partition("_")[0]
print self.post_data_addToCart
break
session_post = self.user_session.post(url=self.URL_cart_post_url, headers=self.post_headers, data=self.post_data_addToCart)
print 'Add To Cart Status: ' + str(session_post.status_code)
def finalBoss(self):
print '\nEntering Payment Info -----------------------------'
self.get_headers['Referer'] = self.URL_checkout_url
self.post_headers['Referer'] = self.URL_pay_url
#print json.dumps(self.get_headers, indent=1)
session_get = self.user_session.get(self.URL_pay_url, headers=self.get_headers)
savePage(session_get, 'finalCheckout.html')
soup = BeautifulSoup(session_get.content, 'lxml')
pay_secure_key = soup.find('input', {'name':'dwfrm_payment_securekey'})
print pay_secure_key
#NOTE: Visa, Mastercard, etc...correspond to different types. Find how they get set
#NOTE: Visa = 001, Mastercard = 002, AE = 003, Discover = 004
post_data_payInfo = { 'dwfrm_payment_creditCard_type': '002',
'dwfrm_payment_creditCard_owner': 'Bob McFlymo',
'dwfrm_payment_creditCard_number': '5105105105105100',
'dwfrm_payment_creditCard_month': '01',
'dwfrm_payment_creditCard_year': '2018',
'dwfrm_payment_creditCard_cvn': '002',
'dwfrm_payment_securekey': pay_secure_key,
'dwfrm_payment_signcreditcardfields': 'sign'
}
#savePage(session_get, 'finalCheckout.html')
def checkItemDirect(self):
#NOTE: this function will most likely hamper performance but in some cases may improve it, leave it up to user choice to run this before checkout
#Basic Steps:
#Use BS to parse for <ul class="size options"
#Size marked as follows: <li class="8 available" data-option-title="8"
#Therefore, match data-option-title with user_size, then check the class for available keyword
session_get = self.user_session.get(self.URL_product)
print 'Status of requests.get: ' + str(session_get.status_code)
soup = BeautifulSoup(session_get.content, "lxml")
#Check that the lxml parser works for html
#Look to use SoupStrainer to improve parsing efficiency
for li in soup.select('li[data-option-title]'):
#print li['class']
#print type(li['class'])
if (self.user_size in li['class']) & ('available' in li['class']):
print 'Size ' + self.user_size + ' Available'
def getMoreInfo(self, nzb):
"""
Get details about a torrent.
.. seealso:: MovieSearcher.correctRelease
"""
data = self.getHTMLData(nzb['detail_url'])
soup = BeautifulSoup(data, 'html.parser')
description = soup.find(id='description')
if description:
nzb['description'] = description.prettify()
line = soup.find(text='Date de publication').parent.parent
pub = line.find_all('td')[1]
added = datetime.strptime(pub.getText().split('(')[0].strip(),
'%d/%m/%Y %H:%M')
nzb['age'] = (datetime.now() - added).days
self.log.debug(nzb['age'])
def test_parse_html2(self):
parser = QqParser(allowed_tags={'chapter', 'section',
'subsection', 'subsubsection',
'eq', 'eqref', 'ref',
'equation', 'label', 'idx'})
doc = r"""\chapter \label h1:label
Hello
This is a \ref{h1:label}.
"""
tree = parser.parse(doc)
html = QqHTMLFormatter(tree)
s = html.do_format()
soup = BeautifulSoup(s, 'html.parser')
self.assertEqual(soup.h1['id'], 'label_h1_label')
self.assertEqual(soup.span['class'], ['section__number'])
self.assertEqual(soup.span.string, "1")
self.assertEqual(soup("a")[1].attrs,{'class': ['a-ref'], 'title': '', 'href': '#label_h1_label'})
self.assertEqual(soup("a")[1].string, "1")
def test_parse_html3(self):
parser = QqParser(allowed_tags={'h1', 'h2', 'h3', 'h4', 'eq', 'eqref', 'ref', 'equation', 'label', 'idx'})
doc = r"""\equation \label eq:x2y2
x^2 + y^2 = z^2
See \ref{eq:x2y2}.
"""
tree = parser.parse(doc)
html = QqHTMLFormatter(tree)
html.counters['equation'].showparents = False
s = html.do_format()
soup = BeautifulSoup(s, 'html.parser')
self.assertEqual(soup.div.attrs, {'id':"label_eq_x2y2",'class':["latex_equation"]})
self.assertEqual(soup.span['class'], ['ref'])
self.assertEqual(soup.a['class'], ['a-ref'])
self.assertEqual(soup.a['href'], '#mjx-eqn-1')
self.assertEqual(soup.a.string, "(1)")
def test_refs_with_separator(self):
doc = r"""\chapter Hello \label sec:first
\chapter World \label sec:other
See
\ref[section][sec:first] and \ref[section][sec:other] for details.
"""
parser = QqParser()
formatter = QqHTMLFormatter()
parser.allowed_tags.update(formatter.uses_tags())
tree = parser.parse(doc)
formatter.root = tree
print(tree.as_list())
html = formatter.do_format()
soup = BeautifulSoup(html, "html.parser")
self.assertEqual(soup("a")[2].contents[0], "section 1")
def test_missing_label(self):
doc = r"""\chapter Hello \label sec:first
\chapter World \label sec:other
See
\ref[section][sec:third] and \ref[zection][sec:another] for details.
"""
parser = QqParser()
formatter = QqHTMLFormatter()
parser.allowed_tags.update(formatter.uses_tags())
tree = parser.parse(doc)
formatter.root = tree
print(tree.as_list())
html = formatter.do_format()
soup = BeautifulSoup(html, "html.parser")
self.assertEqual(soup("a")[2].contents[0], "section ???")
self.assertEqual(soup("a")[3].contents[0], "zection ???")
def getpixivfollow():
"""Get pixiv bookmark."""
users = ['1789300']
page = 1
userlist = {}
bookmark_url = u'https://www.pixiv.net/bookmark.php'
while len(users) > 0:
page_params = (
('type', 'user'),
('rest', 'show'),
('p', str(page)))
bookmark_page = PIXIV_SESSION.get(
bookmark_url, params=page_params, proxies=PROXY).text
bookmark_content = BeautifulSoup(bookmark_page, 'lxml')
print(u'Get Pixiv bookmark page {0} ...'.format(page))
users = bookmark_content.select("div[class=usericon]")
if len(users) == 0:
break
for user in users:
user_info = user.find('a', attrs={'class': 'ui-profile-popup'})
user_name = user_info.attrs['data-user_name']
user_id = user_info.attrs['data-user_id']
userlist[user_id] = user_name
page += 1
return userlist
def pixiv2pawoo(pixivid):
"""Pixiv -> Pawoo."""
pawoourl = u'https://pawoo.net/oauth_authentications/{0}?provider=pixiv'
pawoolink = pawoourl.format(pixivid)
pawoopage = PAWOO_SESSION.get(pawoolink, proxies=PROXY)
if pawoopage.status_code == 200:
pawooname = pawoopage.headers.get('link').split(';')[0]
pawooname = pawooname.replace(
'<https://pawoo.net/.well-known/webfinger?resource=acct%3A', '')
pawooname = pawooname.replace('%40pawoo.net>', '')
csrf_token = BeautifulSoup(pawoopage.text, 'lxml')
csrf_token = csrf_token.select(
"meta[name=csrf-token]")[0].attrs.get('content')
with open('pawoolist.txt', 'a', encoding='utf-8-sig') as pawoofile:
pawoofile.write(
'{1},https://pawoo.net/@{0}\n'.format(pawooname, pixivid))
followpawoo(pawooname, csrf_token)
return 1
else:
return 0
def get_book(url):
""" ????? PDF ??? """
# ????????
print('???????……')
nav_page = CONNECTION.get(url).text
shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page))
for shelf_count, shelf in enumerate(shelves, 1):
res = CONNECTION.get(BASE_URL + shelf).text
soup = BeautifulSoup(res, 'lxml')
save_dir = os.path.join(BASE_DIR, 'Books', str(shelf_count))
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for book_count, book in enumerate(soup.select('#booknav a'), 1):
print('------>', book.string)
file_name = REG_FILE.sub(' ', book.string) + '.pdf'
pdf = CONNECTION.get(BASE_URL + book['rel'][0]).content
with open(os.path.join(save_dir, file_name), 'wb') as pdf_file:
pdf_file.write(pdf)
def read_captcha():
header={
'User-Agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
'Host':'login.weibo.cn'
}
url_login = 'http://login.weibo.cn/login/'
html = requests.get(url_login,headers=header).content # ????
soup = BeautifulSoup(html, 'lxml')
code_img = str(soup.find('img'))[24:-3] # ?????????
print(code_img)
urlretrieve(code_img, r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif')
show_img(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif')
remove_line(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif',
r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha/')
pic_cut('captcha_removeline.gif', 'E:/????/??????/1 ???/captcha_master1/captcha_master/main_captcha/',
'E:/????/??????/1 ???/captcha_master1/captcha_master/word/')
def gen_item_comment(self, response):
comment = []
new_comment = {}
comments_data = []
rep_time_list = response.xpath('//div[@class="authi"]//em').extract()
for indexi, content in enumerate(response.xpath('//td[@class="t_f"]').extract()):
soup = BeautifulSoup(content, 'lxml')
if soup.find('div', class_='attach_nopermission') is not None:
soup.find('div', class_='attach_nopermission').clear()
[s.extract() for s in soup('script')] # remove script tag
c = StrClean.clean_unicode(soup.get_text())
comments_data.append({'content': c, 'reply_time': self.format_rep_date(rep_time_list[indexi])})
new_comment['url'] = response.url
new_comment['comments_data'] = comments_data
comment.append(new_comment)
return comment
def gen_item_comment(self, response):
comment = []
new_comment = {}
comments_data = []
rep_time_list = response.xpath('//div[@class="authi"]//em').extract()
for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()):
soup = BeautifulSoup(content, 'lxml')
[s.extract() for s in soup('script')] # remove script tag
c = StrClean.clean_comment(soup.get_text())
if indexi >= len(rep_time_list):
rep_time = self.format_rep_date(rep_time_list[-1])
else:
rep_time = self.format_rep_date(rep_time_list[indexi])
comments_data.append({'content': c, 'reply_time': rep_time})
new_comment['url'] = response.url
new_comment['comments_data'] = comments_data
comment.append(new_comment)
return comment
def gen_item_comment(self, response):
comment = []
new_comment = {}
comments_data = []
rep_time_list = re.findall(u'\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}', response.body)
for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()):
soup = BeautifulSoup(content, 'lxml')
[s.extract() for s in soup('script')] # remove script tag
c = StrClean.clean_comment(soup.get_text())
if indexi >= len(rep_time_list):
rep_time = self.format_rep_date(rep_time_list[-1])
else:
rep_time = self.format_rep_date(rep_time_list[indexi])
comments_data.append({'content': c, 'reply_time': rep_time})
new_comment['url'] = response.url
new_comment['comments_data'] = comments_data
comment.append(new_comment)
return comment
def article_detail(aitem, response):
for a_content in response.xpath('//script').extract():
if a_content.find("detailArticle|post") == -1:
continue
a_content = a_content.split("props=")[1]
a_content = a_content.split(",location")[0]
a_content = json.loads(a_content).get("detailArticle|post")
aitem.content = BeautifulSoup(a_content.get("content"), 'lxml').get_text()
aitem.time = a_content.get('published_at')
aitem.last_reply_time = aitem.time
aitem.views = a_content.get('counters').get('view_count')
aitem.replies = a_content.get('counters').get('comment')
aitem.author = a_content.get('user').get('name')
aitem.title = a_content.get('title')
category_tags = json.loads(a_content.get('extraction_tags'))
category = ''
for category_tag in category_tags:
category += category_tag[0] + ' '
aitem.category = category
return aitem
def gen_item_comment(self, response):
comment = []
new_comment = {}
comments_data = []
rep_time_list = response.xpath('//span[@class="time"]').extract()
for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]/table[1]').extract()):
soup = BeautifulSoup(content, 'lxml')
if soup.find('div', class_='attach_nopermission') is not None:
soup.find('div', class_='attach_nopermission').clear()
[s.extract() for s in soup('script')] # remove script tag
c = StrClean.clean_unicode(soup.get_text())
comments_data.append({'content': c, 'reply_time': self.format_rep_date(rep_time_list[indexi])})
new_comment['url'] = response.url
new_comment['comments_data'] = comments_data
comment.append(new_comment)
return comment
def gen_item_comment(self, response, is_first=False):
comment = []
new_comment = {}
comments_data = []
rep_time_list = response.xpath('//span[@class="date"]/text()').extract()
for indexi, content in enumerate(response.xpath('//div[@class="replycontent"]').extract()):
soup = BeautifulSoup(content, 'lxml')
[s.extract() for s in soup('script')] # remove script tag
c = StrClean.clean_comment(soup.get_text())
time_index = indexi
if is_first:
time_index += 1
if time_index >= len(rep_time_list):
rep_time = self.format_rep_date(rep_time_list[-1])
else:
rep_time = self.format_rep_date(rep_time_list[time_index])
comments_data.append({'content': c, 'reply_time': rep_time})
new_comment['url'] = response.url
new_comment['comments_data'] = comments_data
comment.append(new_comment)
return comment
def gen_item_comment(self, response):
comment = []
new_comment = {}
comments_data = []
rep_time_list = response.xpath('//div[@class="authi"]//em').extract()
for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()):
soup = BeautifulSoup(content, 'lxml')
[s.extract() for s in soup('script')] # remove script tag
c = StrClean.clean_comment(soup.get_text())
if indexi >= len(rep_time_list):
rep_time = self.format_rep_date(rep_time_list[-1])
else:
rep_time = self.format_rep_date(rep_time_list[indexi])
comments_data.append({'content': c, 'reply_time': rep_time})
new_comment['url'] = response.url
new_comment['comments_data'] = comments_data
comment.append(new_comment)
return comment
def gen_item_comment(self, response, is_first=False):
comment = []
new_comment = {}
comments_data = []
rep_time_list = response.xpath('//div[@class="authi"]/em').extract()
if len(rep_time_list) == 0:
return comment
for indexi, content in enumerate(response.xpath('//div[@class="pct"]//table[1]').extract()):
if is_first and indexi == 0:
continue
soup = BeautifulSoup(content, 'lxml')
[s.extract() for s in soup('script')] # remove script tag
c = StrClean.clean_comment(soup.get_text())
time_index = indexi
if time_index >= len(rep_time_list):
rep_time = self.format_rep_date(rep_time_list[-1])
else:
rep_time = self.format_rep_date(rep_time_list[time_index])
comments_data.append({'content': c, 'reply_time': rep_time})
new_comment['url'] = response.url
new_comment['comments_data'] = comments_data
comment.append(new_comment)
return comment
def download_lyrics(artist, url):
print url
time.sleep(random() + 2)
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
# Get the song title
song_title = soup.find('title').get_text().split(' - ')[1].lower().replace('/', ' ').replace(' ', '_')
# Get the lyrics div
lyrics = soup.findAll('div', {'class': ''})
for i in lyrics:
lyrics = i.get_text().strip()
if len(lyrics) > 10:
with open('artists/' + artist + '/' + song_title + '.txt', 'wb') as w:
cleaned_lyrics = lyrics.replace('\r\n', ' *BREAK* ').replace('\n', ' *BREAK* ').replace(' ', ' ')
w.write(cleaned_lyrics.encode('utf-8'))