def ParseHtml(self, html):
soup = BeautifulSoup(html)
links = soup.findAll('a', attrs={'class': 'ulink'})
#print len(links)
if len(links) == 0: #the js return
# tmp_js = soup.find(name='script', attrs={'language': 'javascript'})
js_str = soup.script.string #two ways to get the <script></script>
new_url = js_str[16:-1] #get the new url
new_url = eval(new_url) #eval:??????????
self.ParseHtml(self.LoadPage(new_url))
else:
# print type(links)
for link in links:
# print type(link)
# print type(link.string)
# print unicode(link.string)
titles = re.findall(r'?(.+?)?', str(link.string)) #unicode(link.string))
if len(titles) <> 0:
print titles[0]
# print 'url is %s, title is %s.' %(link['href'], titles[0])
python类BeautifulSoup()的实例源码
def GetTotalPage(self, html):
# create the BeautifulSoup
some_soup = BeautifulSoup(html)
#get the page div
ele_a = some_soup.find('div', attrs={'class': 'page'})
#get the last div>a text='??'
last_a = ele_a.findAll('a')[-1]
#substr 0:.html
pagenum = last_a.get('href')[:-5]
print 'pagenum :', pagenum
# print type(last_a)
self.SaveTotalPageToFile(pagenum)
# store the max page number to totalpage.ini
#new_page_num: new max page num
def list_of_all_href(self,html):
'''
It will return all hyper links found in the mr-jatt page for download
'''
soup=BeautifulSoup(html)
links=[]
a_list=soup.findAll('a','touch')
for x in xrange(len(a_list)-1):
link = a_list[x].get('href')
name = a_list[x]
name = str(name)
name=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',name)
name=re.sub(r'^[0-9]+\.','',name)
links.append([link,name])
#quit()
return links
def crawler(urls, max_urls):
crawled = Set()
queued = Set(urls)
pairs = []
while urls and len(crawled) < max_urls:
page=urls.pop(0)
if is_html(page):
if page not in crawled:
try:
print(page)
links=BeautifulSoup(urllib2.urlopen(page,timeout=5).read(), parseOnlyThese=SoupStrainer('a'))
for link in links:
url = domain + link['href']
if verify(url) and url not in queued:
# print(url)
urls.append('http://' +url)
# print(urls)
queued.add('http://' +url)
# print(page)
crawled.add(page)
# print(crawled)
except:
continue
return crawled,pairs
def _extract_description(self, result):
desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
if not desc_div:
self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
return None
desc_strs = []
def looper(tag):
if not tag: return
for t in tag:
try:
if t.name == 'br': break
except AttributeError:
pass
try:
desc_strs.append(t.string)
except AttributeError:
desc_strs.append(t)
looper(desc_div)
looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
desc = ''.join(s for s in desc_strs if s)
return self._html_unescape(desc)
def _extract_description(self, result):
desc_td = result.findNext('td')
if not desc_td:
self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
return None
desc_strs = []
def looper(tag):
if not tag: return
for t in tag:
try:
if t.name == 'br': break
except AttributeError:
pass
try:
desc_strs.append(t.string)
except AttributeError:
desc_strs.append(t)
looper(desc_td)
looper(desc_td.find('wbr')) # BeautifulSoup does not self-close <wbr>
desc = ''.join(s for s in desc_strs if s)
return self._html_unescape(desc)
def _get_results_page(self, set_type):
if set_type == LARGE_SET:
url = GoogleSets.URL_LARGE
else:
url = GoogleSets.URL_SMALL
safe_items = [urllib.quote_plus(i) for i in self.items]
blank_items = 5 - len(safe_items)
if blank_items > 0:
safe_items += ['']*blank_items
safe_url = url % tuple(safe_items)
try:
page = self.browser.get_page(safe_url)
except BrowserError, e:
raise GSError, "Failed getting %s: %s" % (e.url, e.error)
return BeautifulSoup(page)
def _get_results_page(self):
if self._page == 0:
if self._results_per_page == 10:
url = SponsoredLinks.SEARCH_URL_0
else:
url = SponsoredLinks.SEARCH_URL_1
else:
if self._results_per_page == 10:
url = SponsoredLinks.NEXT_PAGE_0
else:
url = SponsoredLinks.NEXT_PAGE_1
safe_url = url % { 'query': urllib.quote_plus(self.query),
'start': self._page * self._results_per_page,
'num': self._results_per_page }
try:
page = self.browser.get_page(safe_url)
except BrowserError, e:
raise SLError, "Failed getting %s: %s" % (e.url, e.error)
return BeautifulSoup(page)
def get_content(cls, url=None, session=None):
"""
@brief: ??url????????????
"""
hyperlinks = set()
soup_context = None
# ???????????????
html_context = cls.parse_page(url, session)
if html_context:
soup_context = BeautifulSoup.BeautifulSoup(html_context)
if soup_context:
soup_context = BeautifulSoup.BeautifulSoup(html_context)
for each_link in soup_context.findAll('a'):
hyperlink = urlparse.urljoin(url, (each_link or {}).get('href'))
hyperlinks.add(hyperlink)
return hyperlinks, soup_context
def make_soup(markup, parser=None):
"""Factory method returning a BeautifulSoup instance. The created
instance will use a parser of the given name, if supported by
the underlying BeautifulSoup instance.
"""
if 'bs4' in sys.modules:
# We support parser specification. If the caller didn't
# specify one, leave it to BeautifulSoup to pick the most
# suitable one, but suppress the user warning that asks to
# select the most suitable parser ... which BS then
# selects anyway.
if parser is None:
warnings.filterwarnings('ignore', 'No parser was explicitly specified')
return BeautifulSoup(markup, parser)
return BeautifulSoup(markup)
def make_soup(markup, parser=None):
"""Factory method returning a BeautifulSoup instance. The created
instance will use a parser of the given name, if supported by
the underlying BeautifulSoup instance.
"""
if 'bs4' in sys.modules:
# We support parser specification. If the caller didn't
# specify one, leave it to BeautifulSoup to pick the most
# suitable one, but suppress the user warning that asks to
# select the most suitable parser ... which BS then
# selects anyway.
if parser is None:
warnings.filterwarnings('ignore', 'No parser was explicitly specified')
return BeautifulSoup(markup, parser)
return BeautifulSoup(markup)
def get_member_attributes(self):
""" Returns a dictionary of a balancer member's attributes."""
balancer_member_page = fetch_url(self.module, self.management_url)
try:
assert balancer_member_page[1]['status'] == 200
except AssertionError:
self.module.fail_json(msg="Could not get balancer_member_page, check for connectivity! " + balancer_member_page[1])
else:
try:
soup = BeautifulSoup(balancer_member_page[0])
except TypeError:
self.module.fail_json(msg="Cannot parse balancer_member_page HTML! " + str(soup))
else:
subsoup = soup.findAll('table')[1].findAll('tr')
keys = subsoup[0].findAll('th')
for valuesset in subsoup[1::1]:
if re.search(pattern=self.host, string=str(valuesset)):
values = valuesset.findAll('td')
return dict((keys[x].string, values[x].string) for x in range(0, len(keys)))
def get_categories():
url = "http://sexyhotplay.com.br/categorias/"
html = client.request(url, headers={'Cookie': 'disclaimer-sexyhotplay=1;'})
soup = bs(html)
div = soup.find('div', attrs={'class': 'colunas-3-15'})
links = div.findAll('a', attrs={'class': 'link'}, recursive=True)
results = []
for link in links:
label = link.find('strong').string
url = 'http://sexyhotplay.com.br' + link['href']
results.append({
'name': label,
# 'clearlogo': os.path.join(artPath, 'logo_sexyhot.png'),
'url': url
})
return results
def f_grab_cmd_from_twitter_profile(profile_name):
"""grab 0xXXXXXXXX tag from profile, tag must match [a-zA-Z0-9_]
:rtype: string
:param profile_name: twitter profile name without leading @
:return: string embedded in the profile description
"""
url = 'https://twitter.com/%(profile)s'
payload = {
'profile': profile_name
}
html = requests.get(url % payload)
soup = soupy(html.text)
profile_description = soup.find('meta', {'name': 'description'})['content']
match = re.search('0x(\w+)', profile_description)
output = match.group(1) # group 1 consists of match between ( )
return str(output)
def decrypt(self, pyfile):
self.pyfile = pyfile
if self.article.match(pyfile.url):
html = self.load(pyfile.url)
soup = BeautifulSoup.BeautifulSoup(
html, convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES)
links = []
for a in soup.findAll("a", attrs={'href': self.hoster_links}):
for decrypted_link in self.decrypt_folder(a.get('href')):
links.append(decrypted_link)
self.packages.append((pyfile.name, links, pyfile.name))
else:
self.links = self.decrypt_folder(pyfile.url)
def search(key_word):
global x
search_url='http://news.sogou.com/news?ie=utf8&p=40230447&interV=kKIOkrELjboMmLkEkLoTkKIMkLELjb8TkKIMkrELjboImLkEk74TkKILmrELjbgRmLkEkLY=_485898072&query=%E4%B8%AD%E7%A7%91%E5%A4%A7&'
req=urllib2.urlopen(search_url.replace('key_word',key_word))
real_visited=0
html=req.read()
soup=BeautifulSoup(html)
#print soup
content = soup.findAll(name="a",attrs={"href":True,"data-click":True,"target":True}) #resultset object
num = len(content)
#print num
for i in range(9):
#???????????????????url
p_str= content[2*i] #if no result then nontype object
tit[i]=p_str.renderContents()
tit[i]=tit[i].decode('utf-8', 'ignore')#need it
tit[i]= re.sub("<[^>]+>","",tit[i])
print(tit[i])
url[i]=str(p_str.get("href"))
print(url[i])
#???????url???
img[i]=getimg(url[i])
w, h = img[i].size
img[i]=resize(w,h, w_box, h_box,img[i])
def ProxyIPSpider(self):
# get the proxy
f = open('proxy.txt', 'w')
for page in range(1,50):
url = 'http://www.xicidaili.com/nn/%s' %page
user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
request = urllib2.Request(url)
request.add_header("User-Agent", user_agent)
content = urllib2.urlopen(request)
soup = BeautifulSoup(content)
trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
for tr in trs[1:]:
tds = tr.findAll('td')
ip = tds[2].text.strip()
port = tds[3].text.strip()
protocol = tds[6].text.strip()
if protocol == 'HTTP' or protocol == 'HTTPS':
f.write('%s=%s:%s\n' % (protocol, ip, port))
print '%s://%s:%s' % (protocol, ip, port)
def caiji2(self): #“???IP”
# ??????IP???????????????????????????????
of = open('proxy.txt', 'w')
url = 'http://www.haodailiip.com/guonei/'
for i in range(1,20):
Url = 'http://www.haodailiip.com/guonei/' + str(i)
print u"????"+Url
html = requests.get(Url).text
bs = BeautifulSoup(html)
table = bs.find('table',{"class":"proxy_table"})
tr = table.findAll('tr')
for i in range(1,31):
td = tr[i].findAll('td')
proxy_ip = td[0].text.strip()
proxy_port = td[1].text.strip()
of.write('http=%s:%s\n' %(proxy_ip,proxy_port))
print 'http=%s:%s\n' %(proxy_ip,proxy_port)
time.sleep(2)
of.closed
def ProxyIPSpider(self):
# get the proxy
f = open('proxy.txt', 'w')
for page in range(1,50):
url = 'http://www.xicidaili.com/nn/%s' %page
user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
request = urllib2.Request(url)
request.add_header("User-Agent", user_agent)
content = urllib2.urlopen(request)
soup = BeautifulSoup(content)
trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
for tr in trs[1:]:
tds = tr.findAll('td')
ip = tds[2].text.strip()
port = tds[3].text.strip()
protocol = tds[6].text.strip()
if protocol == 'HTTP' or protocol == 'HTTPS':
f.write('%s=%s:%s\n' % (protocol, ip, port))
print '%s://%s:%s' % (protocol, ip, port)
def caiji2(self): #“???IP”
# ??????IP???????????????????????????????
of = open('proxy.txt', 'w')
url = 'http://www.haodailiip.com/guonei/'
for i in range(1,20):
Url = 'http://www.haodailiip.com/guonei/' + str(i)
print u"????"+Url
html = requests.get(Url).text
bs = BeautifulSoup(html)
table = bs.find('table',{"class":"proxy_table"})
tr = table.findAll('tr')
for i in range(1,31):
td = tr[i].findAll('td')
proxy_ip = td[0].text.strip()
proxy_port = td[1].text.strip()
of.write('http=%s:%s\n' %(proxy_ip,proxy_port))
print 'http=%s:%s\n' %(proxy_ip,proxy_port)
time.sleep(2)
of.closed
def DuckDuckGo(cmd):
debug(cmd.text)
q = cmd.text.split()
if len(q) == 1:
return
question = "+".join(q[1:])
debug("Question=%s" % question)
req = requests.get("https://duckduckgo.com/html/?q=%s" % question)
answer = None
html = bp.BeautifulSoup(req.text)
responses = html.findAll("div", id="zero_click_abstract")
try:
answer = responses[0].text
except Exception as e:
print e # get internal
pass
if not answer:
bot.reply_to(cmd, "Não tenho a menor idéia. Tem de perguntar no google.")
return
try:
bot.reply_to(cmd, answer)
except Exception as e:
bot.reply_to(cmd, "Deu merda: %s" % e)
def _update_cache(release):
LOG.debug('Updating cache for the release "%s"', release)
url = BASE_URL % release
html_page = urllib.request.urlopen(url)
soup = BeautifulSoup(html_page)
specs = {}
for link in soup.findAll('a', attrs={'href': re.compile('.html$')}):
href = link.get('href')
title = ' '.join(href.replace('.html', '').split('-'))
link = url + href
specs[title] = link
_CACHE[release] = {}
_CACHE[release]['specs'] = specs
_CACHE[release]['updated_at'] = datetime.datetime.utcnow()
LOG.info('Cache updated for the release "%s"', release)
def update_planet_fleet(self, planet):
resp = self.br.open(self._get_url('fleet', planet))
soup = BeautifulSoup(resp)
ships = {}
for k, v in self.SHIPS.iteritems():
available = 0
try:
s = soup.find(id='button' + v)
available = int(s.find('span', 'textlabel').nextSibling.replace('.', ''))
except:
available = 0
ships[k] = available
#self.logger.info('Updating %s fleet' % planet)
#self.logger.info('%s' % fleet)
planet.ships = ships
def update_planet_research(self, planet):
resp = self.br.open(self._get_url('research', planet))
soup = BeautifulSoup(resp)
try:
ButtonList = soup.find(id='buttonz')
AllResearchList = ButtonList.findAll('li')
for research in AllResearchList:
if research.get('class') == 'on':
fb = research.find('a', 'fastBuild')
if fb:
build_url = fb.get('onclick') if fb else ''
build_url = self._parse_research_url(build_url)
self.logger.info('Research launched on %s:%s'% (planet, fb.get('title')))
self.br.open(build_url)
break
except:
self.logger.exception('Exception while retrieving researches')
def update_planet_facilities(self, planet):
resp = self.br.open(self._get_url('station', planet))
soup = BeautifulSoup(resp)
try:
ButtonList = soup.find(id='stationbuilding')
AllResearchList = ButtonList.findAll('li')
for research in AllResearchList:
if research.get('class') == 'on':
fb = research.find('a', 'fastBuild')
if fb:
build_url = fb.get('onclick') if fb else ''
build_url = self._parse_research_url(build_url)
self.logger.info('Facility upgraded on %s:%s'% (planet, fb.get('title')))
self.br.open(build_url)
break
except:
self.logger.exception('Exception while retrieving facilities statuses')
return True
def getFirstPostData(forum_text):
soup = BeautifulSoup(forum_text)
title = ""
date = ""
body = ""
try:
date = soup.find("div", attrs={"class": "postDate"}).text
except AttributeError:
print "Date not found"
try:
title = soup.find("div", attrs={"class": "postTitle"}).text
except AttributeError:
print "Title not found"
try:
body = soup.find("div", attrs={"class": "postBody"}).text
except AttributeError:
print "Body not found, now this is weird"
return [title,date,body]
def parse(self, html):
"""
This method initiates parsing of HTML content, cleans resulting
content as needed, and notifies the parser instance of
resulting instances via the handle_article callback.
"""
self.soup = BeautifulSoup(html, "html.parser")
# This parses any global, non-itemized attributes from the page.
self._parse_globals()
# Now parse out listed articles:
for div in self.soup.findAll(ScholarArticleParser._tag_results_checker):
self._parse_article(div)
self._clean_article()
if self.article['title']:
self.handle_article(self.article)
def get_data(self, search_query):
'''helper method to get data from google images by scraping and parsing'''
params = {"site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query}
headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \
IEMobile/7.0; LG; GW910)'}
html = ''
try:
html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text
except Exception as exc:
log_exception(__name__, exc)
soup = BeautifulSoup.BeautifulSoup(html)
results = []
for div in soup.findAll('div'):
if div.get("id") == "images":
for a_link in div.findAll("a"):
page = a_link.get("href")
try:
img = page.split("imgurl=")[-1]
img = img.split("&imgrefurl=")[0]
results.append(img)
except Exception:
pass
return results
def get_top250_db(self):
'''
get the top250 listing for both movies and tvshows as dict with imdbid as key
uses 7 day cache to prevent overloading the server
'''
results = {}
for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]:
html = requests.get(
"http://www.imdb.com/chart/%s" %
listing[0], headers={
'User-agent': 'Mozilla/5.0'}, timeout=20)
soup = BeautifulSoup.BeautifulSoup(html.text)
for table in soup.findAll('table'):
if table.get("class") == "chart full-width":
for td_def in table.findAll('td'):
if td_def.get("class") == "titleColumn":
a_link = td_def.find("a")
if a_link:
url = a_link["href"]
imdb_id = url.split("/")[2]
imdb_rank = url.split(listing[1])[1]
results[imdb_id] = try_parse_int(imdb_rank)
self.write_kodidb(results)
return results
def purgeAttributes(self, mime, _old):
html = mime.html()
soup = BeautifulSoup(html)
newMime = QMimeData()
for tag in soup.recursiveChildGenerator():
# remove attributes in the list
index = -1
try:
for key, value in tag.attrs:
index += 1
if key != 'style':
continue
new = value.split(';')
new = ';'.join([s for s in new
if s.split(':')[0].strip() not in REMOVE_ATTRIBUTES])
tag.attrs[index] = (u'style', new)
except AttributeError:
# 'NavigableString' object has no attribute 'attrs'
pass
# assign the modified html to new Mime
newMime.setHtml(str(soup).decode('utf8'))
# default _processHtml method
return _old(self, newMime)