def download_songs(url):
time.sleep(random.random() * 0.5)
try:
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
# Get the artist name
artist_name = soup.findAll('h1')[0].get_text()[:-7].lower().replace(' ', '_')
# Store all songs for a given artist
with open('artist_data/'+artist_name+'.txt', 'wb') as w:
for song in soup.findAll('a', {'target': '_blank'}):
if 'lyrics/' in song['href']:
song_url = song['href'][1:].strip()
w.write(song_url + '\n')
except urllib2.HTTPError:
print '404 not found'
python类BeautifulSoup()的实例源码
def packages(self):
"""
Parse XML file to locate packages.
"""
xml = requests.get(self._xml_url).content
soup = BeautifulSoup(xml, "html.parser")
nltk_packages, third_party = defaultdict(dict), defaultdict(dict)
for pack in soup.find_all('package'):
package_attributes = pack.attrs
name = package_attributes['id']
# Keeps track of nltk_data packages vs third_party packages.
if package_attributes['url'].startswith(self._nltk_data_url):
nltk_packages[name] = package_attributes
else:
third_party[name] = package_attributes
return nltk_packages, third_party
def _online_tibia(self):
"""Get total players playing"""
url = "http://www.tibia.com/community/?subtopic=worlds"
try:
async with aiohttp.get(url) as response:
soup = BeautifulSoup(await response.text(), "html.parser")
div1 = soup.find('div', attrs={'id': 'RightArtwork'})
div2 = div1.find('div', attrs={'id': 'PlayersOnline'})
test = div2.get_text()
test1 = test.replace("Players Online", "")
new = "Players currently playing Tibia: " + test1
# div2 = div1.find('div', attrs={'class': 'Border_2'})
# div3 = div2.find('div', attrs={'class': 'Border_3'})
# table = div3.find_all('table', attrs={'class': 'Table1'})
# tr = table.find_all('tr')
# tbody = div4.find('div', attrs={'class': 'CaptionInnerContainer'})
await self.bot.say(str(new))
except:
await self.bot.say("Could not retrive data. The webserver may be offline.")
def _server_tibia(self, servername):
"""Get Server Info"""
servername = servername.title()
url = "https://secure.tibia.com/community/?subtopic=worlds&world=" + str(servername)
try:
async with aiohttp.get(url) as response:
soup = BeautifulSoup(await response.text(), "html5lib")
b = soup.find_all("table", attrs={'class': 'Table1'})
new = []
rows = b[1].tbody.div.find_all('td')
for row in rows:
new.append(row.get_text())
k = new[::2]
l = new[1::2]
zipped = list(zip(k, l))
t = tabulate(zipped, headers=["Category", "Info"])
await self.bot.say("```Python" + "\n" + str(t) + "```")
except:
await self.bot.say("Unable to retrive server data. The webserver may be offline.")
def get_course_status(course_num):
client = Client()
subject = client.get_course_subject(course_num)
if subject is None:
return None
semester = get_semester()
subject_url = "http://classes.cornell.edu/browse/roster/" + semester + "/subject/" + subject
subject_page = requests.get(subject_url)
subject_page.raise_for_status()
subject_bs4 = bs4.BeautifulSoup(subject_page.text, "html.parser")
course_code_tags = subject_bs4.find_all("strong", class_="tooltip-iws")
for tag in course_code_tags:
course_code = int(tag.getText().strip())
if course_num == course_code:
section = tag.parent.parent.parent
status = section.find_all('li', class_ = "open-status")[0].i["class"][-1]
if "open-status-open" in status:
return "open"
if "open-status-closed" in status:
return "closed"
if "open-status-warning" in status:
return "waitlist"
if "open-status-archive" in status:
return "archive"
def get_soup_from_url(url, params=None):
'''
url? parameter? ???? ?? URL? GET??? ?? ??(HTML text)?
BeautifulSoup??? ??? ??
:param url: GET??? ?? URL string
:param params: GET?? ???? dict
:return: BeautifulSoup object
'''
# requests.get??? ?? ???(response??)? r??? ??
r = requests.get(url, params=params)
# response???? text???? ??? ??? html_doc??? ??
html_doc = r.text
# BeautifulSoup??? ??, ??? html text
soup = BeautifulSoup(html_doc, 'lxml')
return soup
def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "xml")
xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p")
# Both the <br> and <p> tag are empty-element, just because
# they have no contents.
self.assertEqual(b"<br/>", xml_br.encode())
self.assertEqual(b"<p/>", xml_p.encode())
html_soup = BeautifulSoup("", "html")
html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p")
# The HTML builder users HTML's rules about which tags are
# empty-element tags, and the new tags reflect these rules.
self.assertEqual(b"<br/>", html_br.encode())
self.assertEqual(b"<p></p>", html_p.encode())
def Scrape(url):
timeout = 10
socket.setdefaulttimeout(timeout)
#Collecting html content.
headers = {'User-Agent': 'TorScrapper - Onion scrapper | github.com/ConanKapoor/TorScrapper.git' }
req = urllib.request.Request(url,None,headers)
response = urllib.request.urlopen(req)
#Using BeautifulSoup to parse html object response.
page = BeautifulSoup(response.read(),'html.parser')
#Saving output
token = re.sub(r'[^\w]', '', url)
name = os.path.abspath("") + '/Output/Scraped-' + token +'.html'
file = open(name,'w')
file.write(str(page))
file.close()
# Taking input.
sciencedirect_collect.py 文件源码
项目:scientific-paper-summarisation
作者: EdCo95
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def getJournalURL(jname):
# get journal URL given the journal name for retrieving article PIIs
urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html"
retl = ""
with urllib.request.urlopen(urlstr) as url:
response = url.read()
linkcnt = 0
for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
if linkcnt == 0:
linkcnt += 1
continue
if link.has_attr("href"):
if link.text.lower() == jname.lower():
#print(link["href"])
retl = link["href"]
break
linkcnt += 1
return retl
def get_url(self, query):
site1 = urllib.urlopen('http://www.youtube.com/results?search_query=%s'%query)
html = site1.read()
soup = BS(html)
links = soup.findAll('a')
vidlinks = [link.get('href') for link in links if link.get('href') is not None]
vlink = [ i for i in vidlinks if '/watch?v=' in i][0]
img_link = soup.findAll('img',{'alt':'Thumbnail', 'width':'185'})[0].get('src')
img_url = 'http:%s' %img_link
imagethread = threading.Thread(target=lambda:urllib.urlretrieve(img_url, 'Files\image.jpg'))
imagethread.start()
return vlink
def run(self):
ind=self.qu.get()
url=self.url+str(ind)
soup =bs.BeautifulSoup(''.join( ul.urlopen(url).readlines() ))
bu = up.urlsplit(self.url)
print 'started with the ' ,str(url).split('/')[-1],
for i in soup.find_all(attrs = { "class" : "recipe-title"}):
sp = up.urlsplit(i.a.get('href'))
path = sp.path
print path
if re.search(pat, path):
path = bu.scheme+'://'+bu.netloc+path
filename = str(path).split('/')[-2]
filename = op.join(op.abspath(op.curdir),filename+'.py') # recipe will be stored in given location
# filename = op.join(op.abspath(op.curdir),filename+'.html')
#uncomment the above line if downloading the web page for teh recipe
print path
self.q.put((path,filename))
self.fetch_data()
time.sleep(1)
self.qu.task_done()
self.q.join()
print 'done with the ' ,str(url).split('/')[-1],
def get_all_key_signatures(cfg, keyid):
"""
Get all signatures for a specific key. We exclude self signed signatures
because this is not helpful for us.
"""
content, status_code = make_sks_request(
cfg, requests.get, "lookup", {"op": "vindex", "search": "0x{}".format(keyid)}, None
)
if status_code != 200:
return status_code, content
elem = BeautifulSoup(content, HTML_PARSER).span
ids = []
while (elem.findNext().name != "strong" and elem.findNext()):
elem = elem.findNext()
if "op=get" in elem["href"] and elem.text != keyid:
ids.append(elem.text)
return ids
def search_key(cfg, search_str):
"""
Search for a key by a given string
"""
content, status_code = make_sks_request(
cfg, requests.get, "lookup", {"op": "index", "search": search_str}, None
)
if status_code != 200:
return content, status_code
bs = BeautifulSoup(content, HTML_PARSER)
regex = re.compile(r"^pub *\d{3,4}\w\/([\w\d]{8})")
ids = []
for pre in bs.findAll("pre"):
match = regex.search(pre.text.strip("\r\n"))
if match and not "KEY REVOKED" in pre.text:
ids.append(match.groups()[0])
return {"ids": ids}, status_code
def parse_news(self,response):
item = response.meta.get("item",NewsItem())
soup = BeautifulSoup(response.body.decode('gbk'))
pic = soup.find('p' , class_ = 'f_center').find('img').get('src') if soup.find('p' , class_ = 'f_center') and soup.find('p' , class_ = 'f_center').find('img') else None
referer_web = soup.find('a',id = 'ne_article_source').text if soup.find('a',id = 'ne_article_source') else None
referer_url = soup.find('a',id = 'ne_article_source').get('href') if soup.find('a',id = 'ne_article_source') else None
author = soup.find('span',class_ = 'ep-editor').text if soup.find('span',class_ = 'ep-editor') else None
if u"?" in author:
author = author.split(u"?")[-1]
crawl_date = NOW
read_num = soup.find('div',class_ = 'post_comment_joincount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
comment_num = soup.find('div',class_ = 'post_comment_tiecount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
content = soup.find('div',class_ = 'post_text').get_text(strip=True) if soup.find('div',class_ = 'post_text') else None
item['referer_web'] = referer_web
item['content'] = content
item['referer_url'] = referer_url
item['author'] = author
item['crawl_date'] = crawl_date
item['pic'] = pic
item['comment_num'] = int(comment_num)
item['read_num'] = int(read_num)
yield item
def parse_news(self, response):
item = response.meta.get("item", NewsItem())
soup = BeautifulSoup(response.body.decode("utf-8").encode("utf-8"),"lxml")
pic = soup.find("p",class_ = "detailPic").find("img").get("src") if soup.find("p",class_ = "detailPic") else None
referer_web = soup.find("span",class_ = "ss03").text if soup.find("span",class_ = "ss03") else None
author = soup.find("span",itemprop="author").find("span").text if soup.find("span",itemprop="author") else None
temp = soup.find("div" ,id = "main_content")
if temp:
ps = temp.find_all("p") if temp.find_all("p") else None
content = "\n\n".join([ p.text.strip() for p in ps])
else:
content = None
item['pic'] = pic
item['referer_web'] = referer_web
item['author'] = author
item['content'] = content
item['crawl_date'] = NOW
yield item
def parse_news(self,response):
item = response.meta.get("item",None)
# #??????????????????????
# news_date = item.get("news_date",None)
# if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
# news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
#
# delta = self.end_now-struct_date
# if delta.days == self.end_day:
# # pass
# raise CloseSpider('today scrapy end')
soup = BeautifulSoup(response.body)
news_content_group = soup.find("div",class_="entry-content group")
#??????
news_content_group.find("div",class_="related_posts").replace_with("")
content = news_content_group.text.strip()
item["content"] = content
item["catalogue"] = u"????"
yield item
def parse_news(self,response):
item = response.meta.get("item",NewsItem())
pageindex = response.meta.get("pageindex",1)
soup = BeautifulSoup(response.body, 'lxml')
origin_date = soup.find("td", class_="time").text.strip()
struct_date= datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M")
news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
content = soup.find("div", class_= "lph-article-comView").text.strip() if soup.find("div", class_= "lph-article-comView").text.strip() else None
item["news_date"]= news_date
item["crawl_date"]= NOW
item["content"] = content
item["catalogue"] = u"????"
item = judge_news_crawl(item)
if item:
yield item
else:
self.flag = int(pageindex)
def parse(self,response):
origin_url = response.url
if "index" not in origin_url:
soup = BeautifulSoup(response.body,"lxml")
catalogue = soup.find("a",class_ = "blue CurrChnlCls").get("title").strip()
news_list = soup.find("div", class_ = "lie_main_m").find_all("li")
for news in news_list:
title = news.find("a").text.strip()
news_url = "http://www.cnta.gov.cn/xxfb" + news.find("a").get("href")[2:]
news_no = news_url.rsplit("/",1)[-1].split(".")[0]
item = NewsItem(
news_url =news_url,
title = title,
news_no = news_no,
catalogue = catalogue,
)
yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
else:
topic_url = origin_url.rsplit(".",1)[0]
self.flag.setdefault(topic_url,0)
yield scrapy.Request(origin_url,callback=self.parse_topic)
def parse(self, response):
origin_url = response.url
#http://money.163.com/special/002526O5/transport_02.html
search_result = re.search(r"_(\d)*?\.",origin_url)
#????
pageindex = search_result.group(1) if search_result else 1
soup = BeautifulSoup(response.body,"lxml")
news_list = soup("div",class_="list_item clearfix")
for news in news_list:
news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None
title = news.find("h2").text if news.find("h2") else None
news_url = news.find("h2").a.get("href",None) if news.find("h2") else None
abstract = news.find("p").contents[0] if news.find("p") else None
item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date)
item = judge_news_crawl(item) #??????????
if item:
request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item})
yield request
else:
self.flag = int(pageindex)
if not self.flag:
next_url = self.next_url % int(pageindex)+1
yield scrapy.Request(next_url)
def parse_news(self,response):
item = response.meta.get("item",NewsItem())
soup = BeautifulSoup(response.body)
referer_web = soup.find("a",id="ne_article_source").text if soup.find("a",id="ne_article_source") else None
referer_url = soup.find("a",id="ne_article_source").get("href",None) if soup.find("a",id="ne_article_source") else None
comment_num = soup.find("a",class_="post_cnum_tie").text if soup.find("a",id="ne_article_source") else None
content = soup.find("div",class_="post_text").text.strip() if soup.find("div",class_="post_text") else None
#??: ?????????-????? ??????
author_source = soup.find("span",class_="left").text if soup.find("span",class_="left") else None
#TODO ??????
# import pdb;pdb.set_trace()
# author = re.search(u"??(.*)",author_source).group(1)[1:] if author_source else None
# item["author"]=author
item["referer_web"]=referer_web
item["referer_url"]=referer_url
item["comment_num"]=comment_num
item["content"]=content
item["crawl_date"]=NOW
yield item