def getJournalURL(jname):
# get journal URL given the journal name for retrieving article PIIs
urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html"
retl = ""
with urllib.request.urlopen(urlstr) as url:
response = url.read()
linkcnt = 0
for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
if linkcnt == 0:
linkcnt += 1
continue
if link.has_attr("href"):
if link.text.lower() == jname.lower():
#print(link["href"])
retl = link["href"]
break
linkcnt += 1
return retl
python类SoupStrainer()的实例源码
sciencedirect_collect.py 文件源码
项目:scientific-paper-summarisation
作者: EdCo95
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def __init__(self, data, encoding=None):
"""
Initialize serializer class
:param data: ori data
:param encoding: encoding type of your ori data
"""
self.data = data
if not self.data:
raise ValueError("You must input origin data to this class")
# if you don't support encoding type we will use chardet to check the type
self.encoding = encoding if encoding else UnicodeDammit(self.data).original_encoding
self.encoding = None if self.encoding == "utf-8" else self.encoding
# initialize beautiful soup
# only_content_div = SoupStrainer("body")
self.obj = BeautifulSoup(data, features="lxml", from_encoding=self.encoding)
def get_title(html):
"""
Get the title element from a HTML document
:param str html: The html to parse
:Example:
>>> Link.get_title("xxxx<title>Title</title>xxxx")
'Title'
>>> print(Link.get_title("xxxx<>Title</title>xxxx"))
None
"""
bs = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('title'))
title = bs.find("title")
if not title:
return None
if not title.string:
return None
return title.string.strip().replace('\n', ' ')
def get_child_urls(main_page, max_child=20):
"""retrieve urls from giving html page.
args:
main_page(str): html file.
max_child(int): max number of return urls.
return:
list of url string.
"""
from bs4 import BeautifulSoup, SoupStrainer
children = []
for link in BeautifulSoup(main_page,
"html.parser",
parse_only=SoupStrainer('a')):
if link.has_attr('href') and link['href'].startswith("http"):
children.append(link['href'])
if len(children) > max_child:
children = children[:max_child]
return children
def __get_menu_items(self, url, soupstrainer_parser_selector, routing_action, video_dictionary_action=None):
response = requests.get(url)
tiles = SoupStrainer('a', soupstrainer_parser_selector)
soup = BeautifulSoup(response.content, "html.parser", parse_only=tiles)
listing = []
for tile in soup.find_all(class_="tile"):
link_to_video = tile["href"]
thumbnail, title = self.__get_thumbnail_and_title(tile)
video_dictionary = None
if video_dictionary_action is not None:
video_dictionary = video_dictionary_action(tile)
item = helperobjects.TitleItem(title, {'action': routing_action, 'video': link_to_video},
False, thumbnail, video_dictionary)
listing.append(item)
return listing
def read(self):
with io.open(self.filename, 'rb') as dhtml_file:
def strain(name, attrs):
if name == 'title':
return True
if name == 'div' and dict(attrs).get('id', None) in self.ids:
return True
return False
soup = BeautifulSoup(dhtml_file, "lxml", parse_only=SoupStrainer(strain))
parser = html_parser.HTMLParser()
self.title = parser.unescape(soup.title.decode_contents()) if soup.title else _('Untitled')
for an_id in self.ids:
found_elements = soup.find_all(id=an_id)
if found_elements:
[element] = found_elements
self.elements[an_id] = element.decode_contents()
else:
self.elements[an_id] = ''
self.original_encoding = soup.original_encoding
def get_lyrics_with_urls(urls):
# TODO
ret = []
for url in urls:
time.sleep(3)
print(url)
response = urlopen(url, timeout=5)
content = response.read()
for lyrics in bs(content, "html.parser", parse_only=SoupStrainer('p')):
if(lyrics.has_attr('style')):
lyrics = re.sub('</?br/?>', '\n', str(lyrics))
lyrics = re.sub('<.*?>', '', str(lyrics))
lyrics = re.sub('\n', ' \n', str(lyrics))
ret.append(lyrics)
print(lyrics)
print(str(get_sentiment(lyrics)))
return ret
def get_lyrics(artist, song):
artist = format_artist(artist)
song = format_song(song)
time.sleep(1)
url = LYRICS_URL.format(artist, song)
content = None
try:
response = urlopen(url)
content = response.read()
except Exception as e:
print(url)
print(e)
print("failed\n")
return None
soup = bs(content, "html.parser", parse_only=SoupStrainer('div'))
for l in soup:
for lyrics in soup.find_all(string=lambda t: isinstance(t, Comment)):
if "start of lyrics" in lyrics or "Usage" in lyrics:
lyrics = re.sub('</?br/?>', '', str(lyrics.parent))
lyrics = re.sub('<.*?>', '', str(lyrics))
return str(lyrics)
def scrape_category_page(url):
global ALL_TEXT, non_bmp_map, threads, count
soup = BeautifulSoup(urllib.request.urlopen(url), 'lxml', parse_only=SoupStrainer('div'))
### accounts for categories with over 200 pages
link = soup.find('a', href=True, text='next page')
if (link != None):
try:
t = catThread('https://en.wikipedia.org' + link['href'])
t.daemon = True
t.start()
threads.append(t)
except:
print ("Error: Unable to thread.")
### sends links of wikipedia articles in the category to be scraped
pages_in_category = soup.find('div', {'id':'mw-pages'}).find('div',{'class':'mw-category'})
for obj in pages_in_category.findAll('a'):
tempbun = scrape(Bundle('https://en.wikipedia.org' + obj['href'], False))
with lock:
ALL_TEXT += tempbun.text.translate(non_bmp_map)
print (count)
count += 1
def get_soup(game_html):
"""
Uses Beautiful soup to parses the html document.
Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
:param game_html: html doc
:return: "soupified" html and player_shifts portion of html (it's a bunch of td tags)
"""
strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})
soup = BeautifulSoup(game_html.text, "lxml", parse_only=strainer)
soup = soup.select('td.+.bborder')
if len(soup) == 0:
soup = BeautifulSoup(game_html.text, "html.parser", parse_only=strainer)
soup = soup.select('td.+.bborder')
if len(soup) == 0:
soup = BeautifulSoup(game_html.text, "html5lib")
soup = soup.select('td.+.bborder')
return soup
def get_child_urls(main_page, max_child=20):
"""retrieve urls from giving html page.
args:
main_page(str): html file.
max_child(int): max number of return urls.
return:
list of url string.
"""
from bs4 import BeautifulSoup, SoupStrainer
children = []
for link in BeautifulSoup(main_page, "html.parser", parse_only=SoupStrainer('a')):
if link.has_attr('href') and link['href'].startswith("http"):
children.append(link['href'])
if len(children) > max_child:
children = children[:max_child]
return children
sciencedirect_collect.py 文件源码
项目:scientific-paper-summarisation
作者: EdCo95
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def collectArticles(urlstr):
# get article PIIs
retl = []
with urllib.request.urlopen(urlstr) as url:
response = url.read()
linkcnt = 0
for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
if linkcnt == 0:
linkcnt += 1
continue
if link.has_attr("href"):
#print(link["href"])
retl.append(link["href"])
linkcnt += 1
return retl
def get_links ( url ):
'''
Get all the links off of the page:
gd2.mlb.com/components/game/mlb/year/month/day/
And finds the links for the games that have the following
format:
gid_year_mm_dd_team1mlb_team2mlb
'''
f = get_page (url)
if f==False: return False
# Compile the regex to match links outside of the loop for
# performance
links = []
regex = re.compile("\"gid_(.*?)\"", re.IGNORECASE)
# Find all links on page and if they are links to games then add to list
for link in BeautifulSoup(f, "lxml",parse_only=SoupStrainer('a', href=True) ):
match = regex.findall(str(link))
if match:
links.extend(match)
return links
def _check_latest_version(self, url, package, package_regex, current_version, ud, d):
"""
Return the latest version of a package inside a given directory path
If error or no version, return ""
"""
valid = 0
version = ['', '', '']
bb.debug(3, "VersionURL: %s" % (url))
soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a"))
if not soup:
bb.debug(3, "*** %s NO SOUP" % (url))
return ""
for line in soup.find_all('a', href=True):
bb.debug(3, "line['href'] = '%s'" % (line['href']))
bb.debug(3, "line = '%s'" % (str(line)))
newver = self._parse_path(package_regex, line['href'])
if not newver:
newver = self._parse_path(package_regex, str(line))
if newver:
bb.debug(3, "Upstream version found: %s" % newver[1])
if valid == 0:
version = newver
valid = 1
elif self._vercmp(version, newver) < 0:
version = newver
pupver = re.sub('_', '.', version[1])
bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" %
(package, pupver or "N/A", current_version[1]))
if valid:
return pupver
return ""
def _parse_multiple_apps(self, list_response):
"""Extracts app ids from a list's Response object, sends GET requests to
each app, parses detailed info and returns all apps in a list.
:param list_response: the Response object from a list request
:return: a list of app dictionaries
"""
list_strainer = SoupStrainer('span', {'class': 'preview-overlay-container'})
soup = BeautifulSoup(list_response.content, 'lxml', parse_only=list_strainer)
app_ids = [x.attrs['data-docid'] for x in soup.select('span.preview-overlay-container')]
responses = multi_app_request(app_ids)
app_strainer = SoupStrainer('div', {'class': 'main-content'})
apps = []
errors = []
for i, r in enumerate(responses):
if r is not None and r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'lxml', parse_only=app_strainer)
apps.append(self._parse_app_details(soup))
else:
errors.append(app_ids[i])
if errors:
self._log.error("There was an error parsing the following apps: {errors}.".format(
errors=", ".join(errors)))
return apps
def get_categories():
"""
Sends a GET request to the front page (base url of the app store),
parses and returns a list of all available categories.
Note: May contain some promotions, e.g. "Popular Characters"
"""
categories = {}
strainer = SoupStrainer('a', {'class': 'child-submenu-link'})
response = send_request('GET', s.BASE_URL)
soup = BeautifulSoup(response.content, 'lxml', parse_only=strainer)
category_links = soup.select('a.child-submenu-link')
age = '?age='
for cat in category_links:
url = urljoin(s.BASE_URL, cat.attrs['href'])
category_id = url.split('/')[-1]
name = cat.string
if age in category_id:
category_id = 'FAMILY'
url = url.split('?')[0]
name = 'Family'
if category_id not in categories:
categories[category_id] = {
'name': name,
'url': url,
'category_id': category_id}
return categories
def resolve_title(url):
#grab the first title if there's more than one
try:
pnk_log(mod, "Requesting %s" % url)
r = pnk_request(url)
response_text = r.text
for title in BeautifulSoup(response_text, 'html.parser', parse_only=SoupStrainer('title')):
return title.text.strip()
except:
return None
def getSingle(s):
# load in your friends dictionary
structDir = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'Structs'))
with open(structDir + '/friendsDict.pkl','rb') as input:
friendsDict = pickle.load(input)
# -------------- Now, let's compile a list of friends who are single ------------
Single = []
iteration = 1
relatStrainer = SoupStrainer(text=re.compile("Single</div>"))
relatExt = "/about?section=relationship&pnref=about"
relatExtBeta = "&sk=about§ion=relationship"
fbook = "https://facebook.com"
for friend in friendsDict:
if (friendsDict[friend].find("php") != -1):
relatURL = fbook + friendsDict[friend] + relatExtBeta
else:
relatURL = fbook + friendsDict[friend] + relatExt
relatInfo = s.get(relatURL)
soup = BeautifulSoup(relatInfo.text,"lxml",parse_only=relatStrainer)
comment = soup.find(text=re.compile("Single</div>"))
if (comment != None):
# since some names have special characters, we need to strip these
temp = friend.encode('utf-8').strip()
Single.append(temp + "\n")
print friend + " is single = " + str(comment != None)
# print iteration
iteration += 1
# print Single
singleStr = ''.join(Single)
with open(structDir + "/single.txt","wb") as f:
f.write(singleStr)
def getFriendsList(friends, part,s):
ID = vanity
if(part == 1):
index = 0;
elif(part == 2):
index = 24;
elif(part == 3):
index = 24+36
else:
index = 24+36+36
# find scrape their total number of friends
temp = s.get('https://www.facebook.com/' + ID + '/friends')
soup = BeautifulSoup(temp.text,"lxml")
strainer = SoupStrainer('a',href=re.compile("fref=fr_tab"))
# iterator over entire friends list and pull out the relevant information from
# the html docs that display 24 or 36 friends each
while (index < (numFriends)):
if index == 0:
temp = s.get('https://m.facebook.com/' + ID + '/friends')
soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer)
tempLst = soup.findAll('a')
for item in tempLst:
friends.append(item)
index = 24 + 36*3
else:
temp = (s.get('https://m.facebook.com/' + ID + '/friends?startindex='
+ str(index)))
soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer)
tempLst = soup.findAll('a')
for item in tempLst:
friends.append(item)
index = index + 36*4
return
html.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def __init__(self, *args, **kwargs):
super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
**kwargs)
from bs4 import SoupStrainer
self._strainer = SoupStrainer('table')
def scrape(webpage, extension=".mid"):
# Get all the files of a given extension from a webpage
http = httplib2.Http()
status, response = http.request(webpage)
files = []
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if link.has_attr('href'):
linkname = link['href']
if linkname[-len(extension):] == extension:
files += [linkname]
return files
def get_film_info_subhd():
items = []
target_url = 'http://subhd.com'
content = urllib2.urlopen(target_url).read().decode('utf-8')
only_hotl_tags = SoupStrainer(class_='hotl')
soup = BeautifulSoup(content, "html.parser", parse_only=only_hotl_tags)
i = 0
for link in soup.find_all('a', limit=7):
link_url = target_url + link.get('href')
link_img = target_url + link.findChildren('img')[0].get('src')
cover_img = 'http://img3.doubanio.com/view/movie_poster_cover/spst/public/' + link_img.split('/sub/poster/l/')[
1]
link_title = link.findChildren('img')[0].get('title')
save_path = os.path.abspath("./icons/icon-s")
imgData = urllib2.urlopen(cover_img).read()
fileName = save_path + str(i) + '.jpg'
output = open(fileName, 'wb+')
output.write(imgData)
output.close()
json_item = dict(title=link_title, subtitle='', arg=link_url, icon='icons/icon-s' + str(i) + '.jpg')
items.append(json_item)
i = i + 1
return generate_xml(items)
def resolve_title(url):
#grab the first title if there's more than one
try:
pnk_log(mod, "Requesting %s" % url)
r = pnk_request(url)
response_text = r.text
for title in BeautifulSoup(response_text, 'html.parser', parse_only=SoupStrainer('title')):
return title.text.strip()
except:
return None
def __get_version(self):
'''
get jenkins version
:return:
'''
try:
html = urllib2.urlopen(self.url + '/login?from=%2F').read()
links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
version_text = BeautifulSoup(html, "html.parser", parse_only= links)
if version_text.text != "":
color_output("[+]....jenkins version is %s" % version_text.text)
version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
if len(version_re) != 0:
if version_re[0][0:4] >= self.check_version:
self.user_link = ASYNCH_PEOPEL_PERFIX
else:
self.user_link = PEOPLE_PERFIX
else:
color_output("[-]....can't get jenkins version!")
sys.exit()
except urllib2.URLError,e:
color_output("[-]....can't get jenkins version!")
sys.exit()
except Exception,e:
color_output("[-]....get version error:%s" % str(e))
sys.exit()
def scrape(url):
### opens url so it's like a file
try:
link = urllib.request.urlopen(url)
except urllib.error.HTTPError:
return ''
soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=SoupStrainer('p'))
alltxt = ''
### iterate thru the <p> tags
for para in soup.find_all('p'):
alltxt = alltxt + para.get_text() + ' '
return alltxt
def scrape(bun):
### opens url so it's like a file
link = urllib.request.urlopen(bun.URL)
soup = None
### flag for retrieving categories (or not)
if bun.categories:
soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml')
else:
p_tags = SoupStrainer('p')
soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=p_tags)
### dictionary of paragraphs
doc = {}
### add token and count to replace paragraphs in HTML
token = 'Waka'
count = 0
### all the paragraph texts in one string
alltxt = ''
### iterate thru the <p> tags
for para in soup.find_all('p'):
### put raw text in dictionary
doc[token+str(count)] = para.get_text()
alltxt = alltxt + para.get_text() + ' '
### replace <p> contents with a token
para.string = token + str(count)
count+=1
### get the list of categories
cats = []
if bun.categories:
for cat in soup.find('div', {'id': 'catlinks'}).find('ul').findAll('li'):
cats.append('https://en.wikipedia.org' + cat.find('a')['href'])
for css in soup.find_all('link', rel='stylesheet'):
css['href'] = '//en.wikipedia.org/' + css['href']
for js in soup.find_all('script', src=re.compile('.*')):
js['src'] = '//en.wikipedia.org/' + js['src']
### update stuff in Bundle
bun.paragraphs = doc
bun.text = alltxt
bun.html = str(soup.encode('ascii', 'xmlcharrefreplace').decode('utf-8'))
bun.categories = cats
return bun
def __get_version(self):
'''
get jenkins version
:return:
'''
try:
html = urllib2.urlopen(self.url + '/login?from=%2F').read()
links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
version_text = BeautifulSoup(html, "html.parser", parse_only= links)
if version_text.text != "":
color_output("[+]....jenkins version is %s" % version_text.text)
version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
if len(version_re) != 0:
if version_re[0][0:4] >= self.check_version:
self.user_link = ASYNCH_PEOPEL_PERFIX
else:
self.user_link = PEOPLE_PERFIX
else:
color_output("[-]....can't get jenkins version!")
sys.exit()
except urllib2.URLError,e:
color_output("[-]....can't get jenkins version!")
sys.exit()
except Exception,e:
color_output("[-]....get version error:%s" % str(e))
sys.exit()
def __init__(self, text_blob, *args, **kwargs):
TextParser.text_strainer = SoupStrainer(TextParser.strain_through)
self.soup = BeautifulSoup(text_blob, 'html.parser', parse_only=TextParser.text_strainer)
self.text = self._extract_text()
def get_tuko():
tuko_url = 'https://www.tuko.co.ke'
if check_connection(tuko_url):
tuko = requests.get(tuko_url)
soup = BeautifulSoup(tuko.text, 'lxml', parse_only=SoupStrainer('a'))
tuko = []
for link in soup.select('a.news__link', limit=6):
news_title = '{}({})'.format(link.get_text(), link.get('href'))
tuko_link = requests.get(link.get('href'))
soup_link = BeautifulSoup(tuko_link.text, 'lxml', parse_only=SoupStrainer(['p', 'meta', 'img']))
try:
article_date = soup_link.find("meta", itemprop="datePublished")['content']
except (TypeError, ValueError):
print('Tuko: No article date meta')
continue
image = ''
try:
image = soup_link.find("meta", property="og:image")['content']
except (TypeError, ValueError):
try:
image = soup_link.find('img', class_='article-image__picture')['src']
except (TypeError, ValueError):
print('Tuko: No image found')
news_dict = {
'category': 'news',
'source': 'tuko',
'title': link.get_text(),
'link': link.get('href'),
'image': image,
'content': [link_inner.get_text().strip(' ,.-') for link_inner in
soup_link.select('p.align-left > strong', limit=3) if not
link_inner.get_text().startswith('READ ALSO')],
'date': article_date,
'date_added': datetime.datetime.utcnow()
}
collection.update({'link': link.get('href')}, news_dict, upsert=True)
tuko.append(news_dict)
return tuko
def get_capital():
capital_url = 'http://www.capitalfm.co.ke/news/{}/{:02}'.format(today.year, today.month)
if check_connection(capital_url):
capital = requests.get(capital_url)
soup = BeautifulSoup(capital.text, 'lxml', parse_only=SoupStrainer('div'))
capital = []
for article in soup.select('div.entry-information'):
article_link = article.a
link = article_link['href']
title = article_link.get_text()
capital_link = requests.get(link)
soup_link = BeautifulSoup(capital_link.text, 'lxml', parse_only=SoupStrainer(['meta', 'img', 'div']))
article_date = soup_link.find("meta", property="article:published_time")['content']
image = ''
try:
image = soup_link.find("meta", property="og:image")['content']
except (TypeError, ValueError):
try:
image = soup_link.find('img', class_='size-full')['src']
except (TypeError, ValueError):
print('Capital: No image found')
try:
content = get_content(soup_link, 'entry-content').split('\u2013')[1].strip()
except IndexError:
content = get_content(soup_link, 'entry-content').strip()
news_dict = {
'category': 'news',
'source': 'capital',
'title': title,
'link': link,
'image': image,
'content': content,
'date': article_date,
'date_added': datetime.datetime.utcnow()
}
collection.update({'link': link}, news_dict, upsert=True)
capital.append(news_dict)
return capital