def get_blog(cls, file_name):
if cls.is_exist(file_name):
with open(cls._real_file_name(file_name), 'r', encoding='utf-8') as f:
txt = f.read()
mtime = os.path.getmtime(cls._real_file_name(file_name))
from bs4 import BeautifulSoup, Comment
import yaml
comment = BeautifulSoup(txt, "html.parser").find(text=lambda text: isinstance(text, Comment))
if comment is not None:
blog_info = yaml.load(comment)
if 'use_toc' not in blog_info:
blog_info['use_toc'] = False
html = markdown(txt)
return blog_info, txt, html, mtime
else:
return
else:
return
python类Comment()的实例源码
def get_lyrics(artist, song):
artist = format_artist(artist)
song = format_song(song)
time.sleep(1)
url = LYRICS_URL.format(artist, song)
content = None
try:
response = urlopen(url)
content = response.read()
except Exception as e:
print(url)
print(e)
print("failed\n")
return None
soup = bs(content, "html.parser", parse_only=SoupStrainer('div'))
for l in soup:
for lyrics in soup.find_all(string=lambda t: isinstance(t, Comment)):
if "start of lyrics" in lyrics or "Usage" in lyrics:
lyrics = re.sub('</?br/?>', '', str(lyrics.parent))
lyrics = re.sub('<.*?>', '', str(lyrics))
return str(lyrics)
def get_overrides(soup):
overrides = []
comments = soup.find_all(string=lambda text:isinstance(text,Comment))
for comment in comments:
m = re.match(OVERRIDE_COMMENT_REGEX, comment)
if m:
new_overrides = m.group(1).split(",")
new_overrides = [o.strip() for o in new_overrides]
logger.info("Overrides found: %s" % new_overrides)
overrides += new_overrides
return overrides
def codeAnalyse(html, clas, name = ""):
soup = BeautifulSoup(html,"html.parser")
source = soup.find('code', id = "__cnt_0_4")
soup = BeautifulSoup(str(source),"html.parser")
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
soup = BeautifulSoup(comments[0],"html.parser")
source = soup.find('a', {"class":clas},string = name)
pos = 0
for son in source.parent.find_next_sibling().find_next_siblings():
pos = pos+1
print(source.string, ":", son.a.string, pos, son.a.attrs['href'])
def __get_navigable_strings(self,soup):
if isinstance(soup, NavigableString):
if type(soup) not in (Comment, Declaration) and soup.strip():
yield soup
elif soup.name not in ('script', 'style'):
for c in soup.contents:
for g in self.__get_navigable_strings(c):
yield g
def text(self, target=None, ignore_pureascii_words=False):
"""
Get all text in HTML, skip script and comment
:param target: the BeatuifulSoup object, default self.b
:param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
:return: list of str
"""
if target is None:
target = self.b
from bs4 import Comment
from bs4.element import NavigableString,Doctype
result = []
for descendant in target.descendants:
if not isinstance(descendant, NavigableString) \
or isinstance(descendant,Doctype) \
or descendant.parent.name in ["script", "style"] \
or isinstance(descendant, Comment) \
or "none" in descendant.parent.get("style","")\
or "font-size:0px" in descendant.parent.get("style",""):
continue
data = descendant.strip()
if len(data) > 0:
if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
if PY2:
result.append(data.encode())
else:
result.append(data)
return result
def get_lyric(self, singer, song):
# Replace spaces with _
singer = singer.replace(' ', '_')
song = song.replace(' ', '_')
url = 'http://lyrics.wikia.com/{0}:{1}'.format(singer, song)
req = requests.get(url)
s = BeautifulSoup(req.text, "lxml")
# Get main lyrics holder
lyrics = s.find("div", {'class': 'lyricbox'})
if lyrics is None:
return None
# Remove Scripts
[s.extract() for s in lyrics('script')]
# Remove comments
comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
# Remove unecessary tags
for tag in ['div', 'i', 'b', 'a']:
for match in lyrics.findAll(tag):
match.replaceWithChildren()
# TODO: check if you need the encode/decode thing, if you do then do a try catch for it
# get output as string and remove non unicode characters and replace <br> with newlines
# output = str(lyrics).encode('utf-8', errors = 'replace')[22:-6:].decode('utf-8').replace('\n','').replace('<br/>','\n')
output = str(lyrics).replace('\n', '').replace('<br/>', '\n')[22:-6:]
try:
return output
except:
return output.encode('utf-8')
def lyricswikia(artist, song):
# original code found @
# https://github.com/geekpradd/PyLyrics/blob/master/PyLyrics/functions.py
song = song.split(' - ', 1)[0]
artist = artist.replace(' ', '_')
song = song.replace(' ', '_')
url = 'http://lyrics.wikia.com/{0}:{1}'.format(artist, song)
print('Trying:', url)
r = requests.get(url)
s = BeautifulSoup(r.text, 'html.parser')
# Get main lyrics holder
lyrics = s.find("div", {'class': 'lyricbox'})
if lyrics is not None:
# Remove Scripts
[s.extract() for e in lyrics('script')]
# Remove Comments
comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
# Remove unecessary tags
for tag in ['div', 'i', 'b', 'a']:
for match in lyrics.findAll(tag):
match.replaceWithChildren()
# Get output as a string and remove non unicode characters and replace
# <br> with newlines
lyrics = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode(
"utf-8").replace('\n', '').replace('<br/>', '\n')
try:
return lyrics
except:
return lyrics.encode('utf-8')
def _get_commented_CDN_tags(self):
def get_comment(s):
return s if isinstance(s, Comment) and '//' in s and s.strip()[:4] in ['<lin', '<scr'] else ''
comments = bs(self._get_template()).find_all(string=get_comment)
tags = self._unitags(bs(str(comments)).select('link[href*="//"], script[src*="//"]'))
if tags:
for tag in tags:
for comment in comments:
if tag['open'] in comment and tag['ref'] in comment:
tag['comment'] = comment
return tags
def strip_html_comments(html):
soup = BeautifulSoup(html, 'html.parser')
for element in soup.find_all(text=lambda text: isinstance(text, html_comment)):
element.extract()
return str(soup)
def findSalaries(self, soupped):
total_salaries = []
all_all_salaries = soupped.find("div", {"id": "all_all_salaries"})
comments=all_all_salaries.find_all(string=lambda text:isinstance(text,Comment))
raw_salary_rows = BeautifulSoup(comments[0], "lxml").find("tbody").find_all("tr")
for each_raw_salary in raw_salary_rows:
year = each_raw_salary.find("th").text.replace("-","_").encode("utf8")
salary = self.salaryTextToFloat(each_raw_salary.find_all("td")[2].text)
total_salaries.append((year, salary))
return total_salaries
def text(self, target=None, ignore_pureascii_words=False):
"""
Get all text in HTML, skip script and comment
:param target: the BeatuifulSoup object, default self.b
:param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
:return: list of str
"""
if target is None:
target = self.b
from bs4 import Comment
from bs4.element import NavigableString,Doctype
result = []
for descendant in target.descendants:
if not isinstance(descendant, NavigableString) \
or isinstance(descendant,Doctype) \
or descendant.parent.name in ["script", "style"] \
or isinstance(descendant, Comment) \
or "none" in descendant.parent.get("style","")\
or "font-size:0px" in descendant.parent.get("style",""):
continue
data = descendant.strip()
if len(data) > 0:
if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
if PY2:
result.append(data.encode())
else:
result.append(data)
return result
def clean_tag(doc):
for tag in doc.find_all(["style", "script","form", "textarea", "input", "iframe", "select","frame", "link"]):
tag.extract()
comments = doc.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
def search_esv(message, verse):
"""
Search for a bible passage from the English Standard Version.
Example::
bible Romans 12:16
"""
r = await http.get("http://www.esvapi.org/v2/rest/passageQuery", params={
"key": "IP",
"passage": verse,
"output-format": "crossway-xml-1.0",
"include-simple-entities": "true",
})
doc = BeautifulSoup(r.text(), features="lxml")
if not doc.passage:
raise CommandError("Verse not found.")
lines = []
for verse_unit in doc.passage.content.find_all('verse-unit'):
num = int(verse_unit.find('verse-num').text)
woc = verse_unit.find('woc')
if woc:
text = woc.text
else:
text = "".join([str(node) for node in verse_unit.children
if isinstance(node, NavigableString) and not isinstance(node, Comment)])
lines.append("**{}** {}".format(num, text.strip()))
return "\n".join(lines)
def unwrapUseless(soup):
# unwrap??????
for a in soup.select('a'):
a.unwrap()
for a in soup.select('b'):
a.unwrap()
for a in soup.select('font'):
a.unwrap()
for a in soup.select('span'):
a.unwrap()
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
def scrape_mythic_card_page(url):
r = requests.get(url)
soup = BS(r.text, "html.parser")
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
card = {}
for comment in comments:
if comment == 'CARD NAME':
card['name'] = comment.next_element.strip().replace('"', '')
elif comment == 'MANA COST':
try:
card['manaCost'] = comment.next_element.strip().replace('"', '')
except:
pass
elif comment == 'TYPE':
card['type'] = comment.next_element.strip().replace('"', '')
elif comment == 'CARD TEXT':
buildText = ''
for element in comment.next_elements:
try:
if not element.strip() in ['CARD TEXT', 'FLAVOR TEXT', '']:
if buildText != '':
buildText += '\n'
buildText += element.strip()
if element.strip() == 'FLAVOR TEXT':
card['text'] = buildText
break
except:
pass
elif comment == 'Set Number':
try:
card['number'] = comment.next_element.strip()
except:
pass
elif comment == 'P/T':
try:
if comment.next_element.strip().split('/')[0] != '':
card['power'] = comment.next_element.strip().split('/')[0]
card['toughness'] = comment.next_element.strip().split('/')[1]
except:
pass
return card
SignAndSearch.py 文件源码
项目:relational-social-media-search-engine
作者: indervirbanipal
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def loadSearch(self, url, firstName='results'):
"""
Loads the search page using the url provided and returns raw search results
"""
print " inside loadSearch .."
'''
97.77.104.22:80
174.129.204.124:80
'''
proxy = {
"http":"209.222.25.83:3128",
}
headers = {'Accept-Encoding': 'identity'}
html2 = requests.get(url, proxies=proxy, headers=headers)
print "HTML 2"
# print html2.content
# html = html2.content
html = self.loadPage(url)
print "SPAGE"
# print sPage[:200]
spContent = BeautifulSoup(html)
#title = spContent.find('title')
#if title is not None:
#if title.string is not lSrchTitle:
#sys.exit('There is some problem with url provided, it does not correspond to Linkedin Search')
comment = None
comments = spContent.findAll(text=lambda text:isinstance(text, Comment))
print "COMMENTS"
# print comments
# print " >> BEAUTIFULSOUP FINDALL"
#print comments
cLen = len(comments)
print "Length of COmments"+cLen.__str__()
if cLen > 0 and cLen > 11:
comment = comments[11]
if comment is None:
for cmnt in comments:
if firstName in cmnt:
comment = cmnt
print "output COMMENTS :"
# print comment
return comment
def dealLocalFile():
rootDir = os.getcwd()
list_dirs = os.walk(rootDir)
for root, dirs, files in list_dirs:
# for d in dirs:
# print os.path.join(root, d)
for f in files:
if f.endswith('html'):
path = os.path.join(root, f)
soup = BeautifulSoup(open(path), 'html.parser')
soup = soup.body
#????
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
#??span??
spans = soup.select("span")
[span.unwrap() for span in spans]
#??font??
fonts = soup.select("font")
[font.unwrap() for font in fonts]
pps = soup.select("p")
for pp in pps:
del pp['style']
# text = pp.get_text()
# text = text.strip()
# if text is '' or len(text) < 1:#????p??,??
# pp.extract()
# #
# imgs = soup.select("img")
# for img in imgs:
# src = img['src']
# index = src.find('/')
# if index != -1:
# newSrc = 'imgs' + src[index:]
# img['src'] = newSrc
# # print newSrc
ps = soup.select('p')
title = ''
for p in ps:
if p.get_text() != '' and len(p.get_text()) > 0:
title = p.get_text()
p.extract()
break
fo = open(title + ".html", "w")
soup.prettify()
fo.write(str(soup));
# ???????
fo.close()
# print soup.prettify()
def _parse_tags(cls, html):
excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta',
'link', 'body', 'input', 'form', 'a']
minimum_text_node_length = 8
y_data = []
text_data = []
tag_signatures = []
soup = BeautifulSoup(html, 'html.parser')
for tag in soup.findAll():
path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p]))
tag_signature = '.'.join([path, tag.name])
if (tag.name not in excluded_tags) and ('table' not in path):
tag_text = []
for text in tag.contents:
if isinstance(text, Comment):
continue
try:
text = text.strip()
aux = BeautifulSoup(text, 'html.parser')
if aux.find() is None:
tag_text.append(text)
except Exception, e:
pass
tag_text = "\n".join(tag_text)
if tag_text and len(tag_text) > minimum_text_node_length:
if tag_text not in text_data:
# Remove line returns and tabs
tag_text = cls._remove_chars(tag_text)
if tag_text:
y_data.append(len(tag_text))
text_data.append(tag_text)
tag_signatures.append(path)
x = np.array(y_data)
return x, text_data, tag_signatures