def bash(cmd, message, args):
if len(cache) == 0:
async with aiohttp.ClientSession() as session:
async with session.get('http://bash.org/?random1') as page:
page = await page.text()
quotes = html.fromstring(page).cssselect('body center table tr td[valign="top"]')[0]
for index in range(1, len(quotes), 2):
qid = quotes[index - 1][0][0].text
score = quotes[index - 1][2].text
quote = quotes[index].text_content()
quote = {
'id': qid[1:],
'score': score,
'quote': quote
}
cache.append(quote)
quote = cache.pop()
# skip quotes that are not fitting into message character limit
while len(quote['quote']) > 2037:
quote = cache.pop()
text = quote['quote']
highlight = 'xml' if text.strip()[0] == '<' else 'yaml'
embed = Embed(type='rich', color=0xf7d7c4, description=f'```{highlight}\n{text}\n```')
embed.set_author(name=f"?? #{quote['id']} | Score: {quote['score']}", url=f"http://bash.org/?{quote['id']}")
await message.channel.send(None, embed=embed)
python类fromstring()的实例源码
def joke(cmd, message, args):
randomizer = secrets.randbelow(6644)
joke_url = f'http://jokes.cc.com/feeds/random/{randomizer}'
async with aiohttp.ClientSession() as session:
async with session.get(joke_url) as data:
joke_json = await data.read()
joke_json = json.loads(joke_json)
joke_page_url = joke_json['0']['url']
async with aiohttp.ClientSession() as session:
async with session.get(joke_page_url) as data:
page_data = await data.text()
root = html.fromstring(page_data)
content = root.cssselect('.content_wrap')[0]
joke_text = ''
for element in content.cssselect('p'):
if element.text != '' and element.text != '\n':
joke_text += f'\n{element.text}'
while ' ' in joke_text:
joke_text = joke_text.replace(' ', ' ')
joke_text = ftfy.fix_text(joke_text)
embed = discord.Embed(color=0xFFDC5D)
embed.add_field(name='?? Have A Random Joke', value=joke_text)
await message.channel.send(None, embed=embed)
def cyanideandhappiness(cmd, message, args):
comic_img_url = None
comic_url = None
while not comic_img_url:
comic_number = secrets.randbelow(4665) + 1
comic_url = f'http://explosm.net/comics/{comic_number}/'
async with aiohttp.ClientSession() as session:
async with session.get(comic_url) as data:
page = await data.text()
root = html.fromstring(page)
comic_element = root.cssselect('#main-comic')
comic_img_url = comic_element[0].attrib['src']
if comic_img_url.startswith('//'):
comic_img_url = 'https:' + comic_img_url
embed = discord.Embed(color=0xFF6600)
embed.set_image(url=comic_img_url)
cnh_image = 'https://i.imgur.com/jJl7FoT.jpg'
embed.set_author(name='Cyanide and Happiness', icon_url=cnh_image, url=comic_url)
await message.channel.send(None, embed=embed)
def grab_post_list(tags):
links = []
for x in range(0, 20):
resource = f'http://safebooru.org/index.php?page=dapi&s=post&q=index&tags={tags}&pid={x}'
async with aiohttp.ClientSession() as session:
async with session.get(resource) as data:
data = await data.read()
posts = html.fromstring(data)
for post in posts:
if 'file_url' in post.attrib:
file_url = post.attrib['file_url']
extention = file_url.split('.')[-1]
if extention in ['png', 'jpg', 'jpeg', 'gif']:
height = int(post.attrib['height'])
width = int(post.attrib['width'])
if width < 2000 and height < 2000:
links.append(post)
return links
def replace_arch_section(self, cr, uid, view_id, section_xpath, replacement, context=None):
# the root of the arch section shouldn't actually be replaced as it's
# not really editable itself, only the content truly is editable.
[view] = self.browse(cr, uid, [view_id], context=context)
arch = etree.fromstring(view.arch.encode('utf-8'))
# => get the replacement root
if not section_xpath:
root = arch
else:
# ensure there's only one match
[root] = arch.xpath(section_xpath)
root.text = replacement.text
root.tail = replacement.tail
# replace all children
del root[:]
for child in replacement:
root.append(copy.deepcopy(child))
return arch
def scrape_subraces_old(href):
url = settings.betfair_url + href + settings.betfair_url2_end
r = requests.get(url)
data = []
if r.status_code == 200:
datajson = r.json()
domtree = html.fromstring(datajson['children'])
ul = domtree.xpath('//ul[@class="children"]')[0]
lis = ul.xpath('li')
for li in lis:
item = {}
item['title'] = li.xpath('a/@market-name')[0]
try:
item['identifier'] = li.xpath('a/@market-id')[0]
t = time.localtime(int(li.xpath('a/@market-time')[0]) / 1000)
item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', t)
data.append(item)
except:
data = data + scrape_subraces(li.xpath('a/@href')[0])
return(data)
def extract(url):
global img_no
try :
img_no += 1
r = requests.get(url)
tree = html.fromstring(r.text)
div = tree.xpath('//table[@class="masterresultstable"]\
//div[@class="meshtext-wrapper-left"]')
except : div=[]
if div != []:
div = div[0]
else:
return
typ = div.xpath('.//strong/text()')[0]
items = div.xpath('.//li/text()')
img = tree.xpath('//img[@id="theImage"]/@src')[0]
final_data[img_no] = {}
final_data[img_no]['type'] = typ
final_data[img_no]['items'] = items
final_data[img_no]['img'] = domain + img
try :
urllib.urlretrieve(domain+img, path+str(img_no)+".png")
with open('data_new.json', 'w') as f:
json.dump(final_data, f)
output = "Downloading Images : {}".format(img_no)
sys.stdout.write("\r\x1b[K" + output)
sys.stdout.flush()
except :return
def get_links(query):
urlencoded_query = urllib.parse.quote_plus(query)
r = requests.get("https://duckduckgo.com/html/?q=" + urlencoded_query,
headers={'User-Agent': USER_AGENT})
tree = html.fromstring(r.content)
return tree.xpath('//h2[@class="result__title"]/a[@class="result__a"]/@href')
def scrape_web(website):
r = requests.get(website, timeout=5)
tree = html.fromstring(r.content)
rss_links = tree.xpath('//link[@rel="alternate" and @type="application/atom+xml"]/@href')
if len(rss_links) == 0:
raise NoLinkError(website)
else:
return urllib.parse.urljoin(website, rss_links[0])
def getHearthpwnIdAndUrl(name, set, type, isToken, session):
log.debug("getHearthpwnIdAndUrl() getting for %s", name)
# hearthpwn is also weird
hpname_hacked = name.replace('-', ' ').replace('!', '')
premium = 0 if isToken else 1
# filter-name={}&filter-premium={}&filter-type={}&filter-set={}
r = session.get(setUrlTempl.format(hpname_hacked, premium, hsTypeId[type], setNameIds[set]))
r.raise_for_status()
html = fromstring(r.text)
images = html.xpath('//td[@class="visual-image-cell"]/a/img')
descs = html.xpath('//td[@class="visual-details-cell"]/h3/a')
for i in range(len(images)):
title = descs[i].text
if title.lower() == name.lower():
image = images[i].get('src')
if not image:
image = 'http://media-hearth.cursecdn.com/avatars/148/738/687.png'
# /cards/31128-annoy-o-tron-fanclub
hpid = hpIdRegex.match(images[i].get('data-href')).group(1)
return int(hpid), image.replace('http://', 'https://').lower()
log.debug("getHearthpwnIdAndUrl() card not found at hearthpwn '%s' '%s'", set, name)
raise Exception("getHearthpwnIdAndUrl() card " + name + " not found at hearthpwn")
def loadTokens(tokens = {}, wantedTokens = {}):
resultCards = {}
with requests.Session() as session:
for name, ids in wantedTokens.items():
card = None
if 'id' in ids:
card = tokens[ids['id']]
if name != card['name']:
log.warning('loadTokens() names do not match: %s - %s', name, tokens[ids['id']]['name'])
if 'id' not in ids:
for token in tokens.values():
if name == token['name']:
if card:
log.warning('loadTokens() found token again: %s', name)
card = token
if not card:
log.warning('loadTokens() could not find: %s', name)
exit()
r = session.get('http://www.hearthpwn.com/cards/{}'.format(ids['hpwn']))
r.raise_for_status()
image = fromstring(r.text).xpath('//img[@class="hscard-static"]')[0].get('src')
if not image:
image = 'https://media-hearth.cursecdn.com/avatars/148/738/687.png'
card['cdn'] = image.replace('http://', 'https://').lower()
card['hpwn'] = ids['hpwn']
card['head'] = getHearthHeadId(card['name'], "ignored", "ignored")
# since jade golem: overwrite scraped stats with prepared ones
card['atk'] = ids.get('atk', card['atk'])
card['cost'] = ids.get('cost', card['cost'])
card['hp'] = ids.get('hp', card['hp'])
resultCards[card['name']] = card
print('.', end='')
return resultCards
def lxml(self):
"""Get an lxml etree if possible."""
if ('html' not in self.mimetype and 'xml' not in self.mimetype):
raise AttributeError('Not an HTML/XML response')
from lxml import etree
try:
from lxml.html import fromstring
except ImportError:
fromstring = etree.HTML
if self.mimetype == 'text/html':
return fromstring(self.data)
return etree.XML(self.data)
def analy_following_profile(self,html_text):
tree = html.fromstring(html_text)
url_list = tree.xpath("//h2[@class='ContentItem-title']//span[@class='UserLink UserItem-name']//a[@class='UserLink-link']/@href")
for target_url in url_list:
target_url = "https://www.zhihu.com" + target_url
target_url = target_url.replace("https", "http")
if red.sadd('red_had_spider', target_url):
red.lpush('red_to_spider', target_url)
def list_mtgs_gallery(url=''):
if url == '':
return ''
page = requests.get(url)
tree = html.fromstring(page.content)
cards = []
cardstree = tree.xpath('//*[contains(@class, "log-card")]')
for child in cardstree:
cards.append(child.text)
return cards
def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"code": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[]):
if 'name' in setinfo:
url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['name'].lower().replace('of', '').replace(
' ', ' ').replace(' ', '-')
page = requests.get(url)
tree = html.fromstring(page.content)
cards = []
cardtree = tree.xpath('//*[@id="content-detail-page-of-an-article"]')
for child in cardtree:
cardElements = child.xpath('//*/p/img')
cardcount = 0
for cardElement in cardElements:
card = {
"name": cardElement.attrib['alt'].replace(u"\u2019", '\'').split(' /// ')[0],
"img": cardElement.attrib['src']
}
card["url"] = card["img"]
#card["cmc"] = 0
#card["manaCost"] = ""
#card["type"] = ""
#card["types"] = []
#card["text"] = ""
#card["colorIdentity"] = [""]
# if card['name'] in split_cards:
# card["names"] = [card['name'], split_cards[card['name']]]
# card["layout"] = "split"
#notSplit = True
# for backsplit in split_cards:
# if card['name'] == split_cards[backsplit]:
# notSplit = False
# if not card['name'] in delete_cards:
cards.append(card)
cardcount += 1
fullspoil = {"cards": cards}
print "Spoil Gallery has " + str(cardcount) + " cards."
download_images(fullspoil['cards'], setinfo['code'])
fullspoil = get_rarities_by_symbol(fullspoil, setinfo['code'])
fullspoil = get_mana_symbols(fullspoil, setinfo['code'])
#fullspoil = get_colors_by_frame(fullspoil, setinfo['code'])
return fullspoil
def lxml(self):
"""Get an lxml etree if possible."""
if ('html' not in self.mimetype and 'xml' not in self.mimetype):
raise AttributeError('Not an HTML/XML response')
from lxml import etree
try:
from lxml.html import fromstring
except ImportError:
fromstring = etree.HTML
if self.mimetype == 'text/html':
return fromstring(self.data)
return etree.XML(self.data)
def get_html_tree():
"""Gets and converts the management interface page into a parsable tree."""
try:
with requests.Session() as s:
s.get(_config['base_url'] + _config['welcome_page'],
data=_config['welcome_credentials'])
s.post(_config['base_url'] + _config['login_page'],
data=_config['login_credentials'])
r = s.get(_config['base_url'] + _config['management_page'])
except Exception as e:
logging.error(str(e))
raise e
return html.fromstring(r.content)
def main():
"""Command line entry point."""
import argparse
import sys
parser = argparse.ArgumentParser(
description=sys.modules[__name__].__doc__)
parser.add_argument(
'article_file', metavar='ARTICLE', type=argparse.FileType(),
help='path to Wiktionary article file')
parser.add_argument(
'-z', '--zim-file', action='store_true',
help='treat the article file as a ZIM archive, instead of HTML '
'source')
parser.add_argument(
'-d', '--debug', action='store_true',
help='enable debugging output')
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO)
if args.zim_file:
article_tuples = ZimFile(args.article_file).article_tuples()
else:
article_tuples = [(None, None, args.article_file.read())]
for article_tuple in article_tuples:
context = {'edition': article_tuple[0], 'pagename': article_tuple[1]}
doc = html.fromstring(article_tuple[2])
for translation in parse_document(doc):
translation.update(context)
print json.dumps(translation)
def get_tree(page):
page = page.replace(" ", " ") # otherwise starts-with for lxml doesn't work
try:
tree = html.fromstring(page)
except (etree.XMLSyntaxError, etree.ParserError) as e:
print u"not parsing, beause etree error in get_tree: {}".format(e)
tree = None
return tree
def lxml(self):
"""Get an lxml etree if possible."""
if ('html' not in self.mimetype and 'xml' not in self.mimetype):
raise AttributeError('Not an HTML/XML response')
from lxml import etree
try:
from lxml.html import fromstring
except ImportError:
fromstring = etree.HTML
if self.mimetype == 'text/html':
return fromstring(self.data)
return etree.XML(self.data)