python类fromstring()的实例源码

bash.py 文件源码 项目:apex-sigma-core 作者: lu-ci 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def bash(cmd, message, args):
    if len(cache) == 0:
        async with aiohttp.ClientSession() as session:
            async with session.get('http://bash.org/?random1') as page:
                page = await page.text()
                quotes = html.fromstring(page).cssselect('body center table tr td[valign="top"]')[0]
        for index in range(1, len(quotes), 2):
            qid = quotes[index - 1][0][0].text
            score = quotes[index - 1][2].text
            quote = quotes[index].text_content()
            quote = {
                'id': qid[1:],
                'score': score,
                'quote': quote
            }
            cache.append(quote)
    quote = cache.pop()
    # skip quotes that are not fitting into message character limit
    while len(quote['quote']) > 2037:
        quote = cache.pop()
    text = quote['quote']
    highlight = 'xml' if text.strip()[0] == '<' else 'yaml'
    embed = Embed(type='rich', color=0xf7d7c4, description=f'```{highlight}\n{text}\n```')
    embed.set_author(name=f"?? #{quote['id']} | Score: {quote['score']}", url=f"http://bash.org/?{quote['id']}")
    await message.channel.send(None, embed=embed)
joke.py 文件源码 项目:apex-sigma-core 作者: lu-ci 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def joke(cmd, message, args):
    randomizer = secrets.randbelow(6644)
    joke_url = f'http://jokes.cc.com/feeds/random/{randomizer}'
    async with aiohttp.ClientSession() as session:
        async with session.get(joke_url) as data:
            joke_json = await data.read()
            joke_json = json.loads(joke_json)
            joke_page_url = joke_json['0']['url']
    async with aiohttp.ClientSession() as session:
        async with session.get(joke_page_url) as data:
            page_data = await data.text()
    root = html.fromstring(page_data)
    content = root.cssselect('.content_wrap')[0]
    joke_text = ''
    for element in content.cssselect('p'):
        if element.text != '' and element.text != '\n':
            joke_text += f'\n{element.text}'
    while '  ' in joke_text:
        joke_text = joke_text.replace('  ', ' ')
    joke_text = ftfy.fix_text(joke_text)
    embed = discord.Embed(color=0xFFDC5D)
    embed.add_field(name='?? Have A Random Joke', value=joke_text)
    await message.channel.send(None, embed=embed)
cyanideandhappiness.py 文件源码 项目:apex-sigma-core 作者: lu-ci 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def cyanideandhappiness(cmd, message, args):
    comic_img_url = None
    comic_url = None
    while not comic_img_url:
        comic_number = secrets.randbelow(4665) + 1
        comic_url = f'http://explosm.net/comics/{comic_number}/'
        async with aiohttp.ClientSession() as session:
            async with session.get(comic_url) as data:
                page = await data.text()
        root = html.fromstring(page)
        comic_element = root.cssselect('#main-comic')
        comic_img_url = comic_element[0].attrib['src']
        if comic_img_url.startswith('//'):
            comic_img_url = 'https:' + comic_img_url
    embed = discord.Embed(color=0xFF6600)
    embed.set_image(url=comic_img_url)
    cnh_image = 'https://i.imgur.com/jJl7FoT.jpg'
    embed.set_author(name='Cyanide and Happiness', icon_url=cnh_image, url=comic_url)
    await message.channel.send(None, embed=embed)
safe_core.py 文件源码 项目:apex-sigma-core 作者: lu-ci 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def grab_post_list(tags):
    links = []
    for x in range(0, 20):
        resource = f'http://safebooru.org/index.php?page=dapi&s=post&q=index&tags={tags}&pid={x}'
        async with aiohttp.ClientSession() as session:
            async with session.get(resource) as data:
                data = await data.read()
        posts = html.fromstring(data)
        for post in posts:
            if 'file_url' in post.attrib:
                file_url = post.attrib['file_url']
                extention = file_url.split('.')[-1]
                if extention in ['png', 'jpg', 'jpeg', 'gif']:
                    height = int(post.attrib['height'])
                    width = int(post.attrib['width'])
                    if width < 2000 and height < 2000:
                        links.append(post)
    return links
ir_ui_view.py 文件源码 项目:gooderp_org 作者: osbzr 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def replace_arch_section(self, cr, uid, view_id, section_xpath, replacement, context=None):
        # the root of the arch section shouldn't actually be replaced as it's
        # not really editable itself, only the content truly is editable.

        [view] = self.browse(cr, uid, [view_id], context=context)
        arch = etree.fromstring(view.arch.encode('utf-8'))
        # => get the replacement root
        if not section_xpath:
            root = arch
        else:
            # ensure there's only one match
            [root] = arch.xpath(section_xpath)

        root.text = replacement.text
        root.tail = replacement.tail
        # replace all children
        del root[:]
        for child in replacement:
            root.append(copy.deepcopy(child))

        return arch
betfair_com_scraper_utils.py 文件源码 项目:betfair.com 作者: michalskop 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def scrape_subraces_old(href):
    url = settings.betfair_url + href + settings.betfair_url2_end
    r = requests.get(url)
    data = []
    if r.status_code == 200:
        datajson = r.json()
        domtree = html.fromstring(datajson['children'])
        ul = domtree.xpath('//ul[@class="children"]')[0]
        lis = ul.xpath('li')
        for li in lis:
            item = {}
            item['title'] = li.xpath('a/@market-name')[0]
            try:
                item['identifier'] = li.xpath('a/@market-id')[0]
                t = time.localtime(int(li.xpath('a/@market-time')[0]) / 1000)
                item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', t)
                data.append(item)
            except:
                data = data + scrape_subraces(li.xpath('a/@href')[0])
    return(data)
get_data.py 文件源码 项目:X-ray-classification 作者: bendidi 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def extract(url):
    global img_no

    try :
        img_no += 1
        r = requests.get(url)
        tree = html.fromstring(r.text)

        div = tree.xpath('//table[@class="masterresultstable"]\
            //div[@class="meshtext-wrapper-left"]')
    except : div=[]

    if div != []:
        div = div[0]
    else:
        return

    typ = div.xpath('.//strong/text()')[0]
    items = div.xpath('.//li/text()')
    img = tree.xpath('//img[@id="theImage"]/@src')[0]


    final_data[img_no] = {}
    final_data[img_no]['type'] = typ
    final_data[img_no]['items'] = items
    final_data[img_no]['img'] = domain + img
    try :
        urllib.urlretrieve(domain+img, path+str(img_no)+".png")
        with open('data_new.json', 'w') as f:
            json.dump(final_data, f)

        output = "Downloading Images : {}".format(img_no)
        sys.stdout.write("\r\x1b[K" + output)
        sys.stdout.flush()
    except :return
ddg_parser.py 文件源码 项目:duck-feed 作者: h0m3stuck 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def get_links(query):
    urlencoded_query = urllib.parse.quote_plus(query)
    r = requests.get("https://duckduckgo.com/html/?q=" + urlencoded_query,
                     headers={'User-Agent': USER_AGENT})

    tree = html.fromstring(r.content)

    return tree.xpath('//h2[@class="result__title"]/a[@class="result__a"]/@href')
web_scrapper.py 文件源码 项目:duck-feed 作者: h0m3stuck 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def scrape_web(website):
    r = requests.get(website, timeout=5)

    tree = html.fromstring(r.content)
    rss_links = tree.xpath('//link[@rel="alternate" and @type="application/atom+xml"]/@href')

    if len(rss_links) == 0:
        raise NoLinkError(website)
    else:
        return urllib.parse.urljoin(website, rss_links[0])
scrape.py 文件源码 项目:hearthscan-bot 作者: d-schmidt 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def getHearthpwnIdAndUrl(name, set, type, isToken, session):
    log.debug("getHearthpwnIdAndUrl() getting for %s", name)
    # hearthpwn is also weird
    hpname_hacked = name.replace('-', ' ').replace('!', '')
    premium = 0 if isToken else 1

    # filter-name={}&filter-premium={}&filter-type={}&filter-set={}
    r = session.get(setUrlTempl.format(hpname_hacked, premium, hsTypeId[type], setNameIds[set]))
    r.raise_for_status()
    html = fromstring(r.text)

    images = html.xpath('//td[@class="visual-image-cell"]/a/img')
    descs = html.xpath('//td[@class="visual-details-cell"]/h3/a')

    for i in range(len(images)):
        title = descs[i].text

        if title.lower() == name.lower():
            image = images[i].get('src')
            if not image:
                image = 'http://media-hearth.cursecdn.com/avatars/148/738/687.png'
            # /cards/31128-annoy-o-tron-fanclub
            hpid = hpIdRegex.match(images[i].get('data-href')).group(1)
            return int(hpid), image.replace('http://', 'https://').lower()

    log.debug("getHearthpwnIdAndUrl() card not found at hearthpwn '%s' '%s'", set, name)
    raise Exception("getHearthpwnIdAndUrl() card " + name + " not found at hearthpwn")
scrape.py 文件源码 项目:hearthscan-bot 作者: d-schmidt 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def loadTokens(tokens = {}, wantedTokens = {}):
    resultCards = {}
    with requests.Session() as session:
        for name, ids in wantedTokens.items():
            card = None

            if 'id' in ids:
                card = tokens[ids['id']]
                if name != card['name']:
                    log.warning('loadTokens() names do not match: %s - %s', name, tokens[ids['id']]['name'])

            if 'id' not in ids:
                for token in tokens.values():
                    if name == token['name']:
                        if card:
                            log.warning('loadTokens() found token again: %s', name)
                        card = token

            if not card:
                log.warning('loadTokens() could not find: %s', name)
                exit()

            r = session.get('http://www.hearthpwn.com/cards/{}'.format(ids['hpwn']))
            r.raise_for_status()
            image = fromstring(r.text).xpath('//img[@class="hscard-static"]')[0].get('src')
            if not image:
                image = 'https://media-hearth.cursecdn.com/avatars/148/738/687.png'

            card['cdn'] = image.replace('http://', 'https://').lower()
            card['hpwn'] = ids['hpwn']
            card['head'] = getHearthHeadId(card['name'], "ignored", "ignored")

            # since jade golem: overwrite scraped stats with prepared ones
            card['atk'] = ids.get('atk', card['atk'])
            card['cost'] = ids.get('cost', card['cost'])
            card['hp'] = ids.get('hp', card['hp'])

            resultCards[card['name']] = card
            print('.', end='')

    return resultCards
testtools.py 文件源码 项目:Flask_Blog 作者: sugarguo 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def lxml(self):
        """Get an lxml etree if possible."""
        if ('html' not in self.mimetype and 'xml' not in self.mimetype):
            raise AttributeError('Not an HTML/XML response')
        from lxml import etree
        try:
            from lxml.html import fromstring
        except ImportError:
            fromstring = etree.HTML
        if self.mimetype == 'text/html':
            return fromstring(self.data)
        return etree.XML(self.data)
spider.py 文件源码 项目:my_zhihu_spider 作者: MicroCountry 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def analy_following_profile(self,html_text):
        tree = html.fromstring(html_text)
        url_list = tree.xpath("//h2[@class='ContentItem-title']//span[@class='UserLink UserItem-name']//a[@class='UserLink-link']/@href")
        for target_url in url_list:
            target_url = "https://www.zhihu.com" + target_url
            target_url = target_url.replace("https", "http")
            if red.sadd('red_had_spider', target_url):
                red.lpush('red_to_spider', target_url)
mtgs_scraper.py 文件源码 项目:Magic-Spoiler 作者: Cockatrice 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def list_mtgs_gallery(url=''):
    if url == '':
        return ''
    page = requests.get(url)
    tree = html.fromstring(page.content)
    cards = []
    cardstree = tree.xpath('//*[contains(@class, "log-card")]')
    for child in cardstree:
        cards.append(child.text)
    return cards
wizards_scraper.py 文件源码 项目:Magic-Spoiler 作者: Cockatrice 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"code": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[]):
    if 'name' in setinfo:
        url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['name'].lower().replace('of', '').replace(
            '  ', ' ').replace(' ', '-')
    page = requests.get(url)
    tree = html.fromstring(page.content)
    cards = []
    cardtree = tree.xpath('//*[@id="content-detail-page-of-an-article"]')
    for child in cardtree:
        cardElements = child.xpath('//*/p/img')
        cardcount = 0
        for cardElement in cardElements:
            card = {
                "name": cardElement.attrib['alt'].replace(u"\u2019", '\'').split(' /// ')[0],
                "img": cardElement.attrib['src']
            }
            card["url"] = card["img"]
            #card["cmc"] = 0
            #card["manaCost"] = ""
            #card["type"] = ""
            #card["types"] = []
            #card["text"] = ""
            #card["colorIdentity"] = [""]

            # if card['name'] in split_cards:
            #    card["names"] = [card['name'], split_cards[card['name']]]
            #    card["layout"] = "split"
            #notSplit = True
            # for backsplit in split_cards:
            #    if card['name'] == split_cards[backsplit]:
            #        notSplit = False
            # if not card['name'] in delete_cards:
            cards.append(card)
            cardcount += 1
    fullspoil = {"cards": cards}
    print "Spoil Gallery has " + str(cardcount) + " cards."
    download_images(fullspoil['cards'], setinfo['code'])
    fullspoil = get_rarities_by_symbol(fullspoil, setinfo['code'])
    fullspoil = get_mana_symbols(fullspoil, setinfo['code'])
    #fullspoil = get_colors_by_frame(fullspoil, setinfo['code'])
    return fullspoil
testtools.py 文件源码 项目:swjtu-pyscraper 作者: Desgard 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def lxml(self):
        """Get an lxml etree if possible."""
        if ('html' not in self.mimetype and 'xml' not in self.mimetype):
            raise AttributeError('Not an HTML/XML response')
        from lxml import etree
        try:
            from lxml.html import fromstring
        except ImportError:
            fromstring = etree.HTML
        if self.mimetype == 'text/html':
            return fromstring(self.data)
        return etree.XML(self.data)
caixabreak.py 文件源码 项目:caixabreak 作者: kintoandar 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_html_tree():
    """Gets and converts the management interface page into a parsable tree."""
    try:
        with requests.Session() as s:
            s.get(_config['base_url'] + _config['welcome_page'],
                  data=_config['welcome_credentials'])
            s.post(_config['base_url'] + _config['login_page'],
                   data=_config['login_credentials'])
            r = s.get(_config['base_url'] + _config['management_page'])
    except Exception as e:
        logging.error(str(e))
        raise e
    return html.fromstring(r.content)
parser.py 文件源码 项目:wiktionary-translations-parser 作者: elfxiong 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def main():
    """Command line entry point."""
    import argparse
    import sys

    parser = argparse.ArgumentParser(
        description=sys.modules[__name__].__doc__)
    parser.add_argument(
        'article_file', metavar='ARTICLE', type=argparse.FileType(),
        help='path to Wiktionary article file')
    parser.add_argument(
        '-z', '--zim-file', action='store_true',
        help='treat the article file as a ZIM archive, instead of HTML '
             'source')
    parser.add_argument(
        '-d', '--debug', action='store_true',
        help='enable debugging output')
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG if args.debug else logging.INFO)

    if args.zim_file:
        article_tuples = ZimFile(args.article_file).article_tuples()
    else:
        article_tuples = [(None, None, args.article_file.read())]

    for article_tuple in article_tuples:
        context = {'edition': article_tuple[0], 'pagename': article_tuple[1]}
        doc = html.fromstring(article_tuple[2])
        for translation in parse_document(doc):
            translation.update(context)
            print json.dumps(translation)
util.py 文件源码 项目:oadoi 作者: Impactstory 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def get_tree(page):
    page = page.replace("&nbsp;", " ")  # otherwise starts-with for lxml doesn't work
    try:
        tree = html.fromstring(page)
    except (etree.XMLSyntaxError, etree.ParserError) as e:
        print u"not parsing, beause etree error in get_tree: {}".format(e)
        tree = None
    return tree
testtools.py 文件源码 项目:zanph 作者: zanph 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def lxml(self):
        """Get an lxml etree if possible."""
        if ('html' not in self.mimetype and 'xml' not in self.mimetype):
            raise AttributeError('Not an HTML/XML response')
        from lxml import etree
        try:
            from lxml.html import fromstring
        except ImportError:
            fromstring = etree.HTML
        if self.mimetype == 'text/html':
            return fromstring(self.data)
        return etree.XML(self.data)


问题


面经


文章

微信
公众号

扫码关注公众号