python类read_html()的实例源码

CA_AB.py 文件源码 项目:electricitymap 作者: tmrowco 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def fetch_exchange(country_code1='CA-AB', country_code2='CA-BC', session=None):
    """Requests the last known power exchange (in MW) between two countries

    Arguments:
    country_code (optional) -- used in case a parser is able to fetch multiple countries
    session (optional)      -- request session passed in order to re-use an existing session

    Return:
    A dictionary in the form:
    {
      'sortedCountryCodes': 'DK->NO',
      'datetime': '2017-01-01T00:00:00Z',
      'netFlow': 0.0,
      'source': 'mysource.com'
    }
    """

    r = session or requests.session()
    url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/CSDReportServlet'
    response = r.get(url)
    df_exchanges = pd.read_html(response.text, match='INTERCHANGE', skiprows=0, index_col=0)

    flows = {
        'CA-AB->CA-BC': df_exchanges[1][1]['British Columbia'],
        'CA-AB->CA-SK': df_exchanges[1][1]['Saskatchewan'],
        'CA-AB->US': df_exchanges[1][1]['Montana']
    }
    sortedCountryCodes = '->'.join(sorted([country_code1, country_code2]))
    if sortedCountryCodes not in flows:
        raise NotImplementedError('This exchange pair is not implemented')

    return {
        'datetime': arrow.now(tz=ab_timezone).datetime,
        'sortedCountryCodes': sortedCountryCodes,
        'netFlow': float(flows[sortedCountryCodes]),
        'source': 'ets.aeso.ca'
    }
CR.py 文件源码 项目:electricitymap 作者: tmrowco 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def fetch_production(country_code='CR', session=None):
    # Do not use existing session as some amount of cache is taking place
    r = requests.session()
    url = 'https://appcenter.grupoice.com/CenceWeb/CencePosdespachoNacional.jsf'
    response = r.get(url)
    df_yesterday = pd.read_html(response.text, skiprows=1, index_col=0, header=0)[0]

    soup = BeautifulSoup(response.text, 'html.parser')
    yesterday_date = soup.select('#formPosdespacho:pickFechaInputDate')[0]['value']
    jsf_view_state = soup.select('#javax.faces.ViewState')[0]['value']

    yesterday = arrow.get(yesterday_date, 'DD/MM/YYYY', tzinfo=TIMEZONE)
    today = yesterday.shift(days=+1)

    data = [
        ('formPosdespacho', 'formPosdespacho'),
        ('formPosdespacho:pickFechaInputDate', today.format(DATE_FORMAT)),
        ('formPosdespacho:pickFechaInputCurrentDate', today.format(MONTH_FORMAT)),
        ('formPosdespacho:j_id35.x', ''),
        ('formPosdespacho:j_id35.y', ''),
        ('javax.faces.ViewState', jsf_view_state),
    ]
    response = r.post(url, cookies={}, data=data)
    df_today = pd.read_html(response.text, skiprows=1, index_col=0)[0]

    ydata = df_to_data(country_code, yesterday, df_yesterday)
    tdata = df_to_data(country_code, today, df_today)
    production = ydata + tdata
    unknown_plants()

    return production
get.py 文件源码 项目:wikischolar 作者: evoapps 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def convert_wiki_to_table(wiki_text, n_table=0):
    html_text = pypandoc.convert(wiki_text, 'html', 'mediawiki')
    tables = pandas.read_html(html_text)
    return tables[n_table]
Fetch_Data_Stock_US_Short.py 文件源码 项目:StockRecommendSystem 作者: doncat99 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def getSignleStockShortInfo(stock):
    df = pd.DataFrame()
    url = "http://shortsqueeze.com/?symbol=" + stock + "&submit=Short+Quote%E2%84%A2"
    repeat_times = 3
    downloadFailed = True

    for _ in range(repeat_times): 
        try:
            response = requests.get(url, timeout=15)
            downloadFailed = False
            break
        except Exception as e:
            print ("exception in get stock:" + stock, str(e))
            continue

    if downloadFailed:
        return "", df

    try:    
        tables = pd.read_html(response.text, attrs={'cellpadding': '3', 'width': '100%'})
    except Exception as e:
        print ("exception in parse stock:" + stock, str(e))
        return "", df

    for table in tables:
        if df.empty:
            df = table
        else:
            df = pd.concat([df, table])
    df = df.reset_index(drop=True, inplace=True)
    #print(df)

    soup = BeautifulSoup(response.text, 'lxml')
    dateString = soup.find('span', {"style" : "color:#999999;font-family: verdana, arial, helvetica;font-size:10px"}).get_text()
    date = datetime.datetime.strptime(dateString, '%A %B %d, %Y')
    return date, df.T
Commune.py 文件源码 项目:frenchy 作者: miroli 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, code, year):
        self._geo = get_geo(code, year)
        self.url = url_resolver(code, year, self._geo['region_code'],
                                self._geo['department_code'])

        tables = pd.read_html(self.url, header=0, encoding='utf8', decimal=',',
                              thousands=' ')
        self._parse(tables)
pricehistory.py 文件源码 项目:pyiem 作者: rheineke 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def read_quote_frames(mkt_conf):
    url = _market_quote_url(mkt_conf)
    response = requests.get(url=url)
    dfs = pd.read_html(response.text, index_col=0, header=0, na_values=['---'])

    # Data outside of the HTML tables
    table_headers = _table_headers(response.text)
    market_names = [_market_name(s) for s in table_headers]
    timestamps = [_timestamp(s) for s in table_headers]

    # Modify data frames
    mod_dfs = [_modify_frame(df, ts) for df, ts in zip(dfs, timestamps)]

    return OrderedDict((nm, df) for nm, df in zip(market_names, mod_dfs))
session_test.py 文件源码 项目:pyiem 作者: rheineke 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def main():
    date_cols = [iem.ORDER_DATE, iem.EXPIRATION]
    kwargs = dict(index_col=iem.ORDER_DATE, parse_dates=date_cols)
    dfs = pd.read_html(table_text, **kwargs)
    df = dfs[0]

    oid_df = pd.DataFrame()
    cxl_o = iem.CANCEL_ORDER
    df[cxl_o] = df[cxl_o].combine_first(oid_df[cxl_o])
session.py 文件源码 项目:pyiem 作者: rheineke 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def _frame(response, **kwargs):
    print(response.text)
    dfs = pd.read_html(response.text, **kwargs)

    # Expect a singleton list
    assert len(dfs) == 1

    return dfs[0]
lambda_function.py 文件源码 项目:open-house-crawler 作者: data-skeptic 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def parse_detail_page(b):
    prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
    other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
    # TODO: use the extended fields, add them to the list of properties
    tables = b.findAll('table', {'class': 'cell'})
    if len(tables) > 0:
        prop['listing_timestamp'] = datetime.datetime.now()
        addr_rows = b.findAll('td', {'class': 'addr'})
        addr = ' '.join(map(lambda x: x.getText(), addr_rows))
        t = tables[0]
        df = pd.read_html(str(t))[0]
        data = dict(zip(df[0], df[1]))
        prop['raw_address'] = addr
        prop['bedrooms'] = int(data['Bedrooms'])
        prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
        if data.has_key('Interior Sq Ft'):
            prop['building_size'] = int(data['Interior Sq Ft'])
        prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
        if data.has_key('Parking'):
            try:
                prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
            except ValueError:
                prop['car_spaces'] = -1
        return [prop]
    else:
        return None

# Takes a string of the raw version of the page and extracts any links we might want to crawl
example.py 文件源码 项目:open-house-crawler 作者: data-skeptic 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_detail_page(content):
    prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
    other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
    # TODO: use the extended fields
    b = soup.BeautifulSoup(content)
    tables = b.findAll('table', {'class': 'cell'})
    if len(tables) > 0:
        prop['listing_timestamp'] = datetime.datetime.now()
        addr_rows = b.findAll('td', {'class': 'addr'})
        addr = ' '.join(map(lambda x: x.getText(), addr_rows))
        t = tables[0]
        df = pd.read_html(str(t))[0]
        data = dict(zip(df[0], df[1]))
        prop['raw_address'] = addr
        prop['bedrooms'] = int(data['Bedrooms'])
        prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
        if data.has_key('Interior Sq Ft'):
            prop['building_size'] = int(data['Interior Sq Ft'])
        prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
        if data.has_key('Parking'):
            try:
                prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
            except ValueError:
                prop['car_spaces'] = -1
        #for of in other_fields:
        #    if data.has_key(of):
        #        prop['features'].append({of: data[of]})
        return [prop]
    else:
        return None

# Takes a string of the raw version of the page and extracts any links we might want to crawl
stock.py 文件源码 项目:ShiPanE-Python-SDK 作者: sinall 项目源码 文件源码 阅读 56 收藏 0 点赞 0 评论 0
def new_stocks():
        url = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
        request = requests.get(url)
        doc = lxml.html.soupparser.fromstring(request.content, features='html.parser')
        table = doc.cssselect('table#NewStockTable')[0]
        table.remove(table.cssselect('thead')[0])
        table_html = lxml.html.etree.tostring(table).decode('utf-8')
        df = pd.read_html(table_html, skiprows=[0, 1])[0]
        df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
        df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
        df['code'] = df['code'].map(lambda x: str(x).zfill(6))
        df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
        return df
reference.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _dist_cotent(year, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            if pageNo > 0:
                ct._write_console()
            html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
                     ct.PAGES['163dp'], year, pageNo))
            res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows=[0])[0]
            df = df.drop(df.columns[0], axis=1)
            df.columns = rv.DP_163_COLS
            df['divi'] = df['plan'].map(_fun_divi)
            df['shares'] = df['plan'].map(_fun_into)
            df = df.drop('plan', axis=1)
            df['code'] = df['code'].astype(object)
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            pages = []
            if pageNo == 0:
                page = html.xpath('//div[@class=\"mod_pages\"]/a')
                if len(page)>1:
                    asr = page[len(page)-2]
                    pages = asr.xpath('text()')
        except Exception as e:
            print(e)
        else:
            if pageNo == 0:
                return df, pages[0] if len(pages)>0 else 0
            else:
                return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)
reference.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                ct.PAGES['fd'], year, quarter, pageNo,
                                                ct.PAGE_NUM[1]))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
            print(e)
reference.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def _newstocks(data, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
                         ct.PAGES['newstock'], pageNo))
            res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = sarr.replace('<font color="red">*</font>', '')
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
            df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1)
            df.columns = rv.NEW_STOCKS_COLS
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
            tag = '???' if ct.PY3 else unicode('???', 'utf-8')
            hasNext = True if tag in res else False
            data = data.append(df, ignore_index=True)
            pageNo += 1
            if hasNext:
                data = _newstocks(data, pageNo, retry_count, pause)
        except Exception as ex:
            print(ex)
        else:
            return data
trading.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def _parse_fq_data(url, index, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = df['date'].astype(np.datetime64)
            df = df.drop_duplicates('date')
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)
billboard.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_GGTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _cap_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)
billboard.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_YYTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _broker_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)
billboard.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df = df.drop([2,3], axis=1)
            df.columns = rv.LHB_JGZZ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)
billboard.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3],
                                               ct.PAGES['fd'], '', pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_JGMX_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_detail(pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)
fundamental.py 文件源码 项目:stock 作者: datablood 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _get_profit_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                              ct.PAGES['fd'], year,
                                              quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.PROFIT_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_profit_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass


问题


面经


文章

微信
公众号

扫码关注公众号