python类read_html()的实例源码-面圈网

arbitrage_tools.py 文件源码项目：bitrader 作者: jr-minnaar 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def get_forex_buy_quote(currency_code: str = 'EUR', source: str = 'FNB', order_type: str = 'buy'):
    """Get latest forex from FNB website

    """
    if source == 'FNB':
        tables = pd.read_html(
            'https://www.fnb.co.za/Controller?nav=rates.forex.list.ForexRatesList',
            index_col=1, header=0, match=currency_code)

        df = tables[0]

        types = {
            'buy': 'Bank Selling Rate',
            'sell': 'Bank Buying Rate',
        }

        exhange_rate = df.loc[currency_code, types[order_type]]

        return Decimal("%.4f" % float(exhange_rate))

reference.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def _sz_hz(date='', retry_count=3, pause=0.001):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'],
                                    ct.PAGES['szsefc'], date))
            lines = urlopen(request, timeout = 10).read()
            if len(lines) <= 200:
                return pd.DataFrame()
            df = pd.read_html(lines, skiprows=[0])[0]
            df.columns = rv.MAR_SZ_HZ_COLS
            df['opDate'] = date
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

update_spark_params.py 文件源码项目：spylon 作者: maxpoint 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def _fetch_documentation(version, base_url="https://spark.apache.org/docs"):
    doc_urls = [
        "{base_url}/{version}/configuration.html",
        "{base_url}/{version}/sql-programming-guide.html",
        "{base_url}/{version}/monitoring.html",
        "{base_url}/{version}/spark-standalone.html",
        "{base_url}/{version}/running-on-mesos.html",
        "{base_url}/{version}/running-on-yarn.html",
    ]

    for url in doc_urls:
        doc_url = url.format(version=version, base_url=base_url)
        # print(url)
        print("Loading spark properties from %s", doc_url)
        dfs = pd.read_html(doc_url, header=0)
        desired_cols = ["Property Name", "Default", "Meaning"]
        for df in dfs:
            if ("Property Name" in df) and ('Default' in df):
                for pn, default, desc in df[desired_cols].itertuples(index=False):
                    if type(default) == numpy.bool_:
                        default = bool(default)
                    yield pn, default, desc

webreader.py 文件源码项目：PyTrader 作者: didw 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def get_financial_statements(code):
    url = "http://companyinfo.stock.naver.com/v1/company/ajax/cF1001.aspx?cmp_cd=%s&fin_typ=0&freq_typ=Y" % (code)
    html = requests.get(url).text

    html = html.replace('<th class="bg r01c02 endLine line-bottom"colspan="8">??</th>', "")
    html = html.replace("<span class='span-sub'>(IFRS??)</span>", "")
    html = html.replace("<span class='span-sub'>(IFRS??)</span>", "")
    html = html.replace('\t', '')
    html = html.replace('\n', '')
    html = html.replace('\r', '')

    html = html.replace('2011/12', '2011')
    html = html.replace('2012/03', '2011')
    html = html.replace('2012/12', '2012')
    html = html.replace('2013/03', '2012')
    html = html.replace('2013/12', '2013')
    html = html.replace('2014/03', '2013')
    html = html.replace('2014/12', '2014')
    html = html.replace('2015/03', '2014')
    html = html.replace('2015/12', '2015')

    df_list = pd.read_html(html, index_col='??????')
    df = df_list[0]
    return df

model.py 文件源码项目：NBA-prediction 作者: christopherjenness 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def get_stats(self, url):
        """
        Extracts statistics from URL

        Args:
            url (str): basketball-reference.com box score

        Returns:
            stats (pd.DataFrame): DataFrame of statistics from game
        """
        response = urllib2.urlopen(url)
        html = response.read()
        stat_html = html.replace('<!--', "")
        stat_html = stat_html.replace('-->', "")
        stats = pd.read_html(stat_html)
        return stats[-5]

pricehistory.py 文件源码项目：pyiem 作者: rheineke 项目源码文件源码阅读 57 收藏 0 点赞 0 评论 0

def price_history_frame(mkt_id, year, month):
    """Returns price history as a DataFrame"""
    url = _build_url('pricehistory/PriceHistory_GetData.cfm')
    data = dict(Market_ID=mkt_id, Month='{:02d}'.format(month), Year=year)
    response = requests.post(url=url, data=data)
    index_cols = [iem.DATE, iem.CONTRACT]
    kwargs = dict(header=0, parse_dates=[iem.DATE], index_col=index_cols)
    try:
        dfs = pd.read_html(response.text, **kwargs)
    except ValueError:
        dfs = [pd.DataFrame()]

    # Expect a singleton list
    assert len(dfs) == 1

    # Remove duplicates, if any
    df = dfs[0]
    if len(df.index.unique()) != len(df.index):
        df = df.groupby(level=df.index.names).first()

    return df

pricehistory.py 文件源码项目：pyiem 作者: rheineke 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def history_dates(mkt_id):
    url = _build_url('pricehistory/pricehistory_selectcontract.cfm')
    response = requests.get(url=url, params={'Market_ID': mkt_id})
    dfs = pd.read_html(response.text, index_col=0)

    # Expect a singleton list
    assert len(dfs) == 1

    df = dfs[0]

    mon_str = df.ix['Month:'][1]
    months = [dt.datetime.strptime(s[:3], '%b').month for s in mon_str.split()]
    year_str = df.ix['Year'][1]
    years = [int(s) for s in year_str.split()]

    return itertools.product(years, months)

mass-populate.py 文件源码项目：open-house-crawler 作者: data-skeptic 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def run(query):
    r = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population')
    soup = BeautifulSoup(r.content)
    #
    tbl = soup.find_all('table')[3]
    df = pd.read_html(str(tbl))[0]
    #
    df.columns = df.iloc[0]
    #
    cities = df['City'].tolist()
    #
    for city in cities:
        i = city.find('[')
        if i != -1:
            city = city[0:i]
        city = city + ' ' + query
        print(city)
        populate.query_and_post(city)
        time.sleep(1)

client.py 文件源码项目：ShiPanE-Python-SDK 作者: sinall 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def __query_new_stocks(self):
        DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
        html = lxml.html.parse(DATA_URL)
        res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
        if six.PY2:
            sarr = [etree.tostring(node) for node in res]
        else:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('<font color="red">*</font>', '')
        sarr = '<table>%s</table>' % sarr
        df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
        df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
        df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
        df['code'] = df['code'].map(lambda x: str(x).zfill(6))
        df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
        return df

reference.py 文件源码项目：stock 作者: datablood 项目源码文件源码阅读 79 收藏 0 点赞 0 评论 0

def _sz_hz(date='', retry_count=3, pause=0.001):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'],
                                    ct.PAGES['szsefc'], date))
            lines = urlopen(request, timeout = 10).read()
            if len(lines) <= 200:
                return pd.DataFrame()
            df = pd.read_html(lines, skiprows=[0])[0]
            df.columns = rv.MAR_SZ_HZ_COLS
            df['opDate'] = date
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

app.py 文件源码项目：cfb-comparison 作者: pythonforsports 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def generate_sched_table(team, year, max_rows=20):
    df = pd.DataFrame(teams)
    filter_team = df.loc[df["TeamAlt"] == team]
    filter_team['ESPNID'] = "http://www.espn.com/college-football/team/fpi/_/id/" \
                            + filter_team.ESPNID.map(str) + "/year/" + str(year)
    link = filter_team.tail(1)['ESPNID'].values[0]
    sched_dataframe = pd.read_html(link, header=1)[4]
    sched_dataframe.columns = ['Date', 'Opponent', 'Result/Proj', 'Opp FPI', 'Game Rating']

    return html.Table(
        # Header1
        [html.Tr([
            html.Th(html.H6([team + ' ' + str(year) + ' ' + 'Schedule']), colSpan=5, style=dict(textAlign="center")),
        ])] +

        # Header2
        [html.Tr([html.Td(col) for col in sched_dataframe.columns], style=dict(fontWeight="bold"))] +

        # Body
        [html.Tr([
            html.Td(sched_dataframe.iloc[i][col]) for col in sched_dataframe.columns
        ]) for i in range(min(len(sched_dataframe), max_rows))]
    )

reference.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 47 收藏 0 点赞 0 评论 0

def _dist_cotent(year, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            if pageNo > 0:
                ct._write_console()
            html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
                     ct.PAGES['163dp'], year, pageNo))  
            res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows=[0])[0]
            df = df.drop(df.columns[0], axis=1)
            df.columns = rv.DP_163_COLS
            df['divi'] = df['plan'].map(_fun_divi)
            df['shares'] = df['plan'].map(_fun_into)
            df = df.drop('plan', axis=1)
            df['code'] = df['code'].astype(object)
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            pages = []
            if pageNo == 0:
                page = html.xpath('//div[@class=\"mod_pages\"]/a')
                if len(page)>1:
                    asr = page[len(page)-2]
                    pages = asr.xpath('text()')
        except Exception as e:
            print(e)
        else:
            if pageNo == 0:
                return df, pages[0] if len(pages)>0 else 0
            else:
                return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

reference.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 49 收藏 0 点赞 0 评论 0

def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], 
                                                ct.PAGES['fd'], year, quarter, pageNo,
                                                ct.PAGE_NUM[1]))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
            print(e)

reference.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 55 收藏 0 点赞 0 评论 0

def _newstocks(data, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
                         ct.PAGES['newstock'], pageNo))
            res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = sarr.replace('<font color="red">*</font>', '')
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
            df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1)
            df.columns = rv.NEW_STOCKS_COLS
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
            tag = '???' if ct.PY3 else unicode('???', 'utf-8')
            hasNext = True if tag in res else False 
            data = data.append(df, ignore_index=True)
            pageNo += 1
            if hasNext:
                data = _newstocks(data, pageNo, retry_count, pause)
        except Exception as ex:
            print(ex)
        else:
            return data

trading.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def _parse_fq_data(url, index, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = df['date'].astype(np.datetime64)
            df = df.drop_duplicates('date')
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

billboard.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 48 收藏 0 点赞 0 评论 0

def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_GGTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _cap_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)

billboard.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_YYTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _broker_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)

billboard.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df = df.drop([2,3], axis=1)
            df.columns = rv.LHB_JGZZ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)

billboard.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3],
                                               ct.PAGES['fd'], '', pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_JGMX_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_detail(pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)

fundamental.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def _get_profit_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                              ct.PAGES['fd'], year,
                                              quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.PROFIT_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_profit_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass

fundamental.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def _get_operation_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                 ct.PAGES['fd'], year,
                                                 quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.OPERATION_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_operation_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e)

fundamental.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def _get_growth_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                              ct.PAGES['fd'], year,
                                              quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.GROWTH_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_growth_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e)

fundamental.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def _get_debtpaying_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                  ct.PAGES['fd'], year,
                                                  quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns = ct.DEBTPAYING_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_debtpaying_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e)

fundamental.py 文件源码项目：TuShare 作者: andyzsf 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def _get_cashflow_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                ct.PAGES['fd'], year,
                                                quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns = ct.CASHFLOW_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_cashflow_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e)

func.py 文件源码项目：InplusTrader_Linux 作者: zhengwsh 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def _parse_fq_data(url, index, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')

            sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            if sarr == '':
                return None
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = df['date'].astype(np.datetime64)
            df = df.drop_duplicates('date')
        except ValueError as e:
            # ????????????
            return None
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

daily.py 文件源码项目：SSieve 作者: davidimprovz 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def checkStockNameChanges():
        """
        Handles the renaming of stocks in the DB that have been renamed. It will switch 
        both the name and the symbol for every table in the database. 

        Uses PANDAS to capture html tables from the NASDAQ listings containing symbol changes. 

        Returns the changes in a PANDAS frame, if any. If problem scraping NASDAQ, returns error msg.
        If code error, returns a tuple of False, error.
        """
        try:

            path = 'http://www.nasdaq.com/markets/stocks/symbol-change-history.aspx?page='
            ticker_changes = pd.DataFrame()
            for i in np.arange(100): # set the number high enough to catch all pages
                page = str(i+1)
                full_path = ''.join([path, page])
                symbol_changes = pd.read_html(full_path, header=0)[3] # note: index could change in future if html is restructured
                # concat all of the changes together
                if 'No records found.' not in symbol_changes.iloc[0][0]:
                    ticker_changes = pd.concat([ticker_changes, symbol_changes], ignore_index=True)
                else: break # drop out of loop if there's nothing left to capture

            ticker_changes.rename(columns={'Old Symbol': 'Old', 'New Symbol': 'New', 'Effective Date': 'Date'}, inplace=True)

            # check returned value
            assert isinstance(ticker_changes, pd.DataFrame), "Expected stock name changes to return pandas DataFrame. Got %r instead" % type(tickers)

            return ticker_changes

        except Exception as e:
            return False, e

    # renameStocks()
    # ************** #

scraper.py 文件源码项目：coinbin.org 作者: kennethreitz 项目源码文件源码阅读 49 收藏 0 点赞 0 评论 0

def get_coins():
    coins_db = OrderedDict()

    print(crayons.yellow('Scraping CoinMaketCap...'))

    r = session.get(url)
    html = pq(pq(r.content)('table')[0]).html()
    df = pandas.read_html("<table>{}</table>".format(html))
    df = pandas.concat(df)

    btc_value = float(df.to_dict()['Price'][0][1:].replace(',', ''))

    for row in df.itertuples():

        rank = int(row[1])
        name = ' '.join(row[2].split()[1:])
        ticker = row[3].lower()
        try:
            usd = float(row[5][1:].replace(',', ''))
        except ValueError:
            usd = 0
        finally:
            pass

        btc = convert_to_decimal(usd / btc_value)

        coins_db.update({ticker: {'rank': rank, 'name': name, 'ticker': ticker, 'usd': usd, 'btc': btc}})

    return coins_db

session.py 文件源码项目：esdata 作者: gjacopo 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def read_html_table(self, url, **kwargs): 
        try:
            self.get_status(url)
        except:
            return None
        # set some default values (some are already default values for read_table)
        kwargs.update({'encoding': kwargs.get('encoding') or None})
        # run pandas...
        df = pd.read_html(url, **kwargs)
        return df

    #/************************************************************************/

vhr_spider.py 文件源码项目：Verum 作者: Data4Democracy 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def parse_rep_vote_history(self, response):
        # Some reps did not vote during a session. Test for the "Vote data is unavailable.
        # We capture the base information about the rep for later matching
        if "Vote data is unavailable" in response.css("#mainBody::text").extract()[3]:
            cur_url = response.url
            session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
            url = cur_url
            self.rep_info.append([rep_id, session_id, chamber])

        #Otherwise, we process the body of text.
        else:
            title = response.xpath("""//*[@id="title"]/text()""").extract_first()

            rep_title, rep_short_name, rep_district = self.get_name_district(title)
            #Fetch the main table - they use nested tables, so have to use a direct reference.
            table_rows = response.css('#mainBody > table').extract()[0]

            #Parse the html table and select relevant info for the vote.
            pd_table = pd.read_html(table_rows, header=0, match="Doc.", attrs={'cellspacing':0})[0][['RCS\xa0#', 'Doc.','Vote','Result']]

            #Get session and chamber id from URL and assign to each row
            cur_url = response.url
            session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
            pd_table['session_id'] = session_id
            pd_table['chamber'] = chamber
            pd_table['rep_id'] = rep_id
            pd_table['rep_title'] = rep_title
            pd_table['rep_short_name'] = rep_short_name
            pd_table['district'] = rep_district

            #Reorder columns
            pd_table = pd_table.reindex_axis(['session_id', 'chamber', 'rep_id', 'rep_short_name', 'rep_title', 'district', 'RCS\xa0#', 'Doc.', 'Vote', 'Result'], axis=1)

            return pd_table.to_dict(orient='records')

CA_AB.py 文件源码项目：electricitymap 作者: tmrowco 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def fetch_price(country_code='CA-AB', session=None):
    """Requests the last known power price of a given country

    Arguments:
    country_code (optional) -- used in case a parser is able to fetch multiple countries
    session (optional)      -- request session passed in order to re-use an existing session

    Return:
    A dictionary in the form:
    {
      'countryCode': 'FR',
      'currency': EUR,
      'datetime': '2017-01-01T00:00:00Z',
      'price': 0.0,
      'source': 'mysource.com'
    }
    """

    r = session or requests.session()
    url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/SMPriceReportServlet?contentType=html/'
    response = r.get(url)
    df_prices = pd.read_html(response.text, match='Price', index_col=0, header=0)
    prices = df_prices[1]

    data = {}

    for rowIndex, row in prices.iterrows():
        price = row['Price ($)']
        if (isfloat(price)):
            hours = int(rowIndex.split(' ')[1]) - 1
            data[rowIndex] = {
                'datetime': arrow.get(rowIndex, 'MM/DD/YYYY').replace(hours=hours, tzinfo=ab_timezone).datetime,
                'countryCode': country_code,
                'currency': 'CAD',
                'source': 'ets.aeso.ca',
                'price': float(price),
            }

    return [data[k] for k in sorted(data.keys())]