def get_forex_buy_quote(currency_code: str = 'EUR', source: str = 'FNB', order_type: str = 'buy'):
"""Get latest forex from FNB website
"""
if source == 'FNB':
tables = pd.read_html(
'https://www.fnb.co.za/Controller?nav=rates.forex.list.ForexRatesList',
index_col=1, header=0, match=currency_code)
df = tables[0]
types = {
'buy': 'Bank Selling Rate',
'sell': 'Bank Buying Rate',
}
exhange_rate = df.loc[currency_code, types[order_type]]
return Decimal("%.4f" % float(exhange_rate))
python类read_html()的实例源码
def _sz_hz(date='', retry_count=3, pause=0.001):
for _ in range(retry_count):
time.sleep(pause)
ct._write_console()
try:
request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'],
ct.PAGES['szsefc'], date))
lines = urlopen(request, timeout = 10).read()
if len(lines) <= 200:
return pd.DataFrame()
df = pd.read_html(lines, skiprows=[0])[0]
df.columns = rv.MAR_SZ_HZ_COLS
df['opDate'] = date
except Exception as e:
print(e)
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _fetch_documentation(version, base_url="https://spark.apache.org/docs"):
doc_urls = [
"{base_url}/{version}/configuration.html",
"{base_url}/{version}/sql-programming-guide.html",
"{base_url}/{version}/monitoring.html",
"{base_url}/{version}/spark-standalone.html",
"{base_url}/{version}/running-on-mesos.html",
"{base_url}/{version}/running-on-yarn.html",
]
for url in doc_urls:
doc_url = url.format(version=version, base_url=base_url)
# print(url)
print("Loading spark properties from %s", doc_url)
dfs = pd.read_html(doc_url, header=0)
desired_cols = ["Property Name", "Default", "Meaning"]
for df in dfs:
if ("Property Name" in df) and ('Default' in df):
for pn, default, desc in df[desired_cols].itertuples(index=False):
if type(default) == numpy.bool_:
default = bool(default)
yield pn, default, desc
def get_financial_statements(code):
url = "http://companyinfo.stock.naver.com/v1/company/ajax/cF1001.aspx?cmp_cd=%s&fin_typ=0&freq_typ=Y" % (code)
html = requests.get(url).text
html = html.replace('<th class="bg r01c02 endLine line-bottom"colspan="8">??</th>', "")
html = html.replace("<span class='span-sub'>(IFRS??)</span>", "")
html = html.replace("<span class='span-sub'>(IFRS??)</span>", "")
html = html.replace('\t', '')
html = html.replace('\n', '')
html = html.replace('\r', '')
html = html.replace('2011/12', '2011')
html = html.replace('2012/03', '2011')
html = html.replace('2012/12', '2012')
html = html.replace('2013/03', '2012')
html = html.replace('2013/12', '2013')
html = html.replace('2014/03', '2013')
html = html.replace('2014/12', '2014')
html = html.replace('2015/03', '2014')
html = html.replace('2015/12', '2015')
df_list = pd.read_html(html, index_col='??????')
df = df_list[0]
return df
def get_stats(self, url):
"""
Extracts statistics from URL
Args:
url (str): basketball-reference.com box score
Returns:
stats (pd.DataFrame): DataFrame of statistics from game
"""
response = urllib2.urlopen(url)
html = response.read()
stat_html = html.replace('<!--', "")
stat_html = stat_html.replace('-->', "")
stats = pd.read_html(stat_html)
return stats[-5]
def price_history_frame(mkt_id, year, month):
"""Returns price history as a DataFrame"""
url = _build_url('pricehistory/PriceHistory_GetData.cfm')
data = dict(Market_ID=mkt_id, Month='{:02d}'.format(month), Year=year)
response = requests.post(url=url, data=data)
index_cols = [iem.DATE, iem.CONTRACT]
kwargs = dict(header=0, parse_dates=[iem.DATE], index_col=index_cols)
try:
dfs = pd.read_html(response.text, **kwargs)
except ValueError:
dfs = [pd.DataFrame()]
# Expect a singleton list
assert len(dfs) == 1
# Remove duplicates, if any
df = dfs[0]
if len(df.index.unique()) != len(df.index):
df = df.groupby(level=df.index.names).first()
return df
def history_dates(mkt_id):
url = _build_url('pricehistory/pricehistory_selectcontract.cfm')
response = requests.get(url=url, params={'Market_ID': mkt_id})
dfs = pd.read_html(response.text, index_col=0)
# Expect a singleton list
assert len(dfs) == 1
df = dfs[0]
mon_str = df.ix['Month:'][1]
months = [dt.datetime.strptime(s[:3], '%b').month for s in mon_str.split()]
year_str = df.ix['Year'][1]
years = [int(s) for s in year_str.split()]
return itertools.product(years, months)
def run(query):
r = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population')
soup = BeautifulSoup(r.content)
#
tbl = soup.find_all('table')[3]
df = pd.read_html(str(tbl))[0]
#
df.columns = df.iloc[0]
#
cities = df['City'].tolist()
#
for city in cities:
i = city.find('[')
if i != -1:
city = city[0:i]
city = city + ' ' + query
print(city)
populate.query_and_post(city)
time.sleep(1)
def __query_new_stocks(self):
DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
html = lxml.html.parse(DATA_URL)
res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
if six.PY2:
sarr = [etree.tostring(node) for node in res]
else:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('<font color="red">*</font>', '')
sarr = '<table>%s</table>' % sarr
df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
df['code'] = df['code'].map(lambda x: str(x).zfill(6))
df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
return df
def _sz_hz(date='', retry_count=3, pause=0.001):
for _ in range(retry_count):
time.sleep(pause)
ct._write_console()
try:
request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'],
ct.PAGES['szsefc'], date))
lines = urlopen(request, timeout = 10).read()
if len(lines) <= 200:
return pd.DataFrame()
df = pd.read_html(lines, skiprows=[0])[0]
df.columns = rv.MAR_SZ_HZ_COLS
df['opDate'] = date
except Exception as e:
print(e)
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
def generate_sched_table(team, year, max_rows=20):
df = pd.DataFrame(teams)
filter_team = df.loc[df["TeamAlt"] == team]
filter_team['ESPNID'] = "http://www.espn.com/college-football/team/fpi/_/id/" \
+ filter_team.ESPNID.map(str) + "/year/" + str(year)
link = filter_team.tail(1)['ESPNID'].values[0]
sched_dataframe = pd.read_html(link, header=1)[4]
sched_dataframe.columns = ['Date', 'Opponent', 'Result/Proj', 'Opp FPI', 'Game Rating']
return html.Table(
# Header1
[html.Tr([
html.Th(html.H6([team + ' ' + str(year) + ' ' + 'Schedule']), colSpan=5, style=dict(textAlign="center")),
])] +
# Header2
[html.Tr([html.Td(col) for col in sched_dataframe.columns], style=dict(fontWeight="bold"))] +
# Body
[html.Tr([
html.Td(sched_dataframe.iloc[i][col]) for col in sched_dataframe.columns
]) for i in range(min(len(sched_dataframe), max_rows))]
)
def _dist_cotent(year, pageNo, retry_count, pause):
for _ in range(retry_count):
time.sleep(pause)
try:
if pageNo > 0:
ct._write_console()
html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
ct.PAGES['163dp'], year, pageNo))
res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
df = pd.read_html(sarr, skiprows=[0])[0]
df = df.drop(df.columns[0], axis=1)
df.columns = rv.DP_163_COLS
df['divi'] = df['plan'].map(_fun_divi)
df['shares'] = df['plan'].map(_fun_into)
df = df.drop('plan', axis=1)
df['code'] = df['code'].astype(object)
df['code'] = df['code'].map(lambda x : str(x).zfill(6))
pages = []
if pageNo == 0:
page = html.xpath('//div[@class=\"mod_pages\"]/a')
if len(page)>1:
asr = page[len(page)-2]
pages = asr.xpath('text()')
except Exception as e:
print(e)
else:
if pageNo == 0:
return df, pages[0] if len(pages)>0 else 0
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_forecast_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year, quarter, pageNo,
ct.PAGE_NUM[1]))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('--', '0')
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df = df.drop([4, 5, 8], axis=1)
df.columns = ct.FORECAST_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+',nextPage[0])[0]
return _get_forecast_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _newstocks(data, pageNo, retry_count, pause):
for _ in range(retry_count):
time.sleep(pause)
ct._write_console()
try:
html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
ct.PAGES['newstock'], pageNo))
res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('<font color="red">*</font>', '')
sarr = '<table>%s</table>'%sarr
df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1)
df.columns = rv.NEW_STOCKS_COLS
df['code'] = df['code'].map(lambda x : str(x).zfill(6))
res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
tag = '???' if ct.PY3 else unicode('???', 'utf-8')
hasNext = True if tag in res else False
data = data.append(df, ignore_index=True)
pageNo += 1
if hasNext:
data = _newstocks(data, pageNo, retry_count, pause)
except Exception as ex:
print(ex)
else:
return data
def _parse_fq_data(url, index, retry_count, pause):
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(url)
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
df = pd.read_html(sarr, skiprows = [0, 1])[0]
if len(df) == 0:
return pd.DataFrame()
if index:
df.columns = ct.HIST_FQ_COLS[0:7]
else:
df.columns = ct.HIST_FQ_COLS
if df['date'].dtypes == np.object:
df['date'] = df['date'].astype(np.datetime64)
df = df.drop_duplicates('date')
except Exception as e:
print(e)
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0],
ct.PAGES['fd'], last, pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = rv.LHB_GGTJ_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _cap_tops(last, pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1],
ct.PAGES['fd'], last, pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = rv.LHB_YYTJ_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _broker_tops(last, pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2],
ct.PAGES['fd'], last, pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df = df.drop([2,3], axis=1)
df.columns = rv.LHB_JGZZ_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _inst_tops(last, pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3],
ct.PAGES['fd'], '', pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = rv.LHB_JGMX_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _inst_detail(pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _get_profit_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year,
quarter, pageNo, ct.PAGE_NUM[1]))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
text = text.replace('--', '')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns=ct.PROFIT_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _get_profit_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except:
pass
def _get_operation_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
request = Request(ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year,
quarter, pageNo, ct.PAGE_NUM[1]))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
text = text.replace('--', '')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns=ct.OPERATION_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _get_operation_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _get_growth_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
request = Request(ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year,
quarter, pageNo, ct.PAGE_NUM[1]))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
text = text.replace('--', '')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns=ct.GROWTH_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _get_growth_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _get_debtpaying_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year,
quarter, pageNo, ct.PAGE_NUM[1]))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = ct.DEBTPAYING_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _get_debtpaying_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _get_cashflow_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year,
quarter, pageNo, ct.PAGE_NUM[1]))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
text = text.replace('--', '')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = ct.CASHFLOW_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _get_cashflow_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _parse_fq_data(url, index, retry_count, pause):
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(url)
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
if sarr == '':
return None
df = pd.read_html(sarr, skiprows = [0, 1])[0]
if len(df) == 0:
return pd.DataFrame()
if index:
df.columns = ct.HIST_FQ_COLS[0:7]
else:
df.columns = ct.HIST_FQ_COLS
if df['date'].dtypes == np.object:
df['date'] = df['date'].astype(np.datetime64)
df = df.drop_duplicates('date')
except ValueError as e:
# ????????????
return None
except Exception as e:
print(e)
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
def checkStockNameChanges():
"""
Handles the renaming of stocks in the DB that have been renamed. It will switch
both the name and the symbol for every table in the database.
Uses PANDAS to capture html tables from the NASDAQ listings containing symbol changes.
Returns the changes in a PANDAS frame, if any. If problem scraping NASDAQ, returns error msg.
If code error, returns a tuple of False, error.
"""
try:
path = 'http://www.nasdaq.com/markets/stocks/symbol-change-history.aspx?page='
ticker_changes = pd.DataFrame()
for i in np.arange(100): # set the number high enough to catch all pages
page = str(i+1)
full_path = ''.join([path, page])
symbol_changes = pd.read_html(full_path, header=0)[3] # note: index could change in future if html is restructured
# concat all of the changes together
if 'No records found.' not in symbol_changes.iloc[0][0]:
ticker_changes = pd.concat([ticker_changes, symbol_changes], ignore_index=True)
else: break # drop out of loop if there's nothing left to capture
ticker_changes.rename(columns={'Old Symbol': 'Old', 'New Symbol': 'New', 'Effective Date': 'Date'}, inplace=True)
# check returned value
assert isinstance(ticker_changes, pd.DataFrame), "Expected stock name changes to return pandas DataFrame. Got %r instead" % type(tickers)
return ticker_changes
except Exception as e:
return False, e
# renameStocks()
# ************** #
def get_coins():
coins_db = OrderedDict()
print(crayons.yellow('Scraping CoinMaketCap...'))
r = session.get(url)
html = pq(pq(r.content)('table')[0]).html()
df = pandas.read_html("<table>{}</table>".format(html))
df = pandas.concat(df)
btc_value = float(df.to_dict()['Price'][0][1:].replace(',', ''))
for row in df.itertuples():
rank = int(row[1])
name = ' '.join(row[2].split()[1:])
ticker = row[3].lower()
try:
usd = float(row[5][1:].replace(',', ''))
except ValueError:
usd = 0
finally:
pass
btc = convert_to_decimal(usd / btc_value)
coins_db.update({ticker: {'rank': rank, 'name': name, 'ticker': ticker, 'usd': usd, 'btc': btc}})
return coins_db
def read_html_table(self, url, **kwargs):
try:
self.get_status(url)
except:
return None
# set some default values (some are already default values for read_table)
kwargs.update({'encoding': kwargs.get('encoding') or None})
# run pandas...
df = pd.read_html(url, **kwargs)
return df
#/************************************************************************/
def parse_rep_vote_history(self, response):
# Some reps did not vote during a session. Test for the "Vote data is unavailable.
# We capture the base information about the rep for later matching
if "Vote data is unavailable" in response.css("#mainBody::text").extract()[3]:
cur_url = response.url
session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
url = cur_url
self.rep_info.append([rep_id, session_id, chamber])
#Otherwise, we process the body of text.
else:
title = response.xpath("""//*[@id="title"]/text()""").extract_first()
rep_title, rep_short_name, rep_district = self.get_name_district(title)
#Fetch the main table - they use nested tables, so have to use a direct reference.
table_rows = response.css('#mainBody > table').extract()[0]
#Parse the html table and select relevant info for the vote.
pd_table = pd.read_html(table_rows, header=0, match="Doc.", attrs={'cellspacing':0})[0][['RCS\xa0#', 'Doc.','Vote','Result']]
#Get session and chamber id from URL and assign to each row
cur_url = response.url
session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
pd_table['session_id'] = session_id
pd_table['chamber'] = chamber
pd_table['rep_id'] = rep_id
pd_table['rep_title'] = rep_title
pd_table['rep_short_name'] = rep_short_name
pd_table['district'] = rep_district
#Reorder columns
pd_table = pd_table.reindex_axis(['session_id', 'chamber', 'rep_id', 'rep_short_name', 'rep_title', 'district', 'RCS\xa0#', 'Doc.', 'Vote', 'Result'], axis=1)
return pd_table.to_dict(orient='records')
def fetch_price(country_code='CA-AB', session=None):
"""Requests the last known power price of a given country
Arguments:
country_code (optional) -- used in case a parser is able to fetch multiple countries
session (optional) -- request session passed in order to re-use an existing session
Return:
A dictionary in the form:
{
'countryCode': 'FR',
'currency': EUR,
'datetime': '2017-01-01T00:00:00Z',
'price': 0.0,
'source': 'mysource.com'
}
"""
r = session or requests.session()
url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/SMPriceReportServlet?contentType=html/'
response = r.get(url)
df_prices = pd.read_html(response.text, match='Price', index_col=0, header=0)
prices = df_prices[1]
data = {}
for rowIndex, row in prices.iterrows():
price = row['Price ($)']
if (isfloat(price)):
hours = int(rowIndex.split(' ')[1]) - 1
data[rowIndex] = {
'datetime': arrow.get(rowIndex, 'MM/DD/YYYY').replace(hours=hours, tzinfo=ab_timezone).datetime,
'countryCode': country_code,
'currency': 'CAD',
'source': 'ets.aeso.ca',
'price': float(price),
}
return [data[k] for k in sorted(data.keys())]