def fetch_exchange(country_code1='CA-AB', country_code2='CA-BC', session=None):
"""Requests the last known power exchange (in MW) between two countries
Arguments:
country_code (optional) -- used in case a parser is able to fetch multiple countries
session (optional) -- request session passed in order to re-use an existing session
Return:
A dictionary in the form:
{
'sortedCountryCodes': 'DK->NO',
'datetime': '2017-01-01T00:00:00Z',
'netFlow': 0.0,
'source': 'mysource.com'
}
"""
r = session or requests.session()
url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/CSDReportServlet'
response = r.get(url)
df_exchanges = pd.read_html(response.text, match='INTERCHANGE', skiprows=0, index_col=0)
flows = {
'CA-AB->CA-BC': df_exchanges[1][1]['British Columbia'],
'CA-AB->CA-SK': df_exchanges[1][1]['Saskatchewan'],
'CA-AB->US': df_exchanges[1][1]['Montana']
}
sortedCountryCodes = '->'.join(sorted([country_code1, country_code2]))
if sortedCountryCodes not in flows:
raise NotImplementedError('This exchange pair is not implemented')
return {
'datetime': arrow.now(tz=ab_timezone).datetime,
'sortedCountryCodes': sortedCountryCodes,
'netFlow': float(flows[sortedCountryCodes]),
'source': 'ets.aeso.ca'
}
python类read_html()的实例源码
def fetch_production(country_code='CR', session=None):
# Do not use existing session as some amount of cache is taking place
r = requests.session()
url = 'https://appcenter.grupoice.com/CenceWeb/CencePosdespachoNacional.jsf'
response = r.get(url)
df_yesterday = pd.read_html(response.text, skiprows=1, index_col=0, header=0)[0]
soup = BeautifulSoup(response.text, 'html.parser')
yesterday_date = soup.select('#formPosdespacho:pickFechaInputDate')[0]['value']
jsf_view_state = soup.select('#javax.faces.ViewState')[0]['value']
yesterday = arrow.get(yesterday_date, 'DD/MM/YYYY', tzinfo=TIMEZONE)
today = yesterday.shift(days=+1)
data = [
('formPosdespacho', 'formPosdespacho'),
('formPosdespacho:pickFechaInputDate', today.format(DATE_FORMAT)),
('formPosdespacho:pickFechaInputCurrentDate', today.format(MONTH_FORMAT)),
('formPosdespacho:j_id35.x', ''),
('formPosdespacho:j_id35.y', ''),
('javax.faces.ViewState', jsf_view_state),
]
response = r.post(url, cookies={}, data=data)
df_today = pd.read_html(response.text, skiprows=1, index_col=0)[0]
ydata = df_to_data(country_code, yesterday, df_yesterday)
tdata = df_to_data(country_code, today, df_today)
production = ydata + tdata
unknown_plants()
return production
def convert_wiki_to_table(wiki_text, n_table=0):
html_text = pypandoc.convert(wiki_text, 'html', 'mediawiki')
tables = pandas.read_html(html_text)
return tables[n_table]
Fetch_Data_Stock_US_Short.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def getSignleStockShortInfo(stock):
df = pd.DataFrame()
url = "http://shortsqueeze.com/?symbol=" + stock + "&submit=Short+Quote%E2%84%A2"
repeat_times = 3
downloadFailed = True
for _ in range(repeat_times):
try:
response = requests.get(url, timeout=15)
downloadFailed = False
break
except Exception as e:
print ("exception in get stock:" + stock, str(e))
continue
if downloadFailed:
return "", df
try:
tables = pd.read_html(response.text, attrs={'cellpadding': '3', 'width': '100%'})
except Exception as e:
print ("exception in parse stock:" + stock, str(e))
return "", df
for table in tables:
if df.empty:
df = table
else:
df = pd.concat([df, table])
df = df.reset_index(drop=True, inplace=True)
#print(df)
soup = BeautifulSoup(response.text, 'lxml')
dateString = soup.find('span', {"style" : "color:#999999;font-family: verdana, arial, helvetica;font-size:10px"}).get_text()
date = datetime.datetime.strptime(dateString, '%A %B %d, %Y')
return date, df.T
def __init__(self, code, year):
self._geo = get_geo(code, year)
self.url = url_resolver(code, year, self._geo['region_code'],
self._geo['department_code'])
tables = pd.read_html(self.url, header=0, encoding='utf8', decimal=',',
thousands=' ')
self._parse(tables)
def read_quote_frames(mkt_conf):
url = _market_quote_url(mkt_conf)
response = requests.get(url=url)
dfs = pd.read_html(response.text, index_col=0, header=0, na_values=['---'])
# Data outside of the HTML tables
table_headers = _table_headers(response.text)
market_names = [_market_name(s) for s in table_headers]
timestamps = [_timestamp(s) for s in table_headers]
# Modify data frames
mod_dfs = [_modify_frame(df, ts) for df, ts in zip(dfs, timestamps)]
return OrderedDict((nm, df) for nm, df in zip(market_names, mod_dfs))
def main():
date_cols = [iem.ORDER_DATE, iem.EXPIRATION]
kwargs = dict(index_col=iem.ORDER_DATE, parse_dates=date_cols)
dfs = pd.read_html(table_text, **kwargs)
df = dfs[0]
oid_df = pd.DataFrame()
cxl_o = iem.CANCEL_ORDER
df[cxl_o] = df[cxl_o].combine_first(oid_df[cxl_o])
def _frame(response, **kwargs):
print(response.text)
dfs = pd.read_html(response.text, **kwargs)
# Expect a singleton list
assert len(dfs) == 1
return dfs[0]
def parse_detail_page(b):
prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
# TODO: use the extended fields, add them to the list of properties
tables = b.findAll('table', {'class': 'cell'})
if len(tables) > 0:
prop['listing_timestamp'] = datetime.datetime.now()
addr_rows = b.findAll('td', {'class': 'addr'})
addr = ' '.join(map(lambda x: x.getText(), addr_rows))
t = tables[0]
df = pd.read_html(str(t))[0]
data = dict(zip(df[0], df[1]))
prop['raw_address'] = addr
prop['bedrooms'] = int(data['Bedrooms'])
prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
if data.has_key('Interior Sq Ft'):
prop['building_size'] = int(data['Interior Sq Ft'])
prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
if data.has_key('Parking'):
try:
prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
except ValueError:
prop['car_spaces'] = -1
return [prop]
else:
return None
# Takes a string of the raw version of the page and extracts any links we might want to crawl
def parse_detail_page(content):
prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
# TODO: use the extended fields
b = soup.BeautifulSoup(content)
tables = b.findAll('table', {'class': 'cell'})
if len(tables) > 0:
prop['listing_timestamp'] = datetime.datetime.now()
addr_rows = b.findAll('td', {'class': 'addr'})
addr = ' '.join(map(lambda x: x.getText(), addr_rows))
t = tables[0]
df = pd.read_html(str(t))[0]
data = dict(zip(df[0], df[1]))
prop['raw_address'] = addr
prop['bedrooms'] = int(data['Bedrooms'])
prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
if data.has_key('Interior Sq Ft'):
prop['building_size'] = int(data['Interior Sq Ft'])
prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
if data.has_key('Parking'):
try:
prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
except ValueError:
prop['car_spaces'] = -1
#for of in other_fields:
# if data.has_key(of):
# prop['features'].append({of: data[of]})
return [prop]
else:
return None
# Takes a string of the raw version of the page and extracts any links we might want to crawl
def new_stocks():
url = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
request = requests.get(url)
doc = lxml.html.soupparser.fromstring(request.content, features='html.parser')
table = doc.cssselect('table#NewStockTable')[0]
table.remove(table.cssselect('thead')[0])
table_html = lxml.html.etree.tostring(table).decode('utf-8')
df = pd.read_html(table_html, skiprows=[0, 1])[0]
df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
df['code'] = df['code'].map(lambda x: str(x).zfill(6))
df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
return df
def _dist_cotent(year, pageNo, retry_count, pause):
for _ in range(retry_count):
time.sleep(pause)
try:
if pageNo > 0:
ct._write_console()
html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
ct.PAGES['163dp'], year, pageNo))
res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
df = pd.read_html(sarr, skiprows=[0])[0]
df = df.drop(df.columns[0], axis=1)
df.columns = rv.DP_163_COLS
df['divi'] = df['plan'].map(_fun_divi)
df['shares'] = df['plan'].map(_fun_into)
df = df.drop('plan', axis=1)
df['code'] = df['code'].astype(object)
df['code'] = df['code'].map(lambda x : str(x).zfill(6))
pages = []
if pageNo == 0:
page = html.xpath('//div[@class=\"mod_pages\"]/a')
if len(page)>1:
asr = page[len(page)-2]
pages = asr.xpath('text()')
except Exception as e:
print(e)
else:
if pageNo == 0:
return df, pages[0] if len(pages)>0 else 0
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_forecast_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year, quarter, pageNo,
ct.PAGE_NUM[1]))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('--', '0')
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df = df.drop([4, 5, 8], axis=1)
df.columns = ct.FORECAST_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+',nextPage[0])[0]
return _get_forecast_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _newstocks(data, pageNo, retry_count, pause):
for _ in range(retry_count):
time.sleep(pause)
ct._write_console()
try:
html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
ct.PAGES['newstock'], pageNo))
res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('<font color="red">*</font>', '')
sarr = '<table>%s</table>'%sarr
df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1)
df.columns = rv.NEW_STOCKS_COLS
df['code'] = df['code'].map(lambda x : str(x).zfill(6))
res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
tag = '???' if ct.PY3 else unicode('???', 'utf-8')
hasNext = True if tag in res else False
data = data.append(df, ignore_index=True)
pageNo += 1
if hasNext:
data = _newstocks(data, pageNo, retry_count, pause)
except Exception as ex:
print(ex)
else:
return data
def _parse_fq_data(url, index, retry_count, pause):
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(url)
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
df = pd.read_html(sarr, skiprows = [0, 1])[0]
if len(df) == 0:
return pd.DataFrame()
if index:
df.columns = ct.HIST_FQ_COLS[0:7]
else:
df.columns = ct.HIST_FQ_COLS
if df['date'].dtypes == np.object:
df['date'] = df['date'].astype(np.datetime64)
df = df.drop_duplicates('date')
except Exception as e:
print(e)
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0],
ct.PAGES['fd'], last, pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = rv.LHB_GGTJ_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _cap_tops(last, pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1],
ct.PAGES['fd'], last, pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = rv.LHB_YYTJ_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _broker_tops(last, pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2],
ct.PAGES['fd'], last, pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df = df.drop([2,3], axis=1)
df.columns = rv.LHB_JGZZ_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _inst_tops(last, pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3],
ct.PAGES['fd'], '', pageNo))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@id=\"dataTable\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns = rv.LHB_JGMX_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _inst_detail(pageNo, retry_count, pause, dataArr)
else:
return dataArr
except Exception as e:
print(e)
def _get_profit_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year,
quarter, pageNo, ct.PAGE_NUM[1]))
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
text = text.replace('--', '')
html = lxml.html.parse(StringIO(text))
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df.columns=ct.PROFIT_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+', nextPage[0])[0]
return _get_profit_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except:
pass