def __init__(self,code,sql=False):
if int(code[0]) is 0 or 3:
name = "sz" + code
if int(code[0]) is 6:
name = "sh" + code
if sql:
self.sharedf=ShareClass().GetDayData(code)
else:
conn = create_engine(
'mysql://' + config.user + ':' + config.password + '@' + config.ip + '/daydata?charset=utf8')
x = 'select * from ' + name + '_tencent;' # sql???
self.sharedf = pandas.read_sql(x, con=conn)
self.name=name
python类read_sql()的实例源码
def loadData(self):
# Load the toilet collection data to pandas
collects = pd.read_sql('SELECT * FROM premodeling.toiletcollection', self.conn, coerce_float=True, params=None)
pprint.pprint(collects.keys())
collects = collects[['ToiletID','ToiletExID','Collection_Date','Area','Feces_kg_day','year','month']]
pprint.pprint(collects.keys())
# Load the density data to pandas
density = pd.read_sql('SELECT * FROM premodeling.toiletdensity', self.conn, coerce_float=True, params=None)
pprint.pprint(density.keys())
# Return the data
self.collects = collects
self.density = density
return(collects, density)
def as_dataframe(self, model_group_ids):
"""Return model-group-id subset of table as dataframe
Args:
model_group_ids (list) the desired model group ids
Returns: (pandas.DataFrame) The data from the table corresponding
to those model group ids
"""
return pd.read_sql(
'select * from {} where model_group_id in ({})'.format(
self.distance_table,
str_in_sql(model_group_ids)
),
self.db_engine
)
def simple_moving_average(self, period=50, column='adj_close'):
table_name = 'sma_test'
# stmt = text('SELECT * FROM sma_test WHERE asset_id = :asset_id')
# stmt.bindparams(asset_id=self.id)
# try:
# TODO: parse dates
# df = pd.read_sql(sql, con=conn, params={
# 'asset_id': self.id
# })
# except OperationalError:
# logger.exception('error in query')
sma_ts = pd.Series(
self.ohlcv[column].rolling(center=False, window=period,
min_periods=period - 1).mean()).dropna()
print('creating')
print(sma_ts)
return sma_ts
# return sma_ts
# else:
# print('found')
# print(df)
# return df
def standardize_variable_names(table, RULES):
"""
Script to standardize the variable names in the tables
PARAM DataFrame table: A table returned from pd.read_sql
PARAM list[tuples]: A list of tuples with string replacements, i.e., (string, replacement)
RET table
"""
variableNames = list(table.columns.values)
standardizedNames = {} # Pandas renames columns with a dictionary object
for v in variableNames:
f = v
for r in RULES:
f = re.sub(r[0],r[1],f)
print '%s to %s' %(v,f)
standardizedNames[v] = f
table = table.rename(columns=standardizedNames)
return table
def export_data_unresolved():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_work_view = db.get_work_view()
connection = db_work_view._db_connection
df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False)
df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ')
df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ')
df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ')
df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection)
sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved'])
sample_unresoleved['n'].fillna(0, inplace=True)
sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
def pickle_correlations_zeros_january():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
print 'read'
df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
print 'loaded links'
df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn)
print 'loaded counts'
result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
print 'merged counts'
print result
article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
article_counts['counts'].fillna(0.0, inplace=True)
print article_counts
print 'write to file'
article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
def _limit_and_df(self, query, limit, as_df=False):
"""adds a limit (limit==None := no limit) to any query and allow a return as pandas.DataFrame
:param bool as_df: if is set to True results return as pandas.DataFrame
:param `sqlalchemy.orm.query.Query` query: SQL Alchemy query
:param int or tuple[int] limit: maximum number of results
:return: query result of pyuniprot.manager.models.XY objects
"""
if limit:
if isinstance(limit, int):
query = query.limit(limit)
if isinstance(limit, Iterable) and len(limit) == 2 and [int, int] == [type(x) for x in limit]:
page, page_size = limit
query = query.limit(page_size)
query = query.offset(page * page_size)
if as_df:
results = read_sql(query.statement, self.engine)
else:
results = query.all()
return results
def get_metrics(self):
"""Pull requested evaluation metrics for each model meeting the criteria
specified in __init__.
:returns: evaluation metrics for all models in the batches
:rtype: pandas.DataFrame
"""
metrics_query = """
SELECT DISTINCT
metrics.metric || '_' || metrics.parameter as p_metric,
metrics.value,
metrics.unique_timestamp
FROM
output.metrics as metrics
WHERE
metric || '_' || parameter in {0} AND
unique_timestamp in {1};
""".format(tuple(self.most_important_metrics),
tuple(self.models.index.astype(str).values))
return(pd.read_sql(metrics_query, self.engine))
def get_features(self):
""" Pull features from the database.
:returns: features
:rtype: pandas DataFrame
"""
query = """
SELECT
*
FROM
{0}
WHERE
{1} is not null;
""".format(self.model['test_table_name'][0],
self.model['labelling'].astype(str)[0])
return(pd.read_sql(query, self.engine))
def get_mh_appt_dates(self, people):
""" Get the dates of mental health appointments for people labeled
correctly by the model.
:param people: list of ids to search for
:type people: tuple
:returns: mental health appointment data
:rtype: pandas DataFrame
"""
mh_services_query = """
select
dedupe_id,
svc_date
from
clean.jocomentalhealthservices
where
svc_date < '{0}' and
dedupe_id in {1}
""".format(self.features['labeling_end_date'].unique()[0], people)
return(pd.read_sql(mh_services_query, self.engine).set_index('dedupe_id'))
def get_y_values(self):
""" Return a dataframe containing the true classes and predicted
probabilities for each case, sorted by descending probability.
:returns: dataframe of true classes and predicted probabilities
:rtype: pandas DataFrame
"""
y_query = """
SELECT
label as y_true,
prediction_prob as scores
FROM
output.predictions
WHERE
unique_timestamp = '{0}';
""".format(self.model_id)
y_values = pd.read_sql(y_query, con).sort_values(by = 'scores',
ascending = False)
return y_values
def tables_exist(fake_today,prediction_window,feature_timestamp):
table_list = pd.read_sql('''
SELECT
table_name
FROM
information_schema.tables
WHERE
table_schema = 'feature_tables' AND
table_name LIKE 'features_train_{}_{}_at_{}'
ORDER BY
table_name desc
LIMIT
1;
'''.format(fake_today, prediction_window, feature_timestamp), con)
if len(table_list) > 0:
return True
else:
return False
def load_table(self, train_or_test, feature_timestamp):
# get feature table name
if feature_timestamp == '%':
feature_timestamp = pd.read_sql('''
SELECT
split_part(table_name, '_at_', 2)
FROM
information_schema.tables
WHERE
table_schema = 'feature_tables'
ORDER BY
1 desc
LIMIT
1;
''', con).iat[0,0]
feature_table_name = ('{}."features_{}_{}_{}_at_{}"').format(config_db['feature_schema'],
train_or_test, self.fake_today, self.prediction_window,
feature_timestamp)
# load table
print 'loading {}'.format(feature_table_name)
query = ('SELECT * FROM {}').format(feature_table_name)
full_feature_table = pd.read_sql(query, con = con)
return full_feature_table, feature_table_name
def by_scenario(self, name):
"""
"""
ormclass = self._mapped[name]
query = self.session.query(ormclass).filter(
ormclass.scn_name == self.scn_name)
if self.version:
query = query.filter(ormclass.version == self.version)
# TODO: Better handled in db
if name == 'Transformer':
name = 'Trafo'
df = pd.read_sql(query.statement,
self.session.bind,
index_col=name.lower() + '_id')
if 'source' in df:
df.source = df.source.map(self.id_to_source())
return df
def is_part_of_speech(self, pos):
if hasattr(self.resource, QUERY_ITEM_POS):
current_token = tokens.COCAToken(pos, replace=False)
rc_feature = getattr(self.resource, QUERY_ITEM_POS)
_, table, _ = self.resource.split_resource_feature(rc_feature)
S = "SELECT {} FROM {} WHERE {} {} '{}' LIMIT 1".format(
getattr(self.resource, "{}_id".format(table)),
getattr(self.resource, "{}_table".format(table)),
getattr(self.resource, rc_feature),
self.resource.get_operator(current_token),
pos)
engine = self.resource.get_engine()
df = pd.read_sql(S.replace("%", "%%"), engine)
engine.dispose()
return len(df.index) > 0
else:
return False
def get_unique(self):
if not self.db_name:
return
sql = sqlhelper.sql_url(options.cfg.current_server, self.db_name)
if self._uniques:
S = "SELECT DISTINCT {} FROM {}".format(self.column, self.table)
self.df = pd.read_sql(S, sqlalchemy.create_engine(sql))
self.df = self.df.sort_values(self.column, ascending=True)
else:
S = "SELECT {} FROM {}".format(self.column, self.table)
self.df = pd.read_sql(S, sqlalchemy.create_engine(sql))
items = (self.df[self.column].apply(utf8)
.apply(QtWidgets.QTableWidgetItem))
self.ui.tableWidget.setRowCount(len(items))
self.ui.tableWidget.setColumnCount(1)
for row, item in enumerate(items):
self.ui.tableWidget.setItem(row, 0, item)
def read_sql(self,sql,chunksize=None,return_generator=True):
'''
?????????
:param sql: sql???
:param chunksize: int?????????????????????????
:param return_generator: bool?????????False????????????????chunksize?int?????
:return:
'''
if chunksize is not None and chunksize<=0:
chunksize=None
result=pd.read_sql(sql,self.engine,chunksize=chunksize)
if return_generator:
return result
else:
if chunksize is None:
return result
else:
result=list(result)
if len(result)==0:
return pd.DataFrame()
else:
result=pd.concat(result,axis=0)
return result
def getData(con):
query = 'SELECT {} FROM {}'.format(
','.join(['"{}"'.format(x) for x in DOWNLOAD_COLS]),
DOWNLOAD_TABLE)
if WHERE:
query += ' WHERE {}'.format(WHERE)
if LIMIT:
query += ' LIMIT {}'.format(LIMIT)
print(query)
df = pd.read_sql(
sql = query,
con = con)
return df
def getData(con, lo):
query = DOWNLOAD_QUERY.format(
','.join(['"{}"'.format(x) for x in DOWNLOAD_COLS]),
DOWNLOAD_TABLE,
LELIMIT,
lo)
print(query)
df = pd.read_sql(
sql = query,
con = con)
df.columns = ['canonical_smiles']
# fingerDf = computeDescriptors(df)
fingerDf = computeMACCS(df)
mergedDf = pd.concat([df, fingerDf], axis=1)
return mergedDf
def retrieve_data(self, date, world_or_region=WORLD, table_name='trending_places'):
"""
Fetched records saved in a Database
Parameters
----------
date
world_or_region
table_name
Returns
-------
pandas data frame
"""
with self.con:
return pd.read_sql(FETCH_QUERY % (table_name, date, world_or_region),
self.con, parse_dates=['last_day'])
def _update(stock, conn):
try:
print "update ----- :", stock
query = "select * from '%s' order by date" % stock
df = pd.read_sql(query, conn)
df = df.set_index('date')
print "sql saved:", df.tail(1),df.ix[-1],df.ix[-1].name
if dt.now().weekday() == 5:
today = str(pd.Timestamp(dt.now()) - pd.Timedelta(days=1))[:10]
elif dt.now().weekday() == 6:
today = str(pd.Timestamp(dt.now()) - pd.Timedelta(days=2))[:10]
else:
today = str(pd.Timestamp(dt.now()))[:10]
print "today:",today
if today != df.ix[-1].name[:10]:
df = ts.get_h_data(stock, start=df.ix[-1].name[:10], retry_count=5, pause=1)
print "read from tu:",df.head(1)
df[['open', 'high', 'close', 'low', 'volume']].to_sql(stock, conn, if_exists='append')
import time
time.sleep(10)
except Exception, arg:
print "exceptionu:", stock, arg
errorlist.append(stock)
def _clean(stock, conn):
try:
print "clean ------ :", stock
query = "select * from '%s' order by date" % stock
df = pd.read_sql(query, conn)
print "before",df.tail(5)
cur = conn.cursor()
query = "delete from '%s' where rowid not in(select max(rowid) from '%s' group by date)" % (stock, stock)
cur.execute(query)
conn.commit()
query = "select * from '%s' order by date" % stock
df = pd.read_sql(query, conn)
print "after",df.tail(5)
except Exception, arg:
print "exceptionc:", stock, arg
raise SystemExit(-1)
def RNNPredict(mask,trigger_date=None,source='History.db'):
class RNNPredict(CustomFactor):
inputs = [];
window_length = 1
def compute(self, today, assets, out, *inputs):
if trigger_date != None and today != pd.Timestamp(trigger_date,tz='UTC'): # ????????factor??????????????factor????computer??????????????? ???
return
if os.path.splitext(source)[1] == '.db':
conn = sqlite3.connect(source, check_same_thread=False) #?????????????????????
query = "select * from predict where date >= '%s' order by date limit 1 " % str(today)[:19]
df = pd.read_sql(query, conn)
df = df.set_index('date')
conn.close()
elif os.path.splitext(source)[1] == '.csv':
df = pd.read_csv("predict.csv", index_col=0, parse_dates=True)
df = df[df.index >= pd.Timestamp(str(today))]
print today,df
else:
raise ValueError
new_index = [sid(asset).symbol + "_return" for asset in assets]
df = df.reindex(columns = new_index)
out[:] = df.ix[0].values
print "RNNpredict:", today, out
return RNNPredict(mask=mask)
def mysql_into_df(self):
# first Extract all the table names
# connect = self.db_connection
df_tables = pd.read_sql('show tables;', self.db_connection)
# store the tables in a dictionary
d = {}
col_name = df_tables.columns[0]
# load individual table into the dictionary
for table in df_tables[col_name]:
key = table
value = pd.read_sql('SELECT * FROM ' + table +
';', self.db_connection)
d[key] = value
self.dictTables = d
# Load qualification table separatedly to infare the timestamp
def get_lowest(self, code, date,current_date):
'''
??????????
:param code: ????
:param date: ??
:return:
'''
date = date + '-01-01'
cmd = 'select * from `{}` where datetime > \'{}\' and datetime <\'{}\''.format(code, date,current_date)
try:
df = pd.read_sql(cmd, history_engine,index_col='index')
except Exception,e:
print e
return None,None
#print df.dtypes
# ???????????????
if len(df)<1:
return None,None
df['low']=df['low'].astype('float64')
idx= df['low'].idxmin()
min_date= df.loc[idx]
return min_date['datetime'],min_date['low']
def standard_sessions(start_date, end_date):
"""Return the datetimes corresponding to the trading sessions in a specified
time period during which the stock market was officially open.
Parameters
----------
start_date: Datetime object.
The datetime indicating the beginning of the trading time period.
end_date (optional): Datetime object.
The datetime indicating the ending of the trading time period.
"""
qry = """
SELECT p.datetime FROM prices AS p JOIN symbols as s ON p.symbol_id = s.id
WHERE s.symbol='^GSPC' AND p.datetime >= '{}' AND p.datetime <= '{}'
""".format(start_date, end_date)
return pd.read_sql(qry, conn)
make_dataset.py 文件源码
项目:UK_Imbalance_Price_Forecasting
作者: ADGEfficiency
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def query_sql(db_path, table_name):
"""
Uses pandas to pull data from our sqlite database
args
db_path (str) : location of the database
table_name (str) : name of the table we want
returns
data (pd.DataFrame) :
"""
print('Pulling data for table {} from {}'.format(db_path, table_name))
# connect to our database
conn = sqlite3.connect(db_path)
# pull data by selecting the entire table
data = pd.read_sql(sql='SELECT * from '+str(table_name), con=conn)
data.set_index('index', drop=True, inplace=True)
# close the connection
conn.close()
return data
def matches():
singles = pd.read_sql('select * from game where deleted = 0', con=engine)
doubles = pd.read_sql('select * from doubles_game where deleted = 0', con=engine)
tz = pytz.timezone('America/New_York')
for frame in [singles, doubles]:
frame['timestamp'] = frame['timestamp'].apply(datetime.utcfromtimestamp)
frame['timestamp'] = frame['timestamp'].apply(datetime.replace, tzinfo=tz)
frame['timestamp'] = frame['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S %Z')
singles = singles.to_dict('records')
doubles = doubles.to_dict('records')
return render_template('gamelog.html', singles_games=singles, doubles_games=doubles)
def read(self, *fields, **conditionals):
"""
Pull raw data into a dataframe. If no conditions are passed, pull the whole table.
Otherwise, filter based on the conditions specified ( currently only equality ).
"""
# Parse the list of fields to return
field_query = self._parse_fields(*fields)
# Parse the conditions
conditionals_query = self._parse_conditionals(**conditionals)
query = "SELECT {} FROM {} {}".format(field_query, self.table, conditionals_query)
if "DISTINCT" not in query:
query += " ORDER BY {}".format(self.timestamp_field)
return pd.read_sql(query, self.db)