def df_coach_bsites(self):
df_cols = ['site_num', 'c_score', 'cluster_size', 'algorithm',
'pdb_template_id', 'pdb_template_chain', 'pdb_ligand',
'binding_location_coords', 'c_score_method', 'binding_residues',
'ligand_cluster_counts']
bsites_inf_df = pd.DataFrame.from_records(self.coach_bsites, columns=df_cols).drop_duplicates().reset_index(drop=True)
if bsites_inf_df.empty:
log.warning('Empty dataframe')
return bsites_inf_df
else:
bsites_inf_df['c_score'] = pd.to_numeric(bsites_inf_df.c_score, errors='coerce')
bsites_inf_df['cluster_size'] = pd.to_numeric(bsites_inf_df.cluster_size, errors='coerce')
return ssbio.utils.clean_df(bsites_inf_df)
python类to_numeric()的实例源码
def df_coach_go(self):
cols = ['go_id', 'go_term', 'c_score']
go_all_df = pd.DataFrame()
for go_list in [self.coach_go_mf, self.coach_go_cc, self.coach_go_bp]:
go_df = pd.DataFrame.from_records(go_list, columns=cols).drop_duplicates().reset_index(drop=True)
go_df['c_score'] = pd.to_numeric(go_df.c_score, errors='coerce')
if go_all_df.empty:
go_all_df = go_df
else:
go_all_df.append(go_df)
return go_all_df
def parse_coach_ec_df(infile):
"""Parse the EC.dat output file of COACH and return a dataframe of results
EC.dat contains the predicted EC number and active residues.
The columns are: PDB_ID, TM-score, RMSD, Sequence identity,
Coverage, Confidence score, EC number, and Active site residues
Args:
infile (str): Path to EC.dat
Returns:
DataFrame: Pandas DataFrame summarizing EC number predictions
"""
ec_df = pd.read_table(infile, delim_whitespace=True,
names=['pdb_template', 'tm_score', 'rmsd', 'seq_ident', 'seq_coverage',
'c_score', 'ec_number', 'binding_residues'])
ec_df['pdb_template_id'] = ec_df['pdb_template'].apply(lambda x: x[:4])
ec_df['pdb_template_chain'] = ec_df['pdb_template'].apply(lambda x: x[4])
ec_df = ec_df[['pdb_template_id', 'pdb_template_chain', 'tm_score', 'rmsd',
'seq_ident', 'seq_coverage', 'c_score', 'ec_number', 'binding_residues']]
ec_df['c_score'] = pd.to_numeric(ec_df.c_score, errors='coerce')
return ec_df
def _get_peilmetingen_df(self):
""""""
doc_df = pd.DataFrame(list(self.get_peilmetingen()),
columns=["grondwaterlocatie",
"filternummer",
"datum",
"diepte",
"methode",
"betrouwbaarheid"])
doc_df["datum"] = pd.to_datetime(doc_df["datum"])
doc_df["diepte"] = pd.to_numeric(doc_df["diepte"])
doc_df = doc_df.set_index("datum")
return doc_df
def _get_observaties_df(self):
""""""
doc_df = pd.DataFrame(list(self.get_observaties()),
columns=["grondwaterlocatie",
"filternummer",
"monsternummer",
"datum",
"parameter",
"waarde",
"eenheid",
"betrouwbaarheid"])
doc_df["datum"] = pd.to_datetime(doc_df["datum"])
doc_df["waarde"] = pd.to_numeric(doc_df["waarde"])
return doc_df
def filter_fastq_length_meanqual(df, min_len, max_len,
min_mqual, max_mqual):
querystring = "length >= {0} and meanQual >= {1}".format(min_len, min_mqual)
if max_len != None:
querystring += " and length <= {}".format(max_len)
if max_mqual != None:
querystring += " and meanQual <= {}".format(max_mqual)
print("Keeping reads that satisfy: {}".format(querystring), file=stderr)
filtdf = df.query(querystring)
#filtdf["length"] = pd.to_numeric(filtdf["length"], errors='coerce')
#filtdf["meanQual"] = pd.to_numeric(filtdf["meanQual"], errors='coerce')
return filtdf
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
metadata: qiime2.Metadata) -> None:
# convert metadata to numeric values where applicable, drop the non-numeric
# values, and then drop samples that contain NaNs
df = metadata.to_dataframe()
df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
# filter categorical columns
pre_filtered_cols = set(df.columns)
df = df.select_dtypes([numpy.number]).dropna()
filtered_categorical_cols = pre_filtered_cols - set(df.columns)
# filter 0 variance numerical columns
pre_filtered_cols = set(df.columns)
df = df.loc[:, df.var() != 0]
filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)
# filter the distance matrix to exclude samples that were dropped from
# the metadata, and keep track of how many samples survived the filtering
# so that information can be presented to the user.
initial_dm_length = distance_matrix.shape[0]
distance_matrix = distance_matrix.filter(df.index, strict=False)
filtered_dm_length = distance_matrix.shape[0]
result = skbio.stats.distance.bioenv(distance_matrix, df)
result = q2templates.df_to_html(result)
index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
q2templates.render(index, output_dir, context={
'initial_dm_length': initial_dm_length,
'filtered_dm_length': filtered_dm_length,
'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
'result': result})
def from_csv(cls, filepath_or_buffer):
# Import pandas lazily since it can take a moment to import
try:
import pandas as pd
except ImportError:
raise ImportError("pandas must be installed to use ZiplineBacktestResult")
zipline_result = cls()
results = pd.read_csv(
filepath_or_buffer,
parse_dates=["date"],
index_col=["dataframe", "index", "date", "column"])["value"]
# Extract returns
returns = results.loc["returns"].unstack()
returns.index = returns.index.droplevel(0).tz_localize("UTC")
zipline_result.returns = returns["returns"].astype(float)
# Extract positions
positions = results.loc["positions"].unstack()
positions.index = positions.index.droplevel(0).tz_localize("UTC")
zipline_result.positions = positions.astype(float)
# Extract transactions
transactions = results.loc["transactions"].unstack()
transactions.index = transactions.index.droplevel(0).tz_localize("UTC")
zipline_result.transactions = transactions.apply(pd.to_numeric, errors='ignore')
# Extract benchmark returns
benchmark_returns = results.loc["benchmark"].unstack()
benchmark_returns.index = benchmark_returns.index.droplevel(0).tz_localize("UTC")
zipline_result.benchmark_returns = benchmark_returns["benchmark"].astype(float)
# Extract performance dataframe
perf = results.loc["perf"].unstack()
perf.index = perf.index.droplevel(0).tz_localize("UTC")
zipline_result.perf = perf.apply(pd.to_numeric, errors='ignore')
return zipline_result
DataClean_GS_Analysis5.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def check_null_or_valid(row_data):
"""Function that takes a row of data,
drops all missing values,
and checks if all remaining values are greater than or equal to 0
"""
no_na = row_data.dropna()[1:-1]
numeric = pd.to_numeric(no_na)
ge0 = numeric >= 0
return ge0
# Check whether the first column is 'Life expectancy'
def eliminate_invalids(df, cols):
"""Eliminate invalid data in ``cols`` of ``df``."""
numdf = df.drop(cols, axis=1).join(df[cols].apply(pd.to_numeric,
errors='coerce'))
numdf = numdf[~numdf[cols].isnull().apply(np.any, axis=1)]
return numdf
def partial_convert_only_numerics(df):
"""Convert ``df`` numeric cols and try to coerce any errors encountered."""
col_dict = df_cols_by_type(df)
partial_convert = partial(pd.to_numeric, errors='coerce')
df[col_dict['numeric']].apply(partial_convert)
return df
# Useful one-liners.
# df.select_dtypes(include=['bool'])
# list(df.select_dtypes(include=['bool']).columns)
def condition_df(self):
"""
Do any initial data conditioning that may be required.
"""
logging.info('Ensure that columns that are supposed to be numeric are numeric')
self.df[SET_GHI] = pd.to_numeric(self.df[SET_GHI], errors='coerce')
self.df[SET_WINDVEL] = pd.to_numeric(self.df[SET_WINDVEL], errors='coerce')
self.df[SET_NIGHT_LIGHTS] = pd.to_numeric(self.df[SET_NIGHT_LIGHTS], errors='coerce')
self.df[SET_ELEVATION] = pd.to_numeric(self.df[SET_ELEVATION], errors='coerce')
self.df[SET_SLOPE] = pd.to_numeric(self.df[SET_SLOPE], errors='coerce')
self.df[SET_LAND_COVER] = pd.to_numeric(self.df[SET_LAND_COVER], errors='coerce')
self.df[SET_GRID_DIST_CURRENT] = pd.to_numeric(self.df[SET_GRID_DIST_CURRENT], errors='coerce')
self.df[SET_GRID_DIST_PLANNED] = pd.to_numeric(self.df[SET_GRID_DIST_PLANNED], errors='coerce')
self.df[SET_SUBSTATION_DIST] = pd.to_numeric(self.df[SET_SUBSTATION_DIST], errors='coerce')
self.df[SET_ROAD_DIST] = pd.to_numeric(self.df[SET_ROAD_DIST], errors='coerce')
self.df[SET_HYDRO_DIST] = pd.to_numeric(self.df[SET_HYDRO_DIST], errors='coerce')
self.df[SET_HYDRO] = pd.to_numeric(self.df[SET_HYDRO], errors='coerce')
self.df[SET_SOLAR_RESTRICTION] = pd.to_numeric(self.df[SET_SOLAR_RESTRICTION], errors='coerce')
logging.info('Replace null values with zero')
self.df.fillna(0, inplace=True)
logging.info('Sort by country, Y and X')
self.df.sort_values(by=[SET_COUNTRY, SET_Y, SET_X], inplace=True)
logging.info('Add columns with location in degrees')
project = Proj('+proj=merc +lon_0=0 +k=1 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')
def get_x(row):
x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
return x
def get_y(row):
x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
return y
self.df[SET_X_DEG] = self.df.apply(get_x, axis=1)
self.df[SET_Y_DEG] = self.df.apply(get_y, axis=1)
def condition_df(self):
"""
Do any initial data conditioning that may be required.
"""
logging.info('Ensure that columns that are supposed to be numeric are numeric')
self.df[SET_GHI] = pd.to_numeric(self.df[SET_GHI], errors='coerce')
self.df[SET_WINDVEL] = pd.to_numeric(self.df[SET_WINDVEL], errors='coerce')
self.df[SET_NIGHT_LIGHTS] = pd.to_numeric(self.df[SET_NIGHT_LIGHTS], errors='coerce')
self.df[SET_ELEVATION] = pd.to_numeric(self.df[SET_ELEVATION], errors='coerce')
self.df[SET_SLOPE] = pd.to_numeric(self.df[SET_SLOPE], errors='coerce')
self.df[SET_LAND_COVER] = pd.to_numeric(self.df[SET_LAND_COVER], errors='coerce')
self.df[SET_GRID_DIST_CURRENT] = pd.to_numeric(self.df[SET_GRID_DIST_CURRENT], errors='coerce')
self.df[SET_GRID_DIST_PLANNED] = pd.to_numeric(self.df[SET_GRID_DIST_PLANNED], errors='coerce')
self.df[SET_SUBSTATION_DIST] = pd.to_numeric(self.df[SET_SUBSTATION_DIST], errors='coerce')
self.df[SET_ROAD_DIST] = pd.to_numeric(self.df[SET_ROAD_DIST], errors='coerce')
self.df[SET_HYDRO_DIST] = pd.to_numeric(self.df[SET_HYDRO_DIST], errors='coerce')
self.df[SET_HYDRO] = pd.to_numeric(self.df[SET_HYDRO], errors='coerce')
self.df[SET_SOLAR_RESTRICTION] = pd.to_numeric(self.df[SET_SOLAR_RESTRICTION], errors='coerce')
logging.info('Replace null values with zero')
self.df.fillna(0, inplace=True)
logging.info('Sort by country, Y and X')
self.df.sort_values(by=[SET_COUNTRY, SET_Y, SET_X], inplace=True)
logging.info('Add columns with location in degrees')
project = Proj('+proj=merc +lon_0=0 +k=1 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')
def get_x(row):
x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
return x
def get_y(row):
x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
return y
self.df[SET_X_DEG] = self.df.apply(get_x, axis=1)
self.df[SET_Y_DEG] = self.df.apply(get_y, axis=1)
def saveSlice_CSV(self, outputFilename=outputFilename, xSlice=[], ySlice=[], zSlice=[]):
""" Take a slice and save it to csv """
outputFilename += '_slice.csv'
# # This defines how 'narrow' slice we want. Why am I writing this if ParaView will do it fark
# tol = 1e-2
#
# # Pre allocate empty DF here?
# slicedData = pd.DataFrame()
#
# if not xSlice:
# # We have some slices along x to make
# for point in xSlice:
# # we want to slice at all of these points
# > xSlice[point] - tol
# self.flowData.transpose().loc[(self.flowData.transpose()["x"] > 0.599 & self.flowData.transpose()["x"] < 0.601 & self.flowData.transpose()["z"] == 0), "cf"]
# elif not ySlice:
# # Slices along y to take
# elif not zSlice:
# # And slices aong z
flowData = self.flowData.apply(pd.to_numeric, errors='ignore')
slicedData_indices = (flowData["z"] > -0.01) & (flowData["z"] < 0.01)
slicedData = flowData.loc[slicedData_indices]
slicedData.to_csv(outputFilename, sep=',', index=0, index_label=0)
print "Slices saved in", outputFilename
def saveSlice_CSV(self, outputFilename=outputFilename, xSlice=[], ySlice=[], zSlice=[]):
""" Take a slice and save it to csv """
outputFilename += '_slice.csv'
# # This defines how 'narrow' slice we want. Why am I writing this if ParaView will do it fark
# tol = 1e-2
#
# # Pre allocate empty DF here?
# slicedData = pd.DataFrame()
#
# if not xSlice:
# # We have some slices along x to make
# for point in xSlice:
# # we want to slice at all of these points
# > xSlice[point] - tol
# self.flowData.transpose().loc[(self.flowData.transpose()["x"] > 0.599 & self.flowData.transpose()["x"] < 0.601 & self.flowData.transpose()["z"] == 0), "cf"]
# elif not ySlice:
# # Slices along y to take
# elif not zSlice:
# # And slices aong z
flowData = self.flowData.apply(pd.to_numeric, errors='ignore')
slicedData_indices = (flowData["y"] > 0.598) & (flowData["y"] < 0.602) & (flowData["z"] == 0)
slicedData = flowData.loc[slicedData_indices]
slicedData.to_csv(outputFilename, sep=',', index=0, index_label=0)
print "Slices saved in", outputFilename
def maybe_to_numeric(series):
try:
return pd.to_numeric(series)
except ValueError:
return series
def createPriceHistoryReport(self, stock):
"""
Calls get10YrPriceHistory() to package a price history report into a PANDAS dataframe, then cleans and returns the data.
This function will acquire a price history for the provided symbol, which must be a string and a valid stock symbol
along with the symbol's exchange, e.g., ('MMM', 'NYSE'). The get10YrPriceHistory() function requires the exchange.
After the data is loaded, the function adds a Symbol field to the price history for tracking in the database, reindexes
and renames some fields, properly formats the dates into datetime fields, and converts prices from strings to floats.
Returns the report as a PANDAS dataframe if successful, otherwise a tuple (False, error message).
Example Usage: createPriceHistoryReport(('MMM', 'NYSE'))
"""
try:
# get the raw data from morningstar
price_history = self.get10YrPriceHistory(stock)
if isinstance(price_history, pd.DataFrame): # the price_history has to exist, or else return the err msg of the function called
price_history['Symbol'] = stock[0]
# reorganize header order
price_history = price_history.reindex(columns=['Symbol','Date','Open','High','Low','Close','Volume'])
# rename the Date column for easier processing through SQLite's Date functionality
price_history.rename(columns={'Date':'Reference'}, inplace=True)
# convert all dates to ISO formatted yyyy-mm-dd strings
price_history['Reference'] = price_history['Reference'].apply(lambda x: time.strftime("%Y-%m-%d", time.strptime(x, "%m/%d/%Y")))
# convert volumes to integers # unicode err on ??? value for some volumes goes to NaN
price_history['Volume'] = pd.to_numeric(price_history['Volume'].str.replace(',',''), errors='coerce')
# set index b/f db commit so no duplicate numeric index columns
price_history.set_index(['Symbol'], inplace=True)
return price_history
except Exception as e:
return (False, e)
# get10YrPriceHistory
# ******************* #
def load_groundtruth(self):
gt_labels = pd.read_csv(self.data_path)
if self.subset is not None:
mask = [True if x in self.subset else False for x in gt_labels['id'].values]
gt_labels = gt_labels[mask]
assert np.any(np.array(mask))
gt_labels['length'] = pd.to_numeric(gt_labels['length'])
gt_labels['actions'].fillna('', inplace=True)
self.gt_labels = gt_labels
def read_madx_tracking(file):
"""Read a MAD-X Tracking onetable=true file to a dataframe."""
column_names = ['ID', 'TURN', 'X', 'PX', 'Y', 'PY', 'T', 'PT', 'S', 'E']
data = pd.read_csv(file, skiprows=MADX_TRACKING_SKIP_ROWS, delim_whitespace=True, names=column_names)
return data.apply(pd.to_numeric, errors="ignore").dropna()
def to_dataframe(self, cast_numeric=False):
df = self._dataframe.copy()
if cast_numeric:
df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
return df