def checkFSXvalsAgainstADNIMERGE(tadpoleDF, mriADNI1FileFSX, otherSSvisCodeStr, ssNameTag,
ignoreMissingCols = False):
nrRows, nrCols = tadpoleDF.shape
colListOtherSS = list(ssDF.columns.values)
colListTadpoleDF = list(tadpoleDF.columns.values)
tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]] = \
tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]].apply(pd.to_numeric, errors='coerce')
tadpoleDF['HIPPOSUM'] = tadpoleDF['ST29SV%s' % ssNameTag] + tadpoleDF['ST88SV%s' % ssNameTag]
for r in range(nrRows):
valsNan = np.isnan(tadpoleDF['Hippocampus'][r]) or (np.isnan(tadpoleDF['ST29SV%s' % ssNameTag][r]) and \
np.isnan(tadpoleDF['ST88SV%s' % ssNameTag][r]))
if valsNan:
continue
valsNotEq = tadpoleDF['Hippocampus'][r] != (tadpoleDF['ST29SV%s' % ssNameTag][r] + tadpoleDF['ST88SV%s' % ssNameTag][r])
if valsNotEq:
print('entries dont match\n ', tadpoleDF[['RID','VISCODE', 'Hippocampus', 'ST29SV%s' % ssNameTag,\
'ST88SV%s' % ssNameTag, 'HIPPOSUM']].iloc[r])
# Conclusion: the reason why entries above don't match is because UCSFFSX has duplicate entries for the same subject and viscode.
python类to_numeric()的实例源码
def load_submission(self, submission_file):
loc_submission = pd.read_csv(submission_file, header=None)
build_proc_sub = loc_submission[0].str.split(' ').values.tolist()
assert len(build_proc_sub[0]) == self.n_classes + len(self.submission_columns)
proc_sub = pd.DataFrame.from_records(build_proc_sub, columns=[self.submission_columns + list(range(self.n_classes))])
if self.subset is not None:
if type(proc_sub['frame_id'].values[0]) is np.ndarray:
mask = [True if x[0] in self.subset else False for x in proc_sub['frame_id'].values]
else:
# old pandas version
mask = [True if x in self.subset else False for x in proc_sub['frame_id'].values]
proc_sub = proc_sub[mask]
assert np.any(np.array(mask))
num_proc_sub = proc_sub.apply(pd.to_numeric, errors='ignore')
grouped_by_vid = num_proc_sub
self.submission = grouped_by_vid
def build_dataframe(self):
if not self.values.exists():
return pd.DataFrame()
# Am I really a programmer or just a lego assembler?
# Pandas makes my life at least 20 times easier.
df = pd.DataFrame.from_records(self.values, index=self.index_column)
# make the columns and labels prettier
if self.rename_columns:
df = df.rename(columns=self.column_mapping)
df.index.name = TIME_COLUMN_NAME
try:
df.index = df.index.tz_convert(self.user.pytz_timezone)
except AttributeError:
# if attribute-error means the index is just a regular Index and
# that only dates (and not time) was passed
df.index = pd.DatetimeIndex(df.index, tz=self.user.pytz_timezone)
# cast it as numerics if possible, otherwise if we're dealing with strings, ignore
df = df.apply(pd.to_numeric, errors='ignore')
return df
def _cond_ind_effects_wrapper(self):
"""
A wrapper for the conditional indirect effects.
:return: pd.DataFrame
A DataFrame of effects, se, llci, and ulci, for the conditional indirect effects.
"""
symb_to_var = self._symb_to_var
results = self.estimation_results
rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T
cols_stats = ["Effect", "Boot SE", "BootLLCI", "BootULCI"]
mod_values = self._moderators_values
med_values = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]]
values = med_values + mod_values
rows_levels = np.array([i for i in product(*values)])
cols_levels = ["Mediator"] + [symb_to_var.get(x, x) for x in self._moderators_symb]
rows = np.concatenate([rows_levels, rows_stats], axis=1)
cols = cols_levels + cols_stats
df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
return df.apply(pd.to_numeric, args=["ignore"])
def _simple_ind_effects_wrapper(self):
"""
A wrapper for the indirect effects (and for total/contrast effects if specified)
:return: pd.DataFrame
A DataFrame of effects, se, llci, and ulci, for the simple/total/constrasts of indirect effects.
"""
symb_to_var = self._symb_to_var
results = self.estimation_results
rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T
med_names = [symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]
rows_levels = []
if self._options["total"]:
rows_levels += ["TOTAL"]
rows_levels += med_names
if self._options["contrast"]:
contrasts = ["Contrast: {} vs. {}".format(a, b) for a, b in combinations(med_names, 2)]
rows_levels += contrasts
rows_levels = np.array(rows_levels).reshape(-1, 1)
rows = np.concatenate([rows_levels, rows_stats], axis=1)
cols = ["", "Effect", "Boot SE", "BootLLCI", "BootULCI"]
df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
return df.apply(pd.to_numeric, args=["ignore"])
def _PMM_index_wrapper(self):
"""
A wrapper for the Partial Moderated Mediation index.
:return: pd.DataFrame
A DataFrame of effects, se, llci, and ulci, for the PMM index.
"""
symb_to_var = self._symb_to_var
results = self._PMM_index()
rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T
cols_stats = ["Index", "Boot SE", "LLCI", "ULCI"]
mod_names = [[symb_to_var.get(i, i) for i in self._moderators_symb]]
med_names = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]]
values = mod_names + med_names
rows_levels = np.array([i for i in product(*values)])
cols_levels = ["Moderator", "Mediator"]
rows = np.concatenate([rows_levels, rows_stats], axis=1)
cols = cols_levels + cols_stats
df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
return df.apply(pd.to_numeric, args=["ignore"])
def _MMM_index_wrapper(self):
"""
A wrapper for the Moderated Moderated Mediation index.
:return: pd.DataFrame
A DataFrame of effects, se, llci, and ulci, for the CMM index.
"""
symb_to_var = self._symb_to_var
results = self._MMM_index()
rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T
cols_stats = ["Index", "Boot SE", "BootLLCI", "BootULCI"]
med_names = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]]
rows_levels = np.array([i for i in product(*med_names)])
cols_levels = ["Mediator"]
rows = np.concatenate([rows_levels, rows_stats], axis=1)
cols = cols_levels + cols_stats
df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
return df.apply(pd.to_numeric, args=["ignore"])
def clean_data(DT_df, attributes):
"""data preprocessing"""
# DT_df = DT_df.drop(drop_cols, axis=1)
DT_df["fs_scan_amt_pre"] = DT_df["fs_scan_amt_pre"].astype(float)
DT_df["fs_scan_amt_pos"] = DT_df["fs_scan_amt_pos"].astype(float)
DT_df["fs_scan_amt_pos_PF"] = DT_df["fs_scan_amt_pos_PF"].astype(float)
DT_df["dyn_margin_amt_pre"] = DT_df["dyn_margin_amt_pre"].astype(float)
DT_df["dyn_margin_amt_pos"] = DT_df["dyn_margin_amt_pos"].astype(float)
DT_df["dyn_margin_amt_pos_PF"] = DT_df[
"dyn_margin_amt_pos_PF"].astype(float)
DT_df["ctl_grp_ind"] = DT_df["ctl_grp_ind"].astype(int)
DT_df["mailer_version_id"] = DT_df["mailer_version_id"].astype(int)
DT_df["tcm_redeem_md"] = pd.to_numeric(DT_df["tcm_redeem_md"])
for attr in attributes:
DT_df[attr] = DT_df[attr].astype(int)
fields = attributes + ["fs_scan_amt_pre", "fs_scan_amt_pos", "fs_scan_amt_pos_PF", "dyn_margin_amt_pre", "dyn_margin_amt_pos", "dyn_margin_amt_pos_PF",
"ctl_grp_ind", "mailer_version_id", "tcm_redeem_md", "xtra_card_nbr"]
DT_df = DT_df[fields]
return DT_df
def _get_table(self, column, is_size=True):
cols = list(range(5))
cols.append(self.header.index(column))
header = [self.header[c] for c in cols]
rows = [
[row[c] for c in cols]
for row in self.rows
]
if is_size:
for row in rows:
row[5] = parse_size(row[5])
table = pd.DataFrame.from_records(rows, columns=header)
table = table.rename(columns={
'prog' : 'Program',
'prog2' : 'Program2',
'threads' : 'Threads',
'dataset' : 'Dataset',
'qcut' : 'Quality',
})
table['Threads'] = pd.to_numeric(table['Threads'])
table['Dataset'] = pd.Categorical(table['Dataset'])
table['Program'] = pd.Categorical(table['Program'])
table['Program2'] = pd.Categorical(table['Program2'])
return table
def __init__(self, filename=TABLE_FILENAME):
MS = SpectralTypeRelations.MainSequence()
# Read in the table.
colspecs=[[0,7], [7,14], [14,21], [21,28], [28,34], [34,40], [40,47], [47,55],
[55,63], [63,70], [70,78], [78,86], [86,94], [94,103], [103,110],
[110,116], [116,122], [122,130], [130,137], [137,144], [144,151],
[151,158]]
mam_df = pd.read_fwf(filename, header=20, colspecs=colspecs, na_values=['...'])[:92]
# Strip the * from the logAge column. Probably shouldn't but...
mam_df['logAge'] = mam_df['logAge'].map(lambda s: s.strip('*') if isinstance(s, basestring) else s)
# Convert everything to floats
for col in mam_df.columns:
mam_df[col] = pd.to_numeric(mam_df[col], errors='ignore')
# Add the spectral type number for interpolation
mam_df['SpTNum'] = mam_df['SpT'].map(MS.SpT_To_Number)
self.mam_df = mam_df
consensus.py 文件源码
项目:Comparative-Annotation-Toolkit
作者: ComparativeGenomicsToolkit
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def load_metrics_from_db(db_path, tx_mode, aln_mode):
"""
Loads the alignment metrics for the mRNA/CDS alignments of transMap/AugustusTM/TMR
"""
session = tools.sqlInterface.start_session(db_path)
metrics_table = tools.sqlInterface.tables[aln_mode][tx_mode]['metrics']
metrics_df = tools.sqlInterface.load_metrics(metrics_table, session)
# unstack flattens the long-form data structure
metrics_df = metrics_df.set_index(['AlignmentId', 'classifier']).unstack('classifier')
metrics_df.columns = [col[1] for col in metrics_df.columns]
metrics_df = metrics_df.reset_index()
cols = ['AlnCoverage', 'AlnGoodness', 'AlnIdentity', 'PercentUnknownBases']
metrics_df[cols] = metrics_df[cols].apply(pd.to_numeric)
metrics_df['OriginalIntrons'] = metrics_df['OriginalIntrons'].fillna('')
metrics_df['OriginalIntrons'] = [list(map(int, x)) if len(x[0]) > 0 else [] for x in
metrics_df['OriginalIntrons'].str.split(',').tolist()]
metrics_df['OriginalIntronsPercent'] = metrics_df['OriginalIntrons'].apply(calculate_vector_support, resolve_nan=1)
session.close()
return metrics_df
def create_routing_table(bgp=None, ixp_prefixes=None, ixp_asns=None, bgp_compression='infer'):
log.info('Creating IP2AS tool.')
if bgp_compression == 'infer' and bgp.startswith('http'):
bgp_compression = infer_compression(bgp, 'infer')
if not isinstance(ixp_prefixes, pd.DataFrame):
ixp_prefixes = set(pd.read_csv(ixp_prefixes, comment='#', index_col=0).index.unique()) if ixp_prefixes is not None else set()
if not isinstance(ixp_asns, pd.DataFrame):
ixp_asns = set(pd.read_csv(ixp_asns, comment='#', index_col=0).index.unique()) if ixp_asns is not None else set()
if not isinstance(bgp, pd.DataFrame):
bgp_original = pd.read_table(bgp, comment='#', names=['Address', 'Prefixlen', 'ASN'], compression=bgp_compression)
bgp = bgp_original[~bgp_original.ASN.str.contains(',|_')].copy()
bgp['ASN'] = pd.to_numeric(bgp.ASN)
rt = RoutingTable()
for address, prefixlen, asn in bgp[~bgp.ASN.isin(ixp_asns)].itertuples(index=False):
rt.add_prefix(asn.item(), address, prefixlen)
for address, prefixlen, asn in bgp[bgp.ASN.isin(ixp_asns)].itertuples(index=False):
rt.add_ixp(address, prefixlen)
for prefix in ixp_prefixes:
rt.add_ixp(prefix)
rt.add_private()
rt.add_multicast()
rt.add_default()
return rt
def assemble_row_metadata(full_df, num_col_metadata, num_data_rows, num_row_metadata):
# Extract values
row_metadata_row_inds = range(num_col_metadata + 1, num_col_metadata + num_data_rows + 1)
row_metadata_col_inds = range(1, num_row_metadata + 1)
row_metadata = full_df.iloc[row_metadata_row_inds, row_metadata_col_inds]
# Create index from the first column of full_df (after the filler block)
row_metadata.index = full_df.iloc[row_metadata_row_inds, 0]
# Create columns from the top row of full_df (before cids start)
row_metadata.columns = full_df.iloc[0, row_metadata_col_inds]
# Rename the index name and columns name
row_metadata.index.name = row_index_name
row_metadata.columns.name = row_header_name
# Convert metadata to numeric if possible
row_metadata = row_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore"))
return row_metadata
def assemble_col_metadata(full_df, num_col_metadata, num_row_metadata, num_data_cols):
# Extract values
col_metadata_row_inds = range(1, num_col_metadata + 1)
col_metadata_col_inds = range(num_row_metadata + 1, num_row_metadata + num_data_cols + 1)
col_metadata = full_df.iloc[col_metadata_row_inds, col_metadata_col_inds]
# Transpose so that samples are the rows and headers are the columns
col_metadata = col_metadata.T
# Create index from the top row of full_df (after the filler block)
col_metadata.index = full_df.iloc[0, col_metadata_col_inds]
# Create columns from the first column of full_df (before rids start)
col_metadata.columns = full_df.iloc[col_metadata_row_inds, 0]
# Rename the index name and columns name
col_metadata.index.name = column_index_name
col_metadata.columns.name = column_header_name
# Convert metadata to numeric if possible
col_metadata = col_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore"))
return col_metadata
def _orderbook_tag_frame(text):
# This function can be removed if this pandas feature request is implemented
# https://github.com/pandas-dev/pandas/issues/14608
table_str = _table_text(text)
root = etree.fromstring(table_str)
table_body = root.find('tbody')
index = []
data = defaultdict(list)
# Iterator of tr objects
qty_path = "td[@class='change-cell quantity']"
tr_iter = table_body.iter(tag='tr')
for tr in tr_iter:
index.append(tr.find(path='td').text.strip())
# Quantity Held
pos = pd.to_numeric(tr.find(path=qty_path).attrib['value'])
data[iem.QUANTITY_HELD].append(pos)
# Your Bids
data[iem.YOUR_BIDS].append(_num_open_orders(tr, 'yourBidsCell'))
# Your Asks
data[iem.YOUR_ASKS].append(_num_open_orders(tr, 'yourAsksCell'))
return pd.DataFrame(data=data, index=index)
def apply_ht_scores(dataframe):
# Load the ht score dataframe
ht_scores = pandas.read_csv('{0}ht_scores.csv'.format(config['result_data']), index_col=0)
dataframe['phone'] = dataframe['phone'].map(lambda x: re.sub('[^0-9]', '', str(x)))
# Make the column a numeric column for merging
dataframe['phone'] = pandas.to_numeric(dataframe['phone'])
final = dataframe.merge(ht_scores, how='left', left_on='phone', right_index=True)
# Drop the content column and drop the index column
final.drop('content', axis=1, inplace=True)
if os.path.isfile('{0}ad_chars_final.csv'.format(config['result_data'])):
lock.acquire()
print 'lock has been set for file {0}'.format(file)
final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), mode='a', header=False, encoding='utf-8')
lock.release()
print 'lock has been released for file {0}'.format(file)
else:
final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), header=True, encoding='utf-8')
def apply_ht_scores(dataframe):
# Load the ht score dataframe
ht_scores = pandas.read_csv('{0}ht_scores.csv'.format(config['result_data']), index_col=0)
dataframe['phone'] = dataframe['phone'].map(lambda x: re.sub('[^0-9]', '', str(x)))
# Make the column a numeric column for merging
#dataframe['phone'] = pandas.to_numeric(dataframe['phone'])
final = dataframe.merge(ht_scores, how='left', left_on='phone', right_index=True)
# Drop the content column and drop the index column
final.drop('content', axis=1, inplace=True)
if os.path.isfile('{0}ad_chars_final.csv'.format(config['result_data'])):
lock.acquire()
print 'lock has been set for file {0}'.format(file)
final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), mode='a', header=False, encoding='utf-8', index=False)
lock.release()
else:
final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), header=True, encoding='utf-8', index=False)
def makeDataFrame(phases):
""" Return Pandas DataFrame object, with CIF files as index and ellipsoid parameters as columns (hierarchical by centre atom)"""
import pandas as pd
from pieface.readcoords import Crystal
if isinstance(phases, dict):
if isinstance( phases[phases.keys()[0]], Crystal): # We are reading a dict of Crystals: convert to nested dict first
alldata = makenesteddict(phases)
elif isinstance( phases[phases.keys()[0]], dict ): # Looking at a dict of dicts: assume correct for pandas...
alldata = phases
d = dict([ (i, pd.DataFrame(alldata[i]).set_index('files')) for i in alldata.keys() ]) # Make dict of DataFrames
frame = pd.concat(d, axis=1)
if len(frame.index) == 1: # We're looking at a single cif file - unstack DataFrame with atoms as index
return frame.ix[frame.index[0]].unstack().apply(pd.to_numeric, errors='ignore') # Need to convert back to float/int when unstacking
else:
return frame
else:
raise TypeError("Unknown data format for conversion to DataFrame (expected dict)")
def _return_appropiate_type(self, selected):
if isinstance(selected, pd.Series):
frame = pd.DataFrame(selected).T
if self._required_cols <= set(frame.columns):
selected = frame.apply(pd.to_numeric, errors='ignore')
else:
return selected
if (isinstance(selected, pd.DataFrame)
and self._required_cols <= set(selected.columns)):
molecule = self.__class__(selected)
molecule.metadata = self.metadata.copy()
molecule._metadata = copy.deepcopy(self._metadata)
return molecule
else:
return selected
def _augment_lmfit_modelresult(result):
"""Tidy data values and fitted model from `lmfit.model.ModelResult`.
"""
columns = ['x', 'data', 'best_fit', 'residual']
d = pd.DataFrame(index=range(result.ndata), columns=columns)
for col in columns[1:]:
d.loc[:, col] = getattr(result, col)
independent_vars = result.model.independent_vars
if len(independent_vars) == 1:
independent_var = independent_vars[0]
else:
msg = ('Only 1 independent variable is currently supported.\n'
'Found independent variables: %s' % str(independent_vars))
raise NotImplementedError(msg)
x_array = result.userkws[independent_var]
d.loc[:, 'x'] = x_array
if len(result.components) > 1:
comp_names = [c.name for c in result.components]
for cname, comp in zip(comp_names, result.components):
d.loc[:, cname] = comp.eval(x=d.x, **result.values)
return d.apply(pd.to_numeric, errors='ignore')
def __init__(self, symbol, *args):
super().__init__()
self.data = pd.read_csv(open(r"Stock_Data/{}.csv".format(symbol)))
self.data = self.data.apply(pd.to_numeric, errors="ignore")
self.data.index = self.data["Quarter end"]
self.name = symbol
if self.data["Price"].dtype in (int, float) and self.data["Cumulative dividends per share"].dtype in (int, float):
self.data["Value"] = self.data["Price"] + self.data["Cumulative dividends per share"]
# Calculation of the estimated return
self.data["Estimated Return"] = self.data["Value"].pct_change()
# Calculation of the standard deviation
self.data["Standard Deviation"] = self.data["Value"].std()
else:
self.complete_pricelist = False
def __call__(self, fields, geo_for, geo_in=None, cache=NopCache()):
"""Special method to make API object invocable.
Arguments:
* fields: list of variables to return.
* geo_* fields must be given as dictionaries, eg:
`{'county': '*'}`
* cache: cache in which to store results. Not cached by default.
"""
params = {
'get': ','.join(fields),
'key': self.key,
'for': self._geo2str(geo_for),
}
if geo_in:
params['in'] = self._geo2str(geo_in)
j = fetchjson(self.endpoint, cache, self.session, params=params)
ret = pd.DataFrame(data=j[1:], columns=j[0])
for field in fields:
if self.variables[field].get('predicateType') == 'int':
ret[field] = pd.to_numeric(ret[field])
return ret
LoadAndMatchDates.py 文件源码
项目:Test-stock-prediction-algorithms
作者: timestocome
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def read_data(file_name):
stock = pd.read_csv(file_name, parse_dates=True, index_col=0)
n_samples = len(stock)
# ditch samples with NAN values
stock = stock.dropna(axis=0)
# flip order from newest to oldest to oldest to newest
#stock = stock.iloc[::-1]
# trim data
stock = stock[['Open']]
# convert object to floats
stock['Open'] = pd.to_numeric(stock['Open'], errors='coerce')
# all stock is needed to walk back dates for testing hold out data
return stock
#############################################################################################
# load and combine stock indexes, matching the dates
def get_qstat_as_df():
"""Get the current users output of qstat as a DataFrame.
"""
user = os.environ.get("USER")
try:
ret = subprocess.Popen(
["qstat", "-u", str(user)],
stdout=subprocess.PIPE,
)
df = pd.read_csv(ret.stdout, delimiter="\s+")
# drop the first line since it is just one long line
df = df.drop(df.index[0]).copy()
# convert objects to numeric otherwise numbers are strings
df["JOBID"] = pd.to_numeric(df["job-ID"], errors='coerce')
# df.set_index("JOBID")
df = df.drop('job-ID', 1)
except ValueError:
logger.exception("No jobs in queues for user {}".format(user))
df = pd.DataFrame()
return df
def get_data_from_google(ticker_sym, start, end):
""" Returns a data frame of data for a given stock between two dates """
url = "https://www.google.com/finance/historical?q=%s&startdate=%s&enddate=%s&output=csv" % (ticker_sym, start, end)
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))
df['Date'] = pd.to_datetime(df['Date'])
df['epoch'] = (df['Date'] - datetime(1970,1,1)).dt.total_seconds() * 1000
df.set_index('Date')
df['Adj_Close'] = df['Close'] # google's api doens't provide so just assume it's the same
cols = ['High', 'Low', 'Volume', 'Open', 'Close', 'Adj_Close']
for c in cols: # cast columns to numeric
df[c] = pd.to_numeric(df[c])
return df.iloc[::-1] # reverse the dataframe so index 0 is the earliest date
#@memoize
#def get_data_for_sym(ticker_sym, start, end):
# return list(reversed(get_data_for_sym_from_yahoo(ticker_sym, start, end)))
# #res = StockFeature.select().where(Relationship.from_user == self))
def calc_AB(vcf):
''' Calculate allele balance for all samples in a given
pdVCF. Also converts DP & GQ to numeric type.
Args:
vcf: pdVCF with genotype information extracted
Notes:
ONLY WORKS FOR BIALLELIC VARIANTS
'''
sam = vcf.columns.levels[0][0]
vcf[sam,'DP'] = pd.to_numeric(vcf[sam,'DP'].str.replace('.', '0')) # bcftools places '.' in empty fields
vcf[sam,'GQ'] = pd.to_numeric(vcf[sam,'GQ'].str.replace('.', '0'))
AD = vcf.xs('AD', level=1, axis=1).unstack().str.split(",", n=2)
DP = vcf.xs('DP', level=1, axis=1).unstack()
AB = round(pd.to_numeric(AD.str[1]) / pd.to_numeric(DP), 2)
vcf[sam, 'AB'] = AB.tolist()
return vcf
def update_distances(self):
"""
Calculate the distances between the observed series and the stresses.
Returns
-------
distances: pandas.DataFrame
pandas dataframe with the distances between the oseries (index)
and the stresses (columns).
"""
# Make sure these are values, even when actually objects.
xo = pd.to_numeric(self.oseries.x)
xt = pd.to_numeric(self.stresses.x)
yo = pd.to_numeric(self.oseries.y)
yt = pd.to_numeric(self.stresses.y)
xh, xi = np.meshgrid(xt, xo)
yh, yi = np.meshgrid(yt, yo)
self.distances = pd.DataFrame(np.sqrt((xh - xi) ** 2 + (yh - yi) ** 2),
index=self.oseries.index,
columns=self.stresses.index)
def parse(self, entry):
data = pd.read_csv(str(entry),
engine= "c",
sep= "\t",
parse_dates= False,
index_col= [0, 1])
data.index.names = ["date", "srcid"]
# Check for AMT bug that adds row of ('nvsplDate', 'Total_All') with all 0s, drop if exists
if data.index[-1][0] == 'nvsplDate':
data = data.iloc[:-1, :]
## Pandas cannot seem to handle a MultiIndex with dates;
## slicing syntax becomes even crazier, and often doesn't even work.
## So date conversion is disabled for now.
# # Convert dates
# datetimes = data.index.get_level_values('date').to_datetime()
# data.index.set_levels(datetimes, level= 'date', inplace= True)
# Ensure MultiIndex sortedness
data.sortlevel(inplace= True)
return data.apply(pd.to_numeric, raw= True, errors= "coerce")
def to_numeric(self,columns):
'''
Args
columns (string or list):
column names needed to be converted
Returns
-
'''
if isinstance(columns,str):
self.data_df[columns] = pd.to_numeric(self.data_df[columns],errors='coerce')
elif isinstance(columns,list):
for column in columns:
self.data_df[column] = pd.to_numeric(self.data_df[column],errors='coerce')
# rename certain columns
def to_numeric(self,columns):
'''
Args
columns (string or list):
column names needed to be converted
Returns
-
'''
if isinstance(columns,str):
self.data_df[columns] = pd.to_numeric(self.data_df[columns],errors='coerce')
elif isinstance(columns,list):
for column in columns:
self.data_df[column] = pd.to_numeric(self.data_df[column],errors='coerce')
# rename certain columns