def read_data(fname):
""" Read football-data.co.uk csv """
data = (
pd.read_csv(fname)
.rename(columns={
'HomeTeam': 'home_team',
'AwayTeam': 'away_team',
'FTHG': 'home_goals',
'FTAG': 'away_goals'
})
.loc[lambda df: ~pd.isnull(df['home_goals'])] # Remove future games
)
team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
data['home_team_id'] = data['home_team'].replace(team_map)
data['away_team_id'] = data['away_team'].replace(team_map)
for col in ('home_goals', 'away_goals'):
data[col] = [int(c) for c in data[col]]
return data, team_map
python类isnull()的实例源码
def get_resolution(pdb_id):
"""Quick way to get the resolution of a PDB ID using the table of results from the REST service
Returns infinity if the resolution is not available.
Returns:
float: resolution of a PDB ID in Angstroms
TODO:
- Unit test
"""
pdb_id = pdb_id.upper()
if pdb_id not in _property_table().index:
raise ValueError('PDB ID not in property table')
else:
resolution = _property_table().ix[pdb_id, 'resolution']
if pd.isnull(resolution):
log.debug('{}: no resolution available, probably not an X-ray crystal structure')
resolution = float('inf')
return resolution
def get_release_date(pdb_id):
"""Quick way to get the release date of a PDB ID using the table of results from the REST service
Returns None if the release date is not available.
Returns:
str: Organism of a PDB ID
"""
pdb_id = pdb_id.upper()
if pdb_id not in _property_table().index:
raise ValueError('PDB ID not in property table')
else:
release_date = _property_table().ix[pdb_id, 'releaseDate']
if pd.isnull(release_date):
log.debug('{}: no taxonomy available')
release_date = None
return release_date
def do_pharm_prod(drug_qid, brand_rxnorm, emea, url, brand_name):
# write info on the pharmaceutical product page
ref = create_ref_statement(emea, url)
# has active substance
s = [wdi_core.WDItemID(drug_qid, 'P3781', references=[ref])]
# instance of
s.append(wdi_core.WDItemID('Q28885102', 'P31', references=[ref])) # pharmaceutical product
s.append(wdi_core.WDItemID('Q169336', 'P31', references=[ref])) # chemical mixture
# emea
s.append(wdi_core.WDExternalID(emea, 'P3637', references=[ref]))
if not pd.isnull(brand_rxnorm):
s.append(wdi_core.WDExternalID(str(int(brand_rxnorm)), "P3345"))
item = wdi_core.WDItemEngine(item_name=brand_name, data=s, domain="drugs", append_value=['P3781'])
item.set_label(brand_name)
if item.get_description() == '':
item.set_description("pharmaceutical product")
wdi_helpers.try_write(item, emea, 'P3637', login, edit_summary="add 'active ingredient'")
return item.wd_item_id
def get_wikidata_do_mesh():
# get mesh xrefs, and including mapping relation type
# {'DOID:0050856': {'skos:broadMatch_D019958'}}
query = """
select ?item ?doid ?mesh ?mesh_rt where {
?item wdt:P699 ?doid .
?item p:P486 ?mesh_s .
?mesh_s ps:P486 ?mesh .
optional { ?mesh_s pq:P4390 ?mesh_rt }
}"""
results = WDItemEngine.execute_sparql_query(query)['results']['bindings']
results = [{k: v['value'].replace("http://www.wikidata.org/entity/", "") for k, v in item.items()} for item in
results]
df = pd.DataFrame(results)
df['mesh_rt'] = df.apply(lambda row: QID_MAP_REL_TYPE_CURIE[row.mesh_rt] + "_MESH:" + row.mesh, axis=1)
df['_item'] = df['item']
r = df.groupby("_item").aggregate(lambda x: set(y for y in x if not pd.isnull(y))).to_dict("records")
wd = {list(x['doid'])[0]: x for x in r}
wd = {k: v['mesh_rt'] for k, v in wd.items()}
wd = {k: v for k, v in wd.items() if v}
return wd
def correct_p1c1(rinex_dump, replace_p1_with_c1=True):
"""
"""
if rinex_dump.recv_p1c1 not in [1, 2, 3]:
raise ValueError('unknown receiver type {} (must be 1, 2, or 3)'.format(rinex_dump.recv_p1c1))
for sat in sorted(set(rinex_dump.sat)):
b = rinex_dump.p1c1_table[sat]
if rinex_dump.recv_p1c1 == 1:
rinex_dump.loc[rinex_dump.sat == sat, 'C1'] += b
rinex_dump.loc[rinex_dump.sat == sat, 'P2'] += b
elif rinex_dump.recv_p1c1 == 2:
rinex_dump.loc[rinex_dump.sat == sat, 'C1'] += b
if replace_p1_with_c1:
I = PD.isnull(rinex_dump['P1'])
rinex_dump.loc[I, 'P1'] = rinex_dump.loc[I, 'C1']
return rinex_dump
def to_ns(x):
"""Convert input timestamps to nanoseconds (integers)
:param x: value to be converted
:returns: converted value
:rtype: int
"""
if pd.isnull(x):
return 0
try:
return pd.to_datetime(x).value
except:
if hasattr(x, '__str__'):
return pd.to_datetime(str(x)).value
return 0
def check_nan(val):
"""Check input value for not a number
:param val: value to be checked for nan
:returns: true if nan
:rtype: bool
"""
if pd.isnull(val):
return True
if isinstance(val, str):
val = val.strip()
if not val or val.lower() == 'none' or val.lower() == 'nan':
return True
#from numpy import datetime64
# if isinstance(val, datetime64):
# return val == datetime64('NaT')
return False
def to_str(val, **kwargs):
"""Convert input to string
:param val: value to be converted
:returns: converted value
:rtype: str
"""
try:
if pd.isnull(val):
return kwargs['nan']
except BaseException:
pass
if isinstance(val, str):
return val
if kwargs.get('convert_inconsistent_dtypes', True):
if hasattr(val, '__str__'):
return str(val)
return kwargs['nan']
def to_int(val, **kwargs):
"""Convert input to int
:param val: value to be evaluated
:returns: evaluated value
:rtype: np.int64
"""
try:
if pd.isnull(val):
return kwargs['nan']
except BaseException:
pass
if isinstance(val, np.int64) or isinstance(val, int):
return np.int64(val)
if kwargs.get('convert_inconsistent_dtypes', True):
try:
return np.int64(val)
except BaseException:
pass
return kwargs['nan']
def bool_to_str(val, **kwargs):
"""Convert input boolean to str
:param val: value to be evaluated
:returns: evaluated value
:rtype: str
"""
try:
if pd.isnull(val):
return kwargs['nan']
except BaseException:
pass
if isinstance(val, np.bool_) or isinstance(val, bool):
return str(val)
if kwargs.get('convert_inconsistent_dtypes', True):
if hasattr(val, '__str__'):
return str(val)
return kwargs['nan']
def bool_to_int(val):
"""Convert input boolean to int
:param val: value to be evaluated
:returns: evaluated value
:rtype: np.int64
"""
try:
if pd.isnull(val):
return kwargs['nan']
except BaseException:
pass
if isinstance(val, np.bool_) or isinstance(val, bool):
return np.int64(val)
if kwargs.get('convert_inconsistent_dtypes', False):
try:
return np.int64(val)
except BaseException:
pass
return kwargs['nan']
test_imputer_iterative_regress.py 文件源码
项目:dsbox-cleaning
作者: usc-isi-i2
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def helper_impute_result_check(self, data, result):
"""
check if the imputed reuslt valid
now, check for:
1. contains no nan anymore
2. orignal non-nan value should remain the same
"""
# check 1
self.assertEqual(pd.isnull(result).sum().sum(), 0)
# check 2
# the original non-missing values must keep unchanged
# to check, cannot use pd equals, since the imputer may convert:
# 1 -> 1.0
# have to do loop checking
missing_value_mask = pd.isnull(data)
for col_name in data:
data_non_missing = data[~missing_value_mask[col_name]][col_name]
result_non_missing = result[~missing_value_mask[col_name]][col_name]
for i in data_non_missing.index:
self.assertEqual(data_non_missing[i]==result_non_missing[i], True,
msg="not equals in column: {}".format(col_name))
def helper_impute_result_check(self, data, result):
"""
check if the imputed reuslt valid
now, check for:
1. contains no nan anymore
2. orignal non-nan value should remain the same
"""
# check 1
self.assertEqual(pd.isnull(result).sum().sum(), 0)
# check 2
# the original non-missing values must keep unchanged
# to check, cannot use pd equals, since the imputer may convert:
# 1 -> 1.0
# have to do loop checking
missing_value_mask = pd.isnull(data)
for col_name in data:
data_non_missing = data[~missing_value_mask[col_name]][col_name]
result_non_missing = result[~missing_value_mask[col_name]][col_name]
for i in data_non_missing.index:
self.assertEqual(data_non_missing[i]==result_non_missing[i], True,
msg="not equals in column: {}".format(col_name))
def helper_impute_result_check(self, data, result):
"""
check if the imputed reuslt valid
now, check for:
1. contains no nan anymore
2. orignal non-nan value should remain the same
"""
# check 1
self.assertEqual(pd.isnull(result).sum().sum(), 0)
# check 2
# the original non-missing values must keep unchanged
# to check, cannot use pd equals, since the imputer may convert:
# 1 -> 1.0
# have to do loop checking
missing_value_mask = pd.isnull(data)
for col_name in data:
data_non_missing = data[~missing_value_mask[col_name]][col_name]
result_non_missing = result[~missing_value_mask[col_name]][col_name]
for i in data_non_missing.index:
self.assertEqual(data_non_missing[i]==result_non_missing[i], True,
msg="not equals in column: {}".format(col_name))
def helper_impute_result_check(self, data, result):
"""
check if the imputed reuslt valid
now, check for:
1. contains no nan anymore
2. orignal non-nan value should remain the same
"""
# check 1
self.assertEqual(pd.isnull(result).sum().sum(), 0)
# check 2
# the original non-missing values must keep unchanged
# to check, cannot use pd equals, since the imputer may convert:
# 1 -> 1.0
# have to do loop checking
missing_value_mask = pd.isnull(data)
for col_name in data:
data_non_missing = data[~missing_value_mask[col_name]][col_name]
result_non_missing = result[~missing_value_mask[col_name]][col_name]
for i in data_non_missing.index:
self.assertEqual(data_non_missing[i]==result_non_missing[i], True,
msg="not equals in column: {}".format(col_name))
def limits(self):
if self.is_empty():
return (0, 1)
# Fall back to the range if the limits
# are not set or if any is None or NaN
if self._limits is not None and self.range.range is not None:
limits = []
if len(self._limits) == len(self.range.range):
for l, r in zip(self._limits, self.range.range):
value = r if pd.isnull(l) else l
limits.append(value)
else:
limits = self._limits
return tuple(limits)
return self.range.range
def map(self, x, limits=None):
"""
Return an array-like of x mapped to values
from the scales palette
"""
if limits is None:
limits = self.limits
n = sum(~pd.isnull(list(limits)))
pal = self.palette(n)
if isinstance(pal, dict):
# manual palette with specific assignments
pal_match = [pal[val] for val in x]
else:
pal = np.asarray(pal)
pal_match = pal[match(x, limits)]
pal_match[pd.isnull(pal_match)] = self.na_value
return pal_match
def _mode(x, def_fill=ImputerMixin._def_fill):
"""Get the most common value in a 1d
H2OFrame. Ties will be handled in a non-specified
manner.
Parameters
----------
x : ``H2OFrame``, shape=(n_samples, 1)
The 1d frame from which to derive the mode
"""
idx = x.as_data_frame(use_pandas=True)[x.columns[0]].value_counts().index
# if the most common is null, then return the next most common.
# if there is no next common (i.e., 100% null) then we return the def_fill
return idx[0] if not pd.isnull(idx[0]) else idx[1] if idx.shape[0] > 1 else def_fill
def get_loctype(location, date_index):
"""Returns a pandas Series of the location type for each day.
Locations with a changetime have type *city* before that day, and *conflict*
after it.
"""
n_days = len(date_index)
changetime = location.time
if pd.isnull(changetime):
loctype = location.location_type
else:
#0:changetime, loctype = "city"
loctype = ['city'] * int(changetime)
#changetime:-1, loctype = "conflict"
loctype +=['conflict'] * int(n_days - changetime)
return pd.Series(loctype, index=date_index)
def compare_except(s1, s2, exceptions=[]):
conc = pd.concat([s1, s2], axis=1, ignore_index=True)
def except_apply(x):
try:
str1 = x[0]
str2 = x[1]
for ex in exceptions:
str1 = str1.replace(ex, "")
return jellyfish.jaro_distance(str1, str2)
except Exception as err:
if pd.isnull(x[0]) or pd.isnull(x[1]):
return np.nan
else:
raise err
return conc.apply(except_apply, axis=1)
def find_null_columns(df, features):
"""Locates columns in a pandas dataframe that have no values.
Args:
df: A pandas dataframe containing data.
wanted_feats: A list of string names of columns storing the actual data.
Returns: A list of string names of the null columns.
"""
df_len = len(df)
bad_feats = []
for feat in features:
null_len = len(df[df[feat].isnull()])
if df_len == null_len:
bad_feats.append(feat)
return bad_feats
def _merge_query_params(self, params, date=None):
ret = ''
for key, value in params.iteritems():
if key == 'tenor' and pd.isnull(value):
ret += 'tradeDate=' + date + ';'
elif not pd.isnull(value):
if key == Header.TENOR:
py_assert(date is not None, ValueError, 'date must be given if tenor is not None')
# unit = ''.join(re.findall('[0-9]+', params[Header.TENOR]))
# freq = FreqType(params[Header.TENOR][len(unit):])
ret += 'startDate=' + WIND_DATA_PROVIDER.forward_date(date, value,
self.date_format) + ';endDate=' + date + ';'
elif key == Header.FREQ and value[:3] == 'min':
ret += ('BarSize=' + value[3:] + ';')
else:
ret += (key + '=' + str(value) + ';')
ret = ret[:-1] + FactorLoader._check_industry_params(params.name)
return ret
def _complement_bases(self, genotype):
if pd.isnull(genotype):
return np.nan
complement = ''
for base in list(genotype):
if base == 'A':
complement += 'T'
elif base == 'G':
complement += 'C'
elif base == 'C':
complement += 'G'
elif base == 'T':
complement += 'A'
return complement
def cleanNullColumns(sheet):
"""
Helper function to discard columns in sheets where each value in column is null.
Accepts a DataFrame as the sheet argument.
Returns the cleaned dataframe or an error Tuple of (False, error)
"""
try:# check for and remove columns with all NaNs
for column in sheet.columns:
if pd.isnull(sheet[column]).all():
sheet.drop(column, axis=1, inplace=True)
return sheet
except Exception as e:
return False, e
def get_isd_data(self, station, year):
filename_format = '/pub/data/noaa/{year}/{station}-{year}.gz'
lines = self._retreive_file_lines(filename_format, station, year)
dates = pd.date_range("{}-01-01 00:00".format(year),
"{}-12-31 23:00".format(int(year) + 1),
freq='H', tz=pytz.UTC)
series = pd.Series(None, index=dates, dtype=float)
for line in lines:
if line[87:92].decode('utf-8') == "+9999":
temp_C = float("nan")
else:
temp_C = float(line[87:92]) / 10.
date_str = line[15:27].decode('utf-8')
# there can be multiple readings per hour, so set all to minute 0
dt = pytz.UTC.localize(datetime.strptime(date_str, "%Y%m%d%H%M")).replace(minute=0)
# only set the temp if it's the first encountered in the hour.
if pd.isnull(series.ix[dt]):
series[dt] = temp_C
return series
def get_input_data_mask(self, input_data):
''' Boolean list of missing/not missing values:
True => missing
False => not missing
'''
trace_data, temp_data = input_data
dts = []
mask = []
if trace_data.empty or temp_data.empty:
return pd.Series(mask)
for (start, energy), (p, group) in zip(
trace_data.iteritems(),
temp_data.groupby(level="period")):
temps = group.copy()
temps.index = temps.index.droplevel()
daily_temps = temps.resample('D').apply(np.mean)[0]
for i, tempF in daily_temps.iteritems():
dts.append(i)
mask.append(pd.isnull(energy) or pd.isnull(tempF))
return pd.Series(mask, index=dts)
def test_multiple_records_with_gap(serializer):
records = [
{
"start": datetime(2000, 1, 1, tzinfo=pytz.UTC),
"end": datetime(2000, 1, 2, tzinfo=pytz.UTC),
"value": 1,
},
{
"start": datetime(2000, 1, 3, tzinfo=pytz.UTC),
"end": datetime(2000, 1, 4, tzinfo=pytz.UTC),
"value": 2,
},
]
df = serializer.to_dataframe(records)
assert df.value[datetime(2000, 1, 1, tzinfo=pytz.UTC)] == 1
assert not df.estimated[datetime(2000, 1, 1, tzinfo=pytz.UTC)]
assert pd.isnull(df.value[datetime(2000, 1, 2, tzinfo=pytz.UTC)])
assert not df.estimated[datetime(2000, 1, 2, tzinfo=pytz.UTC)]
assert df.value[datetime(2000, 1, 3, tzinfo=pytz.UTC)] == 2
assert not df.estimated[datetime(2000, 1, 3, tzinfo=pytz.UTC)]
assert pd.isnull(df.value[datetime(2000, 1, 4, tzinfo=pytz.UTC)])
assert not df.estimated[datetime(2000, 1, 4, tzinfo=pytz.UTC)]
def test_multiple_records(serializer):
records = [
{
"start": datetime(2000, 1, 1, tzinfo=pytz.UTC),
"value": 1,
},
{
"start": datetime(2000, 1, 2, tzinfo=pytz.UTC),
"value": 2,
},
]
df = serializer.to_dataframe(records)
assert df.value[datetime(2000, 1, 1, tzinfo=pytz.UTC)] == 1
assert not df.estimated[datetime(2000, 1, 1, tzinfo=pytz.UTC)]
assert pd.isnull(df.value[datetime(2000, 1, 2, tzinfo=pytz.UTC)])
assert not df.estimated[datetime(2000, 1, 2, tzinfo=pytz.UTC)]
def test_multiple_records(serializer):
records = [
{
"end": datetime(2000, 1, 1, tzinfo=pytz.UTC),
"value": 1,
},
{
"end": datetime(2000, 1, 2, tzinfo=pytz.UTC),
"value": 2,
},
]
df = serializer.to_dataframe(records)
assert df.value[datetime(2000, 1, 1, tzinfo=pytz.UTC)] == 2
assert not df.estimated[datetime(2000, 1, 1, tzinfo=pytz.UTC)]
assert pd.isnull(df.value[datetime(2000, 1, 2, tzinfo=pytz.UTC)])
assert not df.estimated[datetime(2000, 1, 2, tzinfo=pytz.UTC)]