def yield_records(self, sorted_records):
n = len(sorted_records)
for i, record in enumerate(sorted_records):
self.validate_record(record)
start = record["start"]
value = record["value"]
estimated = record.get("estimated", False)
if i < n - 1: # all except last record
yield (start, value, estimated)
else: # last record
end = record.get("end", None)
if end is None:
# can't use the value of this record, no end date
yield (start, np.nan, False)
else:
self._validate_record_start_end(record, start, end)
# provide an end date cap
if pd.notnull(value):
yield (start, value, estimated)
yield (end, np.nan, False)
else:
yield (start, np.nan, False)
python类notnull()的实例源码
def serialize_input(self, input_data):
''' Serialize input data
'''
return OrderedDict([
(start.isoformat(), OrderedDict([
("energy", row.energy if pd.notnull(row.energy) else None),
("tempF", row.tempF if pd.notnull(row.tempF) else None),
]))
for start, row in input_data.iterrows()
])
def read_and_clean_csv_to_dataframe(filename_or_stream, encoding='utf-8'):
"""
Reads a utf-8 encoded CSV directly into a pandas dataframe as string values and scrubs np.NaN values to Python None
:param str filename_or_stream: path to CSV
:return:
"""
# pulls data in as utf8, all as strings, and without pre whitespace padding
try:
data = pd.read_csv(
filepath_or_buffer=filename_or_stream,
encoding=encoding,
dtype=str,
skipinitialspace=True
)
except AttributeError:
# this is an empty dataframe and pandas crashed because it can't coerce the columns to strings
# issue and PR to fix is open on pandas core at https://github.com/pydata/pandas/issues/12048
# slated for 1.8 release
# so for now just try loading the dataframe without specifying dtype
data = pd.read_csv(
filepath_or_buffer=filename_or_stream,
encoding=encoding,
skipinitialspace=True
)
logging.info('File read via the pandas read_csv methodology.')
# coerces pandas nulls (of np.NaN type) into python None
data = data.where((pd.notnull(data)), None)
# coerces string representations of Python None to a real Python None
data[data == 'None'] = None
data[data == ''] = None
logging.info("Dataframe of shape %s has been retrieved." % str(data.shape))
return data
def __init__(self,
estimates,
name_map):
validate_column_specs(
estimates,
name_map
)
self.estimates = estimates[
estimates[EVENT_DATE_FIELD_NAME].notnull() &
estimates[FISCAL_QUARTER_FIELD_NAME].notnull() &
estimates[FISCAL_YEAR_FIELD_NAME].notnull()
]
self.estimates[NORMALIZED_QUARTERS] = normalize_quarters(
self.estimates[FISCAL_YEAR_FIELD_NAME],
self.estimates[FISCAL_QUARTER_FIELD_NAME],
)
self.array_overwrites_dict = {
datetime64ns_dtype: Datetime641DArrayOverwrite,
float64_dtype: Float641DArrayOverwrite,
}
self.scalar_overwrites_dict = {
datetime64ns_dtype: Datetime64Overwrite,
float64_dtype: Float64Overwrite,
}
self.name_map = name_map
self._columns = set(name_map.keys())
def update_dataframe_to_be_none_instead_of_nan_for_api_responses(df):
df = df.where((pd.notnull(df)), None)
return df
def get_sorted_response(series):
if series.dropna().empty:
return NO_DATA_RESPONSE
# Do a odd sorted tuple response because Javascript sorting is an oddly difficult problem
# sorted_response = [item for item in series.iteritems()]
sorted_response = []
for index, value in series.iteritems():
if not pd.notnull(value):
value = None
data_point = (index, value)
sorted_response.append(data_point)
return Response(sorted_response)
def test_api_categorization_sort(app, sort_by):
n_categories = 2
dsid, lsi_id, _, ds_input = get_features_lsi_cached(app, n_categories=n_categories)
method = V01 + "/feature-extraction/{}".format(dsid)
data = app.get_check(method)
training_set = ds_input['training_set']
pars = {
'parent_id': lsi_id,
'data': training_set,
'method': 'NearestNeighbor'}
method = V01 + "/categorization/"
data = app.post_check(method, json=pars)
mid = data['id']
method = V01 + "/categorization/{}/predict".format(mid)
data = app.get_check(method, json={'batch_id': -1, "sort_by": sort_by})
res = []
for row in data['data']:
res_el = {'document_id': row['document_id']}
for scores in row['scores']:
res_el[scores['category']] = scores['score']
res.append(res_el)
df = pd.DataFrame(res)
df = df.set_index('document_id')
if sort_by in df.columns:
mask = pd.notnull(df[sort_by])
assert_array_equal(df[mask].index.values,
df[mask].sort_values(sort_by, ascending=False).index.values)
def _prepare_data(self):
"""
Subset the dataframe to the columns needed for estimation purposes, and add a constant.
:return: pd.DataFrame
"""
# Subset the data to the columns used in the model
data = self.data[self.varlist].copy()
data = data[pd.notnull(data)].reset_index(drop=True)
# Mapping each variable name to a unique variable code, and renaming the columns in the data.)
data.rename(columns=self._var_to_symb, inplace=True)
# Adding a constant to the data.
data["Cons"] = 1
if self.options["logit"]:
endog = data["y"]
uniques = np.unique(endog)
if len(uniques) != 2:
raise ValueError(
"The dependent variable does not have exactly two distinct outcomes."
"Please provide another dataset or change the 'logit' option to 0")
else:
endog_logit = [0 if i == uniques[0] else 1 for i in endog]
data["y"] = endog_logit
return data
def remove_random_nan(pd_obj):
return pd_obj.where((pd.notnull(pd_obj)), None)
def split_by_component(df):
df['prim_comp'] = df.Comp.map(lambda s: s[0])
df['sec_comp'] = df.Comp.map(lambda s: s[-1])
comps = pd.concat((df[['prim_comp', 'Sp1']], df[['sec_comp', 'Sp2']]))
prim = comps.loc[comps.prim_comp.notnull()].rename(columns={'Sp1': 'SpT', 'prim_comp': 'comp'})
sec = comps.loc[comps.sec_comp.notnull()].rename(columns={'Sp2': 'SpT', 'sec_comp': 'comp'})
return pd.concat((prim, sec))[['comp', 'SpT']].drop_duplicates(subset='comp')
def add_committee():
df = pandas.DataFrame.from_csv('data/mp-en.csv', header=0, index_col=False)
df = df.where((pandas.notnull(df)), None)
MPs = df.to_dict(orient='records')
for mp in MPs:
if mp['committee_memberships']:
committees = [committee.strip() for committee in mp['committee_memberships'].split(',')]
person_id = utils.hluttaw_to_popitid(mp['identifier__hluttaw'],
base_url)
on_behalf_of_id = utils.org_name_to_popitid(mp['group'],base_url)
for org in committees:
payload = {}
payload['person_id'] = person_id
payload['organization_id'] = utils.org_name_to_popitid(org,base_url)
payload['on_behalf_of_id'] = on_behalf_of_id
payload['role'] = 'Committee Member'
payload['start_date'] = mp['start_date']
url = base_url + '/en/memberships'
r = requests.post(url,headers=headers,json=payload)
print r.content
def update_my():
lang = 'my'
df = pandas.DataFrame.from_csv('data/mp-my.csv', header=1, index_col=False)
df = df.where((pandas.notnull(df)), None)
MPs = df.to_dict(orient='records')
for mp in MPs:
hluttaw_id = mp['identifier__hluttaw']
popit_id = utils.hluttaw_to_popitid(hluttaw_id, base_url)
print hluttaw_id
print popit_id
if popit_id:
url = base_url + "/" + lang + "/persons/" + popit_id
honorific_prefix = mp['honorific_prefix']
name = mp['name']
gender = mp['gender']
national_identity = mp['national_identity']
payload = {
'honorific_prefix': honorific_prefix,
'name': name,
'gender': gender,
'national_identity': national_identity,
}
r = requests.put(url, headers=headers, json=payload)
print r.content
def not_null(x):
return notnull(x) and str(x).lower() not in NULL_VALUES
def nan_coerce(x):
v = str(x)
if pd.notnull(v) is False or v in NAN_LIST:
return np.nan
return x
def remove_line_breaks(x):
x = (str(x) if pd.notnull(x) else '')
for b in LINE_BREAKS_LIST_RX:
x = b.sub(" ", x)
return string_blank_na(x.lstrip().rstrip())
test_ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def test_longpanel_series_combo(self):
wp = tm.makePanel()
lp = wp.to_frame()
y = lp.pop('ItemA')
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
model = ols(y=y, x=lp, entity_effects=True, window=20)
self.assertTrue(notnull(model.beta.values).all())
tm.assertIsInstance(model, PanelOLS)
model.summary
test_panel.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_count(self):
f = lambda s: notnull(s).sum()
self._check_stat_op('count', f, obj=self.panel, has_skipna=False)
test_panel.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def test_transpose_copy(self):
panel = self.panel.copy()
result = panel.transpose(2, 0, 1, copy=True)
expected = panel.swapaxes('items', 'minor')
expected = expected.swapaxes('major', 'minor')
assert_panel_equal(result, expected)
panel.values[0, 1, 1] = np.nan
self.assertTrue(notnull(result.values[1, 0, 1]))
test_panel4d.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_count(self):
f = lambda s: notnull(s).sum()
self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False)
test_indexing.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 41
收藏 0
点赞 0
评论 0
def test_setitem_always_copy(self):
s = self.frame['A'].copy()
self.frame['E'] = s
self.frame['E'][5:10] = nan
self.assertTrue(notnull(s[5:10]).all())