def test_valid_but_messy_file(self):
index = pd.Index(
['SEQUENCE1', 'seq2'], name='Feature ID', dtype=object)
exp = pd.DataFrame([['k__Bar; p__Baz', 'foo'],
['some; taxonomy; for; ya', 'bar baz']],
index=index, columns=['Taxon', 'Extra Column'],
dtype=object)
# has_header=None (default)
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'valid-but-messy.tsv')))
assert_frame_equal(obs, exp)
# has_header=True
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'valid-but-messy.tsv')),
has_header=True)
assert_frame_equal(obs, exp)
python类Index()的实例源码
def test_headerless(self):
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
columns = ['Taxon', 'Unnamed Column 1', 'Unnamed Column 2']
exp = pd.DataFrame([['k__Foo; p__Bar', 'some', 'another'],
['k__Foo; p__Baz', 'column', 'column!']],
index=index, columns=columns, dtype=object)
# has_header=None (default)
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'headerless.tsv')))
assert_frame_equal(obs, exp)
# has_header=False
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'headerless.tsv')),
has_header=False)
assert_frame_equal(obs, exp)
# In-depth testing of the `_dataframe_to_tsv_taxonomy_format` helper function,
# which does the heavy lifting for the transformers.
def find_missing_products():
train = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/train.csv')
train_ids = train['Producto_ID'].unique()
test = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/test.csv')
test_ids = test['Producto_ID'].unique()
missing_ids = pd.Index(test_ids).difference(pd.Index(train_ids))
print "missing ID count ", len(missing_ids)
missing_ids_df = pd.DataFrame(missing_ids, columns=["Producto_ID"])
missing_ids_df.to_csv('missing_ids.csv', index=False)
entries_with_missing = pd.merge(test, missing_ids_df, on='Producto_ID')
print "Mising entries=", entries_with_missing.shape[0], "percentage=", entries_with_missing.shape[0]*100/test.shape[0]
print "full entries count", test.shape[0]
generic.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 36
收藏 0
点赞 0
评论 0
def at_time(self, time, asof=False):
"""
Select values at particular time of day (e.g. 9:30AM).
Parameters
----------
time : datetime.time or string
Returns
-------
values_at_time : type of caller
"""
try:
indexer = self.index.indexer_at_time(time, asof=asof)
return self.take(indexer, convert=False)
except AttributeError:
raise TypeError('Index must be DatetimeIndex')
generic.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def between_time(self, start_time, end_time, include_start=True,
include_end=True):
"""
Select values between particular times of the day (e.g., 9:00-9:30 AM).
Parameters
----------
start_time : datetime.time or string
end_time : datetime.time or string
include_start : boolean, default True
include_end : boolean, default True
Returns
-------
values_between_time : type of caller
"""
try:
indexer = self.index.indexer_between_time(
start_time, end_time, include_start=include_start,
include_end=include_end)
return self.take(indexer, convert=False)
except AttributeError:
raise TypeError('Index must be DatetimeIndex')
common.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def _isnull_old(obj):
"""Detect missing values. Treat None, NaN, INF, -INF as null.
Parameters
----------
arr: ndarray or object value
Returns
-------
boolean ndarray or boolean
"""
if lib.isscalar(obj):
return lib.checknull_old(obj)
# hack (for now) because MI registers as ndarray
elif isinstance(obj, pd.MultiIndex):
raise NotImplementedError("isnull is not defined for MultiIndex")
elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)):
return _isnull_ndarraylike_old(obj)
elif isinstance(obj, ABCGeneric):
return obj._constructor(obj._data.isnull(func=_isnull_old))
elif isinstance(obj, list) or hasattr(obj, '__array__'):
return _isnull_ndarraylike_old(np.asarray(obj))
else:
return obj is None
test_timeseries.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def test_period_resample_with_local_timezone_pytz(self):
# GH5430
tm._skip_if_no_pytz()
import pytz
local_timezone = pytz.timezone('America/Los_Angeles')
start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
tzinfo=pytz.utc)
# 1 day later
end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
tzinfo=pytz.utc)
index = pd.date_range(start, end, freq='H')
series = pd.Series(1, index=index)
series = series.tz_convert(local_timezone)
result = series.resample('D', kind='period').mean()
# Create the expected series
# Index is moved back a day with the timezone conversion from UTC to
# Pacific
expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
expected = pd.Series(1, index=expected_index)
assert_series_equal(result, expected)
test_timeseries.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def test_period_resample_with_local_timezone_dateutil(self):
# GH5430
tm._skip_if_no_dateutil()
import dateutil
local_timezone = 'dateutil/America/Los_Angeles'
start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
tzinfo=dateutil.tz.tzutc())
# 1 day later
end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
tzinfo=dateutil.tz.tzutc())
index = pd.date_range(start, end, freq='H')
series = pd.Series(1, index=index)
series = series.tz_convert(local_timezone)
result = series.resample('D', kind='period').mean()
# Create the expected series
# Index is moved back a day with the timezone conversion from UTC to
# Pacific
expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
expected = pd.Series(1, index=expected_index)
assert_series_equal(result, expected)
test_timeseries.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def test_dayfirst(self):
# GH 5917
arr = ['10/02/2014', '11/02/2014', '12/02/2014']
expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11),
datetime(2014, 2, 12)])
idx1 = DatetimeIndex(arr, dayfirst=True)
idx2 = DatetimeIndex(np.array(arr), dayfirst=True)
idx3 = to_datetime(arr, dayfirst=True)
idx4 = to_datetime(np.array(arr), dayfirst=True)
idx5 = DatetimeIndex(Index(arr), dayfirst=True)
idx6 = DatetimeIndex(Series(arr), dayfirst=True)
self.assertTrue(expected.equals(idx1))
self.assertTrue(expected.equals(idx2))
self.assertTrue(expected.equals(idx3))
self.assertTrue(expected.equals(idx4))
self.assertTrue(expected.equals(idx5))
self.assertTrue(expected.equals(idx6))
test_timeseries.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def test_to_datetime_format(self):
values = ['1/1/2000', '1/2/2000', '1/3/2000']
results1 = [Timestamp('20000101'), Timestamp('20000201'),
Timestamp('20000301')]
results2 = [Timestamp('20000101'), Timestamp('20000102'),
Timestamp('20000103')]
for vals, expecteds in [(values, (Index(results1), Index(results2))),
(Series(values),
(Series(results1), Series(results2))),
(values[0], (results1[0], results2[0])),
(values[1], (results1[1], results2[1])),
(values[2], (results1[2], results2[2]))]:
for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']):
result = to_datetime(vals, format=fmt)
expected = expecteds[i]
if isinstance(expected, Series):
assert_series_equal(result, Series(expected))
elif isinstance(expected, Timestamp):
self.assertEqual(result, expected)
else:
self.assertTrue(result.equals(expected))
test_base.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_asobject_tolist(self):
idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx')
expected_list = [Timedelta('1 days'), Timedelta('2 days'),
Timedelta('3 days'), Timedelta('4 days')]
expected = pd.Index(expected_list, dtype=object, name='idx')
result = idx.asobject
self.assertTrue(isinstance(result, Index))
self.assertEqual(result.dtype, object)
self.assertTrue(result.equals(expected))
self.assertEqual(result.name, expected.name)
self.assertEqual(idx.tolist(), expected_list)
idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT,
timedelta(days=4)], name='idx')
expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT,
Timedelta('4 days')]
expected = pd.Index(expected_list, dtype=object, name='idx')
result = idx.asobject
self.assertTrue(isinstance(result, Index))
self.assertEqual(result.dtype, object)
self.assertTrue(result.equals(expected))
self.assertEqual(result.name, expected.name)
self.assertEqual(idx.tolist(), expected_list)
def prior_values_index(self):
index_values = list(
product(
(freq.freq_str for freq in self.unique_frequencies),
# Only store prior values for forward-fillable fields.
self.ffillable_fields,
)
)
if index_values:
return pd.MultiIndex.from_tuples(index_values)
else:
# MultiIndex doesn't gracefully support empty input, so we return
# an empty regular Index if we have values.
return pd.Index(index_values)
def add_sids(self, to_add):
"""
Add new sids to the container.
"""
self.sids = pd.Index(
sorted(self.sids.union(_ensure_index(to_add))),
)
self._realign_sids()
def drop_sids(self, to_drop):
"""
Remove sids from the container.
"""
self.sids = pd.Index(
sorted(self.sids.difference(_ensure_index(to_drop))),
)
self._realign_sids()
def _ensure_index(x):
if not isinstance(x, pd.Index):
x = pd.Index(sorted(x))
return x
def test_df_of_assets_as_input(self):
algo = TestRegisterTransformAlgorithm(
sim_params=self.sim_params,
env=TradingEnvironment(), # new env without assets
)
df = self.df.copy()
df.columns = pd.Index(map(Equity, df.columns))
algo.run(df)
assert isinstance(algo.sources[0], DataFrameSource)
def index(self):
"""Return dask Index instance"""
name = self._name + '-index'
dsk = {(name, i): (getattr, key, 'index')
for i, key in enumerate(self._keys())}
return Index(merge(dsk, self.dask), name,
self._meta.index, self.divisions)
def _daskify(obj, npartitions=None, chunksize=None):
"""Convert input to a dask-gdf object.
"""
npartitions = npartitions or 1
if isinstance(obj, _Frame):
return obj
elif isinstance(obj, (pd.DataFrame, pd.Series, pd.Index)):
return _daskify(dd.from_pandas(obj, npartitions=npartitions))
elif isinstance(obj, (gd.DataFrame, gd.Series, gd.index.Index)):
return from_pygdf(obj, npartitions=npartitions)
elif isinstance(obj, (dd.DataFrame, dd.Series, dd.Index)):
return from_dask_dataframe(obj)
else:
raise TypeError("type {} is not supported".format(type(obj)))
def concat(objs):
"""Concantenate dask gdf objects
Parameters
----------
objs : sequence of DataFrame, Series, Index
A sequence of objects to be concatenated.
"""
objs = [_daskify(x) for x in objs]
meta = gd.concat(_extract_meta(objs))
name = "concat-" + uuid4().hex
dsk = {}
divisions = [0]
base = 0
lastdiv = 0
for obj in objs:
for k, i in obj._keys():
dsk[name, base + i] = k, i
base += obj.npartitions
divisions.extend([d + lastdiv for d in obj.divisions[1:]])
lastdiv = obj.divisions[-1]
dasks = [o.dask for o in objs]
dsk = merge(dsk, *dasks)
return new_dd_object(dsk, name, meta, divisions)
def _get_return_type(meta):
if isinstance(meta, gd.Series):
return Series
elif isinstance(meta, gd.DataFrame):
return DataFrame
elif isinstance(meta, gd.index.Index):
return Index
return Scalar