def get_levels(self, name):
"""
Return a set containing all distinct values in the column 'name'.
The values are returned in alphabetical order.
Parameters
----------
name : string
The column name for which the unique values are requested
Returns
-------
levels : list
A unique list of all values that are contained in the specified
data column.
"""
return pd.unique(self._table[name].values.ravel())
python类unique()的实例源码
def wrapper_nms(proposal_df, overlap=0.65):
"""Apply non-max-suppresion to a video batch.
"""
vds_unique = pd.unique(proposal_df['video-name'])
new_proposal_df = []
for i, v in enumerate(vds_unique):
idx = proposal_df['video-name'] == v
p = proposal_df.loc[idx, ['video-name', 'f-init', 'f-end',
'score', 'video-frames']]
n_frames = np.int(p['video-frames'].mean())
loc = np.stack((p['f-init'], p['f-end']), axis=-1)
loc, score = nms_detections(loc, np.array(p['score']), overlap)
n_proposals = score.shape[0]
n_frames = np.repeat(p['video-frames'].mean(), n_proposals).astype(int)
this_df = pd.DataFrame({'video-name': np.repeat(v, n_proposals),
'f-init': loc[:, 0], 'f-end': loc[:, 1],
'score': score,
'video-frames': n_frames})
new_proposal_df.append(this_df)
return pd.concat(new_proposal_df, axis=0)
def get_detected_objects(df, tol=1.0, debug=False):
"""
Takes a summary dataframe with RV information. Finds the median rv for each star,
and removes objects that are more than 'tol' km/s from the median value
:param df: A summary dataframe, such as created by get_ccf_summary or find_best_pars
:param tol: The tolerance, in km/s, to accept an observation as detected
:return: a dataframe containing only detected companions
"""
secondary_names = pd.unique(df.Secondary)
secondary_to_rv = defaultdict(float)
for secondary in secondary_names:
rv = df.loc[df.Secondary == secondary]['rv'].median()
secondary_to_rv[secondary] = rv
if debug:
for secondary in sorted(secondary_to_rv.keys()):
print ('RV for {}: {:.2f} km/s'.format(secondary, secondary_to_rv[secondary]))
keys = df.Secondary.values
good = df.loc[abs(df.rv.values - np.array(itemgetter(*keys)(secondary_to_rv))) < tol]
return good
def list_stars(self, print2screen=False):
"""
List all of the stars in all of the CCF interfaces
Parameters:
===========
- print2screen: bool
Should we print the stars and dates to screen?
Returns:
=========
- star_list: list
A list of every star in the file, sorted by name.
"""
stars = []
for inst in self._interfaces.keys():
if print2screen:
print('Stars observed with {}: \n============================\n\n'.format(inst))
stars.extend(self._interfaces[inst].list_stars(print2screen=print2screen))
return list(pd.unique(stars))
test_algos.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def test_datetime64_dtype_array_returned(self):
# GH 9431
expected = np.array(['2015-01-03T00:00:00.000000000+0000',
'2015-01-01T00:00:00.000000000+0000'],
dtype='M8[ns]')
dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000',
'2015-01-01T00:00:00.000000000+0000',
'2015-01-01T00:00:00.000000000+0000'])
result = algos.unique(dt_index)
tm.assert_numpy_array_equal(result, expected)
self.assertEqual(result.dtype, expected.dtype)
s = pd.Series(dt_index)
result = algos.unique(s)
tm.assert_numpy_array_equal(result, expected)
self.assertEqual(result.dtype, expected.dtype)
arr = s.values
result = algos.unique(arr)
tm.assert_numpy_array_equal(result, expected)
self.assertEqual(result.dtype, expected.dtype)
def rename_brands(phone_models):
""" recast all phone brands and model as string integers brand_i and model_j """
brands_table = {}
i = 0
for brand in pd.unique(phone_models['phone_brand']):
brands_table[brand] = 'brand_%s' %i
i += 1
models_table = {}
i = 0
for model in pd.unique(phone_models['device_model']):
models_table[model] = 'model_%s' %i
i += 1
converted = []
for item in zip(phone_models['phone_brand'],phone_models['device_model']):
converted.append((brands_table[item[0]],models_table[item[1]]))
phone_models['phone_brand'] = [x[0] for x in converted]
phone_models['device_model'] = [x[1] for x in converted]
return phone_models
def rename_brands(phone_models):
""" recast all phone brands and model as string integers brand_i and model_j """
brands_table = {}
i = 0
for brand in pd.unique(phone_models['phone_brand']):
brands_table[brand] = 'brand_%s' %i
i += 1
models_table = {}
i = 0
for model in pd.unique(phone_models['device_model']):
models_table[model] = 'model_%s' %i
i += 1
converted = []
for item in zip(phone_models['phone_brand'],phone_models['device_model']):
converted.append((brands_table[item[0]],models_table[item[1]]))
phone_models['phone_brand'] = [x[0] for x in converted]
phone_models['device_model'] = [x[1] for x in converted]
return phone_models
def __init__(self, linksfile, ic=None):
df = pd.read_csv(linksfile)
df['link'] = df.i.map(str) + '_' + df.j.map(str) + '_' + df.k.map(str)
df.set_index('link', inplace=True)
self.df = df
# self.T = len(self.df)
SR_stats = pd.read_csv('calvin/data/SR_stats.csv', index_col=0).to_dict()
self.min_storage = SR_stats['min']
self.max_storage = SR_stats['max']
if ic:
self.apply_ic(ic)
# a few network fixes to make things work
self.add_ag_region_sinks()
self.fix_hydropower_lbs()
self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist()
self.links = list(zip(df.i,df.j,df.k))
self.networkcheck() # make sure things aren't broken
def make_unique_value_each_column (self, df, node_id):
""" Dataframe? ??? ???? ??? ??? ?? ??? ????
Unique Value return in Dataframe
Args:
params:
* df : dataframe
* node_id: nnid
Returns:
json
Raises:
"""
try:
data_conf = dict()
column_cate_unique = dict()
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
for i, v in df.dtypes.iteritems():
if (str(v) not in numerics): # maybe need float
column_cate_unique[i] = df[i].unique().size
data_conf['unique_cell_feature'] = column_cate_unique
data_conf_json_str = json.dumps(data_conf)
data_conf_json = json.loads(data_conf_json_str)
return data_conf_json
except Exception as e:
logging.error("make_unique_value_each_column error : {0}, {1}".format(i,v))
raise e
def makeTable(df, rowsCol, colsCol, dataCol):
# df.set_index(rowsCol)
uniqRowVals = pd.unique(df[rowsCol])
uniqColVals = pd.unique(df[colsCol])
# "rows col = ", df[rowsCol]
# print "uniq row vals", uniqRowVals
# print "uniq col vals", uniqColVals
# print df[[rowsCol, colsCol, dataCol]]
out = pd.DataFrame(index=uniqRowVals, columns=uniqColVals)
for rowVal in uniqRowVals:
for colVal in uniqColVals:
rowsMatch = df[rowsCol] == rowVal
colsMatch = df[colsCol] == colVal
thisIdx = np.where(rowsMatch * colsMatch)[0][0]
out.ix[rowVal][colVal] = df[dataCol][thisIdx]
return out
def label_metadata(label_matrix, label_col):
# Check whether the column value is given as index (number) or name (string)
try:
label_col = int(label_col)
# If given as number, take the name of the column out of it
label_col = label_matrix.columns[label_col]
except ValueError:
pass
import pandas as pd
# Get the unique classes in the given column, and how many of them are there
unique_classes = pd.unique(label_matrix[label_col].ravel())
#num_classes = unique_classes.shape[0]
# Map the unique n classes with a number from 0 to n
label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})
# Replace the given column's values with the mapped equivalent
mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())
# Return the mapped labels as numpy list and the label map (unique classes and number can be obtained from map)
return np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)), np.asarray(label_map) #, unique_classes, num_classes
def label_metadata(label_matrix, label_col):
# Check whether the column value is given as index (number) or name (string)
try:
label_col = int(label_col)
# If given as number, take the name of the column out of it
label_col = label_matrix.columns[label_col]
except ValueError:
pass
# Get the unique classes in the given column, and how many of them are there
unique_classes = pd.unique(label_matrix[label_col].ravel())
# Map the unique n classes with a number from 0 to n
label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})
# Replace the given column values with the mapped equivalent
mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())
# print("label_matrix", label_matrix)
# print("mapped_labels", mapped_labels)
# Return the mapped labels as ndarray and the label map (unique classes and number can be obtained from map)
# np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],))
# Return the mapped labels as DataFrame and the label map (unique classes and number can be obtained from map)
return mapped_labels[[label_col]], np.asarray(label_map) #, unique_classes, num_classes
def create_subset(src, dest, n=250):
"Given a csv file `src`, create a subset `dest` with `n` unique entities"
df = pd.read_csv(src)
lics = pd.unique(df["License #"])
sublics = lics[random.sample(range(0,len(lics)), n)]
subset = df[df["License #"].isin(sublics)]
# Make the column names a little more readable
subset.columns = map(clean_column_name, subset.columns)
subset.to_csv(dest, index=False)
def convert_categorical(df):
onecol = df.columns[1]
onecol_name = df.columns.values.tolist()[1]
df[onecol] = df[onecol].str.lower()
categories = pd.unique(df[onecol])
categories = [x for x in categories if x is not None]
try:
categories.remove(' ')
except:
pass
categories = [str(x) for x in categories]
categories = list(set([str.lower(x).strip() for x in categories]))
#replaces spaces in middle of word w underscores
categories = list(set([x.replace(" ", '_') for x in categories]))
featnames = []
for i in range(len(categories)):
if type(categories[i]) is str:
newfeatstr = onecol_name+'_is_' + categories[i]
featnames.append(newfeatstr)
df[newfeatstr] = (df[onecol] == categories[i])
onecol_null = onecol_name + "_is_null"
df[onecol_null] = pd.isnull(df[onecol])
df[onecol_null] = df[onecol_null].astype(float)
df = df.drop(onecol, axis=1)
df[featnames] = df[featnames].astype(float)
df = df.groupby(config_db['id_column'], sort = False, as_index=False)[featnames].max()
return df, featnames
def _validate_layout(func):
def func_wrapper(self):
if self._col_wrap:
if self._col_wrap > 16:
raise VisualizationInvalidLayout
else:
return func(self)
if self._col_factor and len(pd.unique(self._table[self._col_factor].values.ravel())) > 16:
raise VisualizationInvalidLayout
if self._row_factor and len(pd.unique(self._table[self._row_factor].values.ravel())) > 16:
raise VisualizationInvalidLayout
return func(self)
return func_wrapper
def vectorize (f):
from functools import wraps
try:
from pandas import Series, unique
@wraps(f)
def vectorized_f (x):
# If we're given a scalar value, then simply return it.
if not hasattr(x,'__len__'):
return f(x)
# Get unique values
inputs = unique(x)
outputs = map(f,inputs)
table = dict(zip(inputs,outputs))
result = Series(x).map(table)
return result.values
except ImportError:
def cached_f(x, cache={}):
if x not in cache:
cache[x] = f(x)
return cache[x]
@wraps(f)
def vectorized_f (x):
# If we're given a scalar value, then simply return it.
if not hasattr(x,'__len__'):
return cached_f(x)
return map(cached_f,x)
return vectorized_f
# The type of data returned by the Buffer iterator.
def saveLabel(self):
if not len(self.labelFile):
self.labelFile = QtGui.QFileDialog.getSaveFileName(self, 'Save Label File', os.path.expanduser('~'), 'Txt (*.txt)')
if len(self.labelFile):
self.updateLabelsBuf()
if self.labelsBuf is not None:
if self.labels is None:
self.labels = self.labelsBuf
self.labels = self.labels[~self.labels.image.isin(pd.unique(self.labelsBuf.image.ravel()))]
self.labelsBuf = self.labelsBuf[self.labelsBuf.cateid.notnull()]
self.labels = self.labels.append(self.labelsBuf, ignore_index=True)
self.labels.to_csv(self.labelFile, index=False)
self.labelsBuf = self.labelsBuf[self.labelsBuf.image == os.path.basename(self.imgsList[self.ith])]
def add_actual_temperature(df, method='excel', filename='SecondaryStar_Temperatures.xls'):
"""
Add the actual temperature to a given summary dataframe
:param df: The dataframe to which we will add the actual secondary star temperature
:keyword method: How to get the actual temperature. Options are:
- 'spt': Use main-sequence relationships to go from spectral type --> temperature
- 'excel': Use tabulated data, available in the file 'SecondaryStar_Temperatures.xls'
:keyword filename: The filename of the excel spreadsheet containing the literature temperatures.
Needs to have the right format! Ignored if method='spt'
:return: copy of the original dataframe, with an extra column for the secondary star temperature
"""
# First, get a list of the secondary stars in the data
secondary_names = pd.unique(df.Secondary)
secondary_to_temperature = defaultdict(float)
secondary_to_error = defaultdict(float)
if method.lower() == 'spt':
MS = SpectralTypeRelations.MainSequence()
for secondary in secondary_names:
star_data = StarData.GetData(secondary)
spt = star_data.spectype[0] + re.search('[0-9]\.*[0-9]*', star_data.spectype).group()
T_sec = MS.Interpolate(MS.Temperature, spt)
secondary_to_temperature[secondary] = T_sec
elif method.lower() == 'excel':
table = pd.read_excel(filename, 0)
for secondary in secondary_names:
T_sec = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())]['Literature_Temp'].item()
T_error = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())][
'Literature_error'].item()
secondary_to_temperature[secondary] = T_sec
secondary_to_error[secondary] = T_error
df['Tactual'] = df['Secondary'].map(lambda s: secondary_to_temperature[s])
df['Tact_err'] = df['Secondary'].map(lambda s: secondary_to_error[s])
return
def fit_sigma(df, i):
"""
Find the largest allowable standard deviation, given the possible values Tactual can take.
"""
Tmeasured, Tactual, _, _ = get_values(df)
Tm = Tmeasured[i]
# Get the possible values, and bin those with this measured value
possible_values = sorted(pd.unique(df.Tactual))
edges = [(possible_values[i] + possible_values[i+1])/2 for i in range(len(possible_values)-1)]
bins = [0] + edges + [9e9]
good = df.loc[df.Temperature == Tm]
values, _= np.histogram(good.Tactual.values, bins=bins)
mean = np.mean(good.Tactual.values)
std = np.std(good.Tactual.values, ddof=1)
if std > 0:
return std
sigma_test = np.arange(500, 10, -10) #Just test a bunch of values
idx = np.searchsorted(bins, mean)
idx = np.argmin(abs(np.array(bins) - mean))
x1 = bins[idx-2] if idx > 2 else -1
x2 = bins[idx-1]
x3 = bins[idx]
x4 = bins[idx+1] if idx < len(bins)-2 else np.inf
N = len(good)
probs = [get_probability(x1, x2, x3, x4, N, mean, s) for s in sigma_test]
for s, p in zip(sigma_test, probs):
if p > 0.5:
return s
# If we get here, just return a guess value
return 200.0
#raise ValueError('No probability > 0!')
def read_hdf5(hdf5_file):
"""
Reads the hdf5 file into a dataframe. Assumes a very specific format!
Parameters:
===========
- hdf5_file: string
The full path to the hdf5 file.
Returns
========
A pandas DataFrame containing summary information
"""
logging.info('Reading HDF5 file {}'.format(hdf5_file))
hdf5_int = HDF5_Interface(hdf5_file)
df = hdf5_int.to_df()
# Get the contrast. Split by group and then merge to limit the amount of calculation needed
logging.info('Estimating the V-band contrast ratio for each trial')
test_vsini = df.vsini.unique()[0]
temp = df.loc[(df.rv == 0) & (df.vsini == test_vsini)].drop_duplicates(subset=['star', 'temperature'])
temp['contrast'] = temp.apply(lambda r: get_contrast(r, band='V'), axis=1)
logging.info('Estimating the luminosity ratio for each trial')
temp['lum_ratio'] = temp.apply(get_luminosity_ratio, axis=1)
logging.info('Re-merging dataframe')
df = pd.merge(df, temp[['star', 'temperature', 'contrast', 'lum_ratio']], on=['star', 'temperature'], how='left')
df['logL'] = np.log10(df.lum_ratio)
return df
def parse_input(inp, sort_output=True, ensure_unique=True):
"""
Parse the user input to get a list of integers.
Parameters:
===========
- inp: string
Can be in the form 'a-b', 'a,b,c', 'a-b,c-d', etc.
'-' means an inclusive list of every number between a and b
',' means the numbers a and b
- sort_output: boolean
Sort the output integers?
- ensure_unique: boolean
Make sure the final list has no repeats?
:return: A list of integers
"""
sublists = inp.split(',')
final_list = []
for l in sublists:
if '-' in l:
first, last = l.split('-')
for i in range(int(first), int(last) + 1):
final_list.append(i)
else:
final_list.append(int(l))
if ensure_unique:
final_list = pd.unique(final_list)
if sort_output:
final_list = sorted(final_list)
return final_list
def get_ccf(self, params, df=None):
"""
Get the ccf with the given parameters.
Parameters:
===========
- params: dictionary:
All the parameters necessary to define a single ccf. This should be
a python dictionary with the keys:
- 'starname': The name of the star. Try self.list_stars() for the options.
- 'date': The UT date of the observations. Try self.list_dates() for the options.
- 'T': temperature of the model
- 'logg': the log(g) of the model
- 'vsini': the vsini by which the model was broadened before correlation
- '[Fe/H]': the metallicity of the model
- 'addmode': The way the order CCFs were added to make a total one. Can be:
- 'simple'
- 'ml'
- 'weighted'
- 'dc'
- df: a pandas DataFrame such as outputted by _compile_data
Returns:
========
-ccf: pandas DataFrame
Holds columns of velocity and CCF power
"""
if df is None:
try:
df = self._compile_data(params['starname'], params['date'])
except KeyError:
raise KeyError('Must give get_ccf params with starname and date keywords, if df is not given!')
Tvals = df['T'].unique()
T = Tvals[np.argmin(abs(Tvals - params['T']))]
good = df.loc[(df['T'] == T) & (df.logg == params['logg']) & (df.vsini == params['vsini']) \
& (df['[Fe/H]'] == params['[Fe/H]']) & (df.addmode == params['addmode'])]
return pd.DataFrame(data={'velocity': self.velocities, 'CCF': good['ccf'].item()})
test_algos.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 40
收藏 0
点赞 0
评论 0
def test_ints(self):
arr = np.random.randint(0, 100, size=50)
result = algos.unique(arr)
tm.assertIsInstance(result, np.ndarray)
test_algos.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def test_objects(self):
arr = np.random.randint(0, 100, size=50).astype('O')
result = algos.unique(arr)
tm.assertIsInstance(result, np.ndarray)
test_algos.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def test_object_refcount_bug(self):
lst = ['A', 'B', 'C', 'D', 'E']
for i in range(1000):
len(algos.unique(lst))
test_algos.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_on_index_object(self):
mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(
np.arange(5), 5)])
expected = mindex.values
expected.sort()
mindex = mindex.repeat(2)
result = pd.unique(mindex)
result.sort()
tm.assert_almost_equal(result, expected)
test_algos.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 45
收藏 0
点赞 0
评论 0
def test_unique_label_indices():
from pandas.hashtable import unique_label_indices
a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')
left = unique_label_indices(a)
right = np.unique(a, return_index=True)[1]
tm.assert_numpy_array_equal(left, right)
a[np.random.choice(len(a), 10)] = -1
left = unique_label_indices(a)
right = np.unique(a, return_index=True)[1][1:]
tm.assert_numpy_array_equal(left, right)
def __init__(self, data=None, groups=None, **kwargs):
super().__init__(data=data, **kwargs)
if groups is not None:
self.plydata_groups = list(pd.unique(groups))
def _n_distinct(arr):
"""
Number of unique values in array
"""
return len(pd.unique(arr))
test_chamber_of_deputies_dataset.py 文件源码
项目:serenata-toolbox
作者: datasciencebr
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def test_clean_2017_reimbursements(self):
copy(os.path.join(self.fixtures_path, 'reimbursements-2017.xz'), self.path)
file_path = os.path.join(self.path, 'reimbursements.xz')
self.subject.clean()
assert(os.path.exists(file_path))
dataset = pd.read_csv(file_path, compression='xz')
all_subquotas = [subquota[1] for subquota in self.subject.subquotas]
present_subquotas = pd.unique(dataset['subquota_description'])
for subquota in present_subquotas:
with self.subTest():
assert(subquota in all_subquotas)