def transform(self, x):
"""
Parameters:
x (Sequence): - ???????
Returns:
np.array: - ????????????numpy??
"""
s = pd.cut(x, bins=self.bins)
d = pd.get_dummies(s)
z = d.T.to_dict()
re = []
for i, v in z.items():
for j, u in v.items():
if u == 1:
re.append(str(j))
return np.array(re)
python类get_dummies()的实例源码
def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
def data_preprocess(train, test):
outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
train.drop(train.index[outlier_idx], inplace=True)
all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
test.loc[:, 'MSSubClass':'SaleCondition']))
to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
all_data = all_data.drop(to_delete, axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
# log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(method='ffill')
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train, X_test, y
def generator_input(input_file, chunk_size):
"""Generator function to produce features and labels
needed by keras fit_generator.
"""
input_reader = pd.read_csv(tf.gfile.Open(input_file[0]),
names=CSV_COLUMNS,
chunksize=chunk_size,
na_values=" ?")
for input_data in input_reader:
input_data = input_data.dropna()
label = pd.get_dummies(input_data.pop(LABEL_COLUMN))
input_data = to_numeric_features(input_data)
n_rows = input_data.shape[0]
return ( (input_data.iloc[[index % n_rows]], label.iloc[[index % n_rows]]) for index in itertools.count() )
def next_batch(df, i=None):
"""
:param df: pandas dataframe
:param i: batch index
:return: (numpy array x, numpy array y)
"""
if i is None:
start = 0
end = df.shape[0]
else:
start = BATCH_SIZE * i
end = BATCH_SIZE * (i + 1)
result = df[start:end]
if "Survived" in result:
batch_ys = pd.get_dummies(result.pop('Survived').values).as_matrix()
batch_xs = result.as_matrix()
return batch_xs, batch_ys
else:
return result.as_matrix()
def transform(self, X, y=None):
"""Dummy encode the categorical columns in X
Parameters
----------
X : pd.DataFrame or dd.DataFrame
y : ignored
Returns
-------
transformed : pd.DataFrame or dd.DataFrame
Same type as the input
"""
if not X.columns.equals(self.columns_):
raise ValueError("Columns of 'X' do not match the training "
"columns. Got {!r}, expected {!r}".format(
X.columns, self.columns
))
if isinstance(X, pd.DataFrame):
return pd.get_dummies(X, drop_first=self.drop_first)
elif isinstance(X, dd.DataFrame):
return dd.get_dummies(X, drop_first=self.drop_first)
else:
raise TypeError("Unexpected type {}".format(type(X)))
def input_fn(df):
"""Format the downloaded data."""
# Creates a dictionary mapping from each continuous feature column name (k)
# to the values of that column stored in a constant Tensor.
continuous_cols = [df[k].values for k in CONTINUOUS_COLUMNS]
X_con = np.stack(continuous_cols).astype(np.float32).T
# Standardise
X_con -= X_con.mean(axis=0)
X_con /= X_con.std(axis=0)
# Creates a dictionary mapping from each categorical feature column name
categ_cols = [np.where(pd.get_dummies(df[k]).values)[1][:, np.newaxis]
for k in CATEGORICAL_COLUMNS]
n_values = [np.amax(c) + 1 for c in categ_cols]
X_cat = np.concatenate(categ_cols, axis=1).astype(np.int32)
# Converts the label column into a constant Tensor.
label = df[LABEL_COLUMN].values[:, np.newaxis]
# Returns the feature columns and the label.
return X_con, X_cat, n_values, label
def replay(self):
"""Memory Management and training of the agent
"""
if len(self.memory) < self.batch_size:
return
state, action, reward, next_state, done = self._get_batches()
reward += (self.gamma
* np.logical_not(done)
* np.amax(self.model.predict(next_state), axis=1))
q_target = self.target_model.predict(state)
_ = pd.Series(action)
one_hot = pd.get_dummies(_).as_matrix()
action_batch = np.where(one_hot == 1)
q_target[action_batch] = reward
return self.model.fit(state, q_target,
batch_size=self.batch_size,
epochs=1,
verbose=False)
def make_date_columns_categorical_binary(book_attributes):
"""Turn all date columns in book_attributes into binary categorical columns."""
# bucket publish dates & insert categorical data columns into data frame
orig_pub_year_cat = transform_pub_dates(book_attributes['original_pub_year'])
book_attributes.insert(loc=5, column='orig_pub_year_cat', value=orig_pub_year_cat)
pub_year_cat = transform_pub_dates(book_attributes['pub_year'])
book_attributes.insert(loc=5, column='pub_year_cat', value=pub_year_cat)
# turn date categories into binary dataframes; merge back into book_attributes
pub_year_dummies = pd.get_dummies(book_attributes['pub_year_cat'])
orig_year_dummies = pd.get_dummies(book_attributes['orig_pub_year_cat'])
book_full_attr = book_attributes.merge(pub_year_dummies,left_index=True, right_index=True)
book_full_attr = book_full_attr.merge(orig_year_dummies,left_index=True, right_index=True)
return book_full_attr
def load_user_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00'):
'''
????????,??????.
'''
dump_path = './cache/user_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date=start_date, end_date=end_date, field=['user_id', 'time', 'type'])
prefix = 'Action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
type_dummies = pd.get_dummies(df['type'], prefix=prefix)
df = pd.concat([df, type_dummies], axis=1)
drop_cols = ['time', 'type']
df.drop(drop_cols, axis=1, inplace=True)
df = df.groupby(['user_id'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
return df
def load_base_user_feat(end_date='2016-04-16'):
'''
????????
'''
dump_path = './cache/base_user_feat_{0}.pkl'.format(end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = pd.read_csv(USER_FILE, encoding='gbk')
# sex_dummies = pd.get_dummies(df.sex, prefix='sex')
df.user_reg_tm.fillna('2016-02-01', inplace=True)
df.user_reg_tm = pd.to_datetime(df.user_reg_tm).apply(lambda t: pd.to_datetime('2016-02-01') if t > pd.to_datetime('2016-04-15') else t)
df['reg_tm_dist'] = df.user_reg_tm.apply(lambda t: (pd.to_datetime(end_date) - t).days)
df = df[['user_id', 'user_lv_cd', 'reg_tm_dist']]
# df = pd.concat([df, sex_dummies], axis=1)
# age_dummies = pd.get_dummies(df.age, prefix='age')
# N = age_dummies.shape[1]
# age_dummies.columns = ['age_{0}'.format(i) for i in range(N)]
# df = pd.concat([df, age_dummies], axis=1)
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
return df
def load_UIPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions= [1,2,3,4,5,6]):
'''
UI pair????
'''
dump_path = './cache/UIPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'sku_id', 'cate', 'type'])
prefix = 'UIPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
type_dummies = pd.get_dummies(df['type'], prefix=prefix)
df = pd.concat([df, type_dummies], axis=1)
df.drop(['type'], axis=1, inplace=True)
df = df.groupby(['user_id', 'sku_id', 'cate'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
actions.sort()
rt_cols = ['user_id', 'sku_id', 'cate']
rt_cols.extend(['UIPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
df = df[rt_cols]
return df
def load_UCPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]):
'''
??UCPair???
'''
dump_path = './cache/UCPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'type', 'cate'])
prefix = 'UCPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
type_dummies = pd.get_dummies(df['type'], prefix=prefix)
df = pd.concat([df, type_dummies], axis=1)
df = df.groupby(['user_id', 'cate'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
actions.sort()
rt_cols = ['user_id', 'cate']
rt_cols.extend(['UCPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
df = df[rt_cols]
return df
def load_base_item_feat(end_date = '2016/4/16'):
'''
??????
'''
JComment = pd.read_csv(COMMENT_FILE, encoding='gbk')
end_date = pd.to_datetime(end_date)
JComment.dt = pd.to_datetime(JComment.dt)
dts = JComment.dt.drop_duplicates()
dts.sort_index(inplace=True, ascending=False)
for dt in dts.iteritems():
if dt[-1] < end_date:
break
JComment = JComment[JComment.dt == dt[-1]].drop(['dt'], axis=1)
Comment_num_dummies = pd.get_dummies(JComment.comment_num, prefix='Comment_num')
JComment = pd.concat([JComment, Comment_num_dummies], axis=1)
return JComment.drop(['comment_num'], axis=1)
def load_item_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]):
'''
??????
'''
dump_path = './cache/item_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['sku_id', 'type'])
prefix = 'item_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
type_dummies = pd.get_dummies(df['type'], prefix=prefix)
df = pd.concat([df, type_dummies], axis=1)
df.drop(['type'], axis=1, inplace=True)
df = df.groupby(['sku_id'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
rt_cols = ['sku_id']
rt_cols.extend(['item_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
df = df[rt_cols]
return df
def load_UBPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-01 00:00:00', actions = [1,2,3,4,5,6]):
'''
????????
'''
dump_path = './cache/UBPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'brand', 'type'])
prefix = 'UBPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
type_dummies = pd.get_dummies(df.type, prefix=prefix)
df = pd.concat([df, type_dummies], axis=1)
df.drop(['type'], axis=1, inplace=True)
df = df.groupby(['user_id', 'brand'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
rt_cols = ['user_id', 'brand']
rt_cols.extend(['UBPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
df = df[rt_cols]
return df
def load_BCPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]):
'''
????-??????
'''
dump_path = './cache/BCPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['brand', 'cate', 'type'])
prefix = 'BCPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
type_dummies = pd.get_dummies(df.type, prefix=prefix)
df = pd.concat([df.drop(['type'], axis=1), type_dummies], axis=1)
df = df.groupby(['brand', 'cate'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
rt_cols = ['brand', 'cate']
rt_cols.extend(['BCPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
df = df[rt_cols]
return df
def load_user_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00'):
'''
????????????
'''
dump_path = './cache/user_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone'])
timeZone_dummies = pd.get_dummies(df.time_zone, prefix='time_zone_cnt')
df = pd.concat([df, timeZone_dummies], axis=1)
df.drop(['time_zone'], axis=1, inplace=True)
df = df.groupby(['user_id'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
return df
def load_UCPair_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]):
'''
?????????????????
'''
dump_path = './cache/UCPair_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone', 'cate'])
timeZone_dummies = pd.get_dummies(df.time_zone, prefix='uc_time_zone_cnt')
df = pd.concat([df, timeZone_dummies], axis=1)
df.drop(['time_zone'], axis=1, inplace=True)
df = df.groupby(['user_id', 'cate'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
df = df[df.cate.isin(cate)]
return df
def load_UIPair_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]):
'''
????????????????
'''
dump_path = './cache/UIPair_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone', 'sku_id'])
timeZone_dummies = pd.get_dummies(df.time_zone, prefix='time_zone_cnt')
df = pd.concat([df, timeZone_dummies], axis=1)
df.drop(['time_zone'], axis=1, inplace=True)
df = df.groupby(['user_id', 'sku_id'], as_index=False).sum()
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
return df
def get_table(train_table):
x_cols = []
for col in train_table.columns:
# print(data[col].value_counts())
if col not in ['result', 'team_name', 'competition', 'season_x',
'surname']:
train_table[col] = train_table[col].astype(str)
x_cols.append(col)
# print(x_cols)
X = pd.get_dummies(train_table[x_cols])
y = train_table['result']
print(train_table.shape)
print(X.shape)
print(y.shape)
return X, y
def main():
df = pd.read_csv("dataset.csv")
df = df.dropna()
# print df
x1 = df.copy()
del x1['Customer']
del x1['Effective To Date']
x4 = pd.get_dummies(x1)
# print x4
n = 10
clf = k_means(x4, n_clusters = n)
centroids = clf[0]
# 10 clusters
labels = clf[1]
# print x4[1]
index_db_val = compute_DB_index(x4, labels, centroids, n)
print "The value of Davies Bouldin index for a K-Means cluser of size " + str(n) + " is: " + str(index_db_val)
def dummify(df):
'''
Given a dataframe, for all the columns which are not numericly typed already,
create dummies. This will NOT remove one of the dummies which is required for
linear regression.
returns DataFrame -- a dataframe with all non-numeric columns swapped into dummy columns
'''
obj_cols = []
for cname in df.columns:
if df[cname].dtype == object:
obj_cols.append(cname)
df = pd.get_dummies(df, columns=obj_cols)
# for cname in obj_cols:
# del df[cname]
return df
def apriori_alg(trans, support=0.01, minlen=2):
print('appr_1')
dna = trans.unstack().dropna()
print('appr_2')
ts = pandas.get_dummies(dna).groupby(level=1).sum()
print('appr_3')
collen, rowlen = ts.shape
pattern = []
for cnum in range(minlen, rowlen + 1):
for cols in combinations(ts, cnum):
print('cnum', cnum)
patsup = ts[list(cols)].all(axis=1).sum()
patsup = float(patsup) / collen
pattern.append([",".join(cols), patsup])
print('appr_4')
sdf = pandas.DataFrame(pattern, columns=["Pattern", "Support"])
print('appr_5')
results = sdf[sdf.Support >= support]
print('appr_6')
return results
# ????????? Apriori ?? ? ??
def doOneHot(X_train, X_test):
res = X_test[['instanceID']]
X_test.drop('instanceID', axis=1, inplace=True)
data = X_train.append(X_test, ignore_index=True)
del X_train, X_test
gc.collect()
features_trans = ['gender','appCategory_main','connectionType']
data = pd.get_dummies(data, columns=features_trans)
X_train = data.loc[data['label'] != -1, :]
X_test = data.loc[data['label'] == -1, :]
X_test.loc[:, 'instanceID'] = res.values
del data
gc.collect()
return X_train, X_test
def prepare_gss(onehot=True):
data = pd.read_csv('../data/GSShappiness.csv')
del data['year']
del data['id']
data = data.dropna()
target = "Happiness level"
X = data[list(set(data.columns) - set([target]))]
y = data[target]
if onehot:
X = pd.get_dummies(X)
return X, y
def thunder():
if os.path.exists('../dataset/thunder.pkl'):
return pd.read_pickle('../dataset/thunder.pkl')
thunder_df = pd.read_csv('../input/thunder.csv',
names=[
'datetime', # ????
'lat', # ??(10??)
'lon', # ??(10??)
'type' # ???, CG: ???, IC: ???
])
# ?????????
thunder_df.datetime = pd.to_datetime(thunder_df.datetime)
# observation_point_df.to_pickle('../dataset/observation_point.pkl')
thunder_df = pd.concat([thunder_df, pd.get_dummies(thunder_df.type)], axis=1)
thunder_df.to_pickle('../dataset/thunder_df.pkl')
return thunder_df
def load_data():
data = pd.read_csv('data/train.csv')
# drop rows with empty features / gaps in columns
data = data.dropna()
# Categorical values into numerical (one hot encoding)
one_hot_embarked = pd.get_dummies(data['Embarked'], prefix='embarked')
data = data.join(one_hot_embarked)
one_hot_pclass = pd.get_dummies(data['Pclass'], prefix='pclass')
data = data.join(one_hot_pclass)
# The sex column has only two values (M/F), so that only one column is required for encoding (0/1)
# Intead of one hot encoding with two columns
data['sex'] = data.apply(lambda x: 1 if (x['Sex'] == 'female') else 0, axis=1)
# Drop features not used for training the model
data = data.drop(['Cabin', 'Name', 'PassengerId', 'Pclass', 'Sex', 'Ticket', 'Embarked'], axis=1)
return data.drop(['Survived'], axis=1), data[['Survived']]
def parse_context_dmop(path):
df = read(path, "dmop")
# ATTT-A and ATTT-B are different
attt = df[df['subsystem'].str.startswith("ATTT")]
attt['subsystem'] = attt['subsystem'].str[:3] + attt['subsystem'].str[-1]
df = pd.concat([attt, df])
# take the first 4 chars
df['subsystem'] = df['subsystem'].str[:4]
# convert to 1 / 0
df = pd.get_dummies(df.subsystem)
df = df.resample("1h").sum().fillna(0.0)
df['sum_dmop'] = df.sum(axis=1)
return df