data.py 文件源码

python
阅读 17 收藏 0 点赞 0 评论 0

项目:kaggle-Kobe-Bryant-Shot-Selection 作者: shiba24 项目源码 文件源码
def preproc(self):
        self.df["time_remaining"] = self.df["minutes_remaining"] * 60 + self.df["seconds_remaining"]
        self.df['last_5_sec'] = self.df['time_remaining'] < 5
        self.df['latter_half'] = self.df['time_remaining'] < 360
        self.df['first_period'] = self.df['period'] == 1
        self.df['latter_period'] = self.df['period'] > 2
        self.df['last_period'] = self.df['period'] == 4
        self.df['last_quarter'] = self.df['time_remaining'] < 180

        threshold = 3
        anomaly = 14
        self.df['last_moment'] = self.df.apply(lambda row: row['time_remaining'] < threshold or row['time_remaining'] == anomaly, axis=1)
        self.df['away'] = self.df.matchup.str.contains('@')
        self.df['secondsFromStart'] = 60 * (11 - self.df['minutes_remaining']) + (60 - self.df['seconds_remaining'])
        self.df['secondsFromGameStart'] = (self.df['period'] <= 4).astype(int) * (self.df['period'] - 1) * 12 * 60 + (self.df['period'] > 4).astype(int) * ((self.df['period'] - 4) * 5 * 60 + 3 * 12 * 60) + self.df['secondsFromStart']
        numGaussians = 13
        gaussianMixtureModel = mixture.GMM(n_components=numGaussians, covariance_type='full', 
                                           params='wmc', init_params='wmc',
                                           random_state=1, n_init=3,  verbose=0)
        gaussianMixtureModel.fit(self.df.ix[:,['loc_x','loc_y']])
        self.df['shotLocationCluster'] = gaussianMixtureModel.predict(self.df.ix[:,['loc_x','loc_y']])
        self.df['homeGame'] = self.df['matchup'].apply(lambda x: 1 if (x.find('@') < 0) else 0)

        self.df["game_year"] = pd.Series([int(self.df["game_date"][i][:4]) for i in range(0, len(self.df))])
        self.df["game_month"] = pd.Series([int(self.df["game_date"][i][5:7]) for i in range(0, len(self.df))])
        self.df["game_day"] = pd.Series([int(self.df["game_date"][i][-2:]) for i in range(0, len(self.df))])

        action_type_list = list(set(self.df["action_type"].tolist()))
        self.df["action_type_num"] = pd.Series([action_type_list.index(self.df["action_type"][i]) for i in range(0, len(self.df))])

        combined_shot_type_list = list(set(self.df["combined_shot_type"].tolist()))
        self.df["combined_shot_type_num"] = pd.Series([combined_shot_type_list.index(self.df["combined_shot_type"][i]) for i in range(0, len(self.df))])

        opponent_list = list(set(self.df["opponent"].tolist()))
        self.df["opponent_num"] = pd.Series([opponent_list.index(self.df["opponent"][i]) for i in range(0, len(self.df))])

        game_id_list = list(set(self.df["game_id"].tolist()))
        self.df["game_id_num"] = pd.Series([game_id_list.index(self.df["game_id"][i]) for i in range(0, len(self.df))])

        season_list = list(set(self.df["season"].tolist()))
        season_list.sort()
        self.df["season_num"] = pd.Series([season_list.index(self.df["season"][i]) for i in range(0, len(self.df))])

        self.df["shot_distance"][self.df["shot_distance"] > 45] = 45

        # del self.df["team_id"], self.df["team_name"], self.df["game_event_id"], self.df["lat"], self.df["lon"]
        # return self.df
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号