def generate_user_actions_with_artist(sc):
'''
data_source/user_actions.csv ????artist_id
Args:
sc: pyspark.SparkContext
'''
hdfs_file_dir = 'hdfs:/home/ProjectPOP/data_source'
hdfs_song_path = '%s/mars_tianchi_songs.csv' % (hdfs_file_dir)
hdfs_action_path = '%s/mars_tianchi_user_actions.csv' % (hdfs_file_dir)
logger.info('Start generate song_artist_dict')
song_artist_dict = dict(sc.textFile(hdfs_song_path).map(_generate_song_artist_dict).collect())
song_artist_dict_broadcast = sc.broadcast(song_artist_dict)
logger.info('Start process user_actions')
user_actions = sc.textFile(hdfs_action_path).map(lambda l: _add_artist_into_line(l, song_artist_dict_broadcast))
logger.info(user_actions.take(5))
user_actions.saveAsTextFile('%s/mars_tianchi_songs_with_artist.csv' % (hdfs_file_dir))
return True
generate_user_artist_date_feature.py 文件源码
python
阅读 19
收藏 0
点赞 0
评论 0
评论列表
文章目录