def read_data(filename):
""" Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
and a sparse matrix of artist/user/playcount """
# read in triples of user/artist/playcount from the input dataset
# get a model based off the input params
start = time.time()
logging.debug("reading data from %s", filename)
data = pandas.read_table(filename,
usecols=[0, 2, 3],
names=['user', 'artist', 'plays'])
# map each artist and user to a unique numeric value
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")
# create a sparse matrix of all the users/plays
plays = coo_matrix((data['plays'].astype(numpy.float32),
(data['artist'].cat.codes.copy(),
data['user'].cat.codes.copy())))
logging.debug("read data file in %s", time.time() - start)
return data, plays
评论列表
文章目录