def updateQProbs(lastStateID, lastAction):
# print 'np.sum(QCounts[lastStateID,]) = ', np.sum(QCounts[lastStateID,])
# print 'np.sum(QCounts[lastStateID,]) = ', np.sum(QCounts[lastStateID,])
# print 'np.sum(QValues[lastStateID,]) = ', np.sum(QValues[lastStateID,])
if np.sum(QCounts[lastStateID,]) == 0 or np.sum(QValues[lastStateID,]) == 0:
tau = 1
else:
# print '(-(np.mean(QValues[lastStateID,]))) = ', (-(np.mean(QValues[lastStateID,])))
# print '(np.mean(QCounts[lastStateID,])) = ', (np.mean(QCounts[lastStateID,]))
tau = (-(np.mean(QValues[lastStateID,])))/(np.mean(QCounts[lastStateID,]))
# print 'tau = ', tau
numerator = np.exp(QValues[lastStateID, ]/tau)
tempSum = np.sum(numerator)
denominator = np.array([tempSum, tempSum, tempSum, tempSum, tempSum, tempSum, tempSum, tempSum])
QProbs[lastStateID, ] = np.divide(numerator, denominator)
# initial dataframes which will be able to store performance data over different days
评论列表
文章目录