def process_options(args):
options = argparser().parse_args(args)
if options.max_rank is not None and options.max_rank < 1:
raise ValueError('max-rank must be >= 1')
if options.k is not None and options.k < 2:
raise ValueError('cluster number must be >= 2')
if options.method == MINIBATCH_KMEANS and not with_sklearn:
logging.warning('minibatch kmeans not available, using kmeans (slow)')
options.method = KMEANS
if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
options.jobs = 1
wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)
if options.k is None:
options.k = int(math.ceil((len(wv.words())/2)**0.5))
logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))
if options.normalize:
logging.info('normalize vectors to unit length')
wv.normalize()
words, vectors = wv.words(), wv.vectors()
if options.whiten:
logging.info('normalize features to unit variance')
vectors = scipy.cluster.vq.whiten(vectors)
return words, vectors, options
评论列表
文章目录