python类NearestNeighbors()的实例源码

GornitzAnnotationQueries.py 文件源码 项目:SecuML 作者: ANSSI-FR 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def computeNeighboursScores(self):
        all_instances = self.iteration.datasets.instances
        # Connectivity matrix
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))])
        pipeline.fit(all_instances.getFeatures())
        # Labels
        labels = np.array([generateLabel(x) for x in all_instances.getLabels()])
        # Compute neighbour scores
        scores = []
        all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False)
        for i, label in enumerate(labels):
            if label != 0:
                continue
            else:
                neighbours = all_neighbours[i]
                score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours)
                scores.append(score)
        return np.array(scores)
methods_kharita.py 文件源码 项目:kharita 作者: vipyoung 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def getpossibleedges(datapointwts,seeds):
#    datapointwts = densify(datapointwts);
    X = [(xx[0], xx[1]) for xx in datapointwts];    S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {};
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(S)
    distances, indices = nbrs.kneighbors(X)
    for cd in range(len(seeds)):
        cluster[cd] = []
    for ii, ll in enumerate(indices):
        dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
        cd = ll[dd.index(min(dd))];
        cluster[cd].append(datapointwts[ii])
        p2cluster.append(cd)
    for ii, xx in enumerate(datapointwts):
        if ii>1:
            if datapointwts[ii-1][-1]<datapointwts[ii][-1] and datapointwts[ii-1][-1]>datapointwts[ii][-1]-11:
                cd1 = p2cluster[ii-1]; cd2 = p2cluster[ii];
            if not cd1== cd2:
                gedges1[(cd1,cd2)] =  gedges1.get((cd1,cd2),0)+1;
    return(gedges1)
methods_kharita.py 文件源码 项目:kharita 作者: vipyoung 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def point2cluster(datapointwts,seeds,theta):
    cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = [];
    X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in datapointwts];    S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in seeds];
    Xrot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in datapointwts];    Srot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in seeds];
    for cd in range(len(seeds)):
        cluster[cd] = []
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S)
    distances, indices = nbrs.kneighbors(X)
    nbrsrot = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(Srot)
    distancesrot, indicesrot = nbrsrot.kneighbors(Xrot)
    for ii, ll in enumerate(indices):
        #        print(distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii])
        cd = indicesrot[ii][0]
        if distances[ii][0] < distancesrot[ii][0]:
            cd = indices[ii][0];
            #        print(cd,distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii])
        cluster[cd].append(datapointwts[ii])
        p2cluster.append(cd)
    return(cluster,p2cluster)
methods_kharita.py 文件源码 项目:kharita 作者: vipyoung 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def splitclustersparallel(datapointwts,seeds):
    X = [(xx[0], xx[1]) for xx in datapointwts];    S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; roadwidth = [];
    nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(S)
    distances, indices = nbrs.kneighbors(X)
    for cd in range(len(seeds)):
        cluster[cd] = []; roadwidth.append(0);
    for ii, ll in enumerate(indices):
        dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
        cd = ll[dd.index(min(dd))];
        cluster[cd].append(datapointwts[ii])
        p2cluster.append(cd)
    for cl in cluster:
        mang = seeds[cl][-1];
        scl = seeds[cl]
        if len(cluster[cl]) > 10:
            std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90)
            roadwidth[cl] = 1+5*np.std([geodist(scl,xx)*np.sin(anglebetweentwopoints(scl,xx)-scl[-1])  for xx in cluster[cl]])
            print(cl,scl,[(anglebetweentwopoints(scl,xx),scl[-1])  for xx in cluster[cl]])
__init__.py 文件源码 项目:lsanomaly 作者: lsanomaly 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def median_kneighbour_distance(X, k=5):
    """
    Calculate the median kneighbor distance.

    Find the distance between a set of random datapoints and
    their kth nearest neighbours. This is a heuristic for setting the
    kernel length scale.
    """
    N_all = X.shape[0]
    k = min(k, N_all)
    N_subset = min(N_all, 2000)
    sample_idx_train = np.random.permutation(N_all)[:N_subset]
    nn = neighbors.NearestNeighbors(k)
    nn.fit(X[sample_idx_train, :])
    d, idx = nn.kneighbors(X[sample_idx_train, :])
    return np.median(d[:, -1])
job_description_feature_extraction.py 文件源码 项目:job-salary-prediction 作者: soton-data-mining 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def cosine_knn(corpus_vector, queries_vector, k=10):
    """

    :param corpus_vector: vectorized document text
    :param queries_vector: vectorized query text
    :param k: number of neighbours
    :return: (distances, indices) of knn
    """
    # based on
    # http://scikit-learn.org/stable/modules/neighbors.html
    # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

    # since we want to use cosine similarity to account for document length
    # we have to use bruteforce search
    # parallelize to number of cores with n_jobs -1
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
    nbrs.fit(corpus_vector)
    distances, indices = nbrs.kneighbors(queries_vector)
    return distances, indices
token_container.py 文件源码 项目:basis 作者: vaitech 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def index(self, metric='cosine'):
        """ Build a nearest neighbor retrieval index to perform similarity 
        lookups and analogies

        Arguments:
            metric: string, or sklearn compatible callable

        Returns:
            self

        Raises:
            TokenContainerException if no pretrained vectors have been loaded
        """

        if self.W is not None:
            alg = 'brute' if (metric == 'cosine') else 'auto'
            from sklearn.neighbors import NearestNeighbors
            self._nn = NearestNeighbors(metric=metric, algorithm=alg)
            self._nn.fit(self.W)
        else:
            raise TokenContainerException(
                'cannot build similarity on vectorless structure'
            )
        return self
pixel_sampling.py 文件源码 项目:kaggle-yelp-restaurant-photo-classification 作者: u1234x1234 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def extract_lab_histogram(mode, clusters):

    nn = neighbors.NearestNeighbors(n_neighbors=1)
    nn.fit(clusters)
    out_filename = mode + '_color'    
    try:
        os.remove(out_filename)
    except:
        pass
    out = open(out_filename, 'ab')
    cnt = 0    
    with open(mode + '_list') as f:
        for line in f:
            line = line[:-1]
            image = cv2.imread(line)
            image = cv2.resize(image, (100, 100))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)
            points = image.reshape((-1, 3))
            cn = nn.kneighbors(points)
            hist = np.histogram(cn[1], bins=50, range=(1, 50))[0]
            hist.tofile(out)            
            cnt = cnt + 1
            if cnt % 1000 == 0:
                print(cnt)
ikdb.py 文件源码 项目:ikdb 作者: krishauser 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def buildNNDataStructure(self):
        """Builds a nearest neighbor data structure.  User doesn't need to
        call this unless the self.problems attribute was changed manually."""
        if len(self.problemFeatures)==0 or len(self.featureNames)==0:
            return
        try:
            from sklearn.neighbors import NearestNeighbors,BallTree
            from scipy.spatial import KDTree
            with self.lock:
                try:
                    farray = self.problemFeatures.array
                except AttributeError:
                    farray = np.array(self.problemFeatures.items)
                if self.metricTransform is not None:
                    farray = np.dot(farray,self.metricTransform)
                #self.nn = NearestNeighbors(n_neighbors=1,algorithm="auto").fit(farray)
                self.nn = BallTree(farray)
                #self.nn = KDTree(farray)
                self.nnBuildSize = len(self.problemFeatures)
        except ImportError:
            print "IKDatabase: Warning, scikit-learn is not installed, queries will be much slower"
            with self.lock:
                self.nn = None
                self.nnBuildSize = 0
        return
elsim.py 文件源码 项目:DroidWatcher 作者: suemi994 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, x, ys):
        import numpy as np
        from sklearn.neighbors import NearestNeighbors
        #print x, ys

        CI = np.array( [x.checksum.get_signature_entropy(), x.checksum.get_entropy()] )
        #print CI, x.get_info()
        #print

        for i in ys:
            CI = np.vstack( (CI, [i.checksum.get_signature_entropy(), i.checksum.get_entropy()]) )

        #idx = 0
        #for i in np.array(CI)[1:]:
        #    print idx+1, i, ys[idx].get_info()
        #    idx += 1

        self.neigh = NearestNeighbors(2, 0.4)
        self.neigh.fit(np.array(CI))
        #print self.neigh.kneighbors( CI[0], len(CI) )

        self.CI = CI
        self.ys = ys
elsim.py 文件源码 项目:DroidWatcher 作者: suemi994 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self, x, ys):
        import numpy as np
        from sklearn.neighbors import NearestNeighbors
        #print x, ys

        CI = np.array( [x.checksum.get_signature_entropy(), x.checksum.get_entropy()] )
        #print CI, x.get_info()
        #print

        for i in ys:
            CI = np.vstack( (CI, [i.checksum.get_signature_entropy(), i.checksum.get_entropy()]) )

        #idx = 0
        #for i in np.array(CI)[1:]:
        #    print idx+1, i, ys[idx].get_info()
        #    idx += 1

        self.neigh = NearestNeighbors(2, 0.4)
        self.neigh.fit(np.array(CI))
        #print self.neigh.kneighbors( CI[0], len(CI) )

        self.CI = CI
        self.ys = ys
features.py 文件源码 项目:uhcsdb 作者: bdecost 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def build_search_tree(datadir, featurename='vgg16_block5_conv3-vlad-64.h5'):

    ndim = 64
    features_file = os.path.join(datadir, featurename)
    print(features_file)

    global keys, features
    keys, features = load_features(features_file)

    print('reducing features')
    pca = PCA(n_components=ndim)
    features = pca.fit_transform(features)
    print('ready')

    print('building search tree')
    nn = NearestNeighbors()

    global nneighs
    nneighs = nn.fit(features)
    print('ready')
test_neighbors.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_unsupervised_kneighbors(n_samples=20, n_features=5,
                                 n_query_pts=2, n_neighbors=5):
    # Test unsupervised neighbors methods
    X = rng.rand(n_samples, n_features)

    test = rng.rand(n_query_pts, n_features)

    for p in P:
        results_nodist = []
        results = []

        for algorithm in ALGORITHMS:
            neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors,
                                               algorithm=algorithm,
                                               p=p)
            neigh.fit(X)

            results_nodist.append(neigh.kneighbors(test,
                                                   return_distance=False))
            results.append(neigh.kneighbors(test, return_distance=True))

        for i in range(len(results) - 1):
            assert_array_almost_equal(results_nodist[i], results[i][1])
            assert_array_almost_equal(results[i][0], results[i + 1][0])
            assert_array_almost_equal(results[i][1], results[i + 1][1])
test_neighbors.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_unsupervised_inputs():
    # test the types of valid input into NearestNeighbors
    X = rng.random_sample((10, 3))

    nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1)
    nbrs_fid.fit(X)

    dist1, ind1 = nbrs_fid.kneighbors(X)

    nbrs = neighbors.NearestNeighbors(n_neighbors=1)

    for input in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
        nbrs.fit(input)
        dist2, ind2 = nbrs.kneighbors(X)

        assert_array_almost_equal(dist1, dist2)
        assert_array_almost_equal(ind1, ind2)
test_neighbors.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_radius_neighbors_boundary_handling():
    """Test whether points lying on boundary are handled consistently

    Also ensures that even with only one query point, an object array
    is returned rather than a 2d array.
    """

    X = np.array([[1.5], [3.0], [3.01]])
    radius = 3.0

    for algorithm in ALGORITHMS:
        nbrs = neighbors.NearestNeighbors(radius=radius,
                                          algorithm=algorithm).fit(X)
        results = nbrs.radius_neighbors([[0.0]], return_distance=False)
        assert_equal(results.shape, (1,))
        assert_equal(results.dtype, object)
        assert_array_equal(results[0], [0, 1])
test_neighbors.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_callable_metric():

    def custom_metric(x1, x2):
        return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))

    X = np.random.RandomState(42).rand(20, 2)
    nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto',
                                       metric=custom_metric)
    nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute',
                                       metric=custom_metric)

    nbrs1.fit(X)
    nbrs2.fit(X)

    dist1, ind1 = nbrs1.kneighbors(X)
    dist2, ind2 = nbrs2.kneighbors(X)

    assert_array_almost_equal(dist1, dist2)
kNN1.py 文件源码 项目:website-fingerprinting 作者: AxelGoetz 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, is_multiclass=True, K_CLOSEST_NEIGHBORS=2):
        # Constants
        self.K_RECO = 5.0 # Num of neighbors for weight learning
        self.K_CLOSEST_NEIGHBORS = K_CLOSEST_NEIGHBORS

        self.weights = None

        self.kNN_finder = NearestNeighbors(
            n_neighbors=K_CLOSEST_NEIGHBORS,
            metric=self._calculate_dist,
            metric_params=None, # Dict otherwise
            n_jobs=-1
        )
data.py 文件源码 项目:geomdn 作者: afshinrahimi 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def assignClasses(self):
        clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size)
        train_locs = self.df_train[['lat', 'lon']].values
        clusterer.fit(train_locs)
        clusters = clusterer.get_clusters()
        cluster_points = dd(list)
        for i, cluster in enumerate(clusters):
            cluster_points[cluster].append(train_locs[i])
        logging.info('#labels: %d' %len(cluster_points))
        self.cluster_median = OrderedDict()
        for cluster in sorted(cluster_points):
            points = cluster_points[cluster]
            median_lat = np.median([p[0] for p in points])
            median_lon = np.median([p[1] for p in points]) 
            self.cluster_median[cluster] = (median_lat, median_lon)
        dev_locs = self.df_dev[['lat', 'lon']].values
        test_locs = self.df_test[['lat', 'lon']].values
        nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4)
        nnbr.fit(np.array(self.cluster_median.values()))
        self.dev_classes = nnbr.kneighbors(dev_locs, n_neighbors=1, return_distance=False)[:, 0]
        self.test_classes = nnbr.kneighbors(test_locs, n_neighbors=1, return_distance=False)[:, 0]

        self.train_classes = clusters
        if self.one_hot_labels:
            num_labels = np.max(self.train_classes) + 1
            y_train = np.zeros((len(self.train_classes), num_labels), dtype=np.float32)
            y_train[np.arange(len(self.train_classes)), self.train_classes] = 1
            y_dev = np.zeros((len(self.dev_classes), num_labels), dtype=np.float32)
            y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1
            y_test = np.zeros((len(self.test_classes), num_labels), dtype=np.float32)
            y_test[np.arange(len(self.test_classes)), self.test_classes] = 1
            self.train_classes = y_train
            self.dev_classes = y_dev
            self.test_classes = y_test
wednesday.py 文件源码 项目:singlecell-dash 作者: czbiohub 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def network_layout(matrix, k=30):
    nbrs = NearestNeighbors(k, algorithm='brute', metric='cosine').fit(matrix)
    G = networkx.from_scipy_sparse_matrix(nbrs.kneighbors_graph(matrix))

    node_labels = label_propagation(G, verbose=True)
    communities_labelprop = np.array([node_labels[i] for i in range(matrix.shape[0])])

    pos = graphviz_layout(G, prog="sfdp")
    coords = np.array([pos[i] for i in range(len(pos))])
    print(coords.shape)

    return coords, communities_labelprop
tissue_analysis.py 文件源码 项目:singlecell-dash 作者: czbiohub 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def network_layout(matrix, k=30):
    nbrs = NearestNeighbors(k, algorithm='brute',
                            metric='cosine').fit(matrix)
    G = networkx.from_scipy_sparse_matrix(nbrs.kneighbors_graph(matrix))

    node_labels = label_propagation(G, verbose=True)
    communities_labelprop = np.array([node_labels[i] for i in range(matrix.shape[0])])

    pos = graphviz_layout(G, prog="sfdp")
    coords = np.array([pos[i] for i in range(len(pos))])
    print(coords.shape)

    return coords, communities_labelprop
CostFunctions.py 文件源码 项目:Deep-Learning-Plugin 作者: flowjo-lakes 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self,
                 MMDLayer,
                 MMDTargetTrain,
                 MMDTargetValidation_split=0.1,
                 MMDTargetSampleSize=1000,
                 n_neighbors = 25,
                 scales = None,
                 weights = None):
        if scales == None:
            print("setting scales using KNN")
            med = np.zeros(20)
            for ii in range(1,20):
                sample = MMDTargetTrain[np.random.randint(MMDTargetTrain.shape[0], size=MMDTargetSampleSize),:]
                nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample)
                distances,dummy = nbrs.kneighbors(sample)
                #nearest neighbor is the point so we need to exclude it
                med[ii]=np.median(distances[:,1:n_neighbors])
            med = np.median(med)  
            scales = [med/2, med, med*2] # CyTOF    
            print(scales)
        scales = K.variable(value=np.asarray(scales))
        if weights == None:
            print("setting all scale weights to 1")
            weights = K.eval(K.shape(scales)[0])
        weights = K.variable(value=np.asarray(weights))
        self.MMDLayer =  MMDLayer
        MMDTargetTrain, MMDTargetValidation = train_test_split(MMDTargetTrain, test_size=MMDTargetValidation_split, random_state=42)
        self.MMDTargetTrain = K.variable(value=MMDTargetTrain)
        self.MMDTargetTrainSize = K.eval(K.shape(self.MMDTargetTrain)[0])
        self.MMDTargetValidation = K.variable(value=MMDTargetValidation)
        self.MMDTargetValidationSize = K.eval(K.shape(self.MMDTargetValidation)[0])
        self.MMDTargetSampleSize = MMDTargetSampleSize
        self.kernel = self.RaphyKernel
        self.scales = scales
        self.weights = weights


    #calculate the raphy kernel applied to all entries in a pairwise distance matrix
knn_retrieval.py 文件源码 项目:BioIR 作者: nlpaueb 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_chunk_nns(self, X, q_centroids, question_details, chunk):
        nbrs = NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=1000).fit(X)
        dist, nns = nbrs.kneighbors(q_centroids, return_distance=True)
        q_array = []
        for q_point in range(nns.shape[0]):
            doc_nns = []
            for n_point in range(nns.shape[1]):
                doc_nns.append(self.idmap[chunk[0] + nns[q_point, n_point]])
            q = Question(question_details[q_point][0], question_details[q_point][1], doc_nns, list(dist[q_point, :]))
            q_array.append(q)
        return q_array

    # Dataset indeces are splitted in N chucks. Nearest top-(N*k) neighbors are extracted from each chunk, and then
    # the final top-k neighbors are extracted from those.
methods_kharita.py 文件源码 项目:kharita 作者: vipyoung 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def getseeds(datapoint,radius,theta):
    chosen = []; seeds = [];
#    random.shuffle(datapoint)
    periodsampl = 500000
    for p in datapoint:
        chosen.append(p);
    for j,p in enumerate(chosen):
        ok = -1;
        if j<periodsampl:
            for q in seeds:
                if taxidist(p,q,theta)<radius:
                    ok = 1
                    break;
            if ok <1:
                seeds.append(p)
        else:
            if j%periodsampl == 0:# and (is_power2(int(j/1000))):
#                print(j,time.time()-start)
                S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in seeds];
                nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S)
                X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in chosen[j:min(len(chosen),j+periodsampl)]];
                distances, indices = nbrs.kneighbors(X)
            if distances[j%periodsampl][0] >radius:
                seeds.append(p)
    print('seeds: ', len(seeds))
    return (seeds)
doc2vec.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self,
                 analyzer=None, matching=None,
                 name=None,
                 verbose=0,
                 n_epochs=10,
                 alpha=0.25,
                 min_alpha=0.05,
                 n_jobs=4,
                 **kwargs):
        # self.model = model
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.verbose = verbose
        self.name = "paragraph-vectors" if name is None else name

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

        self.analyzer = analyzer
        self.model = Doc2Vec(alpha=alpha,
                             min_alpha=alpha,
                             size=500,
                             window=8,
                             min_count=1,
                             sample=1e-5,
                             workers=n_jobs,
                             negative=20,
                             dm=0, dbow_words=1,  # words only with dm!=0?
                             dm_mean=0,  # unused when in concat mode
                             dm_concat=1,
                             dm_tag_count=1
                             )
        self.n_epochs = n_epochs
        self._neighbors = NearestNeighbors(**kwargs)
doc2vec.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 45 收藏 0 点赞 0 评论 0
def query(self, query, k=None):
        model, matching = self.model, self._matching
        nn, analyze = self._neighbors, self.analyzer
        verbose = self.verbose
        if k is None:
            k = len(self._centroids)
        if matching:
            matched = matching.predict(query)
            print("Matched:", matched)
            tags = self._y[matched]
            dvs = np.asarray([model.docvecs[tag] for tag in tags])
            n_ret = min(k, len(matched))
            if n_ret == 0:
                return []
            nn.fit(dvs)
        else:
            tags = self._y
            n_ret = k
            # NearestNeighbors are already fit

        if verbose > 0:
            print(len(tags), "documents matched.")
        q = analyze(query)
        qv = model.infer_vector(q).reshape(1, -1)
        ind = nn.kneighbors(qv, n_neighbors=n_ret, return_distance=False)[0]
        y = tags[ind]
        return y
base.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def query(self, query, k=None, matched_indices=None):
        # matching step
        matching_ind = self._matching(query)
        # print(matching_ind, file=sys.stderr)
        Xm, matched_doc_ids = self._X[matching_ind], self._y[matching_ind]
        # matching_docs, matching_doc_ids = self._matching(query)
        # calculate elements to retrieve
        n_ret = len(matching_ind)
        if n_ret == 0:
            return []
        if self.verbose > 0:
            print("Found {} matches:".format(n_ret))
        # n_ret = min(n_ret, k) if k > 0 else n_ret
        # model dependent transformation
        xq = self._cv.transform([query])
        q = self.tfidf.transform(xq)
        # Xm = self.vectorizer.transform(matching_docs)
        # model dependent nearest neighbor search or scoring or whatever
        nn = NearestNeighbors(metric='cosine', algorithm='brute').fit(Xm)
        # abuse kneighbors in this case
        # AS q only contains one element, we only need its results.
        if k is not None and k < n_ret:
            n_ret = k

        ind = nn.kneighbors(q,  # q contains a single element
                            n_neighbors=n_ret,  # limit to k neighbors
                            return_distance=False)[0]  # so we only need 1 res
        # dont forget to convert the indices to document ids of matching
        labels = matched_doc_ids[ind]
        return labels
word2vec.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self,
                 embedding,
                 analyzer,
                 name="WCD",
                 n_jobs=1,
                 normalize=True,
                 verbose=0,
                 oov=None,
                 matching=True,
                 **kwargs):
        self.name = name
        self._embedding = embedding
        self._normalize = normalize
        self._oov = oov
        self.verbose = verbose
        self.n_jobs = n_jobs
        self._neighbors = NearestNeighbors(**kwargs)

        self._analyzer = analyzer

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))
word2vec.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self, embedding, analyzer='word', matching=None, name="FWCD",
                 n_jobs=1, use_idf=True):
        """TODO: to be defined1. """
        self.name = name
        self.matching = Matching(**dict(matching)) if matching else None
        self.vect = EmbeddedVectorizer(embedding, analyzer=analyzer, norm='l2',
                                       use_idf=use_idf)
        self.nn = NearestNeighbors(n_jobs=n_jobs, metric='cosine',
                                   algorithm='brute')
test_neighbors.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_nearest_centroid_ranker():
    # in the case where there is a single point by centroid,
    # nearest centroid should reduce to nearest neighbor
    from sklearn.neighbors import NearestNeighbors
    np.random.seed(0)

    n_samples = 100
    n_features = 120
    X = np.random.rand(n_samples, n_features)
    normalize(X, copy=False)
    index = np.arange(n_samples, dtype='int')
    y = np.arange(n_samples, dtype='int')
    index_train, index_test, y_train, y_test = train_test_split(index, y)
    X_train = X[index_train]
    X_test = X[index_test]


    nn = NearestNeighbors(n_neighbors=1, algorithm='brute')
    nn.fit(X_train)
    dist_ref, idx_ref = nn.kneighbors(X_test)

    nc = NearestCentroidRanker()
    nc.fit(X_train, y_train)
    dist_pred = nc.decision_function(X_test)
    y_pred = nc.predict(X_test)

    # ensures that we have the same number of unique ouput points
    # (even if absolute labels are not preserved)
    assert np.unique(idx_ref[:,0]).shape ==  np.unique(y_pred).shape

    assert_allclose(dist_pred, dist_ref[:,0])
neighbors.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def fit(self, X, y):
        """Fit the model using X as training data
        Parameters
        ----------
        X : {array-like, sparse matrix, BallTree, KDTree}
            Training data, shape [n_samples, n_features],

        """
        X = check_array(X, accept_sparse='csr')
        y = np.asarray(y, dtype='int')
        y_unique = np.unique(y)

        index = np.arange(len(y), dtype='int')

        if len(y_unique) == 0:
            raise ValueError('The training set must have at least '
                             'one document category!')

        # define nearest neighbors search objects for each category
        self._mod = [NearestNeighbors(n_neighbors=1,
                                      leaf_size=self.leaf_size,
                                      algorithm=self.algorithm,
                                      n_jobs=self.n_jobs,
                                      # euclidean metric by default
                                      metric='cosine',
                                      ) for el in range(len(y_unique))]

        index_mapping = []
        for imod, y_val in enumerate(y_unique):
            mask = (y == y_val)
            index_mapping.append(index[mask])
            self._mod[imod].fit(X[mask])

        self.index_mapping = index_mapping


问题


面经


文章

微信
公众号

扫码关注公众号