diff --git a/PKG-INFO b/PKG-INFO index 4c39ca0b2c72b94144672d645d883368ebb714db..f9faed8526db8df904867c7a50a63d432dda3873 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.2 Name: pynndescent -Version: 0.5.6 +Version: 0.5.7 Summary: Nearest Neighbor Descent Home-page: http://github.com/lmcinnes/pynndescent Author: Leland McInnes diff --git a/pynndescent.egg-info/PKG-INFO b/pynndescent.egg-info/PKG-INFO index 4c39ca0b2c72b94144672d645d883368ebb714db..f9faed8526db8df904867c7a50a63d432dda3873 100644 --- a/pynndescent.egg-info/PKG-INFO +++ b/pynndescent.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.2 Name: pynndescent -Version: 0.5.6 +Version: 0.5.7 Summary: Nearest Neighbor Descent Home-page: http://github.com/lmcinnes/pynndescent Author: Leland McInnes diff --git a/pynndescent/distances.py b/pynndescent/distances.py index 03675bbcce7833d0abd1d92a9218597ab9b90a80..10985d63201e3186ea79290612e1266dadffc719 100644 --- a/pynndescent/distances.py +++ b/pynndescent/distances.py @@ -142,7 +142,7 @@ def weighted_minkowski(x, y, w=_mock_ones, p=2): """ result = 0.0 for i in range(x.shape[0]): - result += (w[i] * np.abs(x[i] - y[i])) ** p + result += w[i] * np.abs(x[i] - y[i]) ** p return result ** (1.0 / p) diff --git a/pynndescent/graph_utils.py b/pynndescent/graph_utils.py index 9cddc76f0850a0ff5a1ae0874eff626cd193f38d..40e827043df99239ef732e147bd447e1fb14b2f2 100644 --- a/pynndescent/graph_utils.py +++ b/pynndescent/graph_utils.py @@ -145,10 +145,10 @@ def find_component_connection_edge( best_edge = (indices[0][0], indices[1][0]) while changed[0] or changed[1]: - result = search_closure( + inds, dists, _ = search_closure( query_points, candidate_indices, search_size, epsilon, visited ) - inds, dists = deheap_sort(result) + inds, dists = deheap_sort(inds, dists) for i in range(dists.shape[0]): for j in range(dists.shape[1]): if dists[i, j] < best_dist: diff --git a/pynndescent/optimal_transport.py b/pynndescent/optimal_transport.py index 300178d3092abb874497c56504e795ef200592e5..48e1e3609a4605b5960de1547d3b19e95e06ef6a 100644 --- a/pynndescent/optimal_transport.py +++ b/pynndescent/optimal_transport.py @@ -935,7 +935,7 @@ def network_simplex_core(node_arc_data, spanning_tree, graph, max_iter): fastmath=True, parallel=True, locals={"diff": numba.float32, "result": numba.float32}, - cache=True, + cache=False, ) def right_marginal_error(u, K, v, y): uK = u @ K @@ -950,7 +950,7 @@ def right_marginal_error(u, K, v, y): fastmath=True, parallel=True, locals={"diff": numba.float32, "result": numba.float32}, - cache=True, + cache=False, ) def right_marginal_error_batch(u, K, v, y): uK = K.T @ u @@ -962,7 +962,7 @@ def right_marginal_error_batch(u, K, v, y): return np.sqrt(result) -@numba.njit(fastmath=True, parallel=True, cache=True) +@numba.njit(fastmath=True, parallel=True, cache=False) def transport_plan(K, u, v): i_dim = K.shape[0] j_dim = K.shape[1] @@ -974,7 +974,7 @@ def transport_plan(K, u, v): return result -@numba.njit(fastmath=True, parallel=True, locals={"result": numba.float32}, cache=True) +@numba.njit(fastmath=True, parallel=True, locals={"result": numba.float32}, cache=False) def relative_change_in_plan(old_u, old_v, new_u, new_v): i_dim = old_u.shape[0] j_dim = old_v.shape[0] @@ -987,7 +987,7 @@ def relative_change_in_plan(old_u, old_v, new_u, new_v): return result / (i_dim * j_dim) -@numba.njit(fastmath=True, parallel=True, cache=True) +@numba.njit(fastmath=True, parallel=True, cache=False) def precompute_K_prime(K, x): i_dim = K.shape[0] j_dim = K.shape[1] @@ -1003,7 +1003,7 @@ def precompute_K_prime(K, x): return result -@numba.njit(fastmath=True, parallel=True, cache=True) +@numba.njit(fastmath=True, parallel=True, cache=False) def K_from_cost(cost, regularization): i_dim = cost.shape[0] j_dim = cost.shape[1] @@ -1131,7 +1131,7 @@ def sinkhorn_distance(x, y, cost=_dummy_cost, regularization=1.0): return result -@numba.njit(fastmath=True, parallel=True, cache=True) +@numba.njit(fastmath=True, parallel=True, cache=False) def sinkhorn_distance_batch(x, y, cost=_dummy_cost, regularization=1.0): dim_x = x.shape[0] dim_y = y.shape[0] diff --git a/pynndescent/pynndescent_.py b/pynndescent/pynndescent_.py index 48f3f93dbd8c8ad7716cf0871b630c2a0bac8799..d72d316952707ebbcb11a267a050fce658470c7d 100755 --- a/pynndescent/pynndescent_.py +++ b/pynndescent/pynndescent_.py @@ -63,7 +63,7 @@ def is_c_contiguous(array_like): return flags is not None and flags["C_CONTIGUOUS"] -@numba.njit(parallel=True, cache=True) +@numba.njit(parallel=True, cache=False) def generate_leaf_updates(leaf_block, dist_thresholds, data, dist): updates = [[(-1, -1, np.inf)] for i in range(leaf_block.shape[0])] @@ -156,7 +156,7 @@ def init_from_neighbor_graph(heap, indices, distances): return -@numba.njit(parallel=True, cache=True) +@numba.njit(parallel=True, cache=False) def generate_graph_updates( new_candidate_block, old_candidate_block, dist_thresholds, data, dist ): @@ -372,7 +372,7 @@ def nn_descent( verbose=verbose, ) - return deheap_sort(current_graph) + return deheap_sort(current_graph[0], current_graph[1]) @numba.njit(parallel=True) @@ -946,22 +946,25 @@ class NNDescent: if not hasattr(self, "_search_forest"): if self._rp_forest is None: - # We don't have a forest, so make a small search forest - current_random_state = check_random_state(self.random_state) - rp_forest = make_forest( - self._raw_data, - self.n_neighbors, - self.n_search_trees, - self.leaf_size, - self.rng_state, - current_random_state, - self.n_jobs, - self._angular_trees, - ) - self._search_forest = [ - convert_tree_format(tree, self._raw_data.shape[0]) - for tree in rp_forest - ] + if self.tree_init: + # We don't have a forest, so make a small search forest + current_random_state = check_random_state(self.random_state) + rp_forest = make_forest( + self._raw_data, + self.n_neighbors, + self.n_search_trees, + self.leaf_size, + self.rng_state, + current_random_state, + self.n_jobs, + self._angular_trees, + ) + self._search_forest = [ + convert_tree_format(tree, self._raw_data.shape[0]) + for tree in rp_forest + ] + else: + self._search_forest = [] else: # convert the best trees into a search forest tree_scores = [ @@ -1118,22 +1121,26 @@ class NNDescent: # reorder according to the search tree leaf order if self.verbose: print(ts(), "Resorting data and graph based on tree order") - self._vertex_order = self._search_forest[0].indices - row_ordered_graph = self._search_graph[self._vertex_order, :].tocsc() - self._search_graph = row_ordered_graph[:, self._vertex_order] - self._search_graph = self._search_graph.tocsr() - self._search_graph.sort_indices() - if self._is_sparse: - self._raw_data = self._raw_data[self._vertex_order, :] - else: - self._raw_data = np.ascontiguousarray(self._raw_data[self._vertex_order, :]) + if self.tree_init: + self._vertex_order = self._search_forest[0].indices + row_ordered_graph = self._search_graph[self._vertex_order, :].tocsc() + self._search_graph = row_ordered_graph[:, self._vertex_order] + self._search_graph = self._search_graph.tocsr() + self._search_graph.sort_indices() - tree_order = np.argsort(self._vertex_order) - self._search_forest = tuple( - resort_tree_indices(tree, tree_order) - for tree in self._search_forest[: self.n_search_trees] - ) + if self._is_sparse: + self._raw_data = self._raw_data[self._vertex_order, :] + else: + self._raw_data = np.ascontiguousarray(self._raw_data[self._vertex_order, :]) + + tree_order = np.argsort(self._vertex_order) + self._search_forest = tuple( + resort_tree_indices(tree, tree_order) + for tree in self._search_forest[: self.n_search_trees] + ) + else: + self._vertex_order = np.arange(self._raw_data.shape[0]) if self.compressed: if self.verbose: @@ -1149,34 +1156,42 @@ class NNDescent: if self.verbose: print(ts(), "Building and compiling search function") - tree_hyperplanes = self._search_forest[0].hyperplanes - tree_offsets = self._search_forest[0].offsets - tree_indices = self._search_forest[0].indices - tree_children = self._search_forest[0].children + if self.tree_init: + tree_hyperplanes = self._search_forest[0].hyperplanes + tree_offsets = self._search_forest[0].offsets + tree_indices = self._search_forest[0].indices + tree_children = self._search_forest[0].children + + @numba.njit( + [ + numba.types.Array(numba.types.int32, 1, "C", readonly=True)( + numba.types.Array(numba.types.float32, 1, "C", readonly=True), + numba.types.Array(numba.types.int64, 1, "C", readonly=False), + ) + ], + locals={"node": numba.types.uint32, "side": numba.types.boolean}, + ) + def tree_search_closure(point, rng_state): + node = 0 + while tree_children[node, 0] > 0: + side = select_side( + tree_hyperplanes[node], tree_offsets[node], point, rng_state + ) + if side == 0: + node = tree_children[node, 0] + else: + node = tree_children[node, 1] - @numba.njit( - [ - numba.types.Array(numba.types.int32, 1, "C", readonly=True)( - numba.types.Array(numba.types.float32, 1, "C", readonly=True), - numba.types.Array(numba.types.int64, 1, "C", readonly=False), - ) - ], - locals={"node": numba.types.uint32, "side": numba.types.boolean}, - ) - def tree_search_closure(point, rng_state): - node = 0 - while tree_children[node, 0] > 0: - side = select_side( - tree_hyperplanes[node], tree_offsets[node], point, rng_state - ) - if side == 0: - node = tree_children[node, 0] - else: - node = tree_children[node, 1] + return -tree_children[node] - return -tree_children[node] + self._tree_search = tree_search_closure + else: + @numba.njit() + def tree_search_closure(point, rng_state): + return (0, 0) - self._tree_search = tree_search_closure + self._tree_search = tree_search_closure + tree_indices = np.zeros(1, dtype=np.int64) alternative_dot = pynnd_dist.alternative_dot alternative_cosine = pynnd_dist.alternative_cosine @@ -1301,50 +1316,63 @@ class NNDescent: return result self._search_function = search_closure + self._deheap_function = numba.njit(parallel=self.parallel_batch_queries)( + deheap_sort.py_func + ) + # Force compilation of the search function (hardcoded k, epsilon) query_data = self._raw_data[:1] - _ = self._search_function( + inds, dists, _ = self._search_function( query_data, 5, 0.0, self._visited, self.search_rng_state ) + _ = self._deheap_function(inds, dists) def _init_sparse_search_function(self): if self.verbose: print(ts(), "Building and compiling sparse search function") - tree_hyperplanes = self._search_forest[0].hyperplanes - tree_offsets = self._search_forest[0].offsets - tree_indices = self._search_forest[0].indices - tree_children = self._search_forest[0].children + if self.tree_init: + tree_hyperplanes = self._search_forest[0].hyperplanes + tree_offsets = self._search_forest[0].offsets + tree_indices = self._search_forest[0].indices + tree_children = self._search_forest[0].children + + @numba.njit( + [ + numba.types.Array(numba.types.int32, 1, "C", readonly=True)( + numba.types.Array(numba.types.int32, 1, "C", readonly=True), + numba.types.Array(numba.types.float32, 1, "C", readonly=True), + numba.types.Array(numba.types.int64, 1, "C", readonly=False), + ) + ], + locals={"node": numba.types.uint32, "side": numba.types.boolean}, + ) + def sparse_tree_search_closure(point_inds, point_data, rng_state): + node = 0 + while tree_children[node, 0] > 0: + side = sparse_select_side( + tree_hyperplanes[node], + tree_offsets[node], + point_inds, + point_data, + rng_state, + ) + if side == 0: + node = tree_children[node, 0] + else: + node = tree_children[node, 1] - @numba.njit( - [ - numba.types.Array(numba.types.int32, 1, "C", readonly=True)( - numba.types.Array(numba.types.int32, 1, "C", readonly=True), - numba.types.Array(numba.types.float32, 1, "C", readonly=True), - numba.types.Array(numba.types.int64, 1, "C", readonly=False), - ) - ], - locals={"node": numba.types.uint32, "side": numba.types.boolean}, - ) - def sparse_tree_search_closure(point_inds, point_data, rng_state): - node = 0 - while tree_children[node, 0] > 0: - side = sparse_select_side( - tree_hyperplanes[node], - tree_offsets[node], - point_inds, - point_data, - rng_state, - ) - if side == 0: - node = tree_children[node, 0] - else: - node = tree_children[node, 1] + return -tree_children[node] - return -tree_children[node] + self._tree_search = sparse_tree_search_closure + else: + @numba.njit() + def sparse_tree_search_closure(point_inds, point_data, rng_state): + return (0, 0) - self._tree_search = sparse_tree_search_closure + self._tree_search = sparse_tree_search_closure + tree_indices = np.zeros(1, dtype=np.int64) from pynndescent.distances import alternative_dot, alternative_cosine @@ -1508,10 +1536,13 @@ class NNDescent: return result self._search_function = search_closure + self._deheap_function = numba.njit(parallel=self.parallel_batch_queries)( + deheap_sort.py_func + ) # Force compilation of the search function (hardcoded k, epsilon) query_data = self._raw_data[:1] - _ = self._search_function( + inds, dists, _ = self._search_function( query_data.indices, query_data.indptr, query_data.data, @@ -1520,6 +1551,7 @@ class NNDescent: self._visited, self.search_rng_state, ) + _ = self._deheap_function(inds, dists) @property def neighbor_graph(self): @@ -1600,7 +1632,7 @@ class NNDescent: self._init_search_function() query_data = np.asarray(query_data).astype(np.float32, order="C") - result = self._search_function( + indices, dists, _ = self._search_function( query_data, k, epsilon, self._visited, self.search_rng_state ) else: @@ -1614,7 +1646,7 @@ class NNDescent: if not query_data.has_sorted_indices: query_data.sort_indices() - result = self._search_function( + indices, dists, _ = self._search_function( query_data.indices, query_data.indptr, query_data.data, @@ -1624,7 +1656,7 @@ class NNDescent: self.search_rng_state, ) - indices, dists = deheap_sort(result) + indices, dists = self._deheap_function(indices, dists) # Sort to input graph_data order indices = self._vertex_order[indices] @@ -1634,16 +1666,16 @@ class NNDescent: return indices, dists def update(self, X): - if not hasattr(self, "_search_graph"): - self._init_search_graph() - current_random_state = check_random_state(self.random_state) rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype( np.int64 ) X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C") - original_order = np.argsort(self._vertex_order) + if hasattr(self, "_vertex_order"): + original_order = np.argsort(self._vertex_order) + else: + original_order = np.ones(self._raw_data.shape[0], dtype=np.bool_) if self._is_sparse: self._raw_data = sparse_vstack([self._raw_data, X]) @@ -1693,6 +1725,24 @@ class NNDescent: verbose=self.verbose, ) + # Remove search graph and search function + # and rerun prepare if it was run previously + if ( + hasattr(self, "_search_graph") or + hasattr(self, "_search_function") or + hasattr(self, "_search_forest") + ): + if hasattr(self, "_search_graph"): + del self._search_graph + + if hasattr(self, "_search_forest"): + del self._search_forest + + if hasattr(self, "_search_function"): + del self._search_function + + self.prepare() + class PyNNDescentTransformer(BaseEstimator, TransformerMixin): """PyNNDescentTransformer for fast approximate nearest neighbor transformer. diff --git a/pynndescent/rp_trees.py b/pynndescent/rp_trees.py index c35d53272dea674d310abee50651de13aee0232b..81bbeb7b04ebad784c0b04c560be4ef74bb138cb 100644 --- a/pynndescent/rp_trees.py +++ b/pynndescent/rp_trees.py @@ -1219,7 +1219,7 @@ def renumbaify_tree(tree): "result": numba.float32, "i": numba.uint32, }, - cache=True, + cache=False, ) def score_tree(tree, neighbor_indices, data, rng_state): result = 0.0 @@ -1237,11 +1237,11 @@ def score_tree(tree, neighbor_indices, data, rng_state): return result / numba.float32(neighbor_indices.shape[0]) -@numba.njit(nogil=True, parallel=True, locals={"node": numba.int32}, cache=True) +@numba.njit(nogil=True, locals={"node": numba.int32}, cache=False) def score_linked_tree(tree, neighbor_indices): result = 0.0 n_nodes = len(tree.children) - for i in numba.prange(n_nodes): + for i in range(n_nodes): node = numba.int32(i) left_child = tree.children[node][0] right_child = tree.children[node][1] diff --git a/pynndescent/sparse.py b/pynndescent/sparse.py index 9451c40a6aaa50534aa21b263ddbb69e5ee8f43d..950b459b840a234965254ac63edcf05b365ee452 100644 --- a/pynndescent/sparse.py +++ b/pynndescent/sparse.py @@ -938,7 +938,7 @@ def sparse_symmetric_kl_divergence(ind1, data1, ind2, data2): return symmetric_kl_divergence(dense_data1, dense_data2) -@numba.njit(parallel=True, cache=True) +@numba.njit(parallel=True, cache=False) def diversify( indices, distances, @@ -993,7 +993,7 @@ def diversify( return indices, distances -@numba.njit(parallel=True, cache=True) +@numba.njit(parallel=True, cache=False) def diversify_csr( graph_indptr, graph_indices, diff --git a/pynndescent/sparse_nndescent.py b/pynndescent/sparse_nndescent.py index 4e152d9193d23104f200b2dbd847128057e4af9a..0fccb5536ec9224d89493170078462a9976f26a8 100644 --- a/pynndescent/sparse_nndescent.py +++ b/pynndescent/sparse_nndescent.py @@ -24,7 +24,7 @@ locale.setlocale(locale.LC_NUMERIC, "C") EMPTY_GRAPH = make_heap(1, 1) -@numba.njit(parallel=True, cache=True) +@numba.njit(parallel=True, cache=False) def generate_leaf_updates(leaf_block, dist_thresholds, inds, indptr, data, dist): updates = [[(-1, -1, np.inf)] for i in range(leaf_block.shape[0])] @@ -122,7 +122,7 @@ def init_random(n_neighbors, inds, indptr, data, heap, dist, rng_state): return -@numba.njit(parallel=True, cache=True) +@numba.njit(parallel=True, cache=False) def generate_graph_updates( new_candidate_block, old_candidate_block, dist_thresholds, inds, indptr, data, dist ): @@ -343,4 +343,4 @@ def nn_descent( verbose=verbose, ) - return deheap_sort(current_graph) + return deheap_sort(current_graph[0], current_graph[1]) diff --git a/pynndescent/tests/test_distances.py b/pynndescent/tests/test_distances.py index 026de1eb360ef3cb5c730e706e8811846217ca1a..fef484aeff0a01cf790861e09d5b1e0e0142f781 100644 --- a/pynndescent/tests/test_distances.py +++ b/pynndescent/tests/test_distances.py @@ -5,6 +5,7 @@ import pynndescent.distances as dist import pynndescent.sparse as spdist from scipy import stats from scipy.sparse import csr_matrix +from scipy.version import full_version as scipy_full_version from sklearn.metrics import pairwise_distances from sklearn.neighbors import BallTree from sklearn.preprocessing import normalize @@ -241,9 +242,12 @@ def test_seuclidean(spatial_data): ) +@pytest.mark.skipif( + scipy_full_version < "1.8", reason="incorrect function in scipy<1.8" +) def test_weighted_minkowski(spatial_data): v = np.abs(np.random.randn(spatial_data.shape[1])) - dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3) + dist_matrix = pairwise_distances(spatial_data, metric="minkowski", w=v, p=3) test_matrix = np.array( [ [ diff --git a/pynndescent/tests/test_pynndescent_.py b/pynndescent/tests/test_pynndescent_.py index c90cd44521592e884081e89daa27cea2a3e28b18..d980fb64ee2b85d02fe0c05b63e8f38d2c49edf3 100644 --- a/pynndescent/tests/test_pynndescent_.py +++ b/pynndescent/tests/test_pynndescent_.py @@ -435,3 +435,65 @@ def test_joblib_dump(): np.testing.assert_equal(neighbors1, neighbors2) np.testing.assert_equal(distances1, distances2) + +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_update_no_prepare_query_accuracy(nn_data, metric): + nnd = NNDescent(nn_data[200:800], metric=metric, n_neighbors=10, random_state=None) + nnd.update(nn_data[800:]) + + knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) + + true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:]) + true_indices = true_nnd.kneighbors(nn_data[:200], 10, return_distance=False) + + num_correct = 0.0 + for i in range(true_indices.shape[0]): + num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) + + percent_correct = num_correct / (true_indices.shape[0] * 10) + assert percent_correct >= 0.95, ( + "NN-descent query did not get 95% " "accuracy on nearest neighbors" + ) + +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_update_w_prepare_query_accuracy(nn_data, metric): + nnd = NNDescent(nn_data[200:800], metric=metric, n_neighbors=10, random_state=None, compressed=False) + nnd.prepare() + + nnd.update(nn_data[800:]) + nnd.prepare() + + knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) + + true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:]) + true_indices = true_nnd.kneighbors(nn_data[:200], 10, return_distance=False) + + num_correct = 0.0 + for i in range(true_indices.shape[0]): + num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) + + percent_correct = num_correct / (true_indices.shape[0] * 10) + assert percent_correct >= 0.95, ( + "NN-descent query did not get 95% " "accuracy on nearest neighbors" + ) + +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_tree_init_false(nn_data, metric): + nnd = NNDescent( + nn_data[200:], metric=metric, n_neighbors=10, random_state=None, tree_init=False + ) + nnd.prepare() + + knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) + + true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:]) + true_indices = true_nnd.kneighbors(nn_data[:200], 10, return_distance=False) + + num_correct = 0.0 + for i in range(true_indices.shape[0]): + num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) + + percent_correct = num_correct / (true_indices.shape[0] * 10) + assert percent_correct >= 0.95, ( + "NN-descent query did not get 95% " "accuracy on nearest neighbors" + ) diff --git a/pynndescent/utils.py b/pynndescent/utils.py index 5d372470b2d348ae62f3c94475d73c86969ef673..f72f22a0fe94fdce3a0eac34cb4c8736d4df683c 100644 --- a/pynndescent/utils.py +++ b/pynndescent/utils.py @@ -223,48 +223,36 @@ def siftdown(heap1, heap2, elt): elt = swap -@numba.njit(cache=True) -def deheap_sort(heap): - """Given an array of heaps (of graph_indices and weights), unpack the heap - out to give and array of sorted lists of graph_indices and weights by increasing - weight. This is effectively just the second half of heap sort (the first - half not being required since we already have the graph_data in a heap). +@numba.njit(parallel=True, cache=False) +def deheap_sort(indices, distances): + """Given two arrays representing a heap (indices and distances), reorder the + arrays by increasing distance. This is effectively just the second half of + heap sort (the first half not being required since we already have the + graph_data in a heap). + + Note that this is done in-place. Parameters ---------- - heap : array of shape (3, n_samples, n_neighbors) - The heap to turn into sorted lists. + indices : array of shape (n_samples, n_neighbors) + The graph indices to sort by distance. + distances : array of shape (n_samples, n_neighbors) + The corresponding edge distance. Returns ------- - graph_indices, weights: arrays of shape (n_samples, n_neighbors) - The graph_indices and weights sorted by increasing weight. + indices, distances: arrays of shape (n_samples, n_neighbors) + The indices and distances sorted by increasing distance. """ - indices = heap[0] - weights = heap[1] - - for i in range(indices.shape[0]): + for i in numba.prange(indices.shape[0]): + # starting from the end of the array and moving back + for j in range(indices.shape[1] - 1, 0, -1): + indices[i, 0], indices[i, j] = indices[i, j], indices[i, 0] + distances[i, 0], distances[i, j] = distances[i, j], distances[i, 0] - ind_heap = indices[i] - dist_heap = weights[i] + siftdown(distances[i, :j], indices[i, :j], 0) - for j in range(ind_heap.shape[0] - 1): - ind_heap[0], ind_heap[ind_heap.shape[0] - j - 1] = ( - ind_heap[ind_heap.shape[0] - j - 1], - ind_heap[0], - ) - dist_heap[0], dist_heap[dist_heap.shape[0] - j - 1] = ( - dist_heap[dist_heap.shape[0] - j - 1], - dist_heap[0], - ) - - siftdown( - dist_heap[: dist_heap.shape[0] - j - 1], - ind_heap[: ind_heap.shape[0] - j - 1], - 0, - ) - - return indices.astype(np.int64), weights + return indices, distances # @numba.njit() @@ -306,7 +294,7 @@ def deheap_sort(heap): # return -1 -@numba.njit(parallel=True, locals={"idx": numba.types.int64}, cache=True) +@numba.njit(parallel=True, locals={"idx": numba.types.int64}, cache=False) def new_build_candidates(current_graph, max_candidates, rng_state, n_threads): """Build a heap of candidate neighbors for nearest neighbor descent. For each vertex the candidate neighbors are any current neighbors, and any @@ -606,7 +594,7 @@ def checked_flagged_heap_push(priorities, indices, flags, p, n, f): "i": numba.uint32, "j": numba.uint32, }, - cache=True, + cache=False, ) def apply_graph_updates_low_memory(current_graph, updates, n_threads): @@ -701,7 +689,7 @@ def initalize_heap_from_graph_indices(heap, graph_indices, data, metric): return heap -@numba.njit(parallel=True, cache=True) +@numba.njit(parallel=True, cache=False) def sparse_initalize_heap_from_graph_indices( heap, graph_indices, data_indptr, data_indices, data_vals, metric ): diff --git a/setup.py b/setup.py index 86bea5bf13cb0301efefb84cf5a5ac9005047a4b..46f1c73338ee099fa2f090e5716abf9c63305d98 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def readme(): configuration = { "name": "pynndescent", - "version": "0.5.6", + "version": "0.5.7", "description": "Nearest Neighbor Descent", "long_description": readme(), "classifiers": [