From 8d2414cd6a39de7679d5511abc53c03e1cf8267e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8F=94=E5=A4=8F?= Date: Wed, 28 Nov 2018 12:20:01 +0800 Subject: [PATCH 1/2] update: speed up func sparse_mutual_reachability 20+ times --- hdbscan/_hdbscan_reachability.pyx | 54 +++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/hdbscan/_hdbscan_reachability.pyx b/hdbscan/_hdbscan_reachability.pyx index 50b0259a..b1996ec8 100644 --- a/hdbscan/_hdbscan_reachability.pyx +++ b/hdbscan/_hdbscan_reachability.pyx @@ -94,6 +94,60 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, return result.tocsr() +def sparse_mutual_reachability_old(lil_matrix, min_points=5): + """Added by codingafuture for performance comparation. + Func sparse_mutual_reachability_old is a pure python realisation of above func sparse_mutual_reachability. + """ + from scipy.sparse import lil_matrix as sparse_matrix + result = sparse_matrix(lil_matrix.shape) + core_distance = np.empty(lil_matrix.shape[0], dtype=np.double) + + for i in range(lil_matrix.shape[0]): # 5w + sorted_row_data = sorted(lil_matrix.data[i]) + if min_points < len(sorted_row_data): + core_distance[i] = sorted_row_data[min_points] + else: + core_distance[i] = np.infty + + nz_row_data, nz_col_data = lil_matrix.nonzero() + + for n in range(nz_row_data.shape[0]): + i = nz_row_data[n] + j = nz_col_data[n] + + mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j]) + if np.isfinite(mr_dist): + result[i, j] = mr_dist + + return result.tocsr() + + +def sparse_mutual_reachability_new(lil_mat, min_points=5): + """Added by codingafuture for performance comparation. + Compared to sparse_mutual_reachability_old, sparse_mutual_reachability_new speed up 20+ times. + """ + core_distance = np.empty(lil_mat.shape[0], dtype=np.double) + + for i in range(lil_mat.shape[0]): # 5w + sorted_row_data = sorted(lil_mat.data[i]) + if min_points < len(sorted_row_data): + core_distance[i] = sorted_row_data[min_points] + else: + core_distance[i] = np.infty + + csr_mat = lil_mat.tocsr() + csr_mat.eliminate_zeros() + row, col = csr_mat.nonzero() + data = csr_mat.data +# print len(row), len(col), len(data) + data_core_dis_i = core_distance[row] + data_core_dis_j = core_distance[col] + + stage1 = np.where(data_core_dis_i > data, data_core_dis_i, data) + result = np.where(stage1 > data_core_dis_j, stage1, data_core_dis_j) + x = csr_matrix((result, (row, col)), shape=(lil_mat.shape[0], lil_mat.shape[1])) + return x + def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, alpha=1.0, **kwargs): dim = distance_matrix.shape[0] From 8568381cfdb6cf1435dd6546291726e5b5dd4941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8F=94=E5=A4=8F?= Date: Wed, 28 Nov 2018 14:03:55 +0800 Subject: [PATCH 2/2] update: speed up func sparse_mutual_reachability 20+ times --- hdbscan/_hdbscan_reachability.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/hdbscan/_hdbscan_reachability.pyx b/hdbscan/_hdbscan_reachability.pyx index b1996ec8..5170bc90 100644 --- a/hdbscan/_hdbscan_reachability.pyx +++ b/hdbscan/_hdbscan_reachability.pyx @@ -126,6 +126,7 @@ def sparse_mutual_reachability_new(lil_mat, min_points=5): """Added by codingafuture for performance comparation. Compared to sparse_mutual_reachability_old, sparse_mutual_reachability_new speed up 20+ times. """ + from scipy.sparse import csr_matrix core_distance = np.empty(lil_mat.shape[0], dtype=np.double) for i in range(lil_mat.shape[0]): # 5w