-
-
Notifications
You must be signed in to change notification settings - Fork 342
Join multiple dataset #272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Prathameshk2024
wants to merge
11
commits into
TheAlgorithms:master
from
Prathameshk2024:join_multiple_dataset
Closed
Changes from 10 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
8b30acf
bidirectional_bfs
Prathameshk2024 7a50116
feat-graph_colouring
Prathameshk2024 c798337
feat_dinics_algo
Prathameshk2024 f375079
feat_wildcard_pattern
Prathameshk2024 8ae6d19
feat_optics_algo
Prathameshk2024 7c6b4db
feat_join_multiple_dataset
Prathameshk2024 678a09b
Merge branch 'master' into join_multiple_dataset
Prathameshk2024 eef9c9e
Update data_manipulation/join_multiple_datasets.r
Prathameshk2024 66d17c3
Update data_manipulation/join_multiple_datasets.r
Prathameshk2024 ff4d994
Update data_manipulation/join_multiple_datasets.r
Prathameshk2024 d9c501b
Fix: Remove accidentally committed git log file and improve bidirecti…
Prathameshk2024 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,334 @@ | ||
| # OPTICS (Ordering Points To Identify the Clustering Structure) | ||
| # | ||
| # OPTICS is a density-based clustering algorithm that extends DBSCAN by creating | ||
| # an augmented ordering of the database representing its density-based clustering | ||
| # structure. Unlike DBSCAN, it produces a reachability plot that can be used to | ||
| # extract clusters at different density thresholds. | ||
| # | ||
| # Time Complexity: O(n^2) for distance matrix, O(n log n) with spatial indexing | ||
| # Space Complexity: O(n^2) for distance matrix | ||
| # | ||
| # Input: data matrix/dataframe, eps (maximum radius), minPts (minimum neighbors) | ||
| # Output: ordered points with reachability distances and core distances | ||
|
|
||
| optics <- function(data, eps = 0.5, minPts = 5) { | ||
| # Convert to matrix | ||
| if (is.data.frame(data)) { | ||
| data <- as.matrix(data) | ||
| } | ||
|
|
||
| n <- nrow(data) | ||
|
|
||
| # Calculate pairwise distances | ||
| dist_matrix <- as.matrix(dist(data, method = "euclidean")) | ||
|
|
||
| # Initialize result vectors | ||
| reachability <- rep(Inf, n) | ||
| core_dist <- rep(Inf, n) | ||
| processed <- rep(FALSE, n) | ||
| ordered_points <- integer(0) | ||
| cluster_id <- rep(0, n) | ||
|
|
||
| # Calculate core distance for each point | ||
| for (i in 1:n) { | ||
| neighbors <- which(dist_matrix[i, ] <= eps) | ||
| neighbors <- neighbors[neighbors != i] | ||
|
|
||
| if (length(neighbors) >= minPts) { | ||
| neighbor_dists <- sort(dist_matrix[i, neighbors]) | ||
| core_dist[i] <- neighbor_dists[minPts] | ||
| } | ||
| } | ||
|
|
||
| # Priority queue implementation using simple sorting | ||
| update_seeds <- function(seeds, neighbors, center_idx) { | ||
| c_dist <- core_dist[center_idx] | ||
|
|
||
| for (neighbor in neighbors) { | ||
| if (processed[neighbor]) next | ||
|
|
||
| new_reach_dist <- max(c_dist, dist_matrix[center_idx, neighbor]) | ||
|
|
||
| if (reachability[neighbor] == Inf) { | ||
| reachability[neighbor] <<- new_reach_dist | ||
| seeds <- c(seeds, neighbor) | ||
| } else if (new_reach_dist < reachability[neighbor]) { | ||
| reachability[neighbor] <<- new_reach_dist | ||
| } | ||
| } | ||
|
|
||
| # Sort seeds by reachability distance | ||
| if (length(seeds) > 0) { | ||
| seeds <- seeds[order(reachability[seeds])] | ||
| } | ||
|
|
||
| return(seeds) | ||
| } | ||
|
|
||
| # Main OPTICS algorithm | ||
| for (i in 1:n) { | ||
| if (processed[i]) next | ||
|
|
||
| # Get neighbors within eps | ||
| neighbors <- which(dist_matrix[i, ] <= eps) | ||
| neighbors <- neighbors[neighbors != i] | ||
|
|
||
| processed[i] <- TRUE | ||
| ordered_points <- c(ordered_points, i) | ||
|
|
||
| if (core_dist[i] != Inf) { | ||
| seeds <- integer(0) | ||
| seeds <- update_seeds(seeds, neighbors, i) | ||
|
|
||
| while (length(seeds) > 0) { | ||
| current <- seeds[1] | ||
| seeds <- seeds[-1] | ||
|
|
||
| if (processed[current]) next | ||
|
|
||
| neighbors_current <- which(dist_matrix[current, ] <= eps) | ||
| neighbors_current <- neighbors_current[neighbors_current != current] | ||
|
|
||
| processed[current] <- TRUE | ||
| ordered_points <- c(ordered_points, current) | ||
|
|
||
| if (core_dist[current] != Inf) { | ||
| seeds <- update_seeds(seeds, neighbors_current, current) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return(list( | ||
| order = ordered_points, | ||
| reachability = reachability[ordered_points], | ||
| core_distance = core_dist[ordered_points], | ||
| eps = eps, | ||
| minPts = minPts | ||
| )) | ||
| } | ||
|
|
||
| # Extract DBSCAN-like clusters from OPTICS result | ||
| extract_dbscan_clusters <- function(optics_result, eps_cluster) { | ||
| n <- length(optics_result$order) | ||
| cluster_id <- rep(0, n) | ||
| current_cluster <- 0 | ||
|
|
||
| for (i in 1:n) { | ||
| if (optics_result$reachability[i] > eps_cluster) { | ||
| if (optics_result$core_distance[i] <= eps_cluster) { | ||
| current_cluster <- current_cluster + 1 | ||
| cluster_id[i] <- current_cluster | ||
| } else { | ||
| cluster_id[i] <- 0 | ||
| } | ||
| } else { | ||
| if (current_cluster == 0) { | ||
| current_cluster <- current_cluster + 1 | ||
| } | ||
| cluster_id[i] <- current_cluster | ||
| } | ||
| } | ||
|
|
||
| # Reorder to original indices | ||
| result <- rep(0, n) | ||
| result[optics_result$order] <- cluster_id | ||
|
|
||
| return(result) | ||
| } | ||
|
|
||
| # Extract clusters using xi method (steep areas in reachability plot) | ||
| extract_xi_clusters <- function(optics_result, xi = 0.05, min_cluster_size = 5) { | ||
| n <- length(optics_result$order) | ||
| reach <- optics_result$reachability | ||
|
|
||
| # Replace Inf with max finite value | ||
| max_reach <- max(reach[is.finite(reach)]) | ||
| reach[is.infinite(reach)] <- max_reach * 2 | ||
|
|
||
| # Find steep up and down areas | ||
| steep_up <- c() | ||
| steep_down <- c() | ||
|
|
||
| for (i in 2:n) { | ||
| if (reach[i] > reach[i-1] * (1 + xi)) { | ||
| steep_up <- c(steep_up, i) | ||
| } | ||
| if (reach[i] < reach[i-1] * (1 - xi)) { | ||
| steep_down <- c(steep_down, i) | ||
| } | ||
| } | ||
|
|
||
| # Match steep areas to form clusters | ||
| cluster_id <- rep(0, n) | ||
| current_cluster <- 0 | ||
|
|
||
| if (length(steep_down) > 0 && length(steep_up) > 0) { | ||
| for (sd in steep_down) { | ||
| matching_up <- steep_up[steep_up > sd] | ||
| if (length(matching_up) > 0) { | ||
| su <- matching_up[1] | ||
| if (su - sd >= min_cluster_size) { | ||
| current_cluster <- current_cluster + 1 | ||
| cluster_id[sd:su] <- current_cluster | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| # Reorder to original indices | ||
| result <- rep(0, n) | ||
| result[optics_result$order] <- cluster_id | ||
|
|
||
| return(result) | ||
| } | ||
|
|
||
| # Plot reachability plot | ||
| plot_reachability <- function(optics_result, main = "OPTICS Reachability Plot") { | ||
| reach <- optics_result$reachability | ||
| max_reach <- max(reach[is.finite(reach)]) | ||
| reach[is.infinite(reach)] <- max_reach * 1.2 | ||
|
|
||
| plot(1:length(reach), reach, type = "h", lwd = 2, | ||
| xlab = "Cluster Order", ylab = "Reachability Distance", | ||
| main = main, col = "steelblue") | ||
| grid() | ||
| } | ||
|
|
||
| # Calculate silhouette score for clusters | ||
| calculate_silhouette <- function(data, clusters) { | ||
| if (is.data.frame(data)) { | ||
| data <- as.matrix(data) | ||
| } | ||
|
|
||
| unique_clusters <- unique(clusters[clusters != 0]) | ||
| if (length(unique_clusters) < 2) { | ||
| return(NA) | ||
| } | ||
|
|
||
| dist_matrix <- as.matrix(dist(data)) | ||
| n <- nrow(data) | ||
| silhouette_scores <- rep(0, n) | ||
|
|
||
| for (i in 1:n) { | ||
| if (clusters[i] == 0) next | ||
|
|
||
| same_cluster <- which(clusters == clusters[i]) | ||
| same_cluster <- same_cluster[same_cluster != i] | ||
|
|
||
| if (length(same_cluster) == 0) { | ||
| silhouette_scores[i] <- 0 | ||
| next | ||
| } | ||
|
|
||
| a <- mean(dist_matrix[i, same_cluster]) | ||
|
|
||
| b <- Inf | ||
| for (other_cluster in unique_clusters) { | ||
| if (other_cluster == clusters[i]) next | ||
| other_points <- which(clusters == other_cluster) | ||
| if (length(other_points) > 0) { | ||
| b <- min(b, mean(dist_matrix[i, other_points])) | ||
| } | ||
| } | ||
|
|
||
| silhouette_scores[i] <- (b - a) / max(a, b) | ||
| } | ||
|
|
||
| return(mean(silhouette_scores[clusters != 0])) | ||
| } | ||
|
|
||
| # Example usage and tests | ||
| cat("=== OPTICS Clustering Algorithm ===\n\n") | ||
|
|
||
| # Example 1: Simple 2D dataset | ||
| cat("Example 1: Simple 2D Dataset\n") | ||
| set.seed(42) | ||
| cluster1 <- matrix(rnorm(100, mean = 0, sd = 0.5), ncol = 2) | ||
| cluster2 <- matrix(rnorm(100, mean = 3, sd = 0.5), ncol = 2) | ||
| cluster3 <- matrix(rnorm(100, mean = c(1.5, 3), sd = 0.5), ncol = 2) | ||
| data1 <- rbind(cluster1, cluster2, cluster3) | ||
|
|
||
| cat("Running OPTICS...\n") | ||
| result1 <- optics(data1, eps = 1.0, minPts = 5) | ||
| cat("Processed", length(result1$order), "points\n") | ||
| cat("First 10 reachability distances:", head(result1$reachability, 10), "\n") | ||
|
|
||
| # Extract clusters | ||
| clusters1 <- extract_dbscan_clusters(result1, eps_cluster = 0.8) | ||
| cat("Clusters found:", length(unique(clusters1[clusters1 != 0])), "\n") | ||
| cat("Noise points:", sum(clusters1 == 0), "\n") | ||
|
|
||
| if (length(unique(clusters1[clusters1 != 0])) >= 2) { | ||
| silhouette1 <- calculate_silhouette(data1, clusters1) | ||
| cat("Silhouette score:", round(silhouette1, 4), "\n") | ||
| } | ||
|
|
||
| # Example 2: Dataset with varying densities | ||
| cat("\n\nExample 2: Varying Density Dataset\n") | ||
| set.seed(123) | ||
| dense_cluster <- matrix(rnorm(200, mean = 0, sd = 0.3), ncol = 2) | ||
| sparse_cluster <- matrix(rnorm(100, mean = 4, sd = 1.0), ncol = 2) | ||
| data2 <- rbind(dense_cluster, sparse_cluster) | ||
|
|
||
| result2 <- optics(data2, eps = 2.0, minPts = 5) | ||
| cat("Processed", length(result2$order), "points\n") | ||
|
|
||
| clusters2_dbscan <- extract_dbscan_clusters(result2, eps_cluster = 1.0) | ||
| cat("DBSCAN-style extraction - Clusters:", length(unique(clusters2_dbscan[clusters2_dbscan != 0])), "\n") | ||
|
|
||
| clusters2_xi <- extract_xi_clusters(result2, xi = 0.05, min_cluster_size = 10) | ||
| cat("Xi-method extraction - Clusters:", length(unique(clusters2_xi[clusters2_xi != 0])), "\n") | ||
|
|
||
| # Example 3: Iris dataset | ||
| cat("\n\nExample 3: Iris Dataset\n") | ||
| data(iris) | ||
| iris_data <- iris[, 1:4] | ||
|
|
||
| result3 <- optics(iris_data, eps = 2.0, minPts = 5) | ||
| cat("Processed", length(result3$order), "points\n") | ||
|
|
||
| clusters3 <- extract_dbscan_clusters(result3, eps_cluster = 0.5) | ||
| cat("Clusters found:", length(unique(clusters3[clusters3 != 0])), "\n") | ||
| cat("Noise points:", sum(clusters3 == 0), "\n") | ||
|
|
||
| # Compare with actual species | ||
| if (length(unique(clusters3[clusters3 != 0])) >= 2) { | ||
| silhouette3 <- calculate_silhouette(iris_data, clusters3) | ||
| cat("Silhouette score:", round(silhouette3, 4), "\n") | ||
| } | ||
|
|
||
| # Example 4: Moons dataset (non-convex clusters) | ||
| cat("\n\nExample 4: Two Moons Dataset (Non-convex)\n") | ||
| set.seed(456) | ||
| n_points <- 100 | ||
| t1 <- seq(0, pi, length.out = n_points) | ||
| moon1 <- cbind(cos(t1), sin(t1)) + matrix(rnorm(n_points * 2, sd = 0.1), ncol = 2) | ||
| moon2 <- cbind(1 - cos(t1), 1 - sin(t1) - 0.5) + matrix(rnorm(n_points * 2, sd = 0.1), ncol = 2) | ||
| moons_data <- rbind(moon1, moon2) | ||
|
|
||
| result4 <- optics(moons_data, eps = 0.5, minPts = 5) | ||
| clusters4 <- extract_dbscan_clusters(result4, eps_cluster = 0.3) | ||
| cat("Clusters found:", length(unique(clusters4[clusters4 != 0])), "\n") | ||
| cat("Noise points:", sum(clusters4 == 0), "\n") | ||
|
|
||
| if (length(unique(clusters4[clusters4 != 0])) >= 2) { | ||
| silhouette4 <- calculate_silhouette(moons_data, clusters4) | ||
| cat("Silhouette score:", round(silhouette4, 4), "\n") | ||
| } | ||
|
|
||
| # Example 5: Performance comparison | ||
| cat("\n\nExample 5: Performance Test\n") | ||
| sizes <- c(100, 200, 500) | ||
| cat("Dataset size vs execution time:\n") | ||
|
|
||
| for (size in sizes) { | ||
| test_data <- matrix(rnorm(size * 2), ncol = 2) | ||
| start_time <- Sys.time() | ||
| test_result <- optics(test_data, eps = 1.0, minPts = 5) | ||
| end_time <- Sys.time() | ||
| elapsed <- as.numeric(difftime(end_time, start_time, units = "secs")) | ||
| cat(sprintf(" %d points: %.3f seconds\n", size, elapsed)) | ||
| } | ||
|
|
||
| cat("\n=== All tests completed successfully ===\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| # join_multiple_datasets.r | ||
| # | ||
| # Algorithm Description: | ||
| # This script provides a function to join multiple datasets (data frames or CSV file paths) | ||
| # by their common columns using inner joins. The function reads input datasets, removes | ||
| # empty or invalid ones, and then sequentially merges them on their shared columns. | ||
| # The merging is performed using dplyr's inner_join, and missing values are replaced with | ||
| # empty strings. The algorithm iteratively joins datasets, so its complexity is O(n * m), | ||
| # where n is the number of datasets and m is the average number of rows in each dataset. | ||
| # | ||
| # Example usage: | ||
| # result <- join_multiple_datasets(list("data1.csv", "data2.csv", df3)) | ||
| # head(result) | ||
| # | ||
|
|
||
| #' Join Multiple Datasets by Common Columns | ||
| #' | ||
| #' This function takes a list of data frames or CSV file paths and joins them sequentially | ||
| #' on their common columns using inner joins. It reads CSV files if paths are provided, | ||
| #' removes empty or invalid datasets, and merges the remaining datasets. Missing values | ||
| #' in the result are replaced with empty strings. | ||
| #' | ||
| #' @param inputs A list of data frames and/or character strings representing CSV file paths. | ||
| #' @return A data frame resulting from the inner join of all valid input datasets on their common columns. | ||
| #' @examples | ||
| #' # Example 1: Joining three data frames | ||
| #' df1 <- data.frame(id = 1:3, val1 = c("A", "B", "C")) | ||
| #' df2 <- data.frame(id = 2:3, val2 = c("X", "Y")) | ||
| #' df3 <- data.frame(id = 3, val3 = "Z") | ||
| #' result <- join_multiple_datasets(list(df1, df2, df3)) | ||
| #' print(result) | ||
| #' | ||
| #' # Example 2: Joining CSV files and a data frame | ||
| #' result <- join_multiple_datasets(list("file1.csv", "file2.csv", df3)) | ||
| #' head(result) | ||
| library(dplyr) | ||
| library(purrr) | ||
|
|
||
| join_multiple_datasets <- function(inputs) { | ||
| if (is.character(x)) read.csv(x, stringsAsFactors = FALSE) else x | ||
| }) | ||
|
|
||
| # Remove empty or invalid datasets | ||
| datasets <- datasets[!sapply(datasets, function(df) is.null(df) || nrow(df) == 0)] | ||
| if (length(datasets) < 2) stop(sprintf("At least two valid datasets are required, but only %d valid dataset(s) found.", length(datasets))) | ||
|
|
||
| # Function to find common columns between two datasets | ||
| get_common_cols <- function(df1, df2) intersect(names(df1), names(df2)) | ||
|
|
||
| # Sequentially join all datasets on their common columns | ||
| merged <- reduce(datasets, function(df1, df2) { | ||
| common_cols <- get_common_cols(df1, df2) | ||
| if (length(common_cols) == 0) stop("No common columns found for joining.") | ||
| suppressWarnings( | ||
| inner_join(df1, df2, by = common_cols) %>% | ||
| mutate_all(~ ifelse(is.na(.), "", .)) | ||
| ) | ||
| }) | ||
|
|
||
| return(merged) | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.