diff --git a/.github/workflows/check_on_branch.yml b/.github/workflows/check_on_branch.yml index 2f09f44..bc635a0 100644 --- a/.github/workflows/check_on_branch.yml +++ b/.github/workflows/check_on_branch.yml @@ -17,4 +17,4 @@ jobs: permissions: contents: read steps: - - uses: inbo/actions/check_pkg@checklist-0.3.6 + - uses: inbo/actions/check_pkg@main diff --git a/.github/workflows/check_on_main.yml b/.github/workflows/check_on_main.yml index d5a137b..6e8268d 100644 --- a/.github/workflows/check_on_main.yml +++ b/.github/workflows/check_on_main.yml @@ -16,4 +16,4 @@ jobs: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: inbo/actions/check_pkg@checklist-0.3.6 + - uses: inbo/actions/check_pkg@main diff --git a/.zenodo.json b/.zenodo.json index fcf7c32..3ff4445 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -1,9 +1,9 @@ { "title": "territoria: Clustering Observations from Breeding Birds into Territoria", - "version": "0.0.3", + "version": "0.1.0", "license": "GPL-3.0", "upload_type": "software", - "description": "
Clusters individual observations based on breeding indication and\ndistance between observations.<\/p>", + "description": "
Clusters individual observations based on breeding indication and distance between observations.<\/p>", "keywords": [ "breeding bird", "cluster" @@ -15,11 +15,11 @@ "name": "Onkelinx, Thierry", "affiliation": "Research Institute for Nature and Forest (INBO)", "orcid": "0000-0001-8804-4216", - "type": "ContactPerson" + "type": "contactperson" }, { "name": "Research Institute for Nature and Forest (INBO)", - "type": "RightsHolder" + "type": "rightsholder" } ], "creators": [ diff --git a/CITATION.cff b/CITATION.cff index 2925793..398e4b1 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,4 +20,4 @@ repository-code: https://github.com/inbo/territoria/ type: software abstract: "Clusters individual observations based on breeding indication and distance between observations." -version: 0.0.3 +version: 0.1.0 diff --git a/DESCRIPTION b/DESCRIPTION index e5cbdb6..828cbf9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: territoria Title: Clustering Observations from Breeding Birds into Territoria -Version: 0.0.3 +Version: 0.1.0 Authors@R: c( person("Thierry", "Onkelinx", , "thierry.onkelinx@inbo.be", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-8804-4216", affiliation = "Research Institute for Nature and Forest (INBO)")), @@ -12,10 +12,17 @@ Description: Clusters individual observations based on breeding indication License: GPL-3 URL: https://github.com/inbo/territoria BugReports: https://github.com/inbo/territoria/issues +Depends: + R (>= 4.1.0) Imports: assertthat, + deldir, + dplyr, + igraph, mvtnorm, + rlang, RSQLite, + sf, spatstat.geom, spatstat.random Config/checklist/communities: inbo @@ -23,4 +30,4 @@ Config/checklist/keywords: breeding bird; cluster Encoding: UTF-8 Language: en-GB Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 diff --git a/LICENSE.md b/LICENSE.md index 2fb2e74..379c1b2 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,4 @@ -### GNU GENERAL PUBLIC LICENSE +# GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 diff --git a/NAMESPACE b/NAMESPACE index 78aa76c..f9dce5c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,9 +3,16 @@ export(cluster_observation) export(connect_db) export(distance_matrix) +export(edge_distribution) export(get_cluster) +export(get_total) +export(get_unlikely_observation) +export(get_unlikely_summary) export(import_observations) export(simulate_observations) +export(unlikely_edge_distribution) +export(unlikely_status) +export(unlikely_survey_area) importFrom(RSQLite,SQLite) importFrom(RSQLite,dbClearResult) importFrom(RSQLite,dbConnect) @@ -20,7 +27,28 @@ importFrom(assertthat,is.flag) importFrom(assertthat,is.number) importFrom(assertthat,is.string) importFrom(assertthat,noNA) +importFrom(deldir,deldir) +importFrom(dplyr,filter) +importFrom(dplyr,group_by) +importFrom(dplyr,mutate) +importFrom(dplyr,summarise) +importFrom(dplyr,transmute) +importFrom(igraph,V) +importFrom(igraph,decompose) +importFrom(igraph,graph_from_data_frame) importFrom(mvtnorm,rmvnorm) +importFrom(rlang,.data) +importFrom(sf,st_area) +importFrom(sf,st_as_sf) +importFrom(sf,st_buffer) +importFrom(sf,st_concave_hull) +importFrom(sf,st_drop_geometry) +importFrom(sf,st_intersection) +importFrom(sf,st_union) importFrom(spatstat.geom,owin) importFrom(spatstat.random,rStrauss) +importFrom(stats,aggregate) +importFrom(stats,binom.test) importFrom(stats,rbinom) +importFrom(stats,rnorm) +importFrom(stats,wilcox.test) diff --git a/NEWS.md b/NEWS.md index 44ce1af..ba80664 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,24 @@ +# territoria 0.1.0 + +* `import_observations()` splits surveys with clearly distinct groups of + observations into multiple surveys. + This is done by clustering the observations and checking if the clusters are + too far apart geographically. + It also requires to provide a user and region id. +* `edge_distribution()` return the distribution of the length of the edges of a Delaunay + triangulation on the observations per survey. +* Add `unlikely_survey_area()`, `unlikely_status()` and + `unlikely_edge_distribution()` to detect surveys which are unlikely done + according to the sampling protocol. +* `distance_matrix()` ignores observations from unlikely surveys. + Hence `cluster_observation()` will ignore them during the clustering and + `get_cluster()` ignores them. +* `get_total()` returns the total number of individuals per region based on the + clustering. +* `get_unlikely_summary()` and `get_unlikely_observation()` return the unlikely + surveys and observations. +* Update [`checklist`](https://inbo.github.io/checklist/) machinery. + # territoria 0.0.3 * Update [`checklist`](https://inbo.github.io/checklist/) machinery. diff --git a/R/distance_matrix.R b/R/distance_matrix.R index d28756c..180dc9c 100644 --- a/R/distance_matrix.R +++ b/R/distance_matrix.R @@ -27,9 +27,11 @@ ON distance (distance)" sql <- sprintf( "WITH cte_obs AS ( SELECT - id, x, y, group_x, group_y, group_x + 1 AS group_xp, - group_y + 1 AS group_yp, group_y - 1 AS group_ym - FROM observation + o.id, o.x, o.y, o.group_x, o.group_y, o.group_x + 1 AS group_xp, + o.group_y + 1 AS group_yp, o.group_y - 1 AS group_ym + FROM observation AS o + LEFT JOIN unlikely AS u ON o.survey = u.survey + WHERE u.value IS NULL ), cte_distance AS ( SELECT diff --git a/R/edge_distribution.R b/R/edge_distribution.R new file mode 100644 index 0000000..1c92636 --- /dev/null +++ b/R/edge_distribution.R @@ -0,0 +1,40 @@ +#' @title Edge distribution +#' @description +#' This function returns the length of the edges of the Delaunay triangulation +#' per survey. +#' The function returns only edges with length smaller than twice `max_dist`. +#' @inheritParams import_observations +#' @importFrom assertthat assert_that is.number noNA +#' @importFrom deldir deldir +#' @importFrom RSQLite dbGetQuery +#' @export +edge_distribution <- function(conn, max_dist = 336) { + assert_that( + inherits(conn, "SQLiteConnection"), + is.number(max_dist), noNA(max_dist), max_dist > 0 + ) + "SELECT survey, x, y FROM observation" |> + dbGetQuery(conn = conn) -> observations + table(observations$survey) |> + as.data.frame() -> n_survey + n_survey[n_survey$Freq > 1, ] |> + merge(x = observations, by.y = "Var1", by.x = "survey") -> observations + edges <- data.frame(survey = integer(0), length = numeric(0)) + for (i in unique(observations$survey)) { + candidate <- which(observations$survey == i) + dd <- deldir( + x = rnorm(length(candidate), mean = observations$x[candidate], sd = 0.01), + y = rnorm(length(candidate), mean = observations$y[candidate], sd = 0.01), + id = observations$id[candidate] + ) + extra <- data.frame( + survey = i, + length = sqrt( + (dd$delsgs$x2 - dd$delsgs$x1) ^ 2 + (dd$delsgs$y2 - dd$delsgs$y1) ^ 2 + ) + ) + extra[extra$length <= 2 * max_dist, ] |> + rbind(edges) -> edges + } + return(edges) +} diff --git a/R/get_cluster.R b/R/get_cluster.R index 67c641a..95049be 100644 --- a/R/get_cluster.R +++ b/R/get_cluster.R @@ -9,14 +9,19 @@ get_cluster <- function(conn) { "observation" %in% dbListTables(conn, "observation"), msg = "No observations found. Did you run `import_observations()`?" ) - obs <- dbGetQuery( - conn, "SELECT id, x, y, survey, status, cluster FROM observation" - ) + "SELECT +o.id, o.x, o.y, o.survey, o.status, o.cluster +FROM observation AS o +LEFT JOIN unlikely AS u ON o.survey = u.survey +WHERE u.value IS NULL" |> + dbGetQuery(conn = conn) -> obs cluster <- dbGetQuery(conn, " SELECT - cluster, COUNT(cluster) AS n_obs, MAX(status) AS max_status, - AVG(x) AS centroid_x, AVG(y) AS centroid_y -FROM observation -GROUP BY cluster") + o.cluster, COUNT(o.cluster) AS n_obs, MAX(status) AS max_status, + AVG(o.x) AS centroid_x, AVG(o.y) AS centroid_y +FROM observation AS o +LEFT JOIN unlikely AS u ON o.survey = u.survey +WHERE u.value IS NULL +GROUP BY o.cluster") return(list(cluster = cluster, observations = obs)) } diff --git a/R/get_total.R b/R/get_total.R new file mode 100644 index 0000000..307faa7 --- /dev/null +++ b/R/get_total.R @@ -0,0 +1,52 @@ +#' Get the total number of individuals per region +#' Returns the sum of `value` per region. +#' Clusters spanning more than one region get each a fraction of `value` based +#' on the number of observations of the cluster in a region divided by the total +#' number of observations in the cluster. +#' @inheritParams import_observations +#' @param value A string representing the value per cluster to be calculated. +#' Default is `"IIF(max_status >= 2, 1, iif(n_total > 1, 0.5, 0))"`. +#' Available variables are +#' - `max_status`: the highest status of a cluster. +#' - `n_total`: the total number of observations per cluster +#' - `n_region`: the number of observations in a cluster from the same region. +#' @export +#' @importFrom assertthat assert_that is.string noNA +#' @importFrom RSQLite dbGetQuery +get_total <- function( + conn, value = "IIF(max_status >= 2, 1, iif(n_total > 1, 0.5, 0))" +) { + assert_that( + inherits(conn, "SQLiteConnection"), is.string(value), noNA(value) + ) + sprintf( + "WITH cte_obs AS ( + SELECT o.region, o.cluster, o.status + FROM observation AS o + LEFT JOIN unlikely AS u ON o.survey = u.survey + WHERE u.value IS NULL +), +cte_region AS ( + SELECT cluster, region, COUNT(cluster) AS n_region + FROM cte_obs + WHERE region IS NOT NULL + GROUP BY cluster, region +), +cte_cluster AS ( + SELECT cluster, MAX(status) AS max_status, COUNT(status) AS n_total + FROM cte_obs + GROUP BY cluster +), +cte AS ( + SELECT + r.region, c.cluster, 1.0 * r.n_region / c.n_total AS fraction, + %s AS value + FROM cte_region AS r + INNER JOIN cte_cluster AS c ON r.cluster = c.cluster +) + +SELECT region, SUM(value * fraction) AS individuals FROM cte GROUP BY region", + value + ) |> + dbGetQuery(conn = conn) +} diff --git a/R/get_unlikely.R b/R/get_unlikely.R new file mode 100644 index 0000000..846b1d7 --- /dev/null +++ b/R/get_unlikely.R @@ -0,0 +1,39 @@ +#' Get an overview of why the data of each survey and/or region is unlikely to be the result of the counting protocol. +#' @inheritParams import_observations +#' @export +#' @importFrom assertthat assert_that +#' @importFrom RSQLite dbGetQuery +get_unlikely_summary <- function(conn = conn) { + assert_that(inherits(conn, "SQLiteConnection")) + "WITH cte AS ( + SELECT survey, region, COUNT(id) n + FROM observation + WHERE region IS NOT NULL + GROUP BY survey, region +), +cte_survey AS ( + SELECT survey, region FROM cte GROUP BY survey HAVING n = MAX(n) +) +SELECT c.region, c.survey, u.reason +FROM cte_survey AS c +INNER JOIN unlikely AS u ON c.survey = u.survey +ORDER BY c.region, c.survey" |> + dbGetQuery(conn = conn) +} + +#' Get the unlikely observations +#' @inheritParams import_observations +#' @inheritParams sf::st_as_sf +#' @export +#' @importFrom assertthat assert_that +#' @importFrom RSQLite dbGetQuery +#' @importFrom sf st_as_sf +get_unlikely_observation <- function(conn, crs = 31370) { + assert_that(inherits(conn, "SQLiteConnection")) + "SELECT o.id, o.x, o.y, o.status, o.region, o.survey, s.original, s.user +FROM observation AS o +INNER JOIN unlikely AS u ON o.survey = u.survey +INNER JOIN survey AS s on o.survey = s.id" |> + dbGetQuery(conn = conn) |> + st_as_sf(coords = c("x", "y"), crs = crs) +} diff --git a/R/import_observations.R b/R/import_observations.R index 78a7d04..d96b93c 100644 --- a/R/import_observations.R +++ b/R/import_observations.R @@ -3,28 +3,51 @@ #' The function overwrites any existing table with observations. #' @param observations a data.frame with the observations. #' @param max_dist maximum clustering distance in m. +#' @param max_edge The maximum edge length in m. +#' We apply a Delaunay triangulation to the observations per survey. +#' After removing the edges larger than `max_edge`, we see which observations +#' result in a connected graph. +#' Every connected graph is assigned a new survey id. #' @param conn a DBI connection to an SQLite database. #' @export #' @importFrom assertthat assert_that has_name is.number is.string noNA +#' @importFrom deldir deldir +#' @importFrom igraph decompose graph_from_data_frame V #' @importFrom RSQLite dbClearResult dbSendQuery dbWriteTable -import_observations <- function(observations, conn, max_dist = 336) { - assert_that(inherits(observations, "data.frame")) +#' @importFrom stats aggregate rnorm +import_observations <- function( + observations, conn, max_dist = 336, max_edge = 1500 +) { assert_that( - has_name(observations, "id"), + inherits(observations, "data.frame"), + is.number(max_dist), noNA(max_dist), max_dist > 0, + is.number(max_edge), noNA(max_edge), max_edge > 0, max_dist < max_edge + ) + assert_that( + has_name(observations, "id"), has_name(observations, "user"), has_name(observations, "x"), has_name(observations, "y"), - has_name(observations, "survey"), has_name(observations, "status") + has_name(observations, "survey"), has_name(observations, "status"), + has_name(observations, "region") ) assert_that(is.numeric(observations$x), is.numeric(observations$y)) assert_that( noNA(observations$id), noNA(observations$x), noNA(observations$y), - noNA(observations$survey), noNA(observations$status) + noNA(observations$survey), noNA(observations$status), + noNA(observations$user) ) observations$id <- make_integer(observations$id) observations$survey <- make_integer(observations$survey) observations$status <- make_integer(observations$status) + observations$user <- make_integer(observations$user) + observations$region <- make_integer(observations$region) assert_that( anyDuplicated(observations$id) == 0, msg = "duplicate values in id" ) + unique_combo <- unique(observations[, c("survey", "user")]) + assert_that( + anyDuplicated(unique_combo) == 0, msg = "`survey` with multiple `user`" + ) + diagonal <- diff(range(observations$x)) ^ 2 + diff(range(observations$y)) ^ 2 assert_that( max_dist < sqrt(diagonal), @@ -33,34 +56,120 @@ import_observations <- function(observations, conn, max_dist = 336) { assert_that(inherits(conn, "SQLiteConnection")) - assert_that(is.number(max_dist), max_dist > 0) - sql <- "DROP TABLE IF EXISTS distance" res <- dbSendQuery(conn, sql) dbClearResult(res) + sql <- "DROP TABLE IF EXISTS survey" + res <- dbSendQuery(conn, sql) + dbClearResult(res) + sql <- "DROP TABLE IF EXISTS observation" res <- dbSendQuery(conn, sql) dbClearResult(res) + sql <- "DROP TABLE IF EXISTS unlikely" + res <- dbSendQuery(conn, sql) + dbClearResult(res) + + sql <- "CREATE TABLE survey ( + id INTEGER PRIMARY KEY, original INTEGER NOT NULL, user INTEGER NOT NULL +)" + res <- dbSendQuery(conn, sql) + dbClearResult(res) + + sql <- "CREATE TABLE unlikely ( + survey INTEGER NOT NULL, reason character NOT NULL, value NUMERIC NOT NULL +)" + res <- dbSendQuery(conn, sql) + dbClearResult(res) + sql <- "CREATE TABLE observation ( id INTEGER PRIMARY KEY, x REAL NOT NULL, y REAL NOT NULL, group_x INTEGER NOT NULL, group_y INTEGER NOT NULL, survey INTEGER NOT NULL, - status INTEGER NOT NULL, cluster INTEGER NOT NULL)" + status INTEGER NOT NULL, cluster INTEGER NOT NULL, region INTEGER +)" res <- dbSendQuery(conn, sql) dbClearResult(res) # force observations into a data.frame to avoid problems with sf objects observations <- as.data.frame(observations) - observations$cluster <- observations$id - observations$group_x <- floor(observations$x / max_dist / 2) - observations$group_y <- floor(observations$y / max_dist / 2) - cols <- c("id", "x", "y", "survey", "status", "cluster", "group_x", "group_y") + # splits surveys into clearly distinct groups + observations$original <- observations$survey + # surveys with a bounding box diagonal smaller that the max_edge are OK + bb_min <- aggregate(cbind(x, y) ~ survey, data = observations, FUN = min) + bb_max <- aggregate(cbind(x, y) ~ survey, data = observations, FUN = max) + diagonal <- sqrt((bb_max$x - bb_min$x) ^ 2 + (bb_max$y - bb_min$y) ^ 2) + to_do <- bb_min$survey[diagonal >= max_edge] + done <- observations[!observations$survey %in% to_do, ] + # handle surveys with a bounding box diagonal larger than the max_edge + for (i in to_do) { + candidate <- which(observations$survey == i) + # make Delaunay triangulation + dd <- deldir( + x = rnorm(length(candidate), mean = observations$x[candidate], sd = 0.01), + y = rnorm(length(candidate), mean = observations$y[candidate], sd = 0.01), + id = observations$id[candidate] + ) + # calculate length of edges + edges <- data.frame( + id1 = dd$delsgs$ind1, id2 = dd$delsgs$ind2, + length = sqrt( + (dd$delsgs$x2 - dd$delsgs$x1) ^ 2 + (dd$delsgs$y2 - dd$delsgs$y1) ^ 2 + ) + ) + # do nothing when every edge is smaller than the max_edge + if (max(edges$length) < max_edge) { + done <- rbind(done, observations[candidate, ]) + next + } + # remove edges larger than the max_edge and decompose the graph + edges[edges$length < max_edge, ] |> + rbind( + data.frame( + id1 = observations$id[candidate], id2 = observations$id[candidate], + length = 0 + ) + ) |> + graph_from_data_frame(directed = FALSE) |> + decompose() -> sg + # do nothing when there is this one graph + if (length(sg) == 1) { + done <- rbind(done, observations[candidate, ]) + next + } + # split the survey into groups when there are multiple graphs + for (j in seq_along(sg)) { + V(sg[[j]]) |> + names() |> + as.integer() -> relevant + extra <- observations[observations$id %in% relevant, ] + extra$survey <- j + done <- rbind(done, extra) + } + } + done$survey <- interaction(done$original, done$survey, drop = TRUE) |> + as.integer() + + # store the original survey id and user id + surveys <- unique(done[, c("survey", "original", "user")]) + colnames(surveys) <- c("id", "original", "user") + dbWriteTable(conn, name = "survey", append = TRUE, value = surveys) + + # store the observations with new survey id + done$cluster <- done$id + done$group_x <- floor(done$x / max_dist / 2) + done$group_y <- floor(done$y / max_dist / 2) + cols <- c( + "id", "x", "y", "survey", "status", "cluster", "group_x", "group_y", + "region" + ) dbWriteTable( - conn, name = "observation", append = TRUE, value = observations[, cols] + conn, name = "observation", append = TRUE, value = done[, cols] ) + # create an index on the observation table sql <- "CREATE INDEX IF NOT EXISTS observation_idx ON observation (group_x, group_y)" res <- dbSendQuery(conn, sql) diff --git a/R/simulate_observations.R b/R/simulate_observations.R index fd35fa1..c2e6994 100644 --- a/R/simulate_observations.R +++ b/R/simulate_observations.R @@ -43,5 +43,7 @@ simulate_observations <- function( nrow(observations), size = 1, prob = p_detection ) == 1 observations$id <- seq_along(observations$x) + observations$user <- 1 + observations$region <- 1 return(list(observations = observations, centroids = centroids)) } diff --git a/R/unlikely_edge_distribution.R b/R/unlikely_edge_distribution.R new file mode 100644 index 0000000..4d59e46 --- /dev/null +++ b/R/unlikely_edge_distribution.R @@ -0,0 +1,45 @@ +#' Check for unlikely distribution of Delaunay edges +#' +#' This function compares the distribution of Delaunay edges per survey in the +#' `conn` database with the distribution of Delaunay edges of all surveys in the +#' `conn_reference` database. +#' It uses a Wilcoxon test to check if the distribution of edges is +#' significantly different. +#' The function returns the proportion of surveys with an unlikely distribution +#' and writes the list of unlikely surveys to the database. +#' @inheritParams import_observations +#' @inheritParams unlikely_status +#' @param conn_reference A connection to the reference database. +#' @export +#' @importFrom assertthat assert_that is.number noNA +#' @importFrom RSQLite dbWriteTable +#' @importFrom stats wilcox.test +unlikely_edge_distribution <- function( + conn, conn_reference, max_dist = 336, alpha = 0.01 +) { + assert_that(is.number(alpha), noNA(alpha), alpha > 0, alpha < 1) + edges <- edge_distribution(conn = conn, max_dist = max_dist) + reference <- edge_distribution(conn = conn_reference, max_dist = max_dist) + survey <- unique(edges$survey) + p <- vapply( + survey, FUN.VALUE = numeric(1), edges = edges, reference = reference, + FUN = function(i, edges, reference) { + wilcox.test( + x = edges$length[edges$survey == i], y = reference$length + )$p.value + } + ) + dist_test <- data.frame(survey = survey, p_value = p) + dist_test <- dist_test[order(dist_test$p_value), ] + dist_test$log_p <- log(1 - dist_test$p_value) + c(0, diff(dist_test$p_value) > 0) |> + cumsum() -> dist_test$group + dist_group <- aggregate(log_p ~ group, data = dist_test, FUN = sum) + dist_group$value <- 1 - exp(cumsum(dist_group$log_p)) + dist_group[dist_group$value < alpha, c("group", "value")] |> + merge(dist_test[, c("survey", "group")], by = "group") -> unlikely + unlikely$reason <- "Difference in distribution of Delaunay edges" + unlikely[, c("survey", "reason", "value")] |> + dbWriteTable(conn = conn, name = "unlikely", append = TRUE) + return(nrow(unlikely) / nrow(dist_test)) +} diff --git a/R/unlikely_status.R b/R/unlikely_status.R new file mode 100644 index 0000000..be33c04 --- /dev/null +++ b/R/unlikely_status.R @@ -0,0 +1,65 @@ +#' Detect surveys with an unlikely status distribution +#' @inheritParams import_observations +#' @param threshold A numeric value between 0 and 1 indicating the average +#' proportion of the observations with status above or equal to the +#' `status_split `. +#' Default of 0.5. +#' @param status_split An integer value indicating which status level splits +#' the status into two groups. +#' The default is 2. +#' @param alpha A numeric value between 0 and 1 indicating the family-wise Type +#' I error. +#' Default of 0.01. +#' @return A numeric value between 0 and 1 indicating the proportion of +#' surveys with an unlikely status distribution. +#' The list of unlikely surveys is written to the database. +#' @export +#' @importFrom assertthat assert_that is.count is.number noNA +#' @importFrom RSQLite dbGetQuery dbWriteTable +#' @importFrom stats aggregate binom.test +unlikely_status <- function( + conn, threshold = 0.5, status_split = 2, alpha = 0.01 +) { + assert_that( + is.number(threshold), noNA(threshold), threshold > 0, + is.count(status_split), noNA(status_split), + is.number(alpha), noNA(alpha), alpha > 0, alpha < 1, + inherits(conn, "SQLiteConnection") + ) + sprintf( + "SELECT survey, status >= %1$i AS above, COUNT(id) AS n +FROM observation +GROUP BY survey, status >= %1$i", + status_split + ) |> + dbGetQuery(conn = conn) -> status_obs + merge( + status_obs[status_obs$above == 1, c("survey", "n")], + status_obs[status_obs$above == 0, c("survey", "n")], + by = "survey", all = TRUE + ) -> status_obs + status_obs$n.x[is.na(status_obs$n.x)] <- 0 + status_obs$n.y[is.na(status_obs$n.y)] <- 0 + status_obs[, c("n.x", "n.y")] |> + apply( + 1, threshold = threshold, + FUN = function(x, threshold) { + binom.test(x = x, alternative = "greater", p = threshold)$p.value + } + ) -> status_obs$p_value + status_obs <- status_obs[order(status_obs$p_value, -status_obs$n.y), ] + status_obs$log_p <- log(1 - status_obs$p_value) + c(0, diff(status_obs$n.x) != 0 | diff(status_obs$n.y) != 0) |> + cumsum() -> status_obs$group + status_group <- aggregate(log_p ~ group, data = status_obs, FUN = sum) + status_group$value <- 1 - exp(cumsum(status_group$log_p)) + status_group[status_group$value < alpha, c("group", "value")] |> + merge(status_obs[, c("survey", "group")], by = "group") -> unlikely + unlikely$reason <- sprintf( + "fraction status above or equal to %i greather than %.0f%%", status_split, + 100 * threshold + ) + unlikely[, c("survey", "reason", "value")] |> + dbWriteTable(conn = conn, name = "unlikely", append = TRUE) + return(nrow(unlikely) / nrow(status_obs)) +} diff --git a/R/unlikely_survey_area.R b/R/unlikely_survey_area.R new file mode 100644 index 0000000..a86a692 --- /dev/null +++ b/R/unlikely_survey_area.R @@ -0,0 +1,58 @@ +#' Detect surveys with an unlikely surveyed area +#' +#' First we create a Delaunay triangulation of the survey area. +#' Then we ignore all triangles with an edge larger than `max_edge`. +#' Finally we calculate the area of the remaining triangles. +#' When the sum of the area of the remaining triangles is larger than +#' `max_area`, we consider the survey area unlikely. +#' @inheritParams import_observations +#' @inheritParams sf::st_as_sf +#' @inheritParams sf::st_buffer +#' @inheritParams sf::st_concave_hull +#' @param relevant_area An `sf` object with the relevant area. +#' @param max_area the maximum area of the concave hull. +#' @importFrom assertthat assert_that is.number noNA +#' @importFrom dplyr filter group_by mutate summarise transmute +#' @importFrom RSQLite dbGetQuery dbWriteTable +#' @importFrom rlang .data +#' @importFrom sf st_area st_as_sf st_buffer st_concave_hull st_drop_geometry +#' st_intersection st_union +#' @export +unlikely_survey_area <- function( + conn, relevant_area, max_area = 3e6, dist = 50, crs = 31370, ratio = 0.95 +) { + assert_that( + inherits(conn, "SQLiteConnection"), inherits(relevant_area, "sf"), + is.number(ratio), noNA(ratio), 0 <= ratio, ratio <= 1, + is.number(dist), noNA(dist), dist > 0, + is.number(max_area), noNA(max_area), max_area > 0 + ) + "SELECT survey, x, y FROM observation" |> + dbGetQuery(conn = conn) |> + st_as_sf(coords = c("x", "y"), crs = crs) |> + group_by(.data$survey) |> + summarise( + geometry = st_union(.data$geometry) |> + st_buffer(dist = dist) |> + st_concave_hull(ratio = ratio), + .groups = "drop" + ) |> + st_intersection(relevant_area) |> + group_by(.data$survey) |> + summarise(geometry = st_union(.data$geometry), .groups = "drop") |> + mutate( + area = st_area(.data$geometry) |> + as.numeric() + ) -> survey + survey |> + st_drop_geometry() |> + filter(.data$area >= max_area) |> + transmute( + .data$survey, value = .data$area, + reason = sprintf( + "Area of relevant concave hull larger than %.0f ha", max_area / 1e4 + ) + ) -> unlikely + dbWriteTable(conn = conn, name = "unlikely", value = unlikely, append = TRUE) + return(nrow(unlikely) / nrow(survey)) +} diff --git a/README.md b/README.md index 1f29e3b..08e4372 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ summary(obs$centroids) #> 1st Qu.: 544.72 1st Qu.: 946.10 #> Median : 927.92 Median :1246.24 #> Mean : 955.15 Mean :1224.73 -#> 3rd Qu.:1349.57 3rd Qu.:1669.61 +#> 3rd Qu.:1349.57 3rd Qu.:1669.60 #> Max. :1897.31 Max. :1963.82 summary(obs$observations) #> x y survey status @@ -68,15 +68,15 @@ summary(obs$observations) #> 1st Qu.: 550.47 1st Qu.: 910.56 1st Qu.:1.75 1st Qu.:2.000 #> Median : 899.91 Median :1272.53 Median :2.50 Median :2.000 #> Mean : 946.40 Mean :1238.04 Mean :2.50 Mean :2.096 -#> 3rd Qu.:1362.48 3rd Qu.:1764.88 3rd Qu.:3.25 3rd Qu.:3.000 +#> 3rd Qu.:1362.48 3rd Qu.:1764.89 3rd Qu.:3.25 3rd Qu.:3.000 #> Max. :2012.23 Max. :2082.18 Max. :4.00 Max. :3.000 -#> observed id -#> Mode :logical Min. : 1.00 -#> FALSE:44 1st Qu.: 26.75 -#> TRUE :60 Median : 52.50 -#> Mean : 52.50 -#> 3rd Qu.: 78.25 -#> Max. :104.00 +#> observed id user region +#> Mode :logical Min. : 1.00 Min. :1 Min. :1 +#> FALSE:44 1st Qu.: 26.75 1st Qu.:1 1st Qu.:1 +#> TRUE :60 Median : 52.50 Median :1 Median :1 +#> Mean : 52.50 Mean :1 Mean :1 +#> 3rd Qu.: 78.25 3rd Qu.:1 3rd Qu.:1 +#> Max. :104.00 Max. :1 Max. :1 obs <- obs$observations[obs$observations$observed, ] ``` diff --git a/inst/CITATION b/inst/CITATION index 25fab8f..9f3b31f 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -2,12 +2,12 @@ citHeader("To cite `territoria` in publications please use:") # begin checklist entry bibentry( bibtype = "Manual", - title = "territoria: Clustering Observations from Breeding Birds into Territoria. Version 0.0.3", + title = "territoria: Clustering Observations from Breeding Birds into Territoria. Version 0.1.0", author = c( author = c(person(given = "Thierry", family = "Onkelinx"))), - year = 2023, + year = 2025, url = "https://github.com/inbo/territoria/", abstract = "Clusters individual observations based on breeding indication and distance between observations.", - textVersion = "Onkelinx, Thierry (2023) territoria: Clustering Observations from Breeding Birds into Territoria. Version 0.0.3. https://github.com/inbo/territoria/", + textVersion = "Onkelinx, Thierry (2025) territoria: Clustering Observations from Breeding Birds into Territoria. Version 0.1.0. https://github.com/inbo/territoria/", keywords = "breeding bird; cluster", ) # end checklist entry diff --git a/inst/en_gb.dic b/inst/en_gb.dic index 700598b..3703ec6 100644 --- a/inst/en_gb.dic +++ b/inst/en_gb.dic @@ -1 +1,2 @@ +Delaunay SQLite diff --git a/man/edge_distribution.Rd b/man/edge_distribution.Rd new file mode 100644 index 0000000..a3a3367 --- /dev/null +++ b/man/edge_distribution.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edge_distribution.R +\name{edge_distribution} +\alias{edge_distribution} +\title{Edge distribution} +\usage{ +edge_distribution(conn, max_dist = 336) +} +\arguments{ +\item{conn}{a DBI connection to an SQLite database.} + +\item{max_dist}{maximum clustering distance in m.} +} +\description{ +This function return the length of the edges of the Delaunay triangulation +per survey. +The function returns only edges with length smaller than twice \code{max_dist}. +} diff --git a/man/get_total.Rd b/man/get_total.Rd new file mode 100644 index 0000000..91a20a9 --- /dev/null +++ b/man/get_total.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_total.R +\name{get_total} +\alias{get_total} +\title{Get the total number of individuals per region +Returns the sum of \code{value} per region. +Clusters spanning more than one region get each a fraction of \code{value} based +on the number of observations of the cluster in a region divided by the total +number of observations in the cluster.} +\usage{ +get_total(conn, value = "IIF(max_status >= 2, 1, iif(n_total > 1, 0.5, 0))") +} +\arguments{ +\item{conn}{a DBI connection to an SQLite database.} + +\item{value}{A string representing the value per cluster to be calculated. +Default is \code{"IIF(max_status >= 2, 1, iif(n_total > 1, 0.5, 0))"}. +Available variables are +\itemize{ +\item \code{max_status}: the highest status of a cluster. +\item \code{n_total}: the total number of observations per cluster +\item \code{n_region}: the number of observations in a cluster from the same region. +}} +} +\description{ +Get the total number of individuals per region +Returns the sum of \code{value} per region. +Clusters spanning more than one region get each a fraction of \code{value} based +on the number of observations of the cluster in a region divided by the total +number of observations in the cluster. +} diff --git a/man/get_unlikely_observation.Rd b/man/get_unlikely_observation.Rd new file mode 100644 index 0000000..9bf422f --- /dev/null +++ b/man/get_unlikely_observation.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_unlikely.R +\name{get_unlikely_observation} +\alias{get_unlikely_observation} +\title{Get the unlikely observations} +\usage{ +get_unlikely_observation(conn, crs = 31370) +} +\arguments{ +\item{conn}{a DBI connection to an SQLite database.} + +\item{crs}{coordinate reference system to be assigned; object of class \code{crs}} +} +\description{ +Get the unlikely observations +} diff --git a/man/get_unlikely_summary.Rd b/man/get_unlikely_summary.Rd new file mode 100644 index 0000000..b72b5db --- /dev/null +++ b/man/get_unlikely_summary.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_unlikely.R +\name{get_unlikely_summary} +\alias{get_unlikely_summary} +\title{Get an overview of the unlikely reasons for each survey and region} +\usage{ +get_unlikely_summary(conn = conn) +} +\arguments{ +\item{conn}{a DBI connection to an SQLite database.} +} +\description{ +Get an overview of the unlikely reasons for each survey and region +} diff --git a/man/import_observations.Rd b/man/import_observations.Rd index 388a4e0..8cb6b91 100644 --- a/man/import_observations.Rd +++ b/man/import_observations.Rd @@ -4,7 +4,7 @@ \alias{import_observations} \title{Import the observations} \usage{ -import_observations(observations, conn, max_dist = 336) +import_observations(observations, conn, max_dist = 336, max_edge = 1500) } \arguments{ \item{observations}{a data.frame with the observations.} @@ -12,6 +12,12 @@ import_observations(observations, conn, max_dist = 336) \item{conn}{a DBI connection to an SQLite database.} \item{max_dist}{maximum clustering distance in m.} + +\item{max_edge}{The maximum edge length in m. +We apply a Delaunay triangulation to the observations per survey. +After removing the edges larger than \code{max_edge}, we see which observations +result in a connected graph. +Every connected graph is assigned a new survey id.} } \description{ The function overwrites any existing table with observations. diff --git a/man/unlikely_edge_distribution.Rd b/man/unlikely_edge_distribution.Rd new file mode 100644 index 0000000..82d6a58 --- /dev/null +++ b/man/unlikely_edge_distribution.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unlikely_edge_distribution.R +\name{unlikely_edge_distribution} +\alias{unlikely_edge_distribution} +\title{Check for unlikely distribution of Delaunay edges} +\usage{ +unlikely_edge_distribution(conn, conn_reference, max_dist = 336, alpha = 0.01) +} +\arguments{ +\item{conn}{a DBI connection to an SQLite database.} + +\item{conn_reference}{a connection to the reference database.} + +\item{max_dist}{maximum clustering distance in m.} + +\item{alpha}{A numeric value between 0 and 1 indicating the family-wise Type +I error. +Default of 0.01.} +} +\description{ +This function compares the distribution of Delaunay edges per survey in the +\code{conn} database with the distribution of Delaunay edges of all surveys in the +\code{conn_reference} database. +It uses a Wilcoxon test to check if the distribution of edges is +significantly different. +The function returns the proportion of surveys with an unlikely distribution +and writes the list of unlikely surveys to the database. +} diff --git a/man/unlikely_status.Rd b/man/unlikely_status.Rd new file mode 100644 index 0000000..9e71907 --- /dev/null +++ b/man/unlikely_status.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unlikely_status.R +\name{unlikely_status} +\alias{unlikely_status} +\title{Detect surveys with an unlikely status distribution} +\usage{ +unlikely_status(conn, threshold = 0.5, status_split = 2, alpha = 0.01) +} +\arguments{ +\item{conn}{a DBI connection to an SQLite database.} + +\item{threshold}{A numeric value between 0 and 1 indicating the average +proportion of the observations with status above or equal to the +\code{status_split }. +Default of 0.5.} + +\item{status_split}{An integer values indicating which status level splits +the status into two groups. +The default is 2.} + +\item{alpha}{A numeric value between 0 and 1 indicating the family-wise Type +I error. +Default of 0.01.} +} +\value{ +A numeric value between 0 and 1 indicating the proportion of +surveys with an unlikely status distribution. +The list of unlikely surveys is written to the database. +} +\description{ +Detect surveys with an unlikely status distribution +} diff --git a/man/unlikely_survey_area.Rd b/man/unlikely_survey_area.Rd new file mode 100644 index 0000000..ef35e65 --- /dev/null +++ b/man/unlikely_survey_area.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unlikely_survey_area.R +\name{unlikely_survey_area} +\alias{unlikely_survey_area} +\title{Detect surveys with an unlikely surveyed area} +\usage{ +unlikely_survey_area( + conn, + relevant_area, + max_area = 3e+06, + dist = 50, + crs = 31370, + ratio = 0.95 +) +} +\arguments{ +\item{conn}{a DBI connection to an SQLite database.} + +\item{relevant_area}{An \code{sf} object with the relevant area.} + +\item{max_area}{the maximum area of the concave hull.} + +\item{dist}{numeric or object of class \code{units}; buffer distance(s) for all, or for each of the elements in \code{x}. +In case \code{x} has geodetic coordinates (lon/lat) and \code{sf_use_s2()} is \code{TRUE}, a numeric +\code{dist} is taken as distance in meters and a \code{units} object in \code{dist} is converted to meters. +In case \code{x} has geodetic coordinates (lon/lat) and \code{sf_use_s2()} is \code{FALSE}, a numeric +\code{dist} is taken as degrees, and a \code{units} object in \code{dist} is converted to \code{arc_degree} (and warnings are issued). +In case \code{x} does not have geodetic coordinates (projected) then +numeric \code{dist} is assumed to have the units of the coordinates, and a \code{units} \code{dist} is converted to those if \code{st_crs(x)} is not \code{NA}.} + +\item{crs}{coordinate reference system to be assigned; object of class \code{crs}} + +\item{ratio}{numeric; fraction convex: 1 returns the convex hulls, 0 maximally concave hulls} +} +\description{ +First we create a Delaunay triangulation of the survey area. +Then we ignore all triangles with an edge larger than \code{max_edge}. +Finally we calculate the area of the remaining triangles. +When the sum of the area of the remaining triangles is larger than +\code{max_area}, we consider the survey area unlikely. +}