inbo · ThierryO · May 9, 2025 · May 9, 2025 · May 10, 2025 · May 11, 2025
diff --git a/.github/workflows/check_on_branch.yml b/.github/workflows/check_on_branch.yml
@@ -17,4 +17,4 @@ jobs:
     permissions:
       contents: read
     steps:
-      - uses: inbo/actions/check_pkg@checklist-0.3.6
+      - uses: inbo/actions/check_pkg@main
diff --git a/.github/workflows/check_on_main.yml b/.github/workflows/check_on_main.yml
@@ -16,4 +16,4 @@ jobs:
       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: inbo/actions/check_pkg@checklist-0.3.6
+      - uses: inbo/actions/check_pkg@main
diff --git a/.zenodo.json b/.zenodo.json
@@ -1,9 +1,9 @@
 {
   "title": "territoria: Clustering Observations from Breeding Birds into Territoria",
-  "version": "0.0.3",
+  "version": "0.1.0",
   "license": "GPL-3.0",
   "upload_type": "software",
-  "description": "<p>Clusters individual observations based on breeding indication and\ndistance between observations.<\/p>",
+  "description": "<p>Clusters individual observations based on breeding indication and distance between observations.<\/p>",
   "keywords": [
     "breeding bird",
     "cluster"
@@ -15,11 +15,11 @@
       "name": "Onkelinx, Thierry",
       "affiliation": "Research Institute for Nature and Forest (INBO)",
       "orcid": "0000-0001-8804-4216",
-      "type": "ContactPerson"
+      "type": "contactperson"
     },
     {
       "name": "Research Institute for Nature and Forest (INBO)",
-      "type": "RightsHolder"
+      "type": "rightsholder"
     }
   ],
   "creators": [

diff --git a/CITATION.cff b/CITATION.cff
@@ -20,4 +20,4 @@ repository-code: https://github.com/inbo/territoria/
 type: software
 abstract: "Clusters individual observations based on breeding indication and distance
   between observations."
-version: 0.0.3
+version: 0.1.0
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: territoria
 Title: Clustering Observations from Breeding Birds into Territoria
-Version: 0.0.3
+Version: 0.1.0
 Authors@R: c(
     person("Thierry", "Onkelinx", , "thierry.onkelinx@inbo.be", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-8804-4216", affiliation = "Research Institute for Nature and Forest (INBO)")),
@@ -12,15 +12,22 @@ Description: Clusters individual observations based on breeding indication
 License: GPL-3
 URL: https://github.com/inbo/territoria
 BugReports: https://github.com/inbo/territoria/issues
+Depends: 
+    R (>= 4.1.0)
 Imports:
     assertthat,
+    deldir,
+    dplyr,
+    igraph,
     mvtnorm,
+    rlang,
     RSQLite,
+    sf,
     spatstat.geom,
     spatstat.random
 Config/checklist/communities: inbo
 Config/checklist/keywords: breeding bird; cluster
 Encoding: UTF-8
 Language: en-GB
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,4 +1,4 @@
-### GNU GENERAL PUBLIC LICENSE
+# GNU GENERAL PUBLIC LICENSE
 
 Version 3, 29 June 2007
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,9 +3,16 @@
 export(cluster_observation)
 export(connect_db)
 export(distance_matrix)
+export(edge_distribution)
 export(get_cluster)
+export(get_total)
+export(get_unlikely_observation)
+export(get_unlikely_summary)
 export(import_observations)
 export(simulate_observations)
+export(unlikely_edge_distribution)
+export(unlikely_status)
+export(unlikely_survey_area)
 importFrom(RSQLite,SQLite)
 importFrom(RSQLite,dbClearResult)
 importFrom(RSQLite,dbConnect)
@@ -20,7 +27,28 @@ importFrom(assertthat,is.flag)
 importFrom(assertthat,is.number)
 importFrom(assertthat,is.string)
 importFrom(assertthat,noNA)
+importFrom(deldir,deldir)
+importFrom(dplyr,filter)
+importFrom(dplyr,group_by)
+importFrom(dplyr,mutate)
+importFrom(dplyr,summarise)
+importFrom(dplyr,transmute)
+importFrom(igraph,V)
+importFrom(igraph,decompose)
+importFrom(igraph,graph_from_data_frame)
 importFrom(mvtnorm,rmvnorm)
+importFrom(rlang,.data)
+importFrom(sf,st_area)
+importFrom(sf,st_as_sf)
+importFrom(sf,st_buffer)
+importFrom(sf,st_concave_hull)
+importFrom(sf,st_drop_geometry)
+importFrom(sf,st_intersection)
+importFrom(sf,st_union)
 importFrom(spatstat.geom,owin)
 importFrom(spatstat.random,rStrauss)
+importFrom(stats,aggregate)
+importFrom(stats,binom.test)
 importFrom(stats,rbinom)
+importFrom(stats,rnorm)
+importFrom(stats,wilcox.test)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,24 @@
+# territoria 0.1.0
+
+* `import_observations()` splits surveys with clearly distinct groups of
+  observations into multiple surveys.
+  This is done by clustering the observations and checking if the clusters are
+  too far apart geographically.
+  It also requires to provide a user and region id.
+* `edge_distribution()` return the distribution of the length of the edges of a Delaunay
+  triangulation on the observations per survey.
+* Add `unlikely_survey_area()`, `unlikely_status()` and
+  `unlikely_edge_distribution()` to detect surveys which are unlikely done
+  according to the sampling protocol.
+* `distance_matrix()` ignores observations from unlikely surveys.
+  Hence `cluster_observation()` will ignore them during the clustering and
+  `get_cluster()` ignores them.
+* `get_total()` returns the total number of individuals per region based on the
+  clustering.
+* `get_unlikely_summary()` and `get_unlikely_observation()` return the unlikely
+  surveys and observations.
+* Update [`checklist`](https://inbo.github.io/checklist/) machinery.
+
 # territoria 0.0.3
 
 * Update [`checklist`](https://inbo.github.io/checklist/) machinery.

diff --git a/R/distance_matrix.R b/R/distance_matrix.R
@@ -27,9 +27,11 @@ ON distance (distance)"
   sql <- sprintf(
     "WITH cte_obs AS (
   SELECT
-    id, x, y, group_x, group_y, group_x + 1 AS group_xp,
-  group_y + 1 AS group_yp, group_y - 1 AS group_ym
-  FROM observation
+    o.id, o.x, o.y, o.group_x, o.group_y, o.group_x + 1 AS group_xp,
+    o.group_y + 1 AS group_yp, o.group_y - 1 AS group_ym
+  FROM observation AS o
+  LEFT JOIN unlikely AS u ON o.survey = u.survey
+  WHERE u.value IS NULL
 ),
 cte_distance AS (
   SELECT

diff --git a/R/edge_distribution.R b/R/edge_distribution.R
@@ -0,0 +1,40 @@
+#' @title Edge distribution
+#' @description
+#' This function returns the length of the edges of the Delaunay triangulation
+#' per survey.
+#' The function returns only edges with length smaller than twice `max_dist`.
+#' @inheritParams import_observations
+#' @importFrom assertthat assert_that is.number noNA
+#' @importFrom deldir deldir
+#' @importFrom RSQLite dbGetQuery
+#' @export
+edge_distribution <- function(conn, max_dist = 336) {
+  assert_that(
+    inherits(conn, "SQLiteConnection"),
+    is.number(max_dist), noNA(max_dist), max_dist > 0
+  )
+  "SELECT survey, x, y FROM observation" |>
+    dbGetQuery(conn = conn) -> observations
+  table(observations$survey) |>
+    as.data.frame() -> n_survey
+  n_survey[n_survey$Freq > 1, ] |>
+    merge(x = observations, by.y = "Var1", by.x = "survey") -> observations
+  edges <- data.frame(survey = integer(0), length = numeric(0))
+  for (i in unique(observations$survey)) {
+    candidate <- which(observations$survey == i)
+    dd <- deldir(
+      x = rnorm(length(candidate), mean = observations$x[candidate], sd = 0.01),
+      y = rnorm(length(candidate), mean = observations$y[candidate], sd = 0.01),
+      id = observations$id[candidate]
+    )
+    extra <- data.frame(
+      survey = i,
+      length = sqrt(
+        (dd$delsgs$x2 - dd$delsgs$x1) ^ 2 + (dd$delsgs$y2 - dd$delsgs$y1) ^ 2
+      )
+    )
+    extra[extra$length <= 2 * max_dist, ] |>
+      rbind(edges) -> edges
+  }
+  return(edges)
+}
diff --git a/R/get_cluster.R b/R/get_cluster.R
@@ -9,14 +9,19 @@ get_cluster <- function(conn) {
     "observation" %in% dbListTables(conn, "observation"),
     msg = "No observations found. Did you run `import_observations()`?"
   )
-  obs <- dbGetQuery(
-    conn, "SELECT id, x, y, survey, status, cluster FROM observation"
-  )
+  "SELECT
+o.id, o.x, o.y, o.survey, o.status, o.cluster
+FROM observation AS o
+LEFT JOIN unlikely AS u ON o.survey = u.survey
+WHERE u.value IS NULL" |>
+    dbGetQuery(conn = conn) -> obs
   cluster <- dbGetQuery(conn, "
 SELECT
-  cluster, COUNT(cluster) AS n_obs, MAX(status) AS max_status,
-  AVG(x) AS centroid_x, AVG(y) AS centroid_y
-FROM observation
-GROUP BY cluster")
+  o.cluster, COUNT(o.cluster) AS n_obs, MAX(status) AS max_status,
+  AVG(o.x) AS centroid_x, AVG(o.y) AS centroid_y
+FROM observation AS o
+LEFT JOIN unlikely AS u ON o.survey = u.survey
+WHERE u.value IS NULL
+GROUP BY o.cluster")
   return(list(cluster = cluster, observations = obs))
 }
diff --git a/R/get_total.R b/R/get_total.R
@@ -0,0 +1,52 @@
+#' Get the total number of individuals per region
+#' Returns the sum of `value` per region.
+#' Clusters spanning more than one region get each a fraction of `value` based
+#' on the number of observations of the cluster in a region divided by the total
+#' number of observations in the cluster.
+#' @inheritParams import_observations
+#' @param value A string representing the value per cluster to be calculated.
+#' Default is `"IIF(max_status >= 2, 1, iif(n_total > 1, 0.5, 0))"`.
+#' Available variables are
+#'  - `max_status`: the highest status of a cluster.
+#'  - `n_total`: the total number of observations per cluster
+#'  - `n_region`: the number of observations in a cluster from the same region.
+#' @export
+#' @importFrom assertthat assert_that is.string noNA
+#' @importFrom RSQLite dbGetQuery
+get_total <- function(
+  conn, value = "IIF(max_status >= 2, 1, iif(n_total > 1, 0.5, 0))"
+) {
+  assert_that(
+    inherits(conn, "SQLiteConnection"), is.string(value), noNA(value)
+  )
+  sprintf(
+    "WITH cte_obs AS (
+  SELECT o.region, o.cluster, o.status
+  FROM observation AS o
+  LEFT JOIN unlikely AS u ON o.survey = u.survey
+  WHERE u.value IS NULL
+),
+cte_region AS (
+  SELECT cluster, region, COUNT(cluster) AS n_region
+  FROM cte_obs
+  WHERE region IS NOT NULL
+  GROUP BY cluster, region
+),
+cte_cluster AS (
+  SELECT cluster, MAX(status) AS max_status, COUNT(status) AS n_total
+  FROM cte_obs
+  GROUP BY cluster
+),
+cte AS (
+  SELECT
+    r.region, c.cluster, 1.0 * r.n_region / c.n_total AS fraction,
+    %s AS value
+  FROM cte_region AS r
+  INNER JOIN cte_cluster AS c ON r.cluster = c.cluster
+)
+
+SELECT region, SUM(value * fraction) AS individuals FROM cte GROUP BY region",
+    value
+  ) |>
+    dbGetQuery(conn = conn)
+}
diff --git a/R/get_unlikely.R b/R/get_unlikely.R
@@ -0,0 +1,39 @@
+#' Get an overview of why the data of each survey and/or region is unlikely to be the result of the counting protocol.
+#' @inheritParams import_observations
+#' @export
+#' @importFrom assertthat assert_that
+#' @importFrom RSQLite dbGetQuery
+get_unlikely_summary <- function(conn = conn) {
+  assert_that(inherits(conn, "SQLiteConnection"))
+  "WITH cte AS (
+  SELECT survey, region, COUNT(id) n
+  FROM observation
+  WHERE region IS NOT NULL
+  GROUP BY survey, region
+),
+cte_survey AS (
+  SELECT survey, region FROM cte GROUP BY survey HAVING n = MAX(n)
+)
+SELECT c.region, c.survey, u.reason
+FROM cte_survey AS c
+INNER JOIN unlikely AS u ON c.survey = u.survey
+ORDER BY c.region, c.survey" |>
+    dbGetQuery(conn = conn)
+}
+
+#' Get the unlikely observations
+#' @inheritParams import_observations
+#' @inheritParams sf::st_as_sf
+#' @export
+#' @importFrom assertthat assert_that
+#' @importFrom RSQLite dbGetQuery
+#' @importFrom sf st_as_sf
+get_unlikely_observation <- function(conn, crs = 31370) {
+  assert_that(inherits(conn, "SQLiteConnection"))
+  "SELECT o.id, o.x, o.y, o.status, o.region, o.survey, s.original, s.user
+FROM observation AS o
+INNER JOIN unlikely AS u ON o.survey = u.survey
+INNER JOIN survey AS s on o.survey = s.id" |>
+    dbGetQuery(conn = conn) |>
+    st_as_sf(coords = c("x", "y"), crs = crs)
+}