fix: improve RT deviation estimation for subset alignment

jorainer · jorainer · commit 6b90c2943012 · 2025-05-14T13:00:39.000+02:00
- This fixes issue #795 by using `approxfun()` instead of `stepfun()` for the subset-based alignment (and only there!). For adjustment of chrom peaks' retention times we still use the `stepfun()` that will ensure the RT will coincide with the RT of an actual spectrum (not interpolated).
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: xcms
-Version: 4.7.0
+Version: 4.7.1
 Title: LC-MS and GC-MS Data Analysis
 Description: Framework for processing and visualization of chromatographically
     separated and single-spectra mass spectral data. Imports from AIA/ANDI NetCDF,
diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,17 @@
+# xcms 4.7
+
+## Changes in version 4.7.1
+
+- Fix retention time deviation estimation for subset-based alignment.
+
 # xcms 4.5
 
 ## Changes in version 4.5.4
 
 - Replace usage of deprecated (and removed) class `NAnnotatedDataFrame` with
   `AnnotatedDataFrame`.
 - Fix a bug in `manualChromPeaks()` that caused an error when only a single
-  chrom peak was added. 
+  chrom peak was added.
 
 ## Changes in version 4.5.3
 
diff --git a/R/XcmsExperiment-plotting.R b/R/XcmsExperiment-plotting.R
@@ -123,7 +123,8 @@ plotAdjustedRtime <- function(object, col = "#00000080", lty = 1, lwd = 1,
     peak_group_adj <- peak_group
     for (i in seq_len(ncol(peak_group)))
         peak_group_adj[, i] <- .applyRtAdjustment(peak_group[, i],
-                                                  rt[[i]], rtadj[[i]])
+                                                  rt[[i]], rtadj[[i]],
+                                                  method = "approxfun")
     diff_rt <- peak_group_adj - peak_group
     if (adjustedRtime)
         xrt <- peak_group_adj
diff --git a/R/do_adjustRtime-functions.R b/R/do_adjustRtime-functions.R
@@ -322,6 +322,9 @@ do_adjustRtime_peakGroups <-
 #'
 #' @param rtadj `numeric` with adjusted retention times.
 #'
+#' @param method `character(1)` either `"stepfun"` (the default) or
+#'     `"approxfun"` to avoid the artifacts observed in issue #
+#'
 #' @noRd
 #'
 #' @author Johannes Rainer
@@ -338,14 +341,19 @@ do_adjustRtime_peakGroups <-
 #' ## adjFts[, c("rt", "rtmin", "rtmax")] <- .applyRtAdjustment(feats[, c("rt", "rtmin", "rtmax")], rtr, rtc)
 #'
 #' ## To revert the adjustment: just switch the order of rtr and rtc
-.applyRtAdjustment <- function(x, rtraw, rtadj) {
+.applyRtAdjustment <- function(x, rtraw, rtadj,
+                               method = c("stepfun", "approxfun")) {
+    method <- match.arg(method)
     ## re-order everything if rtraw is not sorted; issue #146
     if (is.unsorted(rtraw)) {
         idx <- order(rtraw)
         rtraw <- rtraw[idx]
         rtadj <- rtadj[idx]
     }
-    adjFun <- stepfun(rtraw[-1] - diff(rtraw) / 2, rtadj)
+    if (method == "approxfun")
+        adjFun <- approxfun(rtraw, rtadj)
+    else
+        adjFun <- stepfun(rtraw[-1] - diff(rtraw) / 2, rtadj)
     res <- adjFun(x)
     ## Fix margins.
     idx_low <- which(x < rtraw[1])
@@ -375,12 +383,16 @@ do_adjustRtime_peakGroups <-
         stop("'rtraw' and 'rtadj' have to have the same length!")
     ## Going to adjust the columns rt, rtmin and rtmax in x.
     ## Using a for loop here.
+    ## Note: we are using `"stepfun"` here on purpose, to be consistent with
+    ## the original code, and as it will adjust retention times to the actual
+    ## adjusted retention times, not mean or interpolated ones.
     for (i in seq_along(rtraw)) {
         whichSample <- which(x[, "sample"] == i)
         if (length(whichSample) && any(rtraw[[i]] != rtadj[[i]])) {
             x[whichSample, c("rt", "rtmin", "rtmax")] <-
                 .applyRtAdjustment(x[whichSample, c("rt", "rtmin", "rtmax")],
-                                   rtraw = rtraw[[i]], rtadj = rtadj[[i]])
+                                   rtraw = rtraw[[i]], rtadj = rtadj[[i]],
+                                   method = "stepfun")
         }
     }
     x
@@ -535,6 +547,11 @@ do_adjustRtime_peakGroups <-
 #' @param method `character` specifying the method with which the non-subset
 #'     samples are adjusted: either `"previous"` or `"average"`. See details.
 #'
+#' @param adjFun `character(1)` defining the function that should be used to
+#'     estimate the retention time deviation. Can be either
+#'     `adjFun = "approxfun"` (default) or `adjFun = "stepfun"` (which was the
+#'     default).
+#'
 #' @return `list` of adjusted retention times.
 #'
 #' @author Johannes Rainer
@@ -543,26 +560,28 @@ do_adjustRtime_peakGroups <-
 #'
 #' @md
 adjustRtimeSubset <- function(rtraw, rtadj, subset,
-                              method = c("average", "previous")) {
+                              method = c("average", "previous"),
+                              adjFun = c("approxfun", "stepfun")) {
     method <- match.arg(method)
+    adjFun <- match.arg(adjFun)
     if (length(rtraw) != length(rtadj))
         stop("Lengths of 'rtraw' and 'rtadj' have to match.")
     if (missing(subset))
         subset <- seq_along(rtraw)
     if (!all(subset %in% seq_along(rtraw)))
         stop("'subset' is out of bounds.")
-    ## if (length(subset) == length(rtraw)) {
-    ##     cat("return rtadj\n")
-    ##     return(rtadj)
-    ## }
     no_subset <- seq_len(length(rtraw))[-subset]
+    message("Aligning samples against subset")
+    pb <- progress_bar$new(format = paste0("[:bar] :current/:",
+                                           "total (:percent) in ",
+                                           ":elapsed"),
+                           total = (length(no_subset)), clear = FALSE)
     for (i in no_subset) {
-        message("Aligning sample number ", i, " against subset ... ",
-                appendLF = FALSE)
         if (method == "previous") {
             i_adj <- .get_closest_index(i, subset, method = "previous")
             rtadj[[i]] <- .applyRtAdjustment(rtraw[[i]], rtraw[[i_adj]],
-                                                 rtadj[[i_adj]])
+                                             rtadj[[i_adj]],
+                                             method = adjFun)
         }
         if (method == "average") {
             i_ref <- c(.get_closest_index(i, subset, method = "previous"),
@@ -576,9 +595,9 @@ adjustRtimeSubset <- function(rtraw, rtadj, subset,
             rt_raw_ref <- apply(rt_raw_ref, 1, weighted.mean, w = wghts)
             rt_adj_ref <- apply(rt_adj_ref, 1, weighted.mean, w = wghts)
             rtadj[[i]] <- .applyRtAdjustment(rtraw[[i]], rt_raw_ref,
-                                             rt_adj_ref)
+                                             rt_adj_ref, method = adjFun)
         }
-        message("OK")
+        pb$tick()
     }
     rtadj
 }
diff --git a/tests/testthat/test_do_adjustRtime-functions.R b/tests/testthat/test_do_adjustRtime-functions.R
@@ -45,13 +45,6 @@ test_that("do_adjustRtime_peakGroups works", {
 })
 
 test_that("applyRtAdjustment works", {
-    skip_on_os(os = "windows", arch = "i386")
-
-    xs <- faahko
-    ## group em.
-    ## xsg <- group(xs)
-    ## ## align em.
-    ## xsa <- retcor(xsg, method = "peakgroups")
     pksAdj <- .applyRtAdjToChromPeaks(chromPeaks(xod_xg),
                                       rtraw = rtime(xod_xg, bySample = TRUE),
                                       rtadj = rtime(xod_xgr, bySample = TRUE))
@@ -73,16 +66,86 @@ test_that("applyRtAdjustment works", {
     ## Artificial examples.
     a_raw <- c(1, 2, 3, 5, 6, 7, 8, 10, 12, 13, 14, 16)
     a_adj <- a_raw + 2 # shift by 2
-    b <- .applyRtAdjustment(a_raw, a_raw, a_adj)
+    b <- .applyRtAdjustment(a_raw, a_raw, a_adj, method = "approxfun")
     expect_equal(a_adj, b)
     b_2 <- .applyRtAdjustment(a_raw, a_raw[4:8], a_adj[4:8])
     expect_equal(b, b_2)
+    x <- c(2, 3, 5, 6, 8)
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "approxfun")
+    expect_equal(res, x + 2)
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "stepfun")
+    expect_equal(res, x + 2)
 
     a_adj <- a_raw - 2
     b <- .applyRtAdjustment(a_raw, a_raw, a_adj)
     expect_equal(a_adj, b)
     b_2 <- .applyRtAdjustment(a_raw, a_raw[4:8], a_adj[4:8])
     expect_equal(b, b_2)
+
+    ## Difference between stepfun (old default) and approxfun.
+    a_raw <- seq(1, 100, by = 0.3)
+    a_adj <- a_raw + 0.2
+    x <- seq(4, 20, by = 0.3)
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "stepfun")
+    expect_equal(res, x + 0.2)
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "approxfun")
+    expect_equal(res, x + 0.2)
+
+    x <- seq(1.4, 20, by = 0.3)
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "stepfun")
+    ## expect_equal(res, x + 0.2) # error!
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "approxfun")
+    expect_equal(res, x + 0.2)
+
+    a_adj <- a_raw + 1.3
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "stepfun")
+    ## expect_equal(res, x + 1.3) # error!
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "approxfun")
+    expect_equal(res, x + 1.3)
+
+    ## small increments
+    a_raw <- seq(0.1, 100, by = 0.03)
+    a_adj <- sort(a_raw + 0.2)
+
+    res <- .applyRtAdjustment(a_raw, a_raw, a_adj, method = "stepfun")
+    expect_equal(a_adj, res)
+    expect_equal(quantile(diff(a_adj)), quantile(diff(res)))
+
+    set.seed(123)
+    x <- seq(2.02, 90.02, by = 0.03)
+    x <- sort(x + rnorm(length(x), 0, 0.001))
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "stepfun")
+    ## expect_equal(res, x + 0.2)
+    ## plot(res, res - x, type = "l")
+
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "approxfun")
+    expect_equal(quantile(diff(res)), quantile(diff(x)))
+    expect_equal(res, x + 0.2)
+    ## plot(res, res - x, type = "l")
+
+    ## deviation is smaller than diff
+    a_raw <- seq(0.1, 100, by = 1.2)
+    a_adj <- sort(a_raw + 0.2)
+
+    res <- .applyRtAdjustment(a_raw, a_raw, a_adj, method = "stepfun")
+    expect_equal(a_adj, res)
+    expect_equal(diff(a_adj), diff(res))
+
+    ## Now, that's an issue.
+    ## we should! have a constant shift by 0.2
+    x <- seq(2, 90, by = 1.2)
+    x <- sort(x + rnorm(length(x), mean = 0, sd = 0.001))
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "stepfun")
+    ## plot(res, res - x, type = "l")
+    ## For a constant shift we expect the difference between consecutive values
+    ## to stay the same, but the test below fails.
+    ## expect_equal(diff(x), diff(res))
+
+    res <- .applyRtAdjustment(x, a_raw, a_adj, method = "approxfun")
+    expect_equal(diff(x), diff(res))
+    expect_equal(mean(res - x), 0.2)
+    expect_equal(res, x + 0.2)
+    ## plot(res, res - x, type = "l")
 })
 
 test_that(".get_closest_index works", {
@@ -136,42 +199,65 @@ test_that(".match_trim_vectors and index works", {
 })
 
 test_that("adjustRtimeSubset works", {
-    skip_on_os(os = "windows", arch = "i386")
-
     rt_raw <- rtime(xod_xgr, adjusted = FALSE, bySample = TRUE)
     rt_adj <- rtime(xod_xgr, adjusted = TRUE, bySample = TRUE)
 
-    res <- adjustRtimeSubset(rt_raw, rt_adj, subset = c(1, 3),
-                             method = "previous")
+    res <- xcms:::adjustRtimeSubset(rt_raw, rt_adj, subset = c(1, 3),
+                             method = "previous", adjFun = "stepfun")
     expect_equal(res[[1]], rt_adj[[1]])
     expect_equal(res[[3]], rt_adj[[3]])
     expect_true(all(res[[2]] != rt_adj[[2]]))
     expect_equal(names(res[[2]]), names(rt_adj[[2]]))
     expect_equal(unname(res[[2]]), unname(rt_adj[[1]]))
 
-    a <- res[[1]] - rt_raw[[1]]
-    b <- res[[2]] - rt_raw[[2]]
-    c <- res[[3]] - rt_raw[[3]]
-    plot(res[[1]], a, type = "l", col = "#ff000040", lty = 2,
-         ylim = range(a, b, c))
-    points(res[[2]], b, type = "l", col = "#00ff0060", lty = 1)
-    points(res[[3]], c, type = "l", col = "#0000ff40", lty = 2)
+    res <- xcms:::adjustRtimeSubset(rt_raw, rt_adj, subset = c(1, 3),
+                             method = "previous", adjFun = "approxfun")
+    expect_equal(res[[1]], rt_adj[[1]])
+    expect_equal(res[[3]], rt_adj[[3]])
+    expect_true(all(res[[2]] != rt_adj[[2]]))
+    ## Values are no longer IDENTICAL, but highly similar:
+    expect_true(median(res[[2]] - rt_adj[[1]]) == 0)
+    expect_true(max(abs(res[[2]] - rt_adj[[1]])) < 0.002)
+
+    ## a <- res[[1]] - rt_raw[[1]]
+    ## b <- res[[2]] - rt_raw[[2]]
+    ## c <- res[[3]] - rt_raw[[3]]
+    ## plot(res[[1]], a, type = "l", col = "#ff000040", lty = 2,
+    ##      ylim = range(a, b, c))
+    ## points(res[[2]], b, type = "l", col = "#00ff0060", lty = 1)
+    ## points(res[[3]], c, type = "l", col = "#0000ff40", lty = 2)
+
+    res <- xcms:::adjustRtimeSubset(rt_raw, rt_adj, subset = c(1, 3),
+                             method = "average", adjFun = "stepfun")
+    expect_equal(res[[1]], rt_adj[[1]])
+    expect_equal(res[[3]], rt_adj[[3]])
+    expect_true(all(res[[2]] != rt_adj[[2]]))
+    expect_true(all(res[[2]] != rt_adj[[1]]))
+    expect_true(all(res[[2]] != rt_adj[[3]]))
+
+    ## a <- res[[1]] - rt_raw[[1]]
+    ## b <- res[[2]] - rt_raw[[2]]
+    ## c <- res[[3]] - rt_raw[[3]]
+    ## plot(res[[1]], a, type = "l", col = "#ff000040", lty = 2,
+    ##      ylim = range(a, b, c))
+    ## points(res[[2]], b, type = "l", col = "#00ff0060", lty = 1)
+    ## points(res[[3]], c, type = "l", col = "#0000ff40", lty = 2)
 
-    res <- adjustRtimeSubset(rt_raw, rt_adj, subset = c(1, 3),
-                             method = "average")
+    res <- xcms:::adjustRtimeSubset(rt_raw, rt_adj, subset = c(1, 3),
+                             method = "average", adjFun = "approxfun")
     expect_equal(res[[1]], rt_adj[[1]])
     expect_equal(res[[3]], rt_adj[[3]])
     expect_true(all(res[[2]] != rt_adj[[2]]))
     expect_true(all(res[[2]] != rt_adj[[1]]))
     expect_true(all(res[[2]] != rt_adj[[3]]))
 
-    a <- res[[1]] - rt_raw[[1]]
-    b <- res[[2]] - rt_raw[[2]]
-    c <- res[[3]] - rt_raw[[3]]
-    plot(res[[1]], a, type = "l", col = "#ff000040", lty = 2,
-         ylim = range(a, b, c))
-    points(res[[2]], b, type = "l", col = "#00ff0060", lty = 1)
-    points(res[[3]], c, type = "l", col = "#0000ff40", lty = 2)
+    ## a <- res[[1]] - rt_raw[[1]]
+    ## b <- res[[2]] - rt_raw[[2]]
+    ## c <- res[[3]] - rt_raw[[3]]
+    ## plot(res[[1]], a, type = "l", col = "#ff000040", lty = 2,
+    ##      ylim = range(a, b, c))
+    ## points(res[[2]], b, type = "l", col = "#00ff0060", lty = 1)
+    ## points(res[[3]], c, type = "l", col = "#0000ff40", lty = 2)
 })
 
 test_that(".adjustRtime_peakGroupsMatrix works", {
diff --git a/tests/testthat/test_methods-XCMSnExp.R b/tests/testthat/test_methods-XCMSnExp.R
@@ -1619,7 +1619,6 @@ test_that("calibrate,XCMSnExp works", {
 })
 
 test_that("adjustRtime,peakGroups works", {
-    skip_on_os(os = "windows", arch = "i386")
 
     xod <- faahko_xod
     xodg <- groupChromPeaks(
@@ -1694,19 +1693,24 @@ test_that("adjustRtime,peakGroups works", {
                     rtime(xodg, bySample = TRUE)[[2]]))
     expect_true(all(rtime(res_sub, bySample = TRUE)[[3]] !=
                     rtime(xodg, bySample = TRUE)[[3]]))
-    expect_equal(unname(rtime(res_sub, bySample = TRUE)[[1]]),
-                 unname(rtime(res_sub, bySample = TRUE)[[2]]))
+    ## With `adjFun = "approxfun"` it's no longer **identical** but highly
+    ## similar!
+    expect_true(max(abs(rtime(res_sub, bySample = TRUE)[[1L]] -
+                        rtime(res_sub, bySample = TRUE)[[2L]])) < 0.0015)
+    ## expect_equal(unname(rtime(res_sub, bySample = TRUE)[[1]]),
+    ##              unname(rtime(res_sub, bySample = TRUE)[[2]]))
     expect_equal(rtime(res_sub, bySample = TRUE)[[2]],
                  .applyRtAdjustment(rtime(xodg, bySample = TRUE)[[2]],
-                                           rtime(xodg, bySample = TRUE)[[1]],
-                                           rtime(res_sub, bySample = TRUE)[[1]]))
+                                    rtime(xodg, bySample = TRUE)[[1]],
+                                    rtime(res_sub, bySample = TRUE)[[1]],
+                                    method = "approxfun"))
     res_sub <- adjustRtime(
         xodg, param = PeakGroupsParam(subset = c(1, 3),
                                       subsetAdjust = "average"))
     expect_true(all(rtime(res_sub, bySample = TRUE)[[1]] !=
                     rtime(xodg, bySample = TRUE)[[1]]))
-    expect_true(all(rtime(res_sub, bySample = TRUE)[[2]] !=
-                    rtime(xodg, bySample = TRUE)[[2]]))
+    expect_false(all(rtime(res_sub, bySample = TRUE)[[2]] ==
+                     rtime(xodg, bySample = TRUE)[[2]]))
     expect_true(all(rtime(res_sub, bySample = TRUE)[[3]] !=
                     rtime(xodg, bySample = TRUE)[[3]]))
     expect_true(any(rtime(res_sub, bySample = TRUE)[[1]] !=