Skip to content

Commit 0e0e8fe

Browse files
Add biomedical statistical tests: Wilcoxon and Mann-Whitney U tests (#221)
1 parent 6f2a72c commit 0e0e8fe

File tree

3 files changed

+690
-0
lines changed

3 files changed

+690
-0
lines changed

DIRECTORY.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
## Association Algorithms
33
* [Apriori](https://github.com/TheAlgorithms/R/blob/HEAD/association_algorithms/apriori.r)
44

5+
## Biomedical
6+
* [Mann Whitney U Test](https://github.com/TheAlgorithms/R/blob/HEAD/biomedical/mann_whitney_u_test.r)
7+
* [Wilcoxon Signed Rank Test](https://github.com/TheAlgorithms/R/blob/HEAD/biomedical/wilcoxon_signed_rank_test.r)
8+
59
## Classification Algorithms
610
* [Decision Tree](https://github.com/TheAlgorithms/R/blob/HEAD/classification_algorithms/decision_tree.r)
711
* [Gradient Boosting Algorithms](https://github.com/TheAlgorithms/R/blob/HEAD/classification_algorithms/gradient_boosting_algorithms.r)
@@ -69,6 +73,7 @@
6973
## Machine Learning
7074
* [Gradient Boosting](https://github.com/TheAlgorithms/R/blob/HEAD/machine_learning/gradient_boosting.r)
7175

76+
7277
## Mathematics
7378
* [Amicable Numbers](https://github.com/TheAlgorithms/R/blob/HEAD/mathematics/amicable_numbers.r)
7479
* [Armstrong Number](https://github.com/TheAlgorithms/R/blob/HEAD/mathematics/armstrong_number.r)

biomedical/mann_whitney_u_test.r

Lines changed: 368 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,368 @@
1+
# Mann-Whitney U Test (Wilcoxon Rank-Sum Test) for Biomedical Data Analysis
2+
# Author: Contributor
3+
# Description: Implementation of Mann-Whitney U test for comparing two independent groups
4+
# Common applications: Comparing treatment vs control groups, comparing different populations
5+
6+
mann_whitney_u_test <- function(x, y, alternative = "two.sided", exact = FALSE) {
7+
#' Mann-Whitney U Test (Wilcoxon Rank-Sum Test)
8+
#'
9+
#' Performs the Mann-Whitney U test to compare two independent samples
10+
#'
11+
#' @param x numeric vector of data values for group 1
12+
#' @param y numeric vector of data values for group 2
13+
#' @param alternative character, alternative hypothesis ("two.sided", "less", "greater")
14+
#' @param exact logical, whether to compute exact p-values for small samples
15+
#'
16+
#' @return list containing:
17+
#' - statistic: test statistic (U)
18+
#' - p_value: p-value of the test
19+
#' - U1: U statistic for group 1
20+
#' - U2: U statistic for group 2
21+
#' - W1: sum of ranks for group 1
22+
#' - W2: sum of ranks for group 2
23+
#' - n1: sample size of group 1
24+
#' - n2: sample size of group 2
25+
#' - method: description of test performed
26+
#' - alternative: alternative hypothesis
27+
#' - data_summary: summary of input data
28+
29+
# Input validation
30+
if (!is.numeric(x) || !is.numeric(y)) {
31+
stop("Both x and y must be numeric vectors")
32+
}
33+
34+
if (length(x) == 0 || length(y) == 0) {
35+
stop("Both groups must contain at least one observation")
36+
}
37+
38+
if (!alternative %in% c("two.sided", "less", "greater")) {
39+
stop("alternative must be 'two.sided', 'less', or 'greater'")
40+
}
41+
42+
# Remove missing values
43+
x <- x[!is.na(x)]
44+
y <- y[!is.na(y)]
45+
46+
n1 <- length(x)
47+
n2 <- length(y)
48+
49+
if (n1 == 0 || n2 == 0) {
50+
stop("Groups cannot be empty after removing missing values")
51+
}
52+
53+
# Combine data and calculate ranks
54+
combined <- c(x, y)
55+
N <- n1 + n2
56+
ranks <- rank(combined, ties.method = "average")
57+
58+
# Sum of ranks for each group
59+
W1 <- sum(ranks[1:n1]) # Sum of ranks for group 1
60+
W2 <- sum(ranks[(n1 + 1):N]) # Sum of ranks for group 2
61+
62+
# Calculate U statistics
63+
U1 <- W1 - n1 * (n1 + 1) / 2
64+
U2 <- W2 - n2 * (n2 + 1) / 2
65+
66+
# Verify: U1 + U2 should equal n1 * n2
67+
if (abs(U1 + U2 - n1 * n2) > 1e-10) {
68+
warning("U statistics calculation may have numerical errors")
69+
}
70+
71+
# Test statistic (traditionally the smaller U, but depends on alternative)
72+
if (alternative == "greater") {
73+
# Testing if group 1 > group 2, so we want U1
74+
U_stat <- U1
75+
} else if (alternative == "less") {
76+
# Testing if group 1 < group 2, so we want U1 (smaller values)
77+
U_stat <- U1
78+
} else {
79+
# Two-sided: use the smaller U for traditional reporting
80+
U_stat <- min(U1, U2)
81+
}
82+
83+
# Calculate p-value
84+
if (n1 >= 8 && n2 >= 8 && !exact) {
85+
# Normal approximation for large samples
86+
mu_U <- n1 * n2 / 2
87+
88+
# Check for ties and adjust variance if necessary
89+
ties <- table(combined)
90+
tie_correction <- sum(ties^3 - ties) / (N * (N - 1))
91+
92+
var_U <- n1 * n2 * (N + 1) / 12 - n1 * n2 * tie_correction / (12 * (N - 1))
93+
sigma_U <- sqrt(var_U)
94+
95+
# Apply continuity correction
96+
if (alternative == "two.sided") {
97+
z <- (abs(U1 - mu_U) - 0.5) / sigma_U
98+
p_value <- 2 * pnorm(z, lower.tail = FALSE)
99+
} else if (alternative == "less") {
100+
z <- (U1 + 0.5 - mu_U) / sigma_U
101+
p_value <- pnorm(z)
102+
} else { # greater
103+
z <- (U1 - 0.5 - mu_U) / sigma_U
104+
p_value <- pnorm(z, lower.tail = FALSE)
105+
}
106+
107+
method <- "Mann-Whitney U test with normal approximation"
108+
109+
} else {
110+
# For small samples or when exact is requested
111+
if (exact && n1 <= 20 && n2 <= 20) {
112+
# Note: Exact calculation would require generating all possible rank combinations
113+
# This is computationally intensive and typically done using lookup tables
114+
p_value <- NA
115+
method <- "Mann-Whitney U test (exact method - requires lookup tables)"
116+
warning("Exact p-value calculation requires specialized tables. Using NA.")
117+
} else {
118+
p_value <- NA
119+
method <- "Mann-Whitney U test"
120+
warning("Sample sizes are small. Consider using exact tables for p-value calculation.")
121+
}
122+
}
123+
124+
# Prepare data summary
125+
data_summary <- list(
126+
n1 = n1,
127+
n2 = n2,
128+
median_x = median(x),
129+
mean_x = mean(x),
130+
sd_x = sd(x),
131+
median_y = median(y),
132+
mean_y = mean(y),
133+
sd_y = sd(y),
134+
combined_median = median(combined),
135+
combined_mean = mean(combined)
136+
)
137+
138+
# Return results
139+
result <- list(
140+
statistic = U_stat,
141+
p_value = p_value,
142+
U1 = U1,
143+
U2 = U2,
144+
W1 = W1,
145+
W2 = W2,
146+
n1 = n1,
147+
n2 = n2,
148+
method = method,
149+
alternative = alternative,
150+
data_summary = data_summary
151+
)
152+
153+
class(result) <- "mann_whitney_test"
154+
return(result)
155+
}
156+
157+
# Print method for mann_whitney_test objects
158+
print.mann_whitney_test <- function(x, ...) {
159+
cat("\n", x$method, "\n")
160+
cat("Data summary:\n")
161+
cat(" Group 1 (x): n =", x$n1, ", median =", round(x$data_summary$median_x, 3),
162+
", mean =", round(x$data_summary$mean_x, 3), ", SD =", round(x$data_summary$sd_x, 3), "\n")
163+
cat(" Group 2 (y): n =", x$n2, ", median =", round(x$data_summary$median_y, 3),
164+
", mean =", round(x$data_summary$mean_y, 3), ", SD =", round(x$data_summary$sd_y, 3), "\n")
165+
166+
cat("\nTest results:\n")
167+
cat(" W1 (sum of ranks, group 1) =", x$W1, "\n")
168+
cat(" W2 (sum of ranks, group 2) =", x$W2, "\n")
169+
cat(" U1 (U statistic, group 1) =", x$U1, "\n")
170+
cat(" U2 (U statistic, group 2) =", x$U2, "\n")
171+
cat(" Test statistic U =", x$statistic, "\n")
172+
cat(" Alternative hypothesis:", x$alternative, "\n")
173+
174+
if (!is.na(x$p_value)) {
175+
cat(" P-value =", round(x$p_value, 6), "\n")
176+
177+
# Interpretation
178+
if (x$p_value < 0.001) {
179+
significance <- "highly significant (p < 0.001)"
180+
} else if (x$p_value < 0.01) {
181+
significance <- "very significant (p < 0.01)"
182+
} else if (x$p_value < 0.05) {
183+
significance <- "significant (p < 0.05)"
184+
} else {
185+
significance <- "not significant (p >= 0.05)"
186+
}
187+
cat(" Result:", significance, "\n")
188+
} else {
189+
cat(" P-value: Not calculated (exact method required)\n")
190+
}
191+
192+
# Effect size estimate
193+
cat(" Effect size estimate (r) =", round(abs(x$U1 - x$n1 * x$n2 / 2) / sqrt(x$n1 * x$n2 * (x$n1 + x$n2 + 1) / 12), 3), "\n")
194+
}
195+
196+
# ==============================================================================
197+
# EXAMPLES WITH BIOMEDICAL DUMMY DATA
198+
# ==============================================================================
199+
200+
run_biomedical_examples <- function() {
201+
cat("=================================================================\n")
202+
cat("MANN-WHITNEY U TEST - BIOMEDICAL EXAMPLES\n")
203+
cat("=================================================================\n\n")
204+
205+
# Example 1: Treatment vs Control group - Drug efficacy
206+
cat("EXAMPLE 1: Drug Efficacy Study (Treatment vs Control)\n")
207+
cat("-----------------------------------------------------------------\n")
208+
209+
# Dummy data: Improvement scores (0-100)
210+
set.seed(123)
211+
treatment_group <- c(78, 85, 92, 73, 88, 91, 76, 83, 89, 95,
212+
80, 87, 94, 71, 86, 93, 79, 84, 90, 77)
213+
214+
control_group <- c(65, 58, 71, 62, 69, 54, 67, 60, 73, 56,
215+
63, 70, 59, 66, 52, 68, 61, 74, 57, 64)
216+
217+
cat("Treatment group scores (n=20):", treatment_group, "\n")
218+
cat("Control group scores (n=20):", control_group, "\n\n")
219+
220+
result1 <- mann_whitney_u_test(treatment_group, control_group,
221+
alternative = "greater")
222+
print(result1)
223+
224+
cat("\nClinical Interpretation:\n")
225+
cat("This test compares the effectiveness of a new drug vs placebo.\n")
226+
cat("H0: No difference between treatment and control groups\n")
227+
cat("H1: Treatment group shows greater improvement than control\n\n")
228+
229+
# Example 2: Male vs Female biomarker levels
230+
cat("\n=================================================================\n")
231+
cat("EXAMPLE 2: Biomarker Analysis by Gender (Male vs Female)\n")
232+
cat("-----------------------------------------------------------------\n")
233+
234+
# Dummy data: Protein biomarker levels (ng/mL)
235+
set.seed(456)
236+
male_levels <- c(12.3, 15.7, 11.8, 14.2, 16.1, 13.5, 12.9, 15.3, 14.8, 13.1,
237+
11.6, 16.4, 12.7, 14.9, 13.8, 15.2, 12.1, 14.6, 13.3, 15.9)
238+
239+
female_levels <- c(9.8, 10.5, 8.9, 11.2, 9.3, 10.8, 8.7, 10.1, 9.6, 11.5,
240+
8.4, 10.9, 9.1, 10.3, 8.8, 11.1, 9.4, 10.6, 8.6, 10.2)
241+
242+
cat("Male biomarker levels (ng/mL):", round(male_levels, 1), "\n")
243+
cat("Female biomarker levels (ng/mL):", round(female_levels, 1), "\n\n")
244+
245+
result2 <- mann_whitney_u_test(male_levels, female_levels,
246+
alternative = "two.sided")
247+
print(result2)
248+
249+
cat("\nClinical Interpretation:\n")
250+
cat("This test examines gender differences in biomarker expression.\n")
251+
cat("H0: No difference in biomarker levels between males and females\n")
252+
cat("H1: There is a difference in biomarker levels between genders\n\n")
253+
254+
# Example 3: Disease severity - Stage I vs Stage II
255+
cat("\n=================================================================\n")
256+
cat("EXAMPLE 3: Disease Severity (Stage I vs Stage II)\n")
257+
cat("-----------------------------------------------------------------\n")
258+
259+
# Dummy data: Disease severity scores
260+
set.seed(789)
261+
stage_1 <- c(18, 22, 15, 20, 25, 17, 19, 23, 16, 21, 24, 14, 18, 22, 19)
262+
stage_2 <- c(35, 42, 38, 29, 45, 33, 40, 37, 31, 44, 36, 39, 32, 43, 34, 41, 28, 46, 30)
263+
264+
cat("Stage I severity scores (n=15):", stage_1, "\n")
265+
cat("Stage II severity scores (n=19):", stage_2, "\n\n")
266+
267+
result3 <- mann_whitney_u_test(stage_1, stage_2, alternative = "less")
268+
print(result3)
269+
270+
cat("\nClinical Interpretation:\n")
271+
cat("This test compares disease severity between different stages.\n")
272+
cat("H0: No difference in severity between Stage I and Stage II\n")
273+
cat("H1: Stage I has lower severity scores than Stage II\n\n")
274+
275+
# Example 4: Age groups - Young vs Elderly immune response
276+
cat("\n=================================================================\n")
277+
cat("EXAMPLE 4: Immune Response by Age Group (Young vs Elderly)\n")
278+
cat("-----------------------------------------------------------------\n")
279+
280+
# Dummy data: Antibody titers (IU/mL)
281+
set.seed(321)
282+
young_adults <- c(450, 520, 380, 490, 560, 420, 480, 510, 440, 500,
283+
470, 530, 410, 485, 525, 395, 475, 515, 435, 495)
284+
285+
elderly <- c(280, 320, 250, 310, 290, 270, 300, 260, 330, 285,
286+
275, 315, 255, 305, 295, 265, 325, 245, 290, 280, 310, 270)
287+
288+
cat("Young adults antibody titers (IU/mL):", young_adults, "\n")
289+
cat("Elderly antibody titers (IU/mL):", elderly, "\n\n")
290+
291+
result4 <- mann_whitney_u_test(young_adults, elderly, alternative = "greater")
292+
print(result4)
293+
294+
cat("\nClinical Interpretation:\n")
295+
cat("This test compares immune response between age groups.\n")
296+
cat("H0: No difference in antibody titers between young and elderly\n")
297+
cat("H1: Young adults have higher antibody titers than elderly\n\n")
298+
299+
# Example 5: Pre-diabetic vs Diabetic glucose levels
300+
cat("\n=================================================================\n")
301+
cat("EXAMPLE 5: Glucose Levels (Pre-diabetic vs Diabetic)\n")
302+
cat("-----------------------------------------------------------------\n")
303+
304+
# Dummy data: Fasting glucose levels (mg/dL)
305+
set.seed(654)
306+
prediabetic <- c(108, 115, 102, 118, 125, 110, 112, 120, 105, 117,
307+
122, 106, 114, 119, 103, 116, 123, 107, 113, 121)
308+
309+
diabetic <- c(145, 165, 138, 172, 156, 149, 183, 142, 168, 151,
310+
178, 135, 161, 154, 175, 140, 164, 158, 181, 147, 170, 153)
311+
312+
cat("Pre-diabetic glucose levels (mg/dL):", prediabetic, "\n")
313+
cat("Diabetic glucose levels (mg/dL):", diabetic, "\n\n")
314+
315+
result5 <- mann_whitney_u_test(prediabetic, diabetic, alternative = "less")
316+
print(result5)
317+
318+
cat("\nClinical Interpretation:\n")
319+
cat("This test compares glucose levels between pre-diabetic and diabetic patients.\n")
320+
cat("H0: No difference in glucose levels between groups\n")
321+
cat("H1: Pre-diabetic patients have lower glucose levels than diabetic patients\n\n")
322+
323+
cat("=================================================================\n")
324+
cat("END OF EXAMPLES\n")
325+
cat("=================================================================\n")
326+
}
327+
328+
# Helper function to compare with built-in R function (if available)
329+
compare_with_r_builtin <- function() {
330+
cat("\n=================================================================\n")
331+
cat("COMPARISON WITH R's BUILT-IN wilcox.test() FUNCTION\n")
332+
cat("=================================================================\n\n")
333+
334+
# Generate sample data
335+
set.seed(123)
336+
group1 <- rnorm(15, mean = 50, sd = 10)
337+
group2 <- rnorm(18, mean = 45, sd = 12)
338+
339+
cat("Comparison using sample data:\n")
340+
cat("Group 1:", round(group1, 2), "\n")
341+
cat("Group 2:", round(group2, 2), "\n\n")
342+
343+
# Our implementation
344+
cat("Our implementation:\n")
345+
our_result <- mann_whitney_u_test(group1, group2, alternative = "two.sided")
346+
print(our_result)
347+
348+
# R's built-in function
349+
cat("\nR's built-in wilcox.test():\n")
350+
r_result <- wilcox.test(group1, group2, alternative = "two.sided", exact = FALSE)
351+
print(r_result)
352+
353+
cat("\nNote: Small differences may occur due to different tie-handling methods\n")
354+
cat("and continuity corrections, but results should be very similar.\n")
355+
}
356+
357+
# Examples are available but not run automatically to avoid side effects
358+
# To run examples, execute: run_biomedical_examples()
359+
# To compare with R built-in, execute: compare_with_r_builtin()
360+
if (interactive()) {
361+
cat("Loading Mann-Whitney U Test implementation...\n")
362+
cat("Run 'run_biomedical_examples()' to see biomedical examples.\n")
363+
cat("Run 'compare_with_r_builtin()' to compare with R's wilcox.test().\n")
364+
}
365+
366+
# Uncomment the following lines to run examples automatically:
367+
# run_biomedical_examples()
368+
# compare_with_r_builtin()

0 commit comments

Comments
 (0)