From 5a20d3d3fbccd81567ad2a611eff1a6418a6e269 Mon Sep 17 00:00:00 2001
From: Kenta Sato <bicycle1885@gmail.com>
Date: Sat, 28 Jun 2014 23:35:58 +0900
Subject: [PATCH 1/2] WIP: more accurate KSDist function

---
 src/univariate/ksdist.jl | 381 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 380 insertions(+), 1 deletion(-)

diff --git a/src/univariate/ksdist.jl b/src/univariate/ksdist.jl
index 0dc60f8f81..04bb61381c 100644
--- a/src/univariate/ksdist.jl
+++ b/src/univariate/ksdist.jl
@@ -1,5 +1,5 @@
 # Distribution of the (two-sided) Kolmogorov-Smirnoff statistic
-#   D_n = \sup_x |\hat{F}_n(x) -F(x)|
+#   D_n = \sup_x |\hat{F}_n(x) - F(x)|
 #   sqrt(n) D_n converges a.s. to the Kolmogorov distribution.
 immutable KSDist <: ContinuousUnivariateDistribution
     n::Int
@@ -15,6 +15,9 @@ minimum(d::KSDist) = 1 / (2 * d.n)
 maximum(d::KSDist) = 1.0
 insupport(d::KSDist, x::Real) = minimum(d) <= x <= 1.0
 
+const NEXACT = 500
+const NKOLMO = 100_000
+
 # TODO: implement Simard and L'Ecuyer (2011) meta-algorithm
 # requires Pomeranz and Pelz-Good algorithms
 function cdf(d::KSDist,x::Real)
@@ -46,6 +49,50 @@ function cdf(d::KSDist,x::Real)
     end
 end
 
+# Simard and L'Ecuyer (2011) meta-algorithm uses several algorithms
+# to compute exact or approximated values.  Dividing the argument
+# space into subspaces, a suitable algorithm is selected from the
+# viewpoint of precision and computing cost.
+#
+#   Simard, Richard, and Pierre L’Ecuyer.
+#   "Computing the two-sided Kolmogorov-Smirnov distribution."
+#   Journal of Statistical Software 39.11 (2011): 1-18.
+#
+#   http://www.jstatsoft.org/v39/i11/paper
+#
+#            Section                       Method
+#   ----------------------------------------------------------
+#   2. Exact methods
+#     2.1. Ruben and Gambino (1982)    cdf_ruben_gambino(d, x)
+#     2.2. Pomeranz (1974)             cdf_pomeranz(d, x)
+#     2.3. Durbin (1973)               cdf_durbin(d, x)
+#   3. Asymptotic approximations
+#          Pelz and Good (1976)        cdf_pelz_good(d, x)
+#
+function cdf2(d::KSDist, x::Real)
+    # debug
+    n = d.n
+    u = cdf_ruben_gambino(d, x)
+
+    if !isnan(u)
+        return u
+    end
+
+    w = n * x * x
+    if n <= NEXACT
+        if w < 0.754693
+            return cdf_durbin(d, x)
+        elseif w < 4.0
+            return cdf_pomeranz(d, x)
+        end
+        return 1.0 - ccdf2(d, x)
+    elseif w * x * n <= 7.0 && n <= NKOLMO
+        return cdf_durbin(d, x)
+    end
+
+    return cdf_pelz_good(d, x)
+end
+
 function ccdf(d::KSDist,x::Real)    
     n = d.n
     b = x*n
@@ -70,10 +117,33 @@ function ccdf(d::KSDist,x::Real)
     end
 end
 
+function ccdf2(d::KSDist, x::Real)
+    n = d.n
+    u = ccdf_ruben_gambino(d, x)
+
+    if !isnan(u)
+        return u
+    end
+
+    w = n * x * x
+    if n <= NEXACT
+        if w < 4.0
+            return 1.0 - cdf(d, x)
+        else
+            return ccdf_miller(d, x)
+        end
+    elseif w >= 2.65
+        return ccdf_miller(d, x)
+    end
+
+    return 1.0 - cdf2(d, x)
+end
 
 # Durbin matrix CDF method, based on Marsaglia, Tsang and Wang (2003)
 # modified to avoid need for exponent tracking
 function cdf_durbin(d::KSDist,x::Float64)
+    #debug
+    warn("cdf_durbin")
     n = d.n
     k, ch, h = ceil_rems_mult(n,x)
 
@@ -102,6 +172,306 @@ function cdf_durbin(d::KSDist,x::Float64)
     s*stirling(n)
 end
 
+# Ruben-Gambino's exact formulae
+function cdf_ruben_gambino(d::KSDist, x::Float64)
+    #debug
+    warn("cdf_ruben_gambino")
+    n = d.n
+
+    if n * x * x >= 18.0 || x >= 1.0
+        return 1.0
+    elseif x <= 0.5 / n
+        return 0.0
+    elseif n == 1
+        return 2x - 1.0
+    elseif x <= 1.0 / n
+        # n!(2x - 1/n)^n
+        t = 2.0 * x * n - 1.0
+        if n <= NEXACT
+            w = rapfac(n)
+            return w * t^n
+        else
+            w = lfact(n) + n * log(t / n)
+            return exp(w)
+        end
+    elseif x >= 1.0 - 1.0 / n
+        return 1.0 - 2(1.0 - x)^n
+    end
+
+    # cannot compute exact value, give up
+    return NaN
+end
+
+function ccdf_ruben_gambino(d::KSDist, x::Float64)
+    #debug
+    warn("ccdf_ruben_gambino")
+    n = d.n
+    w = n * x * x
+    if w >= 370.0 || x >= 1.0
+        return 0.0
+    elseif w <= 0.0274 || x <= 0.5 / n
+        return 1.0
+    elseif n == 1
+        return 2.0 - 2.0 * x
+    end
+
+    if x <= 1.0 / n
+        t = 2.0 * x * n - 1.0
+        if n <= NEXACT
+            z = rapfac(n)
+            warn("one")
+            return 1.0 - z * t^n
+        end
+        z = lfact(n) + n * log(t / n)
+        warn("two")
+        return 1.0 - exp(z)
+    elseif x >= 1.0 - 1.0 / n
+        warn("three")
+        return 2.0 * (1.0 - x)^n
+    end
+
+    # cannot compute exact value, give up
+    warn("four")
+    return NaN
+end
+
+# Pelz-Good's asymptotic approximation
+function cdf_pelz_good(d::KSDist, x::Float64)
+    #debug
+    warn("cdf_pelz_good")
+    # constants
+    n = d.n
+    JMAX = 20
+    EPS = 1.0e-10
+    C = 2.506628274631001  # sqrt(2*pi)
+    C2 = 1.2533141373155001  # sqrt(pi/2)
+    PI = pi
+    PI2 = PI * PI
+    PI4 = PI2 * PI2
+    RACN = sqrt(n)
+    z = RACN * x
+    z2 = z * z
+    z4 = z2 * z2
+    z6 = z4 * z2
+    w = PI2 / (2.0 * z * z)
+
+    term = 1.0
+    j = 0
+    sum = 0.0
+    local ti
+    while j <= JMAX && term > EPS * sum
+        ti = j + 0.5
+        term = exp(-ti * ti * w)
+        sum += term
+        j += 1
+    end
+    sum *= C / z
+
+    term = 1.0
+    tom = 0.0
+    j = 0
+    while j <= JMAX && abs(term) > EPS * abs(tom)
+        ti = j + 0.5
+        term = (PI2 * ti * ti - z2) * exp(-ti * ti * w)
+        tom += term
+        j += 1
+    end
+    sum += tom * C2 / (RACN * 3.0 * z4)
+
+    term = 1.0
+    tom = 0.0
+    j = 0
+    while j <= JMAX && abs(term) > EPS * abs(tom)
+        ti = j + 0.5
+        term = (6z6 + 2z4 + PI2 * (2z4 - 5z2) * ti * ti +
+            PI4 * (1.0 - 2z2) * ti * ti * ti * ti)
+        term *= exp(-ti * ti * w)
+        tom += term
+        j += 1
+    end
+    sum += tom * C2 / (n * 36.0 * z * z6)
+
+    term = 1.0
+    tom = 0.0
+    j = 1
+    while j <= JMAX && term > EPS * tom
+        ti = j
+        term = PI2 * ti * ti * exp(-ti * ti * w)
+        tom += term
+        j += 1
+    end
+    sum -= tom * C2 / (n * 18.0 * z * z2)
+
+    term = 1.0
+    tom = 0.0
+    j = 0
+    while j <= JMAX && abs(term) > EPS * abs(tom)
+        ti = j + 0.5
+        ti = ti * ti
+        term = (-30z6 - 90z6 * z2 + PI * (135z4 - 96z6) * ti +
+            PI4 * (212z4 - 60z2) * ti * ti + PI2 * PI4 * ti * ti * ti * (5.0 - 30z2))
+        term *= exp(-ti * w)
+        tom += term
+        j += 1
+    end
+    sum += tom * C2 / (RACN * n * 3240.0 * z4 * z6)
+
+    term = 1.0
+    tom = 0.0
+    j = 1
+    while j <= JMAX && abs(term) > EPS * abs(tom)
+        ti = j * j
+        term = (3.0 * PI2 * ti * z2 - PI4 * ti * ti) * exp(-ti * w)
+        tom += term
+        j += 1
+    end
+    sum += tom * C2 / (RACN * n * 108.0 * z6)
+
+    return sum
+end
+
+# The Pomeranz recursion formula (1974)
+function cdf_pomeranz(d::KSDist, x::Float64)
+    #debug
+    warn("cdf_pomeranz")
+    # constants
+    EPS = 1.0e-15
+    ENO = 350
+    RENO = ldexp(1.0, ENO)
+    n = d.n
+    t = n * x
+
+    coreno = 1
+    A = zeros(2 * n + 2)
+    Atflo = zeros(2 * n + 2)
+    Atcei = zeros(2 * n + 2)
+    V = zeros(Float64, (n + 2, 2))
+    H = Array(Float64, (n + 2, 4))
+
+    V[1,2] = RENO
+
+    # decompose t = l + f s.t.
+    #   l: non-negative integer
+    #   f: 0 ≤ f < 1
+    l = ifloor(t)
+    f = t - l
+    stop = 2n + 2
+
+    # calculate floors and ceils
+    if f > 0.5
+        # Case (iii): f = 0
+        for i in 1:2:stop
+            Atflo[i] = div(i, 2) - 1 - l
+            Atcei[i] = div(i, 2) + 1 + l
+        end
+
+        for i in 2:2:stop
+            Atflo[i] = div(i, 2) - 2 - l
+            Atcei[i] = div(i, 2) + l
+        end
+    elseif f > 0.0
+        # Case (ii): 0 < f ≤ 1/2
+        for i in 1:stop
+            Atflo[i] = div(i, 2) - 1 - l
+        end
+
+        Atcei[1] = 1 + l
+        for i in 2:stop
+            Atcei[i] = div(i, 2) + l
+        end
+    else
+        # Case (i): f = 0
+        for i in 1:2:stop
+            Atflo[i] = div(i, 2) - l
+            Atcei[i] = div(i, 2) + l
+        end
+
+        for i in 2:2:stop
+            Atflo[i] = div(i, 2) - 1 - l
+            Atcei[i] = div(i, 2) - 1 + l
+        end
+    end
+
+    # calculate A
+    A[1] = 0.0
+    A[2] = min(t - floor(t), ceil(t) - t)
+    A[3] = 1.0 - A[2]
+    for i in 4:stop-1
+        A[i] = A[i-2] + 1
+    end
+    A[stop] = n
+
+    H[1,1] = 1.0
+    w = 2.0 * A[2] / n
+    for j in 2:n+2
+        H[j,1] = w * H[j-1,1] / (j - 1)
+    end
+
+    H[1,2] = 1.0
+    w = (1.0 - 2.0 * A[2]) / n
+    for j in 2:n+2
+        H[j,2] = w * H[j-1,2] / (j - 1)
+    end
+
+    H[1,3] = 1.0
+    w = A[2] / n
+    for j in 2:n+2
+        H[j,3] = w * H[j-1,3] / (j - 1)
+    end
+
+    H[1,4] = 1.0
+    for j in 2:n+2
+        H[j,4] = 0.0
+    end
+
+    # indices i and i-1 for V[i][]
+    r1 = 1
+    r2 = 2
+
+    for i in 2:2n+2
+        # j = floor{Ai - t} + 2, …, ceil{Ai + t}
+        jlo = max(1, Atflo[i] + 2)
+        jhi = min(n + 1, Atcei[i])
+
+        # k = max(1, floor{Ai-1 - t} + 2), …, min(j, ceil{Ai-1 + t})
+        klo = max(1, Atflo[i-1] + 2)
+
+        w = (A[i] - A[i-1]) / n
+        s = 0
+        for j in 1:4
+            if abs(w - H[2,j]) <= EPS
+                s = j
+                break
+            end
+        end
+        @assert s > 0
+
+        minsum = RENO
+        r1, r2 = r2, r1  # r1: i-1, r2: i
+
+        for j in jlo:jhi
+            khi = min(j, Atcei[i-1])
+            sum = 0.0
+            for k in khi:-1:klo
+                sum += V[k,r1] * H[j-k+1,s]
+            end
+            V[j,r2] = sum
+            minsum = min(minsum, sum)
+        end
+
+        if minsum < 1.0e-280
+            for j in jlo:jhi
+                V[j,r2] *= RENO
+            end
+            coreno += 1
+        end
+    end
+
+    sum = V[n+1,r2]
+    w = lfact(n) - coreno * ENO * log(2) + log(sum)
+    return w >= 0.0 ? 1.0 : exp(w)
+end
+
 # Miller (1956) approximation
 function ccdf_miller(d::KSDist, x::Real)
     2.0*ccdf(KSOneSided(d.n),x)
@@ -138,3 +508,12 @@ function stirling(n)
         return sqrt(2.0*pi*n)*(1.0 + twn\(1 + (2.0*twn)\(1 - (15.0*twn)\139.0)))
     end
 end
+
+# n! / n^n
+function rapfac(n)
+    res = 1.0 / n
+    for i in 2:n
+        res *= i / n
+    end
+    return res
+end

From c9bc19e693a6d58a8e78b2a6d534e2803b7dab34 Mon Sep 17 00:00:00 2001
From: Kenta Sato <bicycle1885@gmail.com>
Date: Sun, 29 Jun 2014 07:41:57 +0900
Subject: [PATCH 2/2] add the license term

---
 src/univariate/ksdist.jl | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/univariate/ksdist.jl b/src/univariate/ksdist.jl
index 04bb61381c..b7243f12e7 100644
--- a/src/univariate/ksdist.jl
+++ b/src/univariate/ksdist.jl
@@ -1,3 +1,35 @@
+# This program is a Julia port of KolmogorovSmirnovDist.c, which is 
+# distributed under the GNU GPL v3 license.
+# The original code is available from http://simul.iro.umontreal.ca/ksdir/
+#
+# /********************************************************************
+#  *
+#  * File:          KolmogorovSmirnovDist.c
+#  * Environment:   ISO C99 or ANSI C89
+#  * Author:        Richard Simard
+#  * Organization:  DIRO, Université de Montréal
+#  * Date:          1 February 2012
+#  * Version        1.1
+# 
+#  * Copyright 1 march 2010 by Université de Montréal,
+#                              Richard Simard and Pierre L'Ecuyer
+#  =====================================================================
+# 
+#     This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, version 3 of the License.
+# 
+#     This program is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+# 
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# 
+#  =====================================================================*/
+
+
 # Distribution of the (two-sided) Kolmogorov-Smirnoff statistic
 #   D_n = \sup_x |\hat{F}_n(x) - F(x)|
 #   sqrt(n) D_n converges a.s. to the Kolmogorov distribution.