Skip to content

Commit e88baa4

Browse files
authored
Percentile (#1149)
* implementing quantile for use with percentile, adding tests, need to check accuracy of results * checking quantile calculation with apache commons statistics in tests * rewrote percentile api * tiny quantile refactor * fixed percentileBy and indices * docs and checks for quantile * median now also uses quantile implementation * added issue to todo comments
1 parent 1010e22 commit e88baa4

File tree

14 files changed

+2277
-356
lines changed

14 files changed

+2277
-356
lines changed

core/api/core.api

+98-52
Large diffs are not rendered by default.

core/build.gradle.kts

+4
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ dependencies {
6464
kotlinCompilerPluginClasspathSamples(projects.plugins.expressionsConverter)
6565

6666
api(libs.commonsCsv)
67+
6768
implementation(libs.commonsIo)
6869
implementation(libs.serialization.core)
6970
implementation(libs.serialization.json)
@@ -82,6 +83,9 @@ dependencies {
8283
testImplementation(libs.jsoup)
8384
testImplementation(libs.sl4jsimple)
8485

86+
// for checking results
87+
testImplementation(libs.commonsStatisticsDescriptive)
88+
8589
// for samples.api
8690
testImplementation(projects.dataframeCsv)
8791
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/median.kt

+2
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ import kotlin.reflect.KProperty
3838
* This needs to be explained by KDocs
3939
*
4040
* medianBy is new for all overloads :)
41+
* Uses [QuantileEstimationMethod.R8] for primitive numbers, else [QuantileEstimationMethod.R3].
42+
* MedianBy also uses [QuantileEstimationMethod.R3].
4143
*/
4244

4345
// region DataColumn

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/percentile.kt

+501-131
Large diffs are not rendered by default.

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/Aggregators.kt

+32-6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import org.jetbrains.kotlinx.dataframe.impl.aggregation.aggregators.multipleColu
1111
import org.jetbrains.kotlinx.dataframe.math.indexOfMax
1212
import org.jetbrains.kotlinx.dataframe.math.indexOfMedian
1313
import org.jetbrains.kotlinx.dataframe.math.indexOfMin
14+
import org.jetbrains.kotlinx.dataframe.math.indexOfPercentile
1415
import org.jetbrains.kotlinx.dataframe.math.maxOrNull
1516
import org.jetbrains.kotlinx.dataframe.math.maxTypeConversion
1617
import org.jetbrains.kotlinx.dataframe.math.mean
@@ -19,7 +20,8 @@ import org.jetbrains.kotlinx.dataframe.math.medianConversion
1920
import org.jetbrains.kotlinx.dataframe.math.medianOrNull
2021
import org.jetbrains.kotlinx.dataframe.math.minOrNull
2122
import org.jetbrains.kotlinx.dataframe.math.minTypeConversion
22-
import org.jetbrains.kotlinx.dataframe.math.percentile
23+
import org.jetbrains.kotlinx.dataframe.math.percentileConversion
24+
import org.jetbrains.kotlinx.dataframe.math.percentileOrNull
2325
import org.jetbrains.kotlinx.dataframe.math.std
2426
import org.jetbrains.kotlinx.dataframe.math.stdTypeConversion
2527
import org.jetbrains.kotlinx.dataframe.math.sum
@@ -147,11 +149,35 @@ internal object Aggregators {
147149
}
148150
}
149151

150-
// T: Comparable<T>? -> T
151-
val percentile by withOneOption { percentile: Double ->
152-
flattenReducingForAny<Comparable<Any?>> { type ->
153-
asIterable().percentile(percentile, type)
154-
}
152+
// T : primitive Number? -> Double?
153+
// T : Comparable<T & Any>? -> T?
154+
fun <T> percentileCommon(
155+
percentile: Double,
156+
skipNaN: Boolean,
157+
): Aggregator<T & Any, T?>
158+
where T : Comparable<T & Any>? =
159+
this.percentile.invoke(percentile, skipNaN).cast2()
160+
161+
// T : Comparable<T & Any>? -> T?
162+
fun <T> percentileComparables(percentile: Double): Aggregator<T & Any, T?>
163+
where T : Comparable<T & Any>? =
164+
percentileCommon<T>(percentile, skipNaNDefault).cast2()
165+
166+
// T : primitive Number? -> Double?
167+
fun <T> percentileNumbers(
168+
percentile: Double,
169+
skipNaN: Boolean,
170+
): Aggregator<T & Any, Double?>
171+
where T : Comparable<T & Any>?, T : Number? =
172+
percentileCommon<T>(percentile, skipNaN).cast2()
173+
174+
@Suppress("UNCHECKED_CAST")
175+
private val percentile by withTwoOptions { percentile: Double, skipNaN: Boolean ->
176+
flattenHybridForAny<Comparable<Any>, Comparable<Any>?>(
177+
getReturnType = percentileConversion,
178+
reducer = { type -> percentileOrNull(percentile, type, skipNaN) as Comparable<Any>? },
179+
indexOfResult = { type -> indexOfPercentile(percentile, type, skipNaN) },
180+
)
155181
}
156182

157183
// T : primitive Number? -> Double?

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/describe.kt

-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
66
import org.jetbrains.kotlinx.dataframe.api.ColumnDescription
77
import org.jetbrains.kotlinx.dataframe.api.add
88
import org.jetbrains.kotlinx.dataframe.api.after
9-
import org.jetbrains.kotlinx.dataframe.api.any
109
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
1110
import org.jetbrains.kotlinx.dataframe.api.asComparable
1211
import org.jetbrains.kotlinx.dataframe.api.asNumbers

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/math/median.kt

+49-60
Original file line numberDiff line numberDiff line change
@@ -8,33 +8,34 @@ import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable
88
import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber
99
import org.jetbrains.kotlinx.dataframe.impl.nothingType
1010
import org.jetbrains.kotlinx.dataframe.impl.renderType
11-
import org.jetbrains.kotlinx.dataframe.math.quickSelect
1211
import java.math.BigDecimal
1312
import java.math.BigInteger
13+
import kotlin.math.round
1414
import kotlin.reflect.KType
1515
import kotlin.reflect.full.withNullability
1616
import kotlin.reflect.typeOf
1717

1818
private val logger = KotlinLogging.logger { }
1919

20-
// TODO median always returns the same type, but this can be confusing for iterables of even length
21-
// TODO (e.g. median of [1, 2] should be 1.5, but the type is Int, so it returns 1), Issue #558
22-
2320
/**
2421
* Returns the median of the comparable input:
2522
* - `null` if empty
2623
* - `Double` if primitive number
2724
* - `Double.NaN` if ![skipNaN] and contains NaN
2825
* - (lower) middle else
2926
*
30-
* TODO migrate back to percentile when it's flexible enough
27+
* Based on quantile implementation;
28+
* uses [QuantileEstimationMethod.R8] for primitive numbers, else [QuantileEstimationMethod.R3].
3129
*/
3230
@PublishedApi
3331
internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN: Boolean): Any? {
3432
when {
3533
type.isMarkedNullable ->
3634
error("Encountered nullable type ${renderType(type)} in median function. This should not occur.")
3735

36+
// this means the sequence is empty
37+
type == nothingType -> return null
38+
3839
!type.isIntraComparable() ->
3940
error(
4041
"Unable to compute the median for ${
@@ -49,44 +50,27 @@ internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN:
4950

5051
type == typeOf<Long>() ->
5152
logger.warn { "Converting Longs to Doubles to calculate the median, loss of precision may occur." }
52-
53-
// this means the sequence is empty
54-
type == nothingType -> return null
5553
}
5654

57-
// propagate NaN to return if they are not to be skipped
58-
if (type.canBeNaN && !skipNaN && any { it.isNaN }) return Double.NaN
59-
60-
val list = when {
61-
type.canBeNaN -> filter { !it.isNaN }
62-
else -> this
63-
}.toList()
55+
val p = 0.5
6456

65-
val size = list.size
66-
if (size == 0) return null
57+
// TODO make configurable? https://github.com/Kotlin/dataframe/issues/1121
58+
val (values, method) =
59+
when {
60+
type.isPrimitiveNumber() ->
61+
this.map { (it as Number).toDouble() } to QuantileEstimationMethod.Interpolating.R8
6762

68-
if (size == 1) {
69-
val single = list.single()
70-
return if (type.isPrimitiveNumber()) (single as Number).toDouble() else single
71-
}
63+
else ->
64+
this to QuantileEstimationMethod.Selecting.R3
65+
}
7266

73-
val isOdd = size % 2 != 0
74-
75-
val middleIndex = (size - 1) / 2
76-
val lower = list.quickSelect(middleIndex)
77-
val upper = list.quickSelect(middleIndex + 1)
78-
79-
return when {
80-
isOdd && type.isPrimitiveNumber() -> (lower as Number).toDouble()
81-
isOdd -> lower
82-
type == typeOf<Double>() -> (lower as Double + upper as Double) / 2.0
83-
type == typeOf<Float>() -> ((lower as Float).toDouble() + (upper as Float).toDouble()) / 2.0
84-
type == typeOf<Int>() -> ((lower as Int).toDouble() + (upper as Int).toDouble()) / 2.0
85-
type == typeOf<Short>() -> ((lower as Short).toDouble() + (upper as Short).toDouble()) / 2.0
86-
type == typeOf<Byte>() -> ((lower as Byte).toDouble() + (upper as Byte).toDouble()) / 2.0
87-
type == typeOf<Long>() -> ((lower as Long).toDouble() + (upper as Long).toDouble()) / 2.0
88-
else -> lower
89-
}
67+
return values.quantileOrNull(
68+
p = p,
69+
type = type,
70+
skipNaN = skipNaN,
71+
method = method,
72+
name = "median",
73+
)
9074
}
9175

9276
/**
@@ -95,29 +79,32 @@ internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN:
9579
*/
9680
internal val medianConversion: CalculateReturnType = { type, isEmpty ->
9781
when {
98-
// uses linear interpolation, number 7 of Hyndman and Fan "Sample quantiles in statistical packages"
82+
// uses linear interpolation, R8 of Hyndman and Fan "Sample quantiles in statistical packages"
9983
type.isPrimitiveNumber() -> typeOf<Double>()
10084

10185
// closest rank method, preferring lower middle,
102-
// number 3 of Hyndman and Fan "Sample quantiles in statistical packages"
86+
// R3 of Hyndman and Fan "Sample quantiles in statistical packages"
10387
type.isIntraComparable() -> type
10488

10589
else -> error("Can not calculate median for type ${renderType(type)}")
10690
}.withNullability(isEmpty)
10791
}
10892

10993
/**
110-
* Returns the index of the median of the comparable input:
94+
* Returns the index of the median in the comparable input:
11195
* - `-1` if empty or all `null`
11296
* - index of first NaN if ![skipNaN] and contains NaN
11397
* - index (lower) middle else
11498
* NOTE: For primitive numbers the `seq.elementAt(seq.indexOfMedian())` might be different from `seq.medianOrNull()`
11599
*
116-
* TODO migrate back to percentile when it's flexible enough
100+
* Based on quantile implementation; uses [QuantileEstimationMethod.R3].
117101
*/
118102
internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, skipNaN: Boolean): Int {
119103
val nonNullType = type.withNullability(false)
120104
when {
105+
// this means the sequence is empty
106+
nonNullType == nothingType -> return -1
107+
121108
!nonNullType.isIntraComparable() ->
122109
error(
123110
"Unable to compute the median for ${
@@ -129,9 +116,6 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, s
129116
throw IllegalArgumentException(
130117
"Cannot calculate the median for big numbers in DataFrame. Only primitive numbers are supported.",
131118
)
132-
133-
// this means the sequence is empty
134-
nonNullType == nothingType -> return -1
135119
}
136120

137121
// propagate NaN to return if they are not to be skipped
@@ -148,22 +132,27 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, s
148132
IndexedComparable(i, it)
149133
}
150134
}
151-
val list = when {
152-
nonNullType.canBeNaN -> indexedSequence.filterNot { it.value.isNaN }
153-
else -> indexedSequence
154-
}.toList()
155-
156-
val size = list.size
157-
if (size == 0) return -1
158-
if (size == 1) return 0
159135

160-
val middleIndex = (size - 1) / 2
161-
val lower = list.quickSelect(middleIndex)
136+
// TODO make configurable? https://github.com/Kotlin/dataframe/issues/1121
137+
val method = QuantileEstimationMethod.R3
138+
val p = 0.5
139+
140+
// get the index where the median can be found in the sorted sequence
141+
val indexEstimation = indexedSequence.quantileIndexEstimation(
142+
p = p,
143+
type = typeOf<IndexedComparable<Nothing>>(),
144+
skipNaN = skipNaN,
145+
method = method,
146+
name = "median",
147+
)
148+
if (indexEstimation.isNaN()) return this.indexOfFirst { it.isNaN }
149+
if (indexEstimation < 0.0) return -1
150+
require(indexEstimation == round(indexEstimation)) {
151+
"median expected a whole number index from quantileIndexEstimation but was $indexEstimation"
152+
}
162153

163-
return lower.index
164-
}
154+
val medianResult = indexedSequence.toList().quickSelect(k = indexEstimation.toInt())
165155

166-
private data class IndexedComparable<T : Comparable<T>>(val index: Int, val value: T) :
167-
Comparable<IndexedComparable<T>> {
168-
override fun compareTo(other: IndexedComparable<T>): Int = value.compareTo(other.value)
156+
// return the original unsorted index of the found result
157+
return medianResult.index
169158
}

0 commit comments

Comments
 (0)