Skip to content

Percentile #1149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 98 additions & 52 deletions core/api/core.api

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ dependencies {
kotlinCompilerPluginClasspathSamples(projects.plugins.expressionsConverter)

api(libs.commonsCsv)

implementation(libs.commonsIo)
implementation(libs.serialization.core)
implementation(libs.serialization.json)
Expand All @@ -82,6 +83,9 @@ dependencies {
testImplementation(libs.jsoup)
testImplementation(libs.sl4jsimple)

// for checking results
testImplementation(libs.commonsStatisticsDescriptive)

// for samples.api
testImplementation(projects.dataframeCsv)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ import kotlin.reflect.KProperty
* This needs to be explained by KDocs
*
* medianBy is new for all overloads :)
* Uses [QuantileEstimationMethod.R8] for primitive numbers, else [QuantileEstimationMethod.R3].
* MedianBy also uses [QuantileEstimationMethod.R3].
*/

// region DataColumn
Expand Down
632 changes: 501 additions & 131 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/percentile.kt

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import org.jetbrains.kotlinx.dataframe.impl.aggregation.aggregators.multipleColu
import org.jetbrains.kotlinx.dataframe.math.indexOfMax
import org.jetbrains.kotlinx.dataframe.math.indexOfMedian
import org.jetbrains.kotlinx.dataframe.math.indexOfMin
import org.jetbrains.kotlinx.dataframe.math.indexOfPercentile
import org.jetbrains.kotlinx.dataframe.math.maxOrNull
import org.jetbrains.kotlinx.dataframe.math.maxTypeConversion
import org.jetbrains.kotlinx.dataframe.math.mean
Expand All @@ -19,7 +20,8 @@ import org.jetbrains.kotlinx.dataframe.math.medianConversion
import org.jetbrains.kotlinx.dataframe.math.medianOrNull
import org.jetbrains.kotlinx.dataframe.math.minOrNull
import org.jetbrains.kotlinx.dataframe.math.minTypeConversion
import org.jetbrains.kotlinx.dataframe.math.percentile
import org.jetbrains.kotlinx.dataframe.math.percentileConversion
import org.jetbrains.kotlinx.dataframe.math.percentileOrNull
import org.jetbrains.kotlinx.dataframe.math.std
import org.jetbrains.kotlinx.dataframe.math.stdTypeConversion
import org.jetbrains.kotlinx.dataframe.math.sum
Expand Down Expand Up @@ -147,11 +149,35 @@ internal object Aggregators {
}
}

// T: Comparable<T>? -> T
val percentile by withOneOption { percentile: Double ->
flattenReducingForAny<Comparable<Any?>> { type ->
asIterable().percentile(percentile, type)
}
// T : primitive Number? -> Double?
// T : Comparable<T & Any>? -> T?
fun <T> percentileCommon(
percentile: Double,
skipNaN: Boolean,
): Aggregator<T & Any, T?>
where T : Comparable<T & Any>? =
this.percentile.invoke(percentile, skipNaN).cast2()

// T : Comparable<T & Any>? -> T?
fun <T> percentileComparables(percentile: Double): Aggregator<T & Any, T?>
where T : Comparable<T & Any>? =
percentileCommon<T>(percentile, skipNaNDefault).cast2()

// T : primitive Number? -> Double?
fun <T> percentileNumbers(
percentile: Double,
skipNaN: Boolean,
): Aggregator<T & Any, Double?>
where T : Comparable<T & Any>?, T : Number? =
percentileCommon<T>(percentile, skipNaN).cast2()

@Suppress("UNCHECKED_CAST")
private val percentile by withTwoOptions { percentile: Double, skipNaN: Boolean ->
flattenHybridForAny<Comparable<Any>, Comparable<Any>?>(
getReturnType = percentileConversion,
reducer = { type -> percentileOrNull(percentile, type, skipNaN) as Comparable<Any>? },
indexOfResult = { type -> indexOfPercentile(percentile, type, skipNaN) },
)
}

// T : primitive Number? -> Double?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.ColumnDescription
import org.jetbrains.kotlinx.dataframe.api.add
import org.jetbrains.kotlinx.dataframe.api.after
import org.jetbrains.kotlinx.dataframe.api.any
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
import org.jetbrains.kotlinx.dataframe.api.asComparable
import org.jetbrains.kotlinx.dataframe.api.asNumbers
Expand Down
109 changes: 49 additions & 60 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/math/median.kt
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,34 @@ import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable
import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber
import org.jetbrains.kotlinx.dataframe.impl.nothingType
import org.jetbrains.kotlinx.dataframe.impl.renderType
import org.jetbrains.kotlinx.dataframe.math.quickSelect
import java.math.BigDecimal
import java.math.BigInteger
import kotlin.math.round
import kotlin.reflect.KType
import kotlin.reflect.full.withNullability
import kotlin.reflect.typeOf

private val logger = KotlinLogging.logger { }

// TODO median always returns the same type, but this can be confusing for iterables of even length
// TODO (e.g. median of [1, 2] should be 1.5, but the type is Int, so it returns 1), Issue #558

/**
* Returns the median of the comparable input:
* - `null` if empty
* - `Double` if primitive number
* - `Double.NaN` if ![skipNaN] and contains NaN
* - (lower) middle else
*
* TODO migrate back to percentile when it's flexible enough
* Based on quantile implementation;
* uses [QuantileEstimationMethod.R8] for primitive numbers, else [QuantileEstimationMethod.R3].
*/
@PublishedApi
internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN: Boolean): Any? {
when {
type.isMarkedNullable ->
error("Encountered nullable type ${renderType(type)} in median function. This should not occur.")

// this means the sequence is empty
type == nothingType -> return null

!type.isIntraComparable() ->
error(
"Unable to compute the median for ${
Expand All @@ -49,44 +50,27 @@ internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN:

type == typeOf<Long>() ->
logger.warn { "Converting Longs to Doubles to calculate the median, loss of precision may occur." }

// this means the sequence is empty
type == nothingType -> return null
}

// propagate NaN to return if they are not to be skipped
if (type.canBeNaN && !skipNaN && any { it.isNaN }) return Double.NaN

val list = when {
type.canBeNaN -> filter { !it.isNaN }
else -> this
}.toList()
val p = 0.5

val size = list.size
if (size == 0) return null
// TODO make configurable? https://github.com/Kotlin/dataframe/issues/1121
val (values, method) =
when {
type.isPrimitiveNumber() ->
this.map { (it as Number).toDouble() } to QuantileEstimationMethod.Interpolating.R8

if (size == 1) {
val single = list.single()
return if (type.isPrimitiveNumber()) (single as Number).toDouble() else single
}
else ->
this to QuantileEstimationMethod.Selecting.R3
}

val isOdd = size % 2 != 0

val middleIndex = (size - 1) / 2
val lower = list.quickSelect(middleIndex)
val upper = list.quickSelect(middleIndex + 1)

return when {
isOdd && type.isPrimitiveNumber() -> (lower as Number).toDouble()
isOdd -> lower
type == typeOf<Double>() -> (lower as Double + upper as Double) / 2.0
type == typeOf<Float>() -> ((lower as Float).toDouble() + (upper as Float).toDouble()) / 2.0
type == typeOf<Int>() -> ((lower as Int).toDouble() + (upper as Int).toDouble()) / 2.0
type == typeOf<Short>() -> ((lower as Short).toDouble() + (upper as Short).toDouble()) / 2.0
type == typeOf<Byte>() -> ((lower as Byte).toDouble() + (upper as Byte).toDouble()) / 2.0
type == typeOf<Long>() -> ((lower as Long).toDouble() + (upper as Long).toDouble()) / 2.0
else -> lower
}
return values.quantileOrNull(
p = p,
type = type,
skipNaN = skipNaN,
method = method,
name = "median",
)
}

/**
Expand All @@ -95,29 +79,32 @@ internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN:
*/
internal val medianConversion: CalculateReturnType = { type, isEmpty ->
when {
// uses linear interpolation, number 7 of Hyndman and Fan "Sample quantiles in statistical packages"
// uses linear interpolation, R8 of Hyndman and Fan "Sample quantiles in statistical packages"
type.isPrimitiveNumber() -> typeOf<Double>()

// closest rank method, preferring lower middle,
// number 3 of Hyndman and Fan "Sample quantiles in statistical packages"
// R3 of Hyndman and Fan "Sample quantiles in statistical packages"
type.isIntraComparable() -> type

else -> error("Can not calculate median for type ${renderType(type)}")
}.withNullability(isEmpty)
}

/**
* Returns the index of the median of the comparable input:
* Returns the index of the median in the comparable input:
* - `-1` if empty or all `null`
* - index of first NaN if ![skipNaN] and contains NaN
* - index (lower) middle else
* NOTE: For primitive numbers the `seq.elementAt(seq.indexOfMedian())` might be different from `seq.medianOrNull()`
*
* TODO migrate back to percentile when it's flexible enough
* Based on quantile implementation; uses [QuantileEstimationMethod.R3].
*/
internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, skipNaN: Boolean): Int {
val nonNullType = type.withNullability(false)
when {
// this means the sequence is empty
nonNullType == nothingType -> return -1

!nonNullType.isIntraComparable() ->
error(
"Unable to compute the median for ${
Expand All @@ -129,9 +116,6 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, s
throw IllegalArgumentException(
"Cannot calculate the median for big numbers in DataFrame. Only primitive numbers are supported.",
)

// this means the sequence is empty
nonNullType == nothingType -> return -1
}

// propagate NaN to return if they are not to be skipped
Expand All @@ -148,22 +132,27 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, s
IndexedComparable(i, it)
}
}
val list = when {
nonNullType.canBeNaN -> indexedSequence.filterNot { it.value.isNaN }
else -> indexedSequence
}.toList()

val size = list.size
if (size == 0) return -1
if (size == 1) return 0

val middleIndex = (size - 1) / 2
val lower = list.quickSelect(middleIndex)
// TODO make configurable? https://github.com/Kotlin/dataframe/issues/1121
val method = QuantileEstimationMethod.R3
val p = 0.5

// get the index where the median can be found in the sorted sequence
val indexEstimation = indexedSequence.quantileIndexEstimation(
p = p,
type = typeOf<IndexedComparable<Nothing>>(),
skipNaN = skipNaN,
method = method,
name = "median",
)
if (indexEstimation.isNaN()) return this.indexOfFirst { it.isNaN }
if (indexEstimation < 0.0) return -1
require(indexEstimation == round(indexEstimation)) {
"median expected a whole number index from quantileIndexEstimation but was $indexEstimation"
}

return lower.index
}
val medianResult = indexedSequence.toList().quickSelect(k = indexEstimation.toInt())

private data class IndexedComparable<T : Comparable<T>>(val index: Int, val value: T) :
Comparable<IndexedComparable<T>> {
override fun compareTo(other: IndexedComparable<T>): Int = value.compareTo(other.value)
// return the original unsorted index of the found result
return medianResult.index
}
Loading