@@ -8,33 +8,34 @@ import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable
8
8
import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber
9
9
import org.jetbrains.kotlinx.dataframe.impl.nothingType
10
10
import org.jetbrains.kotlinx.dataframe.impl.renderType
11
- import org.jetbrains.kotlinx.dataframe.math.quickSelect
12
11
import java.math.BigDecimal
13
12
import java.math.BigInteger
13
+ import kotlin.math.round
14
14
import kotlin.reflect.KType
15
15
import kotlin.reflect.full.withNullability
16
16
import kotlin.reflect.typeOf
17
17
18
18
private val logger = KotlinLogging .logger { }
19
19
20
- // TODO median always returns the same type, but this can be confusing for iterables of even length
21
- // TODO (e.g. median of [1, 2] should be 1.5, but the type is Int, so it returns 1), Issue #558
22
-
23
20
/* *
24
21
* Returns the median of the comparable input:
25
22
* - `null` if empty
26
23
* - `Double` if primitive number
27
24
* - `Double.NaN` if ![skipNaN] and contains NaN
28
25
* - (lower) middle else
29
26
*
30
- * TODO migrate back to percentile when it's flexible enough
27
+ * Based on quantile implementation;
28
+ * uses [QuantileEstimationMethod.R8] for primitive numbers, else [QuantileEstimationMethod.R3].
31
29
*/
32
30
@PublishedApi
33
31
internal fun <T : Comparable <T >> Sequence<T>.medianOrNull (type : KType , skipNaN : Boolean ): Any? {
34
32
when {
35
33
type.isMarkedNullable ->
36
34
error(" Encountered nullable type ${renderType(type)} in median function. This should not occur." )
37
35
36
+ // this means the sequence is empty
37
+ type == nothingType -> return null
38
+
38
39
! type.isIntraComparable() ->
39
40
error(
40
41
" Unable to compute the median for ${
@@ -49,44 +50,27 @@ internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN:
49
50
50
51
type == typeOf<Long >() ->
51
52
logger.warn { " Converting Longs to Doubles to calculate the median, loss of precision may occur." }
52
-
53
- // this means the sequence is empty
54
- type == nothingType -> return null
55
53
}
56
54
57
- // propagate NaN to return if they are not to be skipped
58
- if (type.canBeNaN && ! skipNaN && any { it.isNaN }) return Double .NaN
59
-
60
- val list = when {
61
- type.canBeNaN -> filter { ! it.isNaN }
62
- else -> this
63
- }.toList()
55
+ val p = 0.5
64
56
65
- val size = list.size
66
- if (size == 0 ) return null
57
+ // TODO make configurable? https://github.com/Kotlin/dataframe/issues/1121
58
+ val (values, method) =
59
+ when {
60
+ type.isPrimitiveNumber() ->
61
+ this .map { (it as Number ).toDouble() } to QuantileEstimationMethod .Interpolating .R8
67
62
68
- if (size == 1 ) {
69
- val single = list.single()
70
- return if (type.isPrimitiveNumber()) (single as Number ).toDouble() else single
71
- }
63
+ else ->
64
+ this to QuantileEstimationMethod .Selecting .R3
65
+ }
72
66
73
- val isOdd = size % 2 != 0
74
-
75
- val middleIndex = (size - 1 ) / 2
76
- val lower = list.quickSelect(middleIndex)
77
- val upper = list.quickSelect(middleIndex + 1 )
78
-
79
- return when {
80
- isOdd && type.isPrimitiveNumber() -> (lower as Number ).toDouble()
81
- isOdd -> lower
82
- type == typeOf<Double >() -> (lower as Double + upper as Double ) / 2.0
83
- type == typeOf<Float >() -> ((lower as Float ).toDouble() + (upper as Float ).toDouble()) / 2.0
84
- type == typeOf<Int >() -> ((lower as Int ).toDouble() + (upper as Int ).toDouble()) / 2.0
85
- type == typeOf<Short >() -> ((lower as Short ).toDouble() + (upper as Short ).toDouble()) / 2.0
86
- type == typeOf<Byte >() -> ((lower as Byte ).toDouble() + (upper as Byte ).toDouble()) / 2.0
87
- type == typeOf<Long >() -> ((lower as Long ).toDouble() + (upper as Long ).toDouble()) / 2.0
88
- else -> lower
89
- }
67
+ return values.quantileOrNull(
68
+ p = p,
69
+ type = type,
70
+ skipNaN = skipNaN,
71
+ method = method,
72
+ name = " median" ,
73
+ )
90
74
}
91
75
92
76
/* *
@@ -95,29 +79,32 @@ internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN:
95
79
*/
96
80
internal val medianConversion: CalculateReturnType = { type, isEmpty ->
97
81
when {
98
- // uses linear interpolation, number 7 of Hyndman and Fan "Sample quantiles in statistical packages"
82
+ // uses linear interpolation, R8 of Hyndman and Fan "Sample quantiles in statistical packages"
99
83
type.isPrimitiveNumber() -> typeOf<Double >()
100
84
101
85
// closest rank method, preferring lower middle,
102
- // number 3 of Hyndman and Fan "Sample quantiles in statistical packages"
86
+ // R3 of Hyndman and Fan "Sample quantiles in statistical packages"
103
87
type.isIntraComparable() -> type
104
88
105
89
else -> error(" Can not calculate median for type ${renderType(type)} " )
106
90
}.withNullability(isEmpty)
107
91
}
108
92
109
93
/* *
110
- * Returns the index of the median of the comparable input:
94
+ * Returns the index of the median in the comparable input:
111
95
* - `-1` if empty or all `null`
112
96
* - index of first NaN if ![skipNaN] and contains NaN
113
97
* - index (lower) middle else
114
98
* NOTE: For primitive numbers the `seq.elementAt(seq.indexOfMedian())` might be different from `seq.medianOrNull()`
115
99
*
116
- * TODO migrate back to percentile when it's flexible enough
100
+ * Based on quantile implementation; uses [QuantileEstimationMethod.R3].
117
101
*/
118
102
internal fun <T : Comparable <T & Any >? > Sequence<T>.indexOfMedian (type : KType , skipNaN : Boolean ): Int {
119
103
val nonNullType = type.withNullability(false )
120
104
when {
105
+ // this means the sequence is empty
106
+ nonNullType == nothingType -> return - 1
107
+
121
108
! nonNullType.isIntraComparable() ->
122
109
error(
123
110
" Unable to compute the median for ${
@@ -129,9 +116,6 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, s
129
116
throw IllegalArgumentException (
130
117
" Cannot calculate the median for big numbers in DataFrame. Only primitive numbers are supported." ,
131
118
)
132
-
133
- // this means the sequence is empty
134
- nonNullType == nothingType -> return - 1
135
119
}
136
120
137
121
// propagate NaN to return if they are not to be skipped
@@ -148,22 +132,27 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, s
148
132
IndexedComparable (i, it)
149
133
}
150
134
}
151
- val list = when {
152
- nonNullType.canBeNaN -> indexedSequence.filterNot { it.value.isNaN }
153
- else -> indexedSequence
154
- }.toList()
155
-
156
- val size = list.size
157
- if (size == 0 ) return - 1
158
- if (size == 1 ) return 0
159
135
160
- val middleIndex = (size - 1 ) / 2
161
- val lower = list.quickSelect(middleIndex)
136
+ // TODO make configurable? https://github.com/Kotlin/dataframe/issues/1121
137
+ val method = QuantileEstimationMethod .R3
138
+ val p = 0.5
139
+
140
+ // get the index where the median can be found in the sorted sequence
141
+ val indexEstimation = indexedSequence.quantileIndexEstimation(
142
+ p = p,
143
+ type = typeOf<IndexedComparable <Nothing >>(),
144
+ skipNaN = skipNaN,
145
+ method = method,
146
+ name = " median" ,
147
+ )
148
+ if (indexEstimation.isNaN()) return this .indexOfFirst { it.isNaN }
149
+ if (indexEstimation < 0.0 ) return - 1
150
+ require(indexEstimation == round(indexEstimation)) {
151
+ " median expected a whole number index from quantileIndexEstimation but was $indexEstimation "
152
+ }
162
153
163
- return lower.index
164
- }
154
+ val medianResult = indexedSequence.toList().quickSelect(k = indexEstimation.toInt())
165
155
166
- private data class IndexedComparable <T : Comparable <T >>(val index : Int , val value : T ) :
167
- Comparable <IndexedComparable <T >> {
168
- override fun compareTo (other : IndexedComparable <T >): Int = value.compareTo(other.value)
156
+ // return the original unsorted index of the found result
157
+ return medianResult.index
169
158
}
0 commit comments