Skip to content

Commit bcabfd0

Browse files
authored
Add statistics on DataFrame support (#1153)
* Add mean and standard deviation functions Compiler Plugin support for DataFrame * Add aggregation functions: min, max, median, and percentile Implemented new functions for calculating min, max, median, and percentile on DataFrames. Added corresponding tests to ensure proper functionality across different scenarios and ensured refinements with annotations like `@Refine` and `@Interpretable`. * Finished support for min/max/median * Refactor percentile handling for specific column scenarios Replace `percentileArg` with a fixed value of 30.0 in test cases to ensure clearer functionality demonstration. Add `Arguments.percentile` with `ignore()` for improved handling and schema modification alignment in `Percentile0` and `Percentile1` classes. * Refactor schema generation in statistical operations. Replaced schema construction to include only newly generated columns instead of merging with existing ones. Updated test cases to validate schema consistency using `compareSchemas`. * Fix linter and GroupBy ops * Api Dump * Fix reivew
1 parent 90505a6 commit bcabfd0

File tree

24 files changed

+1177
-234
lines changed

24 files changed

+1177
-234
lines changed

core/api/core.api

+4
Original file line numberDiff line numberDiff line change
@@ -5681,7 +5681,11 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/impl/aggregation
56815681

56825682
public final class org/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/Aggregators {
56835683
public static final field INSTANCE Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/Aggregators;
5684+
public final fun getMax ()Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/AggregatorOptionSwitch1;
56845685
public final fun getMean ()Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/AggregatorOptionSwitch1;
5686+
public final fun getMedian ()Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/AggregatorOptionSwitch1;
5687+
public final fun getMin ()Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/AggregatorOptionSwitch1;
5688+
public final fun getPercentile ()Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/AggregatorOptionSwitch2;
56855689
public final fun getStd ()Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/AggregatorOptionSwitch2;
56865690
public final fun getSum ()Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/AggregatorOptionSwitch1;
56875691
public final fun max (Z)Lorg/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/Aggregator;

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/max.kt

+4-1
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,13 @@ public inline fun <reified T : Comparable<T>> AnyRow.rowMaxOf(skipNaN: Boolean =
7373
// endregion
7474

7575
// region DataFrame
76-
76+
@Refine
77+
@Interpretable("Max0")
7778
public fun <T> DataFrame<T>.max(skipNaN: Boolean = skipNaNDefault): DataRow<T> =
7879
maxFor(skipNaN, intraComparableColumns())
7980

81+
@Refine
82+
@Interpretable("Max1")
8083
public fun <T, C : Comparable<C & Any>?> DataFrame<T>.maxFor(
8184
skipNaN: Boolean = skipNaNDefault,
8285
columns: ColumnsForAggregateSelector<T, C>,

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/mean.kt

+4-1
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,13 @@ public inline fun <reified T : Number> AnyRow.rowMeanOf(skipNaN: Boolean = skipN
6060
// endregion
6161

6262
// region DataFrame
63-
63+
@Refine
64+
@Interpretable("Mean0")
6465
public fun <T> DataFrame<T>.mean(skipNaN: Boolean = skipNaNDefault): DataRow<T> =
6566
meanFor(skipNaN, primitiveOrMixedNumberColumns())
6667

68+
@Refine
69+
@Interpretable("Mean1")
6770
public fun <T, C : Number?> DataFrame<T>.meanFor(
6871
skipNaN: Boolean = skipNaNDefault,
6972
columns: ColumnsForAggregateSelector<T, C>,

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/median.kt

+4-1
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,13 @@ public inline fun <reified T> AnyRow.rowMedianOf(
130130
// endregion
131131

132132
// region DataFrame
133-
133+
@Refine
134+
@Interpretable("Median0")
134135
public fun <T> DataFrame<T>.median(skipNaN: Boolean = skipNaNDefault): DataRow<T> =
135136
medianFor(skipNaN, intraComparableColumns())
136137

138+
@Refine
139+
@Interpretable("Median1")
137140
public fun <T, C : Comparable<C & Any>?> DataFrame<T>.medianFor(
138141
skipNaN: Boolean = skipNaNDefault,
139142
columns: ColumnsForAggregateSelector<T, C>,

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/min.kt

+4-1
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,13 @@ public inline fun <reified T : Comparable<T>> AnyRow.rowMinOf(skipNaN: Boolean =
7373
// endregion
7474

7575
// region DataFrame
76-
76+
@Refine
77+
@Interpretable("Min0")
7778
public fun <T> DataFrame<T>.min(skipNaN: Boolean = skipNaNDefault): DataRow<T> =
7879
minFor(skipNaN, intraComparableColumns())
7980

81+
@Refine
82+
@Interpretable("Min1")
8083
public fun <T, C : Comparable<C & Any>?> DataFrame<T>.minFor(
8184
skipNaN: Boolean = skipNaNDefault,
8285
columns: ColumnsForAggregateSelector<T, C>,

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/percentile.kt

+4-1
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,13 @@ public inline fun <reified T> AnyRow.rowPercentileOf(
144144
// endregion
145145

146146
// region DataFrame
147-
147+
@Refine
148+
@Interpretable("Percentile0")
148149
public fun <T> DataFrame<T>.percentile(percentile: Double, skipNaN: Boolean = skipNaNDefault): DataRow<T> =
149150
percentileFor(percentile, skipNaN, intraComparableColumns())
150151

152+
@Refine
153+
@Interpretable("Percentile1")
151154
public fun <T, C : Comparable<C & Any>?> DataFrame<T>.percentileFor(
152155
percentile: Double,
153156
skipNaN: Boolean = skipNaNDefault,

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/std.kt

+4-1
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,13 @@ public inline fun <reified T : Number?> AnyRow.rowStdOf(
6565
// endregion
6666

6767
// region DataFrame
68-
68+
@Refine
69+
@Interpretable("Std0")
6970
public fun <T> DataFrame<T>.std(skipNaN: Boolean = skipNaNDefault, ddof: Int = ddofDefault): DataRow<T> =
7071
stdFor(skipNaN, ddof, primitiveOrMixedNumberColumns())
7172

73+
@Refine
74+
@Interpretable("Std1")
7275
public fun <T, C : Number?> DataFrame<T>.stdFor(
7376
skipNaN: Boolean = skipNaNDefault,
7477
ddof: Int = ddofDefault,

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/aggregation/aggregators/Aggregators.kt

+43-36
Original file line numberDiff line numberDiff line change
@@ -113,25 +113,27 @@ public object Aggregators {
113113
// T : Comparable<T & Any>? -> T?
114114
public fun <T : Comparable<T & Any>?> min(skipNaN: Boolean): Aggregator<T & Any, T?> = min.invoke(skipNaN).cast2()
115115

116-
private val min by withOneOption { skipNaN: Boolean ->
117-
twoStepSelectingForAny<Comparable<Any>, Comparable<Any>?>(
118-
getReturnType = minTypeConversion,
119-
stepOneSelector = { type -> minOrNull(type, skipNaN) },
120-
indexOfResult = { type -> indexOfMin(type, skipNaN) },
121-
)
122-
}
116+
public val min: AggregatorOptionSwitch1<Boolean, Comparable<Any>, Comparable<Any>?>
117+
by withOneOption { skipNaN: Boolean ->
118+
twoStepSelectingForAny<Comparable<Any>, Comparable<Any>?>(
119+
getReturnType = minTypeConversion,
120+
stepOneSelector = { type -> minOrNull(type, skipNaN) },
121+
indexOfResult = { type -> indexOfMin(type, skipNaN) },
122+
)
123+
}
123124

124125
// T: Comparable<T> -> T?
125126
// T : Comparable<T & Any>? -> T?
126127
public fun <T : Comparable<T & Any>?> max(skipNaN: Boolean): Aggregator<T & Any, T?> = max.invoke(skipNaN).cast2()
127128

128-
private val max by withOneOption { skipNaN: Boolean ->
129-
twoStepSelectingForAny<Comparable<Any>, Comparable<Any>?>(
130-
getReturnType = maxTypeConversion,
131-
stepOneSelector = { type -> maxOrNull(type, skipNaN) },
132-
indexOfResult = { type -> indexOfMax(type, skipNaN) },
133-
)
134-
}
129+
public val max: AggregatorOptionSwitch1<Boolean, Comparable<Any>, Comparable<Any>?>
130+
by withOneOption { skipNaN: Boolean ->
131+
twoStepSelectingForAny<Comparable<Any>, Comparable<Any>?>(
132+
getReturnType = maxTypeConversion,
133+
stepOneSelector = { type -> maxOrNull(type, skipNaN) },
134+
indexOfResult = { type -> indexOfMax(type, skipNaN) },
135+
)
136+
}
135137

136138
// T: Number? -> Double
137139
public val std: AggregatorOptionSwitch2<Boolean, Int, Number, Double> by withTwoOptions {
@@ -151,23 +153,23 @@ public object Aggregators {
151153
}
152154
}
153155

154-
// T : primitive Number? -> Double?
155-
// T : Comparable<T & Any>? -> T?
156+
// T: primitive Number? -> Double?
157+
// T: Comparable<T & Any>? -> T?
156158
public fun <T> percentileCommon(
157159
percentile: Double,
158160
skipNaN: Boolean,
159161
): Aggregator<T & Any, T?>
160162
where T : Comparable<T & Any>? =
161163
this.percentile.invoke(percentile, skipNaN).cast2()
162164

163-
// T : Comparable<T & Any>? -> T?
165+
// T: Comparable<T & Any>? -> T?
164166
public fun <T> percentileComparables(
165167
percentile: Double,
166168
): Aggregator<T & Any, T?>
167169
where T : Comparable<T & Any>? =
168170
percentileCommon<T>(percentile, skipNaNDefault).cast2()
169171

170-
// T : primitive Number? -> Double?
172+
// T: primitive Number? -> Double?
171173
public fun <T> percentileNumbers(
172174
percentile: Double,
173175
skipNaN: Boolean,
@@ -176,40 +178,45 @@ public object Aggregators {
176178
percentileCommon<T>(percentile, skipNaN).cast2()
177179

178180
@Suppress("UNCHECKED_CAST")
179-
private val percentile by withTwoOptions { percentile: Double, skipNaN: Boolean ->
180-
flattenHybridForAny<Comparable<Any>, Comparable<Any>?>(
181-
getReturnType = percentileConversion,
182-
reducer = { type -> percentileOrNull(percentile, type, skipNaN) as Comparable<Any>? },
183-
indexOfResult = { type -> indexOfPercentile(percentile, type, skipNaN) },
184-
)
185-
}
181+
public val percentile: AggregatorOptionSwitch2<Double, Boolean, Comparable<Any>, Comparable<Any>?>
182+
by withTwoOptions {
183+
percentile: Double,
184+
skipNaN: Boolean,
185+
->
186+
flattenHybridForAny<Comparable<Any>, Comparable<Any>?>(
187+
getReturnType = percentileConversion,
188+
reducer = { type -> percentileOrNull(percentile, type, skipNaN) as Comparable<Any>? },
189+
indexOfResult = { type -> indexOfPercentile(percentile, type, skipNaN) },
190+
)
191+
}
186192

187-
// T : primitive Number? -> Double?
188-
// T : Comparable<T & Any>? -> T?
193+
// T: primitive Number? -> Double?
194+
// T: Comparable<T & Any>? -> T?
189195
public fun <T> medianCommon(skipNaN: Boolean): Aggregator<T & Any, T?>
190196
where T : Comparable<T & Any>? =
191197
median.invoke(skipNaN).cast2()
192198

193-
// T : Comparable<T & Any>? -> T?
199+
// T: Comparable<T & Any>? -> T?
194200
public fun <T> medianComparables(): Aggregator<T & Any, T?>
195201
where T : Comparable<T & Any>? =
196202
medianCommon<T>(skipNaNDefault).cast2()
197203

198-
// T : primitive Number? -> Double?
204+
// T: primitive Number? -> Double?
199205
public fun <T> medianNumbers(
200206
skipNaN: Boolean,
201207
): Aggregator<T & Any, Double?>
202208
where T : Comparable<T & Any>?, T : Number? =
203209
medianCommon<T>(skipNaN).cast2()
204210

205211
@Suppress("UNCHECKED_CAST")
206-
private val median by withOneOption { skipNaN: Boolean ->
207-
flattenHybridForAny<Comparable<Any>, Comparable<Any>?>(
208-
getReturnType = medianConversion,
209-
reducer = { type -> medianOrNull(type, skipNaN) as Comparable<Any>? },
210-
indexOfResult = { type -> indexOfMedian(type, skipNaN) },
211-
)
212-
}
212+
public val median: AggregatorOptionSwitch1<Boolean, Comparable<Any>, Comparable<Any>?>
213+
by withOneOption { skipNaN: Boolean ->
214+
flattenHybridForAny<Comparable<Any>, Comparable<Any>?>(
215+
getReturnType = medianConversion,
216+
reducer = { type -> medianOrNull(type, skipNaN) as Comparable<Any>? },
217+
indexOfResult = { type -> indexOfMedian(type, skipNaN) },
218+
)
219+
}
213220

214221
// T: Number -> T
215222
// Byte -> Int

0 commit comments

Comments
 (0)