Skip to content

Commit 563d367

Browse files
committed
added describe support for incomparable number types (converting them to either double or bigdecimal) and added tests
1 parent ac3eb9a commit 563d367

File tree

4 files changed

+137
-11
lines changed

4 files changed

+137
-11
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataColumnType.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.isNothing
1111
import org.jetbrains.kotlinx.dataframe.impl.projectTo
1212
import org.jetbrains.kotlinx.dataframe.type
1313
import org.jetbrains.kotlinx.dataframe.typeClass
14+
import java.math.BigDecimal
15+
import java.math.BigInteger
1416
import kotlin.contracts.ExperimentalContracts
1517
import kotlin.contracts.contract
1618
import kotlin.reflect.KClass
@@ -44,6 +46,8 @@ public inline fun <reified T> AnyCol.isType(): Boolean = type() == typeOf<T>()
4446

4547
public fun AnyCol.isNumber(): Boolean = isSubtypeOf<Number?>()
4648

49+
public fun AnyCol.isBigNumber(): Boolean = isSubtypeOf<BigInteger?>() || isSubtypeOf<BigDecimal?>()
50+
4751
public fun AnyCol.isList(): Boolean = typeClass == List::class
4852

4953
/**

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/TypeUtils.kt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.DataRow
1111
import org.jetbrains.kotlinx.dataframe.api.Infer
1212
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
1313
import org.jetbrains.kotlinx.dataframe.util.GUESS_VALUE_TYPE
14+
import java.math.BigDecimal
15+
import java.math.BigInteger
1416
import kotlin.reflect.KClass
1517
import kotlin.reflect.KType
1618
import kotlin.reflect.KTypeParameter
@@ -29,6 +31,7 @@ import kotlin.reflect.full.superclasses
2931
import kotlin.reflect.full.withNullability
3032
import kotlin.reflect.jvm.jvmErasure
3133
import kotlin.reflect.typeOf
34+
import kotlin.toBigDecimal as toBigDecimalKotlin
3235

3336
internal inline fun <reified T> KClass<*>.createTypeUsing() = typeOf<T>().projectTo(this)
3437

@@ -646,3 +649,18 @@ internal fun Any.asArrayAsListOrNull(): List<*>? =
646649
is Array<*> -> asList()
647650
else -> null
648651
}
652+
653+
internal fun Any.isBigNumber(): Boolean = this is BigInteger || this is BigDecimal
654+
655+
internal fun Number.toBigDecimal(): BigDecimal =
656+
when (this) {
657+
is BigDecimal -> this
658+
is BigInteger -> this.toBigDecimalKotlin()
659+
is Int -> this.toBigDecimalKotlin()
660+
is Byte -> this.toInt().toBigDecimalKotlin()
661+
is Short -> this.toInt().toBigDecimalKotlin()
662+
is Long -> this.toBigDecimalKotlin()
663+
is Float -> this.toBigDecimalKotlin()
664+
is Double -> this.toBigDecimalKotlin()
665+
else -> BigDecimal(this.toString())
666+
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/describe.kt

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
55
import org.jetbrains.kotlinx.dataframe.api.ColumnDescription
66
import org.jetbrains.kotlinx.dataframe.api.add
77
import org.jetbrains.kotlinx.dataframe.api.after
8+
import org.jetbrains.kotlinx.dataframe.api.any
89
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
910
import org.jetbrains.kotlinx.dataframe.api.asComparable
1011
import org.jetbrains.kotlinx.dataframe.api.asNumbers
1112
import org.jetbrains.kotlinx.dataframe.api.cast
1213
import org.jetbrains.kotlinx.dataframe.api.concat
1314
import org.jetbrains.kotlinx.dataframe.api.isComparable
1415
import org.jetbrains.kotlinx.dataframe.api.isNumber
16+
import org.jetbrains.kotlinx.dataframe.api.map
1517
import org.jetbrains.kotlinx.dataframe.api.maxOrNull
1618
import org.jetbrains.kotlinx.dataframe.api.mean
1719
import org.jetbrains.kotlinx.dataframe.api.medianOrNull
@@ -25,7 +27,9 @@ import org.jetbrains.kotlinx.dataframe.columns.size
2527
import org.jetbrains.kotlinx.dataframe.columns.values
2628
import org.jetbrains.kotlinx.dataframe.impl.columns.addPath
2729
import org.jetbrains.kotlinx.dataframe.impl.columns.asAnyFrameColumn
30+
import org.jetbrains.kotlinx.dataframe.impl.isBigNumber
2831
import org.jetbrains.kotlinx.dataframe.impl.renderType
32+
import org.jetbrains.kotlinx.dataframe.impl.toBigDecimal
2933
import org.jetbrains.kotlinx.dataframe.index
3034
import org.jetbrains.kotlinx.dataframe.kind
3135
import org.jetbrains.kotlinx.dataframe.type
@@ -55,12 +59,12 @@ internal fun describeImpl(cols: List<AnyCol>): DataFrame<ColumnDescription> {
5559
}
5660
}
5761

58-
val all = cols.collectAll(false)
62+
val allCols = cols.collectAll(false)
5963

60-
val hasNumeric = all.any { it.isNumber() }
61-
val hasComparable = all.any { it.isComparable() }
62-
val hasLongPaths = all.any { it.path().size > 1 }
63-
var df = all.toDataFrame {
64+
val hasNumericCols = allCols.any { it.isNumber() }
65+
val hasInterComparableCols = allCols.any { it.isComparable() }
66+
val hasLongPaths = allCols.any { it.path().size > 1 }
67+
var df = allCols.toDataFrame {
6468
ColumnDescription::name from { it.name() }
6569
if (hasLongPaths) {
6670
ColumnDescription::path from { it.path() }
@@ -74,21 +78,63 @@ internal fun describeImpl(cols: List<AnyCol>): DataFrame<ColumnDescription> {
7478
.groupBy { it }.maxByOrNull { it.value.size }
7579
?.key
7680
}
77-
if (hasNumeric) {
81+
if (hasNumericCols) {
7882
ColumnDescription::mean from { if (it.isNumber()) it.asNumbers().mean() else null }
7983
ColumnDescription::std from { if (it.isNumber()) it.asNumbers().std() else null }
8084
}
81-
if (hasComparable) {
82-
ColumnDescription::min from inferType { if (it.isComparable()) it.asComparable().minOrNull() else null }
85+
if (hasInterComparableCols || hasNumericCols) {
86+
ColumnDescription::min from inferType {
87+
when {
88+
it.isComparable() ->
89+
it.asComparable().minOrNull()
90+
91+
// Found incomparable number types, convert all to Double or BigDecimal first
92+
it.isNumber() ->
93+
if (it.any { it?.isBigNumber() == true }) {
94+
it.map { (it as Number?)?.toBigDecimal() }.minOrNull()
95+
} else {
96+
it.map { (it as Number?)?.toDouble() }.minOrNull()
97+
}
98+
99+
else -> null
100+
}
101+
}
83102
ColumnDescription::median from inferType {
84-
if (it.isComparable()) it.asComparable().medianOrNull() else null
103+
when {
104+
it.isComparable() ->
105+
it.asComparable().medianOrNull()
106+
107+
// Found incomparable number types, convert all to Double or BigDecimal first
108+
it.isNumber() ->
109+
if (it.any { it?.isBigNumber() == true }) {
110+
it.map { (it as Number?)?.toBigDecimal() }.medianOrNull()
111+
} else {
112+
it.map { (it as Number?)?.toDouble() }.medianOrNull()
113+
}
114+
115+
else -> null
116+
}
117+
}
118+
ColumnDescription::max from inferType {
119+
when {
120+
it.isComparable() -> it.asComparable().maxOrNull()
121+
122+
// Found incomparable number types, convert all to Double or BigDecimal first
123+
it.isNumber() ->
124+
if (it.any { it?.isBigNumber() == true }) {
125+
it.map { (it as Number?)?.toBigDecimal() }.maxOrNull()
126+
} else {
127+
it.map { (it as Number?)?.toDouble() }.maxOrNull()
128+
}
129+
130+
else -> null
131+
}
85132
}
86-
ColumnDescription::max from inferType { if (it.isComparable()) it.asComparable().maxOrNull() else null }
87133
}
88134
}
89135
df = df.add(ColumnDescription::freq) {
90136
val top = it[ColumnDescription::top]
91-
val data = all[index]
137+
val data = allCols[index]
92138
data.values.count { it == top }
93139
}.move(ColumnDescription::freq).after(ColumnDescription::top)
94140

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/describe.kt

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
package org.jetbrains.kotlinx.dataframe.api
22

33
import io.kotest.matchers.shouldBe
4+
import org.jetbrains.kotlinx.dataframe.alsoDebug
45
import org.junit.Test
6+
import kotlin.reflect.typeOf
57

68
class DescribeTests {
79

@@ -11,4 +13,60 @@ class DescribeTests {
1113
val df = dataFrameOf(a).drop(1)
1214
df.describe()["min"][0] shouldBe null
1315
}
16+
17+
@Test
18+
fun `describe nullable Number column`() {
19+
val a by columnOf(
20+
1,
21+
2.0,
22+
3f,
23+
4L,
24+
5.toShort(),
25+
6.toByte(),
26+
7.toBigInteger(),
27+
8.toBigDecimal(),
28+
null,
29+
)
30+
val df = dataFrameOf(a)
31+
val describe = df.describe()
32+
.alsoDebug()
33+
.single()
34+
with(describe) {
35+
name shouldBe "a"
36+
type shouldBe "Number?"
37+
count shouldBe 9
38+
unique shouldBe 9
39+
nulls shouldBe 1
40+
top shouldBe 1
41+
freq shouldBe 1
42+
mean shouldBe 4.5
43+
std shouldBe 2.449489742783178
44+
min shouldBe 1.toBigDecimal()
45+
median shouldBe 4.toBigDecimal()
46+
max shouldBe 8.toBigDecimal()
47+
}
48+
}
49+
50+
@Test
51+
fun `describe with NaNs`() {
52+
val a by columnOf(1.0, 2.0, Double.NaN, 4.0)
53+
val df = dataFrameOf(a)
54+
val describe = df.describe()
55+
.alsoDebug()
56+
.single()
57+
with(describe) {
58+
name shouldBe "a"
59+
type shouldBe "Double"
60+
count shouldBe 4
61+
unique shouldBe 4
62+
nulls shouldBe 0
63+
top shouldBe 1
64+
freq shouldBe 1
65+
mean.isNaN() shouldBe true
66+
std.isNaN() shouldBe true
67+
min shouldBe 1.0 // TODO should be NaN too?
68+
median shouldBe 3.0
69+
max.isNaN shouldBe true
70+
}
71+
}
1472
}

0 commit comments

Comments
 (0)