Skip to content

Commit 9d0fe3d

Browse files
Merge pull request #1075 from Kotlin/sxss_writing
add SXSS writing
2 parents 749dbf3 + 088277e commit 9d0fe3d

File tree

1 file changed

+83
-9
lines changed
  • dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io

1 file changed

+83
-9
lines changed

dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt

Lines changed: 83 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import org.apache.poi.util.DefaultTempFileCreationStrategy
2020
import org.apache.poi.util.LocaleUtil
2121
import org.apache.poi.util.LocaleUtil.getUserTimeZone
2222
import org.apache.poi.util.TempFile
23+
import org.apache.poi.xssf.streaming.SXSSFWorkbook
2324
import org.apache.poi.xssf.usermodel.XSSFWorkbook
2425
import org.jetbrains.kotlinx.dataframe.AnyFrame
2526
import org.jetbrains.kotlinx.dataframe.AnyRow
@@ -470,6 +471,28 @@ private fun Cell?.cellValue(sheetName: String): Any? {
470471
return getValueFromType(cellType)
471472
}
472473

474+
public enum class WorkBookType {
475+
XLS,
476+
XLSX,
477+
}
478+
479+
/**
480+
* Writes this DataFrame to an Excel file as a single sheet.
481+
*
482+
* Implemented with [Apache POI](https://poi.apache.org) using `HSSFWorkbook` for XLS files,
483+
* `XSSFWorkbook` for standard XLSX files, and `SXSSFWorkbook` for memory-efficient streaming when creating new XLSX files.
484+
*
485+
* @param path The path to the file where the data will be written.
486+
* @param columnsSelector A [selector][ColumnsSelector] to determine which columns to include in the file. The default is all columns.
487+
* @param sheetName The name of the sheet in the Excel file. If null, the default name will be used.
488+
* @param writeHeader A flag indicating whether to write the header row in the Excel file. Defaults to true.
489+
* @param workBookType The [type of workbook][WorkBookType] to create (e.g., XLS or XLSX). Defaults to XLSX.
490+
* @param keepFile If `true` and the file already exists, a new sheet will be appended instead of overwriting the file.
491+
* This may result in higher memory usage and slower performance compared to creating a new file.
492+
* Defaults to `false`.
493+
*
494+
* @throws [IllegalArgumentException] if the [sheetName] is invalid or workbook already contains a sheet with this name.
495+
*/
473496
public fun <T> DataFrame<T>.writeExcel(
474497
path: String,
475498
columnsSelector: ColumnsSelector<T, *> = { all() },
@@ -479,11 +502,24 @@ public fun <T> DataFrame<T>.writeExcel(
479502
keepFile: Boolean = false,
480503
): Unit = writeExcel(File(path), columnsSelector, sheetName, writeHeader, workBookType, keepFile)
481504

482-
public enum class WorkBookType {
483-
XLS,
484-
XLSX,
485-
}
486-
505+
/**
506+
* Writes this DataFrame to an Excel file as a single sheet.
507+
*
508+
* Implemented with [Apache POI](https://poi.apache.org) using `HSSFWorkbook` for XLS files,
509+
* `XSSFWorkbook` for standard XLSX files,
510+
* and `SXSSFWorkbook` for memory-efficient streaming when creating new XLSX files.
511+
*
512+
* @param file The file where the data will be written.
513+
* @param columnsSelector A [selector][ColumnsSelector] to determine which columns to include in the file. The default is all columns.
514+
* @param sheetName The name of the sheet in the Excel file. If null, the default name will be used.
515+
* @param writeHeader A flag indicating whether to write the header row in the Excel file. Defaults to true.
516+
* @param workBookType The [type of workbook][WorkBookType] to create (e.g., XLS or XLSX). Defaults to XLSX.
517+
* @param keepFile If `true` and the file already exists, a new sheet will be appended instead of overwriting the file.
518+
* This may result in higher memory usage and slower performance compared to creating a new file.
519+
* Defaults to `false`.
520+
*
521+
* @throws [IllegalArgumentException] if the [sheetName] is invalid or workbook already contains a sheet with this name.
522+
*/
487523
public fun <T> DataFrame<T>.writeExcel(
488524
file: File,
489525
columnsSelector: ColumnsSelector<T, *> = { all() },
@@ -493,22 +529,41 @@ public fun <T> DataFrame<T>.writeExcel(
493529
keepFile: Boolean = false,
494530
) {
495531
val factory =
496-
if (keepFile) {
532+
// Write to an existing file with `keepFile` flag
533+
if (keepFile && file.exists() && file.length() > 0L) {
534+
val fis = file.inputStream()
497535
when (workBookType) {
498-
WorkBookType.XLS -> HSSFWorkbook(file.inputStream())
499-
WorkBookType.XLSX -> XSSFWorkbook(file.inputStream())
536+
WorkBookType.XLS -> HSSFWorkbook(fis)
537+
WorkBookType.XLSX -> XSSFWorkbook(fis)
500538
}
501539
} else {
502540
when (workBookType) {
503541
WorkBookType.XLS -> HSSFWorkbook()
504-
WorkBookType.XLSX -> XSSFWorkbook()
542+
543+
// Use streaming mode for a new XLSX file
544+
WorkBookType.XLSX -> SXSSFWorkbook()
505545
}
506546
}
507547
return file.outputStream().use {
508548
writeExcel(it, columnsSelector, sheetName, writeHeader, factory)
509549
}
510550
}
511551

552+
/**
553+
* Writes this DataFrame to an Excel file using an existing [Workbook] instance into given [OutputStream].
554+
*
555+
* Uses [Apache POI](https://poi.apache.org).
556+
* Supports [XSSFWorkbook] and [SXSSFWorkbook] for XLSX and [HSSFWorkbook] for XLS,
557+
* and allows users to manage the workbook externally.
558+
*
559+
* @param outputStream The output stream where the Excel data will be written.
560+
* @param columnsSelector A [selector][ColumnsSelector] to determine which columns to include in the file. The default is all columns.
561+
* @param sheetName The name of the sheet in the Excel file. If null, the default name will be used.
562+
* @param writeHeader A flag indicating whether to write the header row in the Excel file. Defaults to true.
563+
* @param factory The [Workbook] instance, allowing integration with an existing workbook.
564+
*
565+
* @throws [IllegalArgumentException] if the [sheetName] is invalid or workbook already contains a sheet with this name.
566+
*/
512567
public fun <T> DataFrame<T>.writeExcel(
513568
outputStream: OutputStream,
514569
columnsSelector: ColumnsSelector<T, *> = { all() },
@@ -522,6 +577,25 @@ public fun <T> DataFrame<T>.writeExcel(
522577
wb.close()
523578
}
524579

580+
/**
581+
* Creates a new [Sheet] in the given [Workbook] and writes this DataFrame content into it.
582+
*
583+
* Uses [Apache POI](https://poi.apache.org).
584+
* Supports [XSSFWorkbook] and [SXSSFWorkbook] for XLSX and [HSSFWorkbook] for XLS,
585+
* and allows users to manage the workbook externally.
586+
*
587+
* Automatically handles datetime types.
588+
* Skips null values to prevent Apache POI from treating empty cells incorrectly.
589+
*
590+
* @param wb The [Workbook] where the sheet will be created.
591+
* @param columnsSelector A [selector][ColumnsSelector] to determine which columns to include. Defaults to all columns.
592+
* @param sheetName The name of the sheet. If null, a default sheet name is used.
593+
* @param writeHeader Whether to include a header row with column names. Defaults to true.
594+
*
595+
* @return The created [Sheet] instance containing the DataFrame data.
596+
*
597+
* @throws [IllegalArgumentException] if the [sheetName] is invalid or workbook already contains a sheet with this name.
598+
*/
525599
public fun <T> DataFrame<T>.writeExcel(
526600
wb: Workbook,
527601
columnsSelector: ColumnsSelector<T, *> = { all() },

0 commit comments

Comments
 (0)