Skip to content

Commit 33634c4

Browse files
committed
support importing parquet files
1 parent 16002e7 commit 33634c4

File tree

1 file changed

+26
-1
lines changed

1 file changed

+26
-1
lines changed

fsspark/utils/io.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def import_table(path: str,
1616
Import tsv file as Spark DataFrame.
1717
1818
:param path: File path
19-
:param header:
19+
:param header: True if the first row is header.
2020
:param sep: Column separator
2121
:param n_partitions: Minimal number of partitions
2222
@@ -39,6 +39,31 @@ def import_table(path: str,
3939
return sdf
4040

4141

42+
def import_parquet(path: str,
43+
header: bool = True) -> pyspark.sql.DataFrame:
44+
"""
45+
Import parquet file as Spark DataFrame.
46+
47+
:param path: File path
48+
:param header: True if the first row is header.
49+
50+
:return: Spark DataFrame
51+
"""
52+
53+
_sc = pyspark.sql.SparkSession.getActiveSession()
54+
55+
if _sc is None:
56+
raise ValueError("Active Spark Session not found...")
57+
58+
sdf = (_sc
59+
.read
60+
.option("header", header)
61+
.option("inferSchema", "true")
62+
.parquet(path)
63+
)
64+
return sdf
65+
66+
4267
def import_table_as_psdf(path: str,
4368
sep: str = "\t",
4469
n_partitions: int = 5) -> pyspark.pandas.DataFrame:

0 commit comments

Comments
 (0)