@@ -735,6 +735,37 @@ def open_zipfile(
735
735
raise ValueError
736
736
737
737
738
+ @contextlib .contextmanager
739
+ def open_tarfile (
740
+ path : str | Path ,
741
+ inner_path : str ,
742
+ * ,
743
+ operation : Operation = "read" ,
744
+ representation : Representation = "binary" ,
745
+ ) -> Generator [typing .IO [bytes ], None , None ]:
746
+ """Open a tar file."""
747
+ if representation != "binary" :
748
+ raise NotImplementedError
749
+
750
+ if operation == "read" :
751
+ with tarfile .open (path , "r" ) as tar :
752
+ member = tar .getmember (inner_path )
753
+ file = tar .extractfile (member )
754
+ if file is None :
755
+ raise FileNotFoundError (f"could not find { inner_path } in tarfile { path } " )
756
+ yield file
757
+ elif operation == "write" :
758
+ file = BytesIO ()
759
+ yield file
760
+ file .seek (0 )
761
+ tarinfo = tarfile .TarInfo (name = inner_path )
762
+ tarinfo .size = len (file .getbuffer ())
763
+ with tarfile .TarFile (path , mode = "w" ) as tar_file :
764
+ tar_file .addfile (tarinfo , file )
765
+ else :
766
+ raise ValueError
767
+
768
+
738
769
@contextlib .contextmanager
739
770
def open_zip_reader (
740
771
path : str | Path , inner_path : str , delimiter : str = "\t " , ** kwargs : Any
@@ -888,11 +919,8 @@ def write_tarfile_csv(
888
919
:param kwargs: Additional kwargs to pass to :func:`get_df_io` and transitively to
889
920
:func:`pandas.DataFrame.to_csv`.
890
921
"""
891
- s = df .to_csv (sep = sep , index = index , ** kwargs )
892
- tarinfo = tarfile .TarInfo (name = inner_path )
893
- tarinfo .size = len (s )
894
- with tarfile .TarFile (path , mode = "w" ) as tar_file :
895
- tar_file .addfile (tarinfo , BytesIO (s .encode ("utf-8" )))
922
+ with open_tarfile (path , inner_path , operation = "write" ) as file :
923
+ df .to_csv (file , sep = sep , index = index , ** kwargs )
896
924
897
925
898
926
def write_tarfile_xml (
@@ -911,11 +939,9 @@ def write_tarfile_xml(
911
939
from lxml import etree
912
940
913
941
kwargs .setdefault ("pretty_print" , True )
914
- s = etree .tostring (element_tree , ** kwargs )
915
- tarinfo = tarfile .TarInfo (name = inner_path )
916
- tarinfo .size = len (s )
917
- with tarfile .TarFile (path , mode = "w" ) as tar_file :
918
- tar_file .addfile (tarinfo , BytesIO (s ))
942
+
943
+ with open_tarfile (path , inner_path , operation = "write" ) as file :
944
+ file .write (etree .tostring (element_tree , ** kwargs ))
919
945
920
946
921
947
def read_tarfile_csv (
@@ -932,9 +958,8 @@ def read_tarfile_csv(
932
958
"""
933
959
import pandas as pd
934
960
935
- with tarfile .open (path ) as tar_file :
936
- with tar_file .extractfile (inner_path ) as file : # type: ignore
937
- return pd .read_csv (file , sep = sep , ** kwargs )
961
+ with open_tarfile (path , inner_path ) as file :
962
+ return pd .read_csv (file , sep = sep , ** kwargs )
938
963
939
964
940
965
def read_tarfile_xml (path : str | Path , inner_path : str , ** kwargs : Any ) -> lxml .etree .ElementTree :
@@ -948,9 +973,8 @@ def read_tarfile_xml(path: str | Path, inner_path: str, **kwargs: Any) -> lxml.e
948
973
"""
949
974
from lxml import etree
950
975
951
- with tarfile .open (path ) as tar_file :
952
- with tar_file .extractfile (inner_path ) as file : # type: ignore
953
- return etree .parse (file , ** kwargs )
976
+ with open_tarfile (path , inner_path ) as file :
977
+ return etree .parse (file , ** kwargs )
954
978
955
979
956
980
def read_rdf (path : str | Path , ** kwargs : Any ) -> rdflib .Graph :
0 commit comments