From eeaef7436acbe31eae7c9556ed68ad115949c0da Mon Sep 17 00:00:00 2001 From: Mikhail Beliansky Date: Sun, 1 Mar 2020 17:16:23 +0200 Subject: [PATCH 1/5] Added append class --- pandas_to_postgres/__init__.py | 1 + pandas_to_postgres/append_df.py | 73 +++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 pandas_to_postgres/append_df.py diff --git a/pandas_to_postgres/__init__.py b/pandas_to_postgres/__init__.py index cfc068d..2ecd337 100644 --- a/pandas_to_postgres/__init__.py +++ b/pandas_to_postgres/__init__.py @@ -1,5 +1,6 @@ from .copy_df import DataFrameCopy from .copy_hdf import HDFTableCopy, SmallHDFTableCopy, BigHDFTableCopy +from .append_df import DataFrameCopyAppend from .hdf_to_postgres import hdf_to_postgres, create_hdf_table_objects, copy_worker from .utilities import ( hdf_metadata, diff --git a/pandas_to_postgres/append_df.py b/pandas_to_postgres/append_df.py new file mode 100644 index 0000000..a8b34a7 --- /dev/null +++ b/pandas_to_postgres/append_df.py @@ -0,0 +1,73 @@ +from .utilities import create_file_object, df_generator, cast_pandas +from ._base_copy import BaseCopy + + +class DataFrameCopyAppend(BaseCopy): + """ + Class for handling a standard case of iterating over a pandas DataFrame in chunks + and COPYing to PostgreSQL via StringIO CSV. + + Differs from main DataFrameCopy class with that it doesn't drop fks, pks and indexes, and appends df + to existing data in table. This will cause it to lose a lot of performance. + """ + + def __init__( + self, df, defer_sql_objs=False, conn=None, table_obj=None, csv_chunksize=10 ** 6 + ): + """ + Parameters + ---------- + df: pandas DataFrame + Data to copy to database table + defer_sql_objs: bool + multiprocessing has issue with passing SQLALchemy objects, so if + True, defer attributing these to the object until after pickled by Pool + conn: SQlAlchemy Connection + Managed outside of the object + table_obj: SQLAlchemy model object + Destination SQL Table + csv_chunksize: int + Max rows to keep in memory when generating CSV for COPY + """ + super().__init__(defer_sql_objs, conn, table_obj, csv_chunksize) + + self.df = df + self.rows = self.df.shape[0] + + def truncate(self): + pass + + def create_pk(self): + pass + + def create_fks(self): + pass + + def drop_fks(self): + pass + + def drop_pk(self): + pass + + def copy(self, functions=[cast_pandas]): + self.drop_fks() + self.drop_pk() + self.df = self.data_formatting(self.df, functions=functions) + with self.conn.begin(): + self.truncate() + + self.logger.info("Creating generator for chunking dataframe") + for chunk in df_generator(self.df, self.csv_chunksize): + + self.logger.info("Creating CSV in memory") + fo = create_file_object(chunk) + + self.logger.info("Copying chunk to database") + self.copy_from_file(fo) + del fo + + self.logger.info("All chunks copied ({} rows)".format(self.rows)) + + self.create_pk() + self.create_fks() + self.analyze() From b3c9b82d6fce8bbd696de80f29f0671b7027cdd7 Mon Sep 17 00:00:00 2001 From: Mikhail Beliansky Date: Sun, 1 Mar 2020 17:23:30 +0200 Subject: [PATCH 2/5] Update requirements --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e38c8b6..b8d2eac 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def read(fname, lines=False): ), url="http://github.com/cid-harvard/pandas-to-postgres", packages=find_packages(), - install_requires=["SQLAlchemy", "pandas", "psycopg2", "tables"], + install_requires=["SQLAlchemy", "pandas", "psycopg2-binary", "tables"], long_description=read("README.md"), classifiers=[ "Topic :: Database", From 5c4a3886ef927008cf335ef53e5d611f4b574c1f Mon Sep 17 00:00:00 2001 From: Mikhail Beliansky Date: Sun, 1 Mar 2020 17:55:38 +0200 Subject: [PATCH 3/5] Remove freeze --- pandas_to_postgres/append_df.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas_to_postgres/append_df.py b/pandas_to_postgres/append_df.py index a8b34a7..9e226ae 100644 --- a/pandas_to_postgres/append_df.py +++ b/pandas_to_postgres/append_df.py @@ -12,7 +12,7 @@ class DataFrameCopyAppend(BaseCopy): """ def __init__( - self, df, defer_sql_objs=False, conn=None, table_obj=None, csv_chunksize=10 ** 6 + self, df, defer_sql_objs=False, conn=None, table_obj=None, csv_chunksize=10 ** 6 ): """ Parameters @@ -49,6 +49,22 @@ def drop_fks(self): def drop_pk(self): pass + def copy_from_file(self, file_object): + """ + COPY to PostgreSQL table using StringIO CSV object + Parameters + ---------- + file_object: StringIO + CSV formatted data to COPY from DataFrame to PostgreSQL + """ + cur = self.conn.connection.cursor() + file_object.seek(0) + columns = file_object.readline() + sql = "COPY {table} ({columns}) FROM STDIN WITH CSV".format( + table=self.sql_table, columns=columns + ) + cur.copy_expert(sql=sql, file=file_object) + def copy(self, functions=[cast_pandas]): self.drop_fks() self.drop_pk() @@ -58,7 +74,6 @@ def copy(self, functions=[cast_pandas]): self.logger.info("Creating generator for chunking dataframe") for chunk in df_generator(self.df, self.csv_chunksize): - self.logger.info("Creating CSV in memory") fo = create_file_object(chunk) From a1e78a19becc02549ec859e0679bdd96eef3038f Mon Sep 17 00:00:00 2001 From: Mikhail Beliansky Date: Sun, 1 Mar 2020 17:57:15 +0200 Subject: [PATCH 4/5] Bumped version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b8d2eac..2cce7c6 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ def read(fname, lines=False): setup( name="pandas_to_postgres", - version="v0.0.3", + version="v0.0.4", author="Brendan Leonard ", description=( "Utility to copy Pandas DataFrames and DataFrames stored in HDF5 files " From c93292c8c0ac5d3aa9f9977e1e82fc6592c1379d Mon Sep 17 00:00:00 2001 From: Mikhail Beliansky Date: Sun, 1 Mar 2020 18:02:55 +0200 Subject: [PATCH 5/5] Edit docstring --- pandas_to_postgres/append_df.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas_to_postgres/append_df.py b/pandas_to_postgres/append_df.py index 9e226ae..8328ac7 100644 --- a/pandas_to_postgres/append_df.py +++ b/pandas_to_postgres/append_df.py @@ -4,9 +4,6 @@ class DataFrameCopyAppend(BaseCopy): """ - Class for handling a standard case of iterating over a pandas DataFrame in chunks - and COPYing to PostgreSQL via StringIO CSV. - Differs from main DataFrameCopy class with that it doesn't drop fks, pks and indexes, and appends df to existing data in table. This will cause it to lose a lot of performance. """