Skip to content

Commit bca122b

Browse files
jaouenajaouenadel
andauthored
build_database.py / Add option to drop EDC tables (#25)
* Drop edc_table when refresh_type='all' or 'reset-table' set to True * rename 'reset-tables' param to 'drop-tables' --------- Co-authored-by: jaouenadel <jaouenadel@protonmail.com>
1 parent 069ee96 commit bca122b

File tree

2 files changed

+41
-5
lines changed

2 files changed

+41
-5
lines changed

pipelines/run.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,14 @@ def run():
6161
type=str,
6262
help="Comma-separated list of years to process (for custom refresh type)",
6363
)
64-
def run_build_database(refresh_type, custom_years):
64+
@click.option(
65+
"--drop-tables",
66+
is_flag=True,
67+
show_default=True,
68+
default=False,
69+
help="Drop and re-create edc tables in the database before data insertion.",
70+
)
71+
def run_build_database(refresh_type, custom_years, drop_tables):
6572
"""Run build_database task."""
6673
module = importlib.import_module("tasks.build_database")
6774
task_func = getattr(module, "execute")
@@ -70,7 +77,11 @@ def run_build_database(refresh_type, custom_years):
7077
if custom_years:
7178
custom_years_list = [year.strip() for year in custom_years.split(",")]
7279

73-
task_func(refresh_type=refresh_type, custom_years=custom_years_list)
80+
task_func(
81+
refresh_type=refresh_type,
82+
custom_years=custom_years_list,
83+
drop_tables=drop_tables,
84+
)
7485

7586

7687
@run.command("download_database")

pipelines/tasks/build_database.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,17 +114,32 @@ def download_extract_insert_yearly_edc_data(year: str):
114114
return True
115115

116116

117+
def drop_edc_tables():
118+
"""Drop tables using tables names defined in _config_edc.py"""
119+
conn = duckdb.connect(DUCKDB_FILE)
120+
tables_names = [
121+
file_info["table_name"] for file_info in edc_config["files"].values()
122+
]
123+
for table_name in tables_names:
124+
query = f"DROP TABLE IF EXISTS {table_name};"
125+
logger.info(f"Drop table {table_name} (query: {query})")
126+
conn.execute(query)
127+
return True
128+
129+
117130
def process_edc_datasets(
118131
refresh_type: Literal["all", "last", "custom"] = "last",
119132
custom_years: List[str] = None,
133+
drop_tables: bool = False,
120134
):
121135
"""
122136
Process the EDC datasets.
123137
:param refresh_type: Refresh type to run
124-
- "all": Refresh the data for every possible year
138+
- "all": Drop edc tables and import the data for every possible year.
125139
- "last": Refresh the data only for the last available year
126140
- "custom": Refresh the data for the years specified in the list custom_years
127141
:param custom_years: years to update
142+
:param drop_tables: Whether to drop edc tables in the database before data insertion.
128143
:return:
129144
"""
130145
available_years = edc_config["source"]["available_years"]
@@ -156,6 +171,9 @@ def process_edc_datasets(
156171

157172
logger.info(f"Launching processing of EDC datasets for years: {years_to_update}")
158173

174+
if drop_tables or (refresh_type == "all"):
175+
drop_edc_tables()
176+
159177
for year in years_to_update:
160178
download_extract_insert_yearly_edc_data(year=year)
161179

@@ -164,11 +182,18 @@ def process_edc_datasets(
164182
return True
165183

166184

167-
def execute(refresh_type: str = "all", custom_years: List[str] = None):
185+
def execute(
186+
refresh_type: str = "all",
187+
custom_years: List[str] = None,
188+
drop_tables: bool = False,
189+
):
168190
"""
169191
Execute the EDC dataset processing with specified parameters.
170192
171193
:param refresh_type: Type of refresh to perform ("all", "last", or "custom")
172194
:param custom_years: List of years to process when refresh_type is "custom"
195+
:param drop_tables: Whether to drop edc tables in the database before data insertion.
173196
"""
174-
process_edc_datasets(refresh_type=refresh_type, custom_years=custom_years)
197+
process_edc_datasets(
198+
refresh_type=refresh_type, custom_years=custom_years, drop_tables=drop_tables
199+
)

0 commit comments

Comments
 (0)