diff --git a/CHANGELOG.md b/CHANGELOG.md index a3cbddb..9d070fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ # Changelog -## [Version 1.1.4](https://github.com/dataiku/dss-plugin-sharepoint-online/releases/tag/v1.1.4) - Feature release - 2024-07-16 +## [Version 1.1.5](https://github.com/dataiku/dss-plugin-sharepoint-online/releases/tag/v1.1.5) - Feature release - 2024-10-15 + +- Update of individual list records + +## [Version 1.1.4](https://github.com/dataiku/dss-plugin-sharepoint-online/releases/tag/v1.1.4) - Feature and bugfix release - 2024-07-16 - Fix writing when using presets with no root folder defined - Limit string length to the 255 characters SharePoint limit @@ -10,6 +14,10 @@ - Add login with Azure AD app certificate +## [Version 1.1.3](https://github.com/dataiku/dss-plugin-sharepoint-online/releases/tag/v1.1.3) - Feature release - 2024-06-04 + +- Add login with Azure AD app certificate + ## [Version 1.1.2](https://github.com/dataiku/dss-plugin-sharepoint-online/releases/tag/v1.1.2) - Bugfix release - 2024-05-28 - Fix path creation inside read-only parent directory diff --git a/custom-recipes/sharepoint-online-append-list/recipe.json b/custom-recipes/sharepoint-online-append-list/recipe.json index 18a5740..821f8ae 100644 --- a/custom-recipes/sharepoint-online-append-list/recipe.json +++ b/custom-recipes/sharepoint-online-append-list/recipe.json @@ -142,14 +142,25 @@ "name": "write_mode", "label": "Write mode", "type": "SELECT", - "defaultValue": "append", "selectChoices": [ { "value": "append", "label": "Append to existing list" + }, + { + "value": "update", + "label": "Update items in existing list" } ], - "visibilityCondition": false + "defaultValue": "append", + "visibilityCondition": "model.advanced_parameters == true" + }, + { + "name": "columns_to_update", + "label": "Columns to update", + "type": "COLUMNS", + "columnRole": "input_dataset", + "visibilityCondition": "model.advanced_parameters == true && model.write_mode == 'update'" }, { "name": "max_workers", diff --git a/custom-recipes/sharepoint-online-append-list/recipe.py b/custom-recipes/sharepoint-online-append-list/recipe.py index 502c3ef..fec59a7 100644 --- a/custom-recipes/sharepoint-online-append-list/recipe.py +++ b/custom-recipes/sharepoint-online-append-list/recipe.py @@ -21,6 +21,13 @@ def convert_date_format(json_row): return json_row +def get_write_mode(config): + write_mode = config.get("write_mode", "append") + if write_mode == "create": + write_mode = "append" + return write_mode + + input_dataset_names = get_input_names_for_role('input_dataset') input_dataset = dataiku.Dataset(input_dataset_names[0]) input_dataframe = input_dataset.get_dataframe() @@ -43,7 +50,11 @@ def convert_date_format(json_row): expand_lookup = config.get("expand_lookup", False) metadata_to_retrieve = config.get("metadata_to_retrieve", []) advanced_parameters = config.get("advanced_parameters", False) -write_mode = "append" +write_mode = get_write_mode(config) +columns_to_update = config.get("columns_to_update", []) +if columns_to_update and "ID" not in columns_to_update: + columns_to_update.append("ID") + if not advanced_parameters: max_workers = 1 # no multithread per default batch_size = 100 @@ -57,11 +68,13 @@ def convert_date_format(json_row): display_metadata = len(metadata_to_retrieve) > 0 client = SharePointClient(config) -sharepoint_writer = client.get_writer({"columns": input_schema}, None, None, max_workers, batch_size, write_mode) +sharepoint_writer = client.get_writer({"columns": input_schema}, None, None, max_workers, + batch_size, write_mode, columns_to_update) + + with output_dataset.get_writer() as writer: for index, input_parameters_row in input_dataframe.iterrows(): - json_row = input_parameters_row.to_dict() - json_row = convert_date_format(json_row) - sharepoint_writer.write_row_dict(json_row) - writer.write_row_dict(json_row) + straighten_json_row = sharepoint_writer.pandas_row_to_json(input_parameters_row) + sharepoint_writer.write_row_dict(straighten_json_row) + writer.write_row_dict(sharepoint_writer.fix_dates_for_pandas_output(input_parameters_row)) sharepoint_writer.close() diff --git a/plugin.json b/plugin.json index c357c1b..cdb516f 100644 --- a/plugin.json +++ b/plugin.json @@ -1,6 +1,6 @@ { "id": "sharepoint-online", - "version": "1.1.4", + "version": "1.1.5", "meta": { "label": "SharePoint Online", "description": "Read and write data from/to your SharePoint Online account", diff --git a/python-lib/dss_constants.py b/python-lib/dss_constants.py index 89fedcc..f6254da 100644 --- a/python-lib/dss_constants.py +++ b/python-lib/dss_constants.py @@ -37,7 +37,7 @@ class DSSConstants(object): "sharepoint_oauth": "The access token is missing" } PATH = 'path' - PLUGIN_VERSION = "1.1.4" + PLUGIN_VERSION = "1.1.5" SECRET_PARAMETERS_KEYS = ["Authorization", "sharepoint_username", "sharepoint_password", "client_secret", "client_certificate", "passphrase"] SITE_APP_DETAILS = { "sharepoint_tenant": "The tenant name is missing", diff --git a/python-lib/sharepoint_client.py b/python-lib/sharepoint_client.py index 656b3e0..98dceb0 100644 --- a/python-lib/sharepoint_client.py +++ b/python-lib/sharepoint_client.py @@ -542,6 +542,21 @@ def get_item_structure(self, list_title, item): "checkInComment": None } + def get_update_list_item_kwargs(self, list_title, list_item_entity_type_full_name, item_id, item_update): + # https://sharepoint.stackexchange.com/questions/250659/how-implement-rest-queries-in-sharepoint + list_item_update_info = self.get_list_item_update(item_update, list_item_entity_type_full_name) + headers = DSSConstants.JSON_HEADERS + headers["X-HTTP-Method"] = "MERGE" + headers["If-Match"] = "*" + update_item_url = self.get_update_item_url(list_title, item_id) + kwargs = { + "verb": "PATCH", + "url": update_item_url, + "headers": headers, + "json": list_item_update_info + } + return kwargs + @staticmethod def get_form_value(field_name, field_value): return { @@ -564,6 +579,17 @@ def get_list_item_create_info(self, list_title): } } + @staticmethod + def get_list_item_update(item, list_item_entity_type_full_name): + # https://learn.microsoft.com/en-us/sharepoint/dev/sp-add-ins/working-with-lists-and-list-items-with-rest + ret = {} + ret["__metadata"] = { + "type": "{}".format(list_item_entity_type_full_name) + } + for field_name in item: + ret[field_name] = item.get(field_name) + return ret + def process_batch(self, kwargs_array): batch_id = self.get_random_guid() change_set_id = self.get_random_guid() @@ -628,6 +654,7 @@ def log_batch_errors(self, response, kwargs_array): logger.info("Batch error analysis") statuses = re.findall('HTTP/1.1 (.*?) ', str(response.content)) dump_response_content = False + reason_to_raise = None for status, kwarg in zip(statuses, kwargs_array): if not status.startswith("20"): if dump_response_content: @@ -644,6 +671,7 @@ def log_batch_errors(self, response, kwargs_array): error_messages = re.findall('"ErrorMessage":"(.*?)}', str(response.content)) for error_message in error_messages: logger.warning("Error:'{}'".format(error_message)) + reason_to_raise = error_message if dump_response_content: if self.number_dumped_logs == 0: logger.warning("response.content={}".format(response.content)) @@ -652,6 +680,8 @@ def log_batch_errors(self, response, kwargs_array): self.number_dumped_logs += 1 else: logger.info("Batch error analysis OK") + if reason_to_raise: + raise SharePointClientError("There was at least one issue during batch processing ({}). Look into the logs for more details.".format(reason_to_raise)) def get_base_url(self): return "{}/{}/_api/Web".format( @@ -689,6 +719,13 @@ def get_list_add_item_using_path_url(self, list_title): self.escape_path(list_title) ) + def get_update_item_url(self, list_title, item_id): + return self.get_base_url() + "/GetList(@a1)/items({})?@a1='/{}/Lists/{}'".format( + item_id, + self.sharepoint_site, + self.escape_path(list_title) + ) + def get_list_fields_url(self, list_title): return self.get_lists_by_title_url(list_title) + "/fields" @@ -716,7 +753,7 @@ def get_file_url(self, full_path): ) def get_file_content_url(self, full_path): - return self.get_file_url(full_path) + "/$value" + return self.get_file_url(full_path.replace("#", "%23")) + "/$value" def get_move_url(self, from_path, to_path): # Using the new method leads to 403. @@ -918,7 +955,7 @@ def escape_path(path): return path.replace("'", "''") def get_writer(self, dataset_schema, dataset_partitioning, - partition_id, max_workers, batch_size, write_mode): + partition_id, max_workers, batch_size, write_mode, columns_to_update=[]): return SharePointListWriter( self.config, self, @@ -927,7 +964,8 @@ def get_writer(self, dataset_schema, dataset_partitioning, partition_id, max_workers=max_workers, batch_size=batch_size, - write_mode=write_mode + write_mode=write_mode, + columns_to_update=columns_to_update ) def get_read_schema(self, display_metadata=False, metadata_to_retrieve=[]): diff --git a/python-lib/sharepoint_lists.py b/python-lib/sharepoint_lists.py index 508a2b7..697e0a7 100644 --- a/python-lib/sharepoint_lists.py +++ b/python-lib/sharepoint_lists.py @@ -1,4 +1,5 @@ import datetime +import pandas from concurrent.futures import ThreadPoolExecutor, as_completed from sharepoint_constants import SharePointConstants from dss_constants import DSSConstants @@ -54,6 +55,8 @@ def assert_list_title(list_title): def dss_to_sharepoint_date(date): + if "T" not in date or "Z" not in date: + return date return format_date(date, DSSConstants.DATE_FORMAT, SharePointConstants.DATE_FORMAT) @@ -78,9 +81,18 @@ def format_date(date, from_format, to_format): return date +def get_columns_types(input_schema): + columns_types = {} + for column_schema in input_schema: + column_name = column_schema.get("name", "") + column_type = column_schema.get("type", "string") + columns_types[column_name] = column_type + return columns_types + + class SharePointListWriter(object): - def __init__(self, config, client, dataset_schema, dataset_partitioning, partition_id, max_workers=5, batch_size=100, write_mode="create"): + def __init__(self, config, client, dataset_schema, dataset_partitioning, partition_id, max_workers=5, batch_size=100, write_mode="create", columns_to_update=[]): self.client = client self.config = config self.dataset_schema = dataset_schema @@ -89,10 +101,13 @@ def __init__(self, config, client, dataset_schema, dataset_partitioning, partiti self.buffer = [] logger.info('init SharepointListWriter with {} workers and batch size of {}'.format(max_workers, batch_size)) self.columns = dataset_schema[SharePointConstants.COLUMNS] + self.dss_columns_types = get_columns_types(self.columns) self.sharepoint_column_ids = {} self.sharepoint_existing_column_names = {} self.sharepoint_existing_column_entity_property_names = {} self.web_name = self.client.sharepoint_list_title + self.write_mode = write_mode + self.columns_to_update = columns_to_update if write_mode == SharePointConstants.WRITE_MODE_CREATE: logger.info('flush:recycle_list "{}"'.format(self.client.sharepoint_list_title)) @@ -132,14 +147,52 @@ def write_row(self, row): def write_row_dict(self, row_dict): row = [] for element in row_dict: - row.append(str(row_dict.get(element))) + row.append(row_dict.get(element)) self.write_row(row) + def pandas_row_to_json(self, input_pandas_row): + input_row = input_pandas_row.to_dict() + # Now lets fix what has been savaged by the panda + output_row = {} + for key in input_row: + target_type = self.dss_columns_types.get(key) + value = input_row.get(key) + if not value or pandas.isna(value): + straighten_value = None + elif target_type in ["int", "bigint"]: + if isinstance(value, int): + straighten_value = str(value) + else: + # If there was one NaN in the int column, the whole column has been converted in float + # Because, obviously... + straighten_value = str(int(value)) + else: + straighten_value = str(value) + output_row[key] = straighten_value + return output_row + + def fix_dates_for_pandas_output(self, input_pandas_row): + input_row = input_pandas_row.to_dict() + fixed_output_row = {} + for key in input_row: + target_type = self.dss_columns_types.get(key) + value = input_row.get(key) + if pandas.isna(value): + fixed_output_row[key] = None + elif target_type == "date": + fixed_output_row[key] = value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + else: + fixed_output_row[key] = value + return fixed_output_row + def flush(self): - if self.max_workers > 1: - self.upload_rows_multithreaded() + if self.write_mode == "update": + self.update_rows() else: - self.upload_rows() + if self.max_workers > 1: + self.upload_rows_multithreaded() + else: + self.upload_rows() def upload_rows_multithreaded(self): logger.info("Starting multithreaded rows adding") @@ -171,6 +224,20 @@ def upload_rows(self): self.client.process_batch(kwargs) logger.info("{} items written".format(len(kwargs))) + def update_rows(self): + logger.info("Starting updating items") + kwargs = [] + for row in self.buffer: + item = self.build_row_dictionary(row, self.columns_to_update) + item_id = item.pop("ID", None) + if item_id is None: + kwargs.append(self.client.get_add_list_item_kwargs(self.web_name, item)) + # raise Exception("Item in column 'ID' cannot be left empty") + else: + kwargs.append(self.client.get_update_list_item_kwargs(self.web_name, self.list_item_entity_type_full_name, item_id, item)) + self.client.process_batch(kwargs) + logger.info("{} items written".format(len(kwargs))) + def create_sharepoint_columns(self): """ Create the list's columns on SP, retrieve their SP id and map it to their DSS column name """ logger.info("create_sharepoint_columns") @@ -198,9 +265,12 @@ def create_sharepoint_columns(self): else: self.sharepoint_column_ids[dss_column_name] = dss_column_name - def build_row_dictionary(self, row): + def build_row_dictionary(self, row, columns_to_update=None): ret = {} for column, structure in zip(row, self.columns): + if columns_to_update: + if structure[SharePointConstants.NAME_COLUMN] not in columns_to_update: + continue key_to_use = self.sharepoint_existing_column_names.get( structure[SharePointConstants.NAME_COLUMN], self.sharepoint_column_ids[structure[SharePointConstants.NAME_COLUMN]] diff --git a/tests/python/integration/test_scenario.py b/tests/python/integration/test_scenario.py index b1d364a..7789940 100644 --- a/tests/python/integration/test_scenario.py +++ b/tests/python/integration/test_scenario.py @@ -39,6 +39,10 @@ def test_run_sharepoint_online_append_to_list_recipe(user_dss_clients): dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="APPENDTOLISTRECIPE") +def test_run_sharepoint_online_update_individual_list_rows(user_dss_clients): + dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="UPDATEINDIVIDUALLISTROWS") + + def test_run_sharepoint_online_write_file_in_path_w_ro_parent(user_dss_clients): dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="SC169288_WRITE_FILE_WITH_RO_PARENT_FOLDER")