ckan · JVickery-TBS · May 7, 2024 · May 7, 2024 · May 7, 2024 · May 8, 2024
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,6 +2,7 @@
 ================
 
 Feat:
+* Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field). This will strip surrounding white space from data values prior to inserting them into the database.
 * Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` controls whether a resource requires a successful validation report to be XLoadered. By default, a resource would also require a Validation Schema, which can be turned off with `ckanext.xloader.validation.enforce_schema`.
 
 

diff --git a/README.md b/README.md
@@ -437,6 +437,12 @@ Default value: `True`
 
 Controls whether or not a resource requires a Validation Schema to be present from the ckanext-validation plugin to be XLoadered.
 
+## Data Dictionary Fields
+
+#### strip_extra_white
+
+This plugin adds the `Strip Extra Leading and Trailing White Space` field to Data Dictionary fields. This controls whether or not to trim whitespace from data values prior to inserting into the database. Default for each field is `True` (it will trim whitespace).
+
 ## Developer installation
 
 To install XLoader for development, activate your CKAN virtualenv and in

diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -171,28 +171,23 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     logger.info('Ensuring character coding is UTF8')
     f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
     try:
-        save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
-        try:
-            with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
-                                       skip_rows=skip_rows) as stream:
-                stream.save(**save_args)
-        except (EncodingError, UnicodeDecodeError):
-            with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
-                        skip_rows=skip_rows) as stream:
-                stream.save(**save_args)
-        csv_filepath = f_write.name
-
         # datastore db connection
         engine = get_write_engine()
 
         # get column info from existing table
         existing = datastore_resource_exists(resource_id)
         existing_info = {}
         if existing:
-            existing_fields = existing.get('fields', [])
+            if p.toolkit.check_ckan_version(min_version='2.11'):
+                ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
+                existing_fields = ds_info.get('fields', [])
+            else:
+                existing_fields = existing.get('fields', [])
             existing_info = dict((f['id'], f['info'])
                                  for f in existing_fields
                                  if 'info' in f)
+            existing_fields_by_headers = dict((f['id'], f)
+                                              for f in existing_fields)
 
             # Column types are either set (overridden) in the Data Dictionary page
             # or default to text type (which is robust)
@@ -207,6 +202,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
             for f in fields:
                 if f['id'] in existing_info:
                     f['info'] = existing_info[f['id']]
+                    f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \
+                         else existing_fields_by_headers[f['id']].get('strip_extra_white', True)
 
             '''
             Delete or truncate existing datastore table before proceeding,
@@ -223,11 +220,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
         else:
             fields = [
                 {'id': header_name,
-                 'type': 'text'}
+                 'type': 'text',
+                 'strip_extra_white': True,}
                 for header_name in headers]
 
         logger.info('Fields: %s', fields)
 
+        save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
+        try:
+            with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
+                                       skip_rows=skip_rows) as stream:
+                super_iter = stream.iter
+                def strip_white_space_iter():
+                    for row in super_iter():
+                        if len(row) == len(fields):
+                            for _index, _cell in enumerate(row):
+                                # only strip white space if strip_extra_white is True
+                                if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
+                                    row[_index] = _cell.strip()
+                        yield row
+                stream.iter = strip_white_space_iter
+                stream.save(**save_args)
+        except (EncodingError, UnicodeDecodeError):
+            with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
+                        skip_rows=skip_rows) as stream:
+                super_iter = stream.iter
+                def strip_white_space_iter():
+                    for row in super_iter():
+                        if len(row) == len(fields):
+                            for _index, _cell in enumerate(row):
+                                # only strip white space if strip_extra_white is True
+                                if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
+                                    row[_index] = _cell.strip()
+                        yield row
+                stream.iter = strip_white_space_iter
+                stream.save(**save_args)
+        csv_filepath = f_write.name
+
         # Create table
         from ckan import model
         context = {'model': model, 'ignore_auth': True}
@@ -383,10 +412,16 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     existing = datastore_resource_exists(resource_id)
     existing_info = None
     if existing:
-        existing_fields = existing.get('fields', [])
+        if p.toolkit.check_ckan_version(min_version='2.11'):
+            ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
+            existing_fields = ds_info.get('fields', [])
+        else:
+            existing_fields = existing.get('fields', [])
         existing_info = dict(
             (f['id'], f['info'])
             for f in existing_fields if 'info' in f)
+        existing_fields_by_headers = dict((f['id'], f)
+                                          for f in existing_fields)
 
     # Some headers might have been converted from strings to floats and such.
     headers = encode_headers(headers)
@@ -400,6 +435,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     strict_guessing = p.toolkit.asbool(
         config.get('ckanext.xloader.strict_type_guessing', True))
     types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing)
+    fields = []
 
     # override with types user requested
     if existing_info:
@@ -410,6 +446,12 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
                 'timestamp': datetime.datetime,
             }.get(existing_info.get(h, {}).get('type_override'), t)
             for t, h in zip(types, headers)]
+        for h in headers:
+            fields.append(existing_fields_by_headers.get(h, {}))
+    else:
+        # default strip_extra_white
+        for h in headers:
+            fields.append({'strip_extra_white': True})
 
     # Strip leading and trailing whitespace, then truncate to maximum length,
     # then strip again in case the truncation exposed a space.
@@ -419,7 +461,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
         if header and header.strip()
     ]
     header_count = len(headers)
-    type_converter = TypeConverter(types=types)
+    type_converter = TypeConverter(types=types, fields=fields)
 
     with UnknownEncodingStream(table_filepath, file_format, decoding_result,
                                skip_rows=skip_rows,
@@ -451,10 +493,16 @@ def row_iterator():
             for h in headers_dicts:
                 if h['id'] in existing_info:
                     h['info'] = existing_info[h['id']]
+                    h['strip_extra_white'] = existing_info[h['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[h['id']] \
+                        else existing_fields_by_headers[h['id']].get('strip_extra_white', True)
                     # create columns with types user requested
                     type_override = existing_info[h['id']].get('type_override')
                     if type_override in list(_TYPE_MAPPING.values()):
                         h['type'] = type_override
+        else:
+            # default strip_extra_white
+            for h in headers_dicts:
+                h['strip_extra_white'] = True
 
         # preserve any types that we have sniffed unless told otherwise
         _save_type_overrides(headers_dicts)

diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
@@ -18,8 +18,9 @@ class TypeConverter:
     as desired.
     """
 
-    def __init__(self, types=None):
+    def __init__(self, types=None, fields=None):
         self.types = types
+        self.fields = fields
 
     def convert_types(self, extended_rows):
         """ Try converting cells to numbers or timestamps if applicable.
@@ -31,7 +32,16 @@ def convert_types(self, extended_rows):
             for cell_index, cell_value in enumerate(row):
                 if cell_value is None:
                     row[cell_index] = ''
+                if self.fields:
+                    # only strip white space if strip_extra_white is True
+                    if self.fields[cell_index].get('info', {}).get('strip_extra_white', True) and isinstance(cell_value, six.text_type):
+                        cell_value = cell_value.strip()
+                        row[cell_index] = cell_value.strip()
                 if not cell_value:
+                    # load_csv parody: empty of string type should be None
+                    if self.types and self.types[cell_index] == six.text_type:
+                        cell_value = None
+                        row[cell_index] = None
                     continue
                 cell_type = self.types[cell_index] if self.types else None
                 if cell_type in [Decimal, None]:

diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
@@ -25,6 +25,12 @@
     def config_declarations(cls):
         return cls
 
+if toolkit.check_ckan_version(min_version='2.11'):
+    from ckanext.datastore.interfaces import IDataDictionaryForm
+    has_idata_dictionary_form = True
+else:
+    has_idata_dictionary_form = False
+
 log = logging.getLogger(__name__)
 
 
@@ -39,6 +45,8 @@ class xloaderPlugin(plugins.SingletonPlugin):
     plugins.implements(plugins.IResourceController, inherit=True)
     plugins.implements(plugins.IClick)
     plugins.implements(plugins.IBlueprint)
+    if has_idata_dictionary_form:
+        plugins.implements(IDataDictionaryForm, inherit=True)
     if HAS_IPIPE_VALIDATION:
         plugins.implements(IPipeValidation)
 
@@ -252,6 +260,23 @@ def get_helpers(self):
             "xloader_badge": xloader_helpers.xloader_badge,
         }
 
+    # IDataDictionaryForm
+
+    def update_datastore_create_schema(self, schema):
+        default = toolkit.get_validator('default')
+        boolean_validator = toolkit.get_validator('boolean_validator')
+        to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data')
+        schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')]
+        return schema
+
+    def update_datastore_info_field(self, field, plugin_data):
+        # expose all our non-secret plugin data in the field
+        field.update(plugin_data.get('xloader', {}))
+        # CKAN version parody
+        if '_info' in plugin_data:
+            field.update({'info': plugin_data['_info']})
+        return field
+
 
 def _should_remove_unsupported_resource_from_datastore(res_dict):
     if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):

diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html
@@ -0,0 +1,17 @@
+{% ckan_extends %}
+{% import 'macros/form.html' as form %}
+
+{% block additional_fields %}
+  {{ super() }}
+  {% if h.check_ckan_version(min_version='2.11') %}
+    {% set field_prefix = 'fields__' %}
+  {% else %}
+    {% set field_prefix = 'info__' %}
+  {% endif %}
+  {% set selected_value = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white', true)) %}
+  {{ form.select(field_prefix ~ position ~ '__strip_extra_white',
+    label=_('Strip Extra Leading and Trailing White Space'), options=[
+    {'text': _('Yes'), 'value': true},
+    {'text': _('No'), 'value': false},
+    ], selected=selected_value) }}
+{% endblock %}
diff --git a/ckanext/xloader/tests/samples/boston_311_sample.csv b/ckanext/xloader/tests/samples/boston_311_sample.csv
@@ -1,4 +1,4 @@
-CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
-101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St  Dorchester  MA  02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
-101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St  East Boston  MA  02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
-101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St  East Boston  MA  02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
+CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
+101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department   ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St  Dorchester  MA  02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
+101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St  East Boston  MA  02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
+101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St  East Boston  MA  02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py
@@ -81,7 +81,7 @@ def test_xloader_data_into_datastore(self, cli, data):
         with mock.patch("ckanext.xloader.jobs.get_response", get_response):
             stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
             assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout
-            assert "Fields: [{'id': 'x', 'type': 'text'}, {'id': 'y', 'type': 'text'}]" in stdout
+            assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout
             assert "Copying to database..." in stdout
             assert "Creating search index..." in stdout
             assert "Express Load completed" in stdout
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     ================
     Feat:
+    * Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field). This will strip surrounding white space from data values prior to inserting them into the database.
     * Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` controls whether a resource requires a successful validation report to be XLoadered. By default, a resource would also require a Validation Schema, which can be turned off with `ckanext.xloader.validation.enforce_schema`.
@@ Expand Down @@