Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0d0dbea
feat(logic): strip white space;
JVickery-TBS May 7, 2024
616c948
fix(logic): strip white space for load table;
JVickery-TBS May 7, 2024
860ca9e
fix(logic): strip white space for load table;
JVickery-TBS May 7, 2024
88f96a8
fix(logic): strip white space;
JVickery-TBS May 8, 2024
341ec1e
Merge branch 'master' into feature/strip-white-space
JVickery-TBS May 8, 2024
21a1ece
fix(tests): new code for tests;
JVickery-TBS May 8, 2024
a6ab0a0
fix(logic): load csv white space;
JVickery-TBS May 12, 2024
54f87e0
feat(logic): added `strip_extra_white` info field;
JVickery-TBS May 14, 2024
50080ea
feat(logic): added `strip_extra_white` field;
JVickery-TBS May 14, 2024
116c29f
fix(logic): minor logic fixes;
JVickery-TBS May 14, 2024
43b9f94
feat(tests,i18n): updated tests;
JVickery-TBS Jul 15, 2024
06ee48a
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Jul 15, 2024
c00fb5a
fix(tests,logic): misc fixes;
JVickery-TBS Jul 15, 2024
669930e
fix(tests,logic): new output and parody;
JVickery-TBS Jul 15, 2024
3263bab
fix(logic): ckan versioning;
JVickery-TBS Jul 16, 2024
7cb6a84
fix(logic): ckan versioning;
JVickery-TBS Jul 16, 2024
bf2e939
feat(templates,logic): pre-datadictionary implement;
JVickery-TBS Jul 16, 2024
d6de1b1
feat(tests): add coverage;
JVickery-TBS Jul 22, 2024
8564cdd
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Jul 23, 2024
9b65844
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Dec 9, 2024
bf7efc6
fix(tests): ds keys;
JVickery-TBS Dec 9, 2024
4a35fc8
fix(tests): datadictionary;
JVickery-TBS Dec 9, 2024
4886902
debug tests...
JVickery-TBS Dec 9, 2024
6587421
debug tests...
JVickery-TBS Dec 9, 2024
20a33f6
fix(logic): loader;
JVickery-TBS Dec 9, 2024
e4aac5d
fix(logic): datadict versions;
JVickery-TBS Dec 9, 2024
dd137f3
fix(templates): selected value;
JVickery-TBS Dec 9, 2024
c7fb399
feat(misc): readme, changelog;
JVickery-TBS Dec 9, 2024
ec8ef04
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Dec 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
================

Feat:
* Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field). This will strip surrounding white space from data values prior to inserting them into the database.
* Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` controls whether a resource requires a successful validation report to be XLoadered. By default, a resource would also require a Validation Schema, which can be turned off with `ckanext.xloader.validation.enforce_schema`.


Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,12 @@ Default value: `True`

Controls whether or not a resource requires a Validation Schema to be present from the ckanext-validation plugin to be XLoadered.

## Data Dictionary Fields

#### strip_extra_white

This plugin adds the `Strip Extra Leading and Trailing White Space` field to Data Dictionary fields. This controls whether or not to trim whitespace from data values prior to inserting into the database. Default for each field is `True` (it will trim whitespace).

## Developer installation

To install XLoader for development, activate your CKAN virtualenv and in
Expand Down
78 changes: 63 additions & 15 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,28 +171,23 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
csv_filepath = f_write.name

# datastore db connection
engine = get_write_engine()

# get column info from existing table
existing = datastore_resource_exists(resource_id)
existing_info = {}
if existing:
existing_fields = existing.get('fields', [])
if p.toolkit.check_ckan_version(min_version='2.11'):
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
else:
existing_fields = existing.get('fields', [])
existing_info = dict((f['id'], f['info'])
for f in existing_fields
if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Column types are either set (overridden) in the Data Dictionary page
# or default to text type (which is robust)
Expand All @@ -207,6 +202,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
for f in fields:
if f['id'] in existing_info:
f['info'] = existing_info[f['id']]
f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \
else existing_fields_by_headers[f['id']].get('strip_extra_white', True)

'''
Delete or truncate existing datastore table before proceeding,
Expand All @@ -223,11 +220,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
else:
fields = [
{'id': header_name,
'type': 'text'}
'type': 'text',
'strip_extra_white': True,}
for header_name in headers]

logger.info('Fields: %s', fields)

save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
if len(row) == len(fields):
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
if len(row) == len(fields):
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
csv_filepath = f_write.name

# Create table
from ckan import model
context = {'model': model, 'ignore_auth': True}
Expand Down Expand Up @@ -383,10 +412,16 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
existing = datastore_resource_exists(resource_id)
existing_info = None
if existing:
existing_fields = existing.get('fields', [])
if p.toolkit.check_ckan_version(min_version='2.11'):
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
else:
existing_fields = existing.get('fields', [])
existing_info = dict(
(f['id'], f['info'])
for f in existing_fields if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Some headers might have been converted from strings to floats and such.
headers = encode_headers(headers)
Expand All @@ -400,6 +435,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
strict_guessing = p.toolkit.asbool(
config.get('ckanext.xloader.strict_type_guessing', True))
types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing)
fields = []

# override with types user requested
if existing_info:
Expand All @@ -410,6 +446,12 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
'timestamp': datetime.datetime,
}.get(existing_info.get(h, {}).get('type_override'), t)
for t, h in zip(types, headers)]
for h in headers:
fields.append(existing_fields_by_headers.get(h, {}))
else:
# default strip_extra_white
for h in headers:
fields.append({'strip_extra_white': True})

# Strip leading and trailing whitespace, then truncate to maximum length,
# then strip again in case the truncation exposed a space.
Expand All @@ -419,7 +461,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
if header and header.strip()
]
header_count = len(headers)
type_converter = TypeConverter(types=types)
type_converter = TypeConverter(types=types, fields=fields)

with UnknownEncodingStream(table_filepath, file_format, decoding_result,
skip_rows=skip_rows,
Expand Down Expand Up @@ -451,10 +493,16 @@ def row_iterator():
for h in headers_dicts:
if h['id'] in existing_info:
h['info'] = existing_info[h['id']]
h['strip_extra_white'] = existing_info[h['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[h['id']] \
else existing_fields_by_headers[h['id']].get('strip_extra_white', True)
# create columns with types user requested
type_override = existing_info[h['id']].get('type_override')
if type_override in list(_TYPE_MAPPING.values()):
h['type'] = type_override
else:
# default strip_extra_white
for h in headers_dicts:
h['strip_extra_white'] = True

# preserve any types that we have sniffed unless told otherwise
_save_type_overrides(headers_dicts)
Expand Down
12 changes: 11 additions & 1 deletion ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ class TypeConverter:
as desired.
"""

def __init__(self, types=None):
def __init__(self, types=None, fields=None):
self.types = types
self.fields = fields

def convert_types(self, extended_rows):
""" Try converting cells to numbers or timestamps if applicable.
Expand All @@ -31,7 +32,16 @@ def convert_types(self, extended_rows):
for cell_index, cell_value in enumerate(row):
if cell_value is None:
row[cell_index] = ''
if self.fields:
# only strip white space if strip_extra_white is True
if self.fields[cell_index].get('info', {}).get('strip_extra_white', True) and isinstance(cell_value, six.text_type):
cell_value = cell_value.strip()
row[cell_index] = cell_value.strip()
if not cell_value:
# load_csv parody: empty of string type should be None
if self.types and self.types[cell_index] == six.text_type:
cell_value = None
row[cell_index] = None
continue
cell_type = self.types[cell_index] if self.types else None
if cell_type in [Decimal, None]:
Expand Down
25 changes: 25 additions & 0 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
def config_declarations(cls):
return cls

if toolkit.check_ckan_version(min_version='2.11'):
from ckanext.datastore.interfaces import IDataDictionaryForm
has_idata_dictionary_form = True
else:
has_idata_dictionary_form = False

log = logging.getLogger(__name__)


Expand All @@ -39,6 +45,8 @@ class xloaderPlugin(plugins.SingletonPlugin):
plugins.implements(plugins.IResourceController, inherit=True)
plugins.implements(plugins.IClick)
plugins.implements(plugins.IBlueprint)
if has_idata_dictionary_form:
plugins.implements(IDataDictionaryForm, inherit=True)
if HAS_IPIPE_VALIDATION:
plugins.implements(IPipeValidation)

Expand Down Expand Up @@ -252,6 +260,23 @@ def get_helpers(self):
"xloader_badge": xloader_helpers.xloader_badge,
}

# IDataDictionaryForm

def update_datastore_create_schema(self, schema):
default = toolkit.get_validator('default')
boolean_validator = toolkit.get_validator('boolean_validator')
to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data')
schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')]
return schema

def update_datastore_info_field(self, field, plugin_data):
# expose all our non-secret plugin data in the field
field.update(plugin_data.get('xloader', {}))
# CKAN version parody
if '_info' in plugin_data:
field.update({'info': plugin_data['_info']})
return field


def _should_remove_unsupported_resource_from_datastore(res_dict):
if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):
Expand Down
17 changes: 17 additions & 0 deletions ckanext/xloader/templates/datastore/snippets/dictionary_form.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{% ckan_extends %}
{% import 'macros/form.html' as form %}

{% block additional_fields %}
{{ super() }}
{% if h.check_ckan_version(min_version='2.11') %}
{% set field_prefix = 'fields__' %}
{% else %}
{% set field_prefix = 'info__' %}
{% endif %}
{% set selected_value = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white', true)) %}
{{ form.select(field_prefix ~ position ~ '__strip_extra_white',
label=_('Strip Extra Leading and Trailing White Space'), options=[
{'text': _('Yes'), 'value': true},
{'text': _('No'), 'value': false},
], selected=selected_value) }}
{% endblock %}
8 changes: 4 additions & 4 deletions ckanext/xloader/tests/samples/boston_311_sample.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
2 changes: 1 addition & 1 deletion ckanext/xloader/tests/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_xloader_data_into_datastore(self, cli, data):
with mock.patch("ckanext.xloader.jobs.get_response", get_response):
stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout
assert "Fields: [{'id': 'x', 'type': 'text'}, {'id': 'y', 'type': 'text'}]" in stdout
assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout
assert "Copying to database..." in stdout
assert "Creating search index..." in stdout
assert "Express Load completed" in stdout
Expand Down
Loading
Loading