Add tabdata encoding (#466)

enekomartinmartinez · enekomartinmartinez · commit 2cd61a0d1f68 · 2025-03-23T15:42:40.000+01:00
* Allow passing encoding for TabData files
diff --git a/docs/whats_new.rst b/docs/whats_new.rst
@@ -1,6 +1,6 @@
 What's New
 ==========
-v3.14.3 (2025/XX/XX)
+v3.14.3 (2025/03/XX)
 --------------------
 New Features
 ~~~~~~~~~~~~
@@ -23,6 +23,7 @@ Performance
 
 Internal Changes
 ~~~~~~~~~~~~~~~~
+- Allow passing encoding for TabData files (:issue:`455`). (`@enekomartinmartinez <https://github.com/enekomartinmartinez>`_)
 
 v3.14.2 (2024/11/12)
 --------------------
diff --git a/pysd/py_backend/data.py b/pysd/py_backend/data.py
@@ -125,8 +125,9 @@ def get_columns(cls, file_name, vars=None, encoding=None):
         file_name: str
             Output file to read. Must be csv or tab.
 
-        vars: list
-            List of var names to find in the file.
+        vars: list or None (optional)
+            List of var names to find in the file. If None all variables
+            will be returned. Default is None.
 
         encoding: str or None (optional)
             Encoding type to read output file. Needed if the file has special
@@ -268,14 +269,20 @@ def __init__(self, real_name, py_name, coords, interp="interpolate"):
                              + "'raw', 'interpolate', "
                              + "'look_forward' or 'hold_backward'")
 
-    def load_data(self, file_names):
+    def load_data(self, file_names, encoding=None):
         """
         Load data values from files.
 
         Parameters
         ----------
         file_names: list or str or pathlib.Path
             Name of the files to search the variable in.
+        encoding: list or str or None (optional)
+            Encoding to be used by the data readers. If a list is given,
+            then file_names should be a list of the same lenght. If
+            None or a string is given, this value will be used for all
+            of them. See documentation from pandas.read_table for
+            further information. Default is None.
 
         Returns
         -------
@@ -285,9 +292,11 @@ def load_data(self, file_names):
         """
         if isinstance(file_names, (str, Path)):
             file_names = [file_names]
+        if isinstance(encoding, str) or encoding is None:
+            encoding = [encoding]*len(file_names)
 
-        for file_name in file_names:
-            self.data = self._load_data(Path(file_name))
+        for file_name, encoding_df in zip(file_names, encoding):
+            self.data = self._load_data(Path(file_name), encoding_df)
             if self.data is not None:
                 break
 
@@ -297,7 +306,7 @@ def load_data(self, file_names):
                 f"Data for {self.real_name} not found in "
                 f"{', '.join([str(file_name) for file_name in file_names])}")
 
-    def _load_data(self, file_name):
+    def _load_data(self, file_name, encoding):
         """
         Load data values from output
 
@@ -317,7 +326,10 @@ def _load_data(self, file_name):
         if file_name.suffix in [".csv", ".tab"]:
 
             columns, transpose = Columns.get_columns(
-                file_name, vars=[self.real_name, self.py_name])
+                file_name,
+                vars=[self.real_name, self.py_name],
+                encoding=encoding
+            )
 
             if not columns:
                 # the variable is not in the passed file
diff --git a/pysd/py_backend/model.py b/pysd/py_backend/model.py
@@ -73,7 +73,7 @@ class Macro(DynamicStateful):
     """
     def __init__(self, py_model_file, params=None, return_func=None,
                  time=None, time_initialization=None, data_files=None,
-                 py_name=None):
+                 data_files_encoding=None, py_name=None):
         super().__init__()
         self.time = time
         self.time_initialization = time_initialization
@@ -158,7 +158,7 @@ def __init__(self, py_model_file, params=None, return_func=None,
 
         # Load data files
         if data_files:
-            self._get_data(data_files)
+            self._get_data(data_files, data_files_encoding)
 
         # Assign the cache type to each variable
         self._assign_cache_type()
@@ -221,14 +221,19 @@ def clean_caches(self):
         # if nested macros
         [macro.clean_caches() for macro in self._macro_elements]
 
-    def _get_data(self, data_files):
+    def _get_data(self, data_files, encoding):
+        """Load Data for TabData objects"""
         if isinstance(data_files, dict):
             for data_file, vars in data_files.items():
+                if isinstance(encoding, dict):
+                    encoding_df = encoding.get(data_file, None)
+                else:
+                    encoding_df = encoding
                 for var in vars:
                     found = False
                     for element in self._data_elements:
                         if var in [element.py_name, element.real_name]:
-                            element.load_data(data_file)
+                            element.load_data(data_file, encoding_df)
                             found = True
                             break
                     if not found:
@@ -237,7 +242,7 @@ def _get_data(self, data_files):
 
         else:
             for element in self._data_elements:
-                element.load_data(data_files)
+                element.load_data(data_files, encoding)
 
     def _get_initialize_order(self):
         """
@@ -1396,11 +1401,13 @@ class Model(Macro):
     :class:`pysd.py_backend.model.Macro`
 
     """
-    def __init__(self, py_model_file, data_files, initialize, missing_values):
+    def __init__(self, py_model_file, data_files, data_files_encoding,
+                 initialize, missing_values):
         """ Sets up the Python objects """
         super().__init__(py_model_file, None, None, Time(),
                          data_files=data_files)
         self.data_files = data_files
+        self.data_files_encoding = data_files_encoding
         self.missing_values = missing_values
         # set time component
         self.time.stage = 'Load'
@@ -2159,6 +2166,7 @@ def copy(self, reload=False):
         new_model = type(self)(
             py_model_file=deepcopy(self.py_model_file),
             data_files=deepcopy(self.data_files),
+            data_files_encoding=deepcopy(self.data_files_encoding),
             initialize=initialize,
             missing_values=deepcopy(self.missing_values)
         )
@@ -2194,6 +2202,7 @@ def reload(self):
 
         """
         self.__init__(self.py_model_file, data_files=self.data_files,
+                      data_files_encoding=self.data_files_encoding,
                       initialize=True,
                       missing_values=self.missing_values)
 
diff --git a/pysd/pysd.py b/pysd/pysd.py
@@ -24,8 +24,8 @@
     )
 
 
-def read_xmile(xmile_file, data_files=None, initialize=True,
-               missing_values="warning"):
+def read_xmile(xmile_file, data_files=None, data_files_encoding=None,
+               initialize=True, missing_values="warning"):
     """
     Construct a model from a Xmile file.
 
@@ -38,9 +38,20 @@ def read_xmile(xmile_file, data_files=None, initialize=True,
         If False, the model will not be initialize when it is loaded.
         Default is True.
 
-    data_files: list or str or None (optional)
-        If given the list of files where the necessary data to run the model
-        is given. Default is None.
+    data_files: dict or list or str or None
+        The dictionary with keys the name of file and variables to
+        load the data from there. Or the list of names or name of the
+        file to search the data in. Only works for TabData type object
+        and it is neccessary to provide it. Default is None.
+
+    data_files_encoding: list or str or dict or None (optional)
+        Encoding for data_files. If a string or None is passed this
+        value will be used for all the files. If data_files is a list,
+        a list of the same length could be used to specify different
+        encodings. If data_files is a dictionary, a dictionary with the
+        same keys could be used, being the values the encodings. See
+        documentation from pandas.read_table for further information.
+        Default is None.
 
     missing_values: str ("warning", "error", "ignore", "keep") (optional)
         What to do with missing values. If "warning" (default)
@@ -75,15 +86,20 @@ def read_xmile(xmile_file, data_files=None, initialize=True,
     py_model_file = ModelBuilder(abs_model).build_model()
 
     # load Python file
-    model = load(py_model_file, data_files, initialize, missing_values)
+    model = load(
+        py_model_file,
+        data_files, data_files_encoding,
+        initialize,
+        missing_values
+    )
     model.xmile_file = str(xmile_file)
 
     return model
 
 
-def read_vensim(mdl_file, data_files=None, initialize=True,
-                missing_values="warning", split_views=False,
-                encoding=None, **kwargs):
+def read_vensim(mdl_file, data_files=None, data_files_encoding=None,
+                initialize=True, missing_values="warning",
+                split_views=False, encoding=None, **kwargs):
     """
     Construct a model from Vensim `.mdl` file.
 
@@ -96,9 +112,29 @@ def read_vensim(mdl_file, data_files=None, initialize=True,
         If False, the model will not be initialize when it is loaded.
         Default is True.
 
-    data_files: list or str or None (optional)
-        If given the list of files where the necessary data to run the model
-        is given. Default is None.
+    data_files: dict or list or str or None
+        The dictionary with keys the name of file and variables to
+        load the data from there. Or the list of names or name of the
+        file to search the data in. Only works for TabData type object
+        and it is neccessary to provide it. Default is None.
+
+    data_files_encoding: list or str or dict or None (optional)
+        Encoding for data_files. If a string or None is passed this
+        value will be used for all the files. If data_files is a list,
+        a list of the same length could be used to specify different
+        encodings. If data_files is a dictionary, a dictionary with the
+        same keys could be used, being the values the encodings. See
+        documentation from pandas.read_table for further information.
+        Default is None.
+
+    data_files_encoding: list or str or dict or None (optional)
+        Encoding for data_files. If a string or None is passed this
+        value will be used for all the files. If data_files is a list,
+        a list of the same length could be used to specify different
+        encodings. If data_files is a dictionary, a dictionary with the
+        same keys could be used, being the values the encodings. See
+        documentation from pandas.read_table for further information.
+        Default is None.
 
     missing_values: str ("warning", "error", "ignore", "keep") (optional)
         What to do with missing values. If "warning" (default)
@@ -155,14 +191,19 @@ def read_vensim(mdl_file, data_files=None, initialize=True,
     py_model_file = ModelBuilder(abs_model).build_model()
 
     # load Python file
-    model = load(py_model_file, data_files, initialize, missing_values)
+    model = load(
+        py_model_file,
+        data_files, data_files_encoding,
+        initialize,
+        missing_values
+    )
     model.mdl_file = str(mdl_file)
 
     return model
 
 
-def load(py_model_file, data_files=None, initialize=True,
-         missing_values="warning"):
+def load(py_model_file, data_files=None, data_files_encoding=None,
+         initialize=True, missing_values="warning"):
     """
     Load a Python-converted model file.
 
@@ -182,6 +223,15 @@ def load(py_model_file, data_files=None, initialize=True,
         file to search the data in. Only works for TabData type object
         and it is neccessary to provide it. Default is None.
 
+    data_files_encoding: list or str or dict or None (optional)
+        Encoding for data_files. If a string or None is passed this
+        value will be used for all the files. If data_files is a list,
+        a list of the same length could be used to specify different
+        encodings. If data_files is a dictionary, a dictionary with the
+        same keys could be used, being the values the encodings. See
+        documentation from pandas.read_table for further information.
+        Default is None.
+
     missing_values : str ("warning", "error", "ignore", "keep") (optional)
         What to do with missing values. If "warning" (default)
         shows a warning message and interpolates the values.
@@ -195,4 +245,9 @@ def load(py_model_file, data_files=None, initialize=True,
     >>> model = load('../tests/test-models/samples/teacup/teacup.py')
 
     """
-    return Model(py_model_file, data_files, initialize, missing_values)
+    return Model(
+        py_model_file,
+        data_files, data_files_encoding,
+        initialize,
+        missing_values
+    )
diff --git a/tests/pytest_types/external/pytest_external.py b/tests/pytest_types/external/pytest_external.py
@@ -1382,7 +1382,9 @@ def test_constant_0d(self, _root):
         data2.initialize()
 
         assert data() == -1
+        assert type(data()) == float
         assert data2() == 0
+        assert type(data2()) == float
 
     def test_constant_n0d(self, _root):
         """