FIX: slow read_off method (#353)

safesintesi · daavoo · web-flow · commit cab0c7f5a51f · 2023-11-23T12:16:47.000+01:00
* FIXED `read_off` method from `off.py`

ADDED comment parameter to skip comment rows and end of line comments
CHANGED to c engine to increase drastically the performance
ADDED n_rows and REMOVED skipfooter to use c engine
ADDED dtype parameter to directly cast the element types
SET points read_csv to off buffer: no skip of first lines needed

* Deleted commented section

* FIX off.py

Fixed typo in parameters name
Added n_points and n_faces default as 0

* REFACTOR changed deprecated jit to njit

* FIX ValueError when using `read_obj`

* Added throw ValueError on empty file

* Update pyntcloud/io/obj.py

---------

Co-authored-by: Edoardo Bortolozzo &lt;edoardo.bortolozzo@studenti.unipd.it&gt;
Co-authored-by: David de la Iglesia Castro &lt;daviddelaiglesiacastro@gmail.com&gt;
diff --git a/pyntcloud/io/obj.py b/pyntcloud/io/obj.py
@@ -72,7 +72,8 @@ def read_obj(filename):
         for i in range(sum(c.isdigit() for c in f[0].split(" "))):
             mesh_columns.append("v{}".format(i + 1))
 
-    mesh = pd.DataFrame([re.split(r'\D+', x) for x in f], dtype='i4', columns=mesh_columns).astype('i4')
+    # trying to coerce type to integer throws error, casted afetr passes tests
+    mesh = pd.DataFrame([re.split(r'\D+', x) for x in f], columns=mesh_columns).astype('i4')
     mesh -= 1  # index starts with 1 in obj file
 
     data["mesh"] = mesh
diff --git a/pyntcloud/io/off.py b/pyntcloud/io/off.py
@@ -8,9 +8,12 @@ def read_off(filename):
 
         first_line = off.readline()
         if "OFF" not in first_line:
-            raise ValueError('The file does not start whith the word OFF')
+            raise ValueError('The file does not start with the word OFF')
         color = True if "C" in first_line else False
 
+        n_points = 0
+        n_faces = 0
+
         count = 1
         for line in off:
             count += 1
@@ -22,22 +25,38 @@ def read_off(filename):
                 n_faces = int(line[1])
                 break
 
+        if (n_points == 0):
+            raise ValueError('The file has no points')
+
         data = {}
         point_names = ["x", "y", "z"]
-        if color:
-            point_names.extend(["red", "green", "blue"])
-
-        data["points"] = pd.read_csv(filename, sep=" ", header=None, engine="python",
-                                     skiprows=count, skipfooter=n_faces,
-                                     names=point_names, index_col=False)
-        for n in ["x", "y", "z"]:
-            data["points"][n] = data["points"][n].astype(np.float32)
+        point_types = {'x': np.float32, 'y': np.float32, 'z': np.float32}
 
         if color:
-            for n in ["red", "green", "blue"]:
-                data["points"][n] = data["points"][n].astype(np.uint8)
-
-        data["mesh"] = pd.read_csv(filename, sep=" ", header=None, engine="python",
-                                   skiprows=(count + n_points), usecols=[1, 2, 3],
-                                   names=["v1", "v2", "v3"])
+            point_names.extend(["red", "green", "blue"])
+            point_types = dict(point_types, **{'red': np.uint8, 'green': np.uint8, 'blue': np.uint8})
+
+        data["points"] = pd.read_csv(
+            off,
+            sep=" ",
+            header=None,
+            engine="c",
+            nrows=n_points,
+            names=point_names,
+            dtype=point_types,
+            index_col=False,
+            comment="#"
+        )
+
+        data["mesh"] = pd.read_csv(
+            filename,
+            sep=" ",
+            header=None,
+            engine="c",
+            skiprows=(count + n_points),
+            nrows=n_faces,
+            usecols=[1, 2, 3],
+            names=["v1", "v2", "v3"],
+            comment="#"
+        )
         return data
diff --git a/pyntcloud/utils/numba.py b/pyntcloud/utils/numba.py
@@ -1,21 +1,21 @@
-from numba import jit
+from numba import njit
 
 
-@jit
+@njit
 def groupby_count(xyz, indices, out):
     for i in range(xyz.shape[0]):
         out[indices[i]] += 1
     return out
 
 
-@jit
+@njit
 def groupby_sum(xyz, indices, N, out):
     for i in range(xyz.shape[0]):
         out[indices[i]] += xyz[i][N]
     return out
 
 
-@jit
+@njit
 def groupby_max(xyz, indices, N, out):
     for i in range(xyz.shape[0]):
         if xyz[i][N] > out[indices[i]]: