Improve error messages (#57)

jeffreylovitz · web-flow · commit 26ac800f4cbd · 2021-04-23T10:11:14.000-04:00
* Accept NULL values in CSVs with schemas

* Report filename and line number in exceptions
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
@@ -65,6 +65,11 @@ def typed_prop_to_binary(prop_val, prop_type):
     # Remove leading and trailing whitespace
     prop_val = prop_val.strip()
 
+    if prop_val == "":
+        # An empty string indicates a NULL property.
+        # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
+        return struct.pack(format_str, 0)
+
     # TODO allow ID type specification
     if prop_type == Type.LONG:
         try:
@@ -107,7 +112,7 @@ def typed_prop_to_binary(prop_val, prop_type):
         return array_prop_to_binary(format_str, prop_val)
 
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
-    raise Exception("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
+    raise SchemaError("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
 
 
 # Convert a single CSV property field with an inferred type into a binary stream.
@@ -227,14 +232,14 @@ def convert_header_with_schema(self, header):
             # Multiple colons found in column name, emit error.
             # TODO might need to check for backtick escapes
             if len(pair) > 2:
-                raise CSVError("Field '%s' had %d colons" % field, len(field))
+                raise CSVError("%s: Field '%s' had %d colons" % (self.infile.name, field, len(field)))
 
             # Convert the column type.
             col_type = convert_schema_type(pair[1].upper().strip())
 
             # If the column did not have a name but the type requires one, emit an error.
             if len(pair[0]) == 0 and col_type not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
-                raise SchemaError("Each property in the header should be a colon-separated pair")
+                raise SchemaError("%s: Each property in the header should be a colon-separated pair" % (self.infile.name))
             else:
                 # We have a column name and a type.
                 # Only store the name if the column's values should be added as properties.
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
@@ -63,7 +63,11 @@ def process_entities(self):
                         id_field = self.id_namespace + '.' + str(id_field)
                     self.update_node_dictionary(id_field)
 
-                row_binary = self.pack_props(row)
+                try:
+                    row_binary = self.pack_props(row)
+                except SchemaError as e:
+                    # TODO why is line_num off by one?
+                    raise SchemaError("%s:%d %s" % (self.infile.name, self.reader.line_num - 1, str(e)))
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
@@ -61,12 +61,16 @@ def process_entities(self):
                     src = self.query_buffer.nodes[start_id]
                     dest = self.query_buffer.nodes[end_id]
                 except KeyError as e:
-                    print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
+                    print("%s:%d Relationship specified a non-existent identifier. src: %s; dest: %s" %
+                          (self.infile.name, self.reader.line_num - 1, row[self.start_id], row[self.end_id]))
                     if self.config.skip_invalid_edges is False:
                         raise e
                     continue
                 fmt = "=QQ" # 8-byte unsigned ints for src and dest
-                row_binary = struct.pack(fmt, src, dest) + self.pack_props(row)
+                try:
+                    row_binary = struct.pack(fmt, src, dest) + self.pack_props(row)
+                except SchemaError as e:
+                    raise SchemaError("%s:%d %s" % (self.infile.name, self.reader.line_num, str(e)))
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
@@ -660,7 +660,7 @@ def test16_error_on_schema_failure(self):
         except Exception as e:
             # Verify that the correct exception is raised.
             self.assertEqual(sys.exc_info()[0].__name__, 'SchemaError')
-            self.assertIn("Could not parse 'strval' as an array", e.args)
+            self.assertIn("Could not parse 'strval' as an array", str(e))
 
     def test17_ensure_index_is_created(self):
         graphname = "index_test"