Merge pull request #5 from maliedvp/new_similarities

maliedvp · web-flow · commit 3ad965563605 · 2024-12-03T18:24:48.000+01:00
New similarities
diff --git a/src/neer_match/__init__.py b/src/neer_match/__init__.py
@@ -3,4 +3,4 @@
 Neural-symbolic Entity Reasoning and Matching.
 """
 
-__version__ = '0.7.34'
+__version__ = '0.7.35'
diff --git a/src/neer_match/matching_model.py b/src/neer_match/matching_model.py
@@ -13,6 +13,8 @@
 import pandas as pd
 import tensorflow as tf
 import typing
+import pickle
+from pathlib import Path
 
 
 def _suggest(
@@ -222,6 +224,32 @@ def suggest(
         """
         return _suggest(self, left, right, count, batch_size=batch_size, **kwargs)
 
+    def save(self, target_directory: Path, name: str, include_optimizer: bool = True) -> None:
+        """Save the deep learning model to disk.
+
+        Saves the model architecture, weights, and optimizer state (optional),
+        along with the similarity map.
+
+        Args:
+            target_directory: The directory where the model will be saved.
+            name: The name of the model (used as a subdirectory).
+            include_optimizer: Whether to save the optimizer state.
+        """
+        # Ensure target_directory is a Path object
+        target_directory = Path(target_directory) / name / 'model'
+
+        # Ensure the directory exists
+        target_directory.mkdir(parents=True, exist_ok=True)
+        
+        # Save the model architecture and weights
+        super().save(target_directory / "model.h5", include_optimizer=include_optimizer)
+        
+        # Save the similarity map
+        with open(target_directory / "similarity_map.pkl", "wb") as f:
+            pickle.dump(self.similarity_map, f)
+
+        print(f"Model successfully saved to {target_directory}")
+
     @property
     def similarity_map(self) -> SimilarityMap:
         """Similarity Map of the Model."""
@@ -476,6 +504,7 @@ def fit(
         right: pd.DataFrame,
         matches: pd.DataFrame,
         epochs: int,
+        mismatch_share: float = 0.1,
         satisfiability_weight: float = 1.0,
         verbose: int = 1,
         log_mod_n: int = 1,
@@ -496,6 +525,7 @@ def fit(
             right: The right data frame.
             matches: The matches data frame.
             epochs: The number of epochs to train.
+            mismatch_share: The mismatch share.
             satisfiability_weight: The weight of the satisfiability loss.
             verbose: The verbosity level.
             log_mod_n: The log modulo.
@@ -512,7 +542,12 @@ def fit(
         # The remaining arguments are validated in the DataGenerator
 
         data_generator = DataGenerator(
-            self.record_pair_network.similarity_map, left, right, matches, **kwargs
+            self.record_pair_network.similarity_map, 
+            left, 
+            right, 
+            matches, 
+            mismatch_share=mismatch_share,
+            **kwargs
         )
 
         axioms = self._make_axioms(data_generator)
@@ -529,6 +564,7 @@ def evaluate(
         right: pd.DataFrame,
         matches: pd.DataFrame,
         batch_size: int = 16,
+        mismatch_share: float = 1.0,
         satisfiability_weight: float = 1.0,
     ) -> dict:
         """Evaluate the model.
@@ -542,14 +578,15 @@ def evaluate(
             right: The right data frame.
             matches: The matches data frame.
             batch_size: Batch size.
+            mismatch_share: The mismatch share.
             satisfiability_weight: The weight of the satisfiability loss.
         """
         data_generator = DataGenerator(
             self.record_pair_network.similarity_map,
             left,
             right,
             matches,
-            mismatch_share=1.0,
+            mismatch_share=mismatch_share,
             batch_size=batch_size,
             shuffle=False,
         )
@@ -634,6 +671,34 @@ def suggest(
         """
         return _suggest(self, left, right, count, batch_size=batch_size)
 
+    def save(self, target_directory: Path, name: str) -> None:
+        """Save the neural-symbolic model to disk.
+
+        Saves the record pair network, similarity map, and optimizer.
+
+        Args:
+            target_directory: The directory where the model will be saved.
+            name: The name of the model (used as a subdirectory).
+        """
+        # Ensure target_directory is a Path object
+        target_directory = Path(target_directory) / name / 'model'
+
+        # Ensure the directory exists
+        target_directory.mkdir(parents=True, exist_ok=True)
+        
+        # Save the record pair network weights
+        self.record_pair_network.save_weights(target_directory / "record_pair_network.weights.h5")
+        
+        # Save the similarity map
+        with open(target_directory / "similarity_map.pkl", "wb") as f:
+            pickle.dump(self.record_pair_network.similarity_map, f)
+        
+        # Save the optimizer state
+        with open(target_directory / "optimizer.pkl", "wb") as f:
+            pickle.dump(self.optimizer.get_config(), f)
+        
+        print(f"Model successfully saved to {target_directory}")
+
     @property
     def similarity_map(self) -> SimilarityMap:
         """Similarity Map of the Model."""
diff --git a/src/neer_match/similarity_map.py b/src/neer_match/similarity_map.py
@@ -5,7 +5,7 @@
 records of two datasets.
 """
 
-from rapidfuzz import distance
+from rapidfuzz import distance, fuzz
 import numpy
 import typing
 
@@ -30,6 +30,7 @@ def gaussian(x: typing.Union[float, int], y: typing.Union[float, int]) -> float:
 def available_similarities() -> typing.Dict[str, typing.Callable]:
     """Return the list of available similarities."""
     return {
+        "basic_ratio": fuzz.ratio,
         "damerau_levenshtein": distance.DamerauLevenshtein.normalized_similarity,
         "discrete": discrete,
         "euclidean": euclidean,
@@ -41,8 +42,16 @@ def available_similarities() -> typing.Dict[str, typing.Callable]:
         "lcsseq": distance.LCSseq.normalized_similarity,
         "levenshtein": distance.Levenshtein.normalized_similarity,
         "osa": distance.OSA.normalized_similarity,
+        "partial_ratio": fuzz.partial_ratio,
+        "partial_ratio_alignment": fuzz.partial_ratio_alignment,
+        "partial_token_ratio": fuzz.partial_token_ratio,
+        "partial_token_set_ratio": fuzz.partial_token_set_ratio,
+        "partial_token_sort_ratio": fuzz.partial_token_sort_ratio,
         "postfix": distance.Postfix.normalized_similarity,
         "prefix": distance.Prefix.normalized_similarity,
+        "token_ratio": fuzz.token_ratio,
+        "token_set_ratio": fuzz.token_set_ratio,
+        "token_sort_ratio": fuzz.token_sort_ratio,
     }
 
 
diff --git a/test/__init__.py b/test/__init__.py
@@ -10,7 +10,7 @@
     "title": ["jaro_winkler"],
     "platform": ["levenshtein", "jaro"],
     "year": ["euclidean", "discrete"],
-    "developer~dev": ["jaro"],
+    "developer~dev": ["jaro", "token_sort_ratio"],
 }
 
 items = [

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`"title": ["jaro_winkler"],`
`11`	`11`	`"platform": ["levenshtein", "jaro"],`
`12`	`12`	`"year": ["euclidean", "discrete"],`
`13`		`- "developer~dev": ["jaro"],`
	`13`	`+ "developer~dev": ["jaro", "token_sort_ratio"],`
`14`	`14`	`}`
`15`	`15`
`16`	`16`	`items = [`