Merge pull request #18 from armingh2000/fix/load-score

armingh2000 · web-flow · commit 851bedeac5b0 · 2024-04-14T22:56:36.000-04:00
Fix/load score
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -61,6 +61,12 @@ All notable changes to this project will be documented in this file.
 
 - Renamed test_scorer to test_fact_scorer.
 
+## v 1.0.1 - 2024-04-14
+
+- Fix score calculation when loading from dumped data.
+- Add tests for the fix.
+- Remove unnecessary code.
+
 <!--
 ### Added
 
diff --git a/FactScoreLite/atomic_facts.py b/FactScoreLite/atomic_facts.py
@@ -186,19 +186,3 @@ def fix_sentence_splitter(self, sentences: list, initials: list) -> list:
                 results.append(sent)
 
         return results
-
-
-if __name__ == "__main__":
-    generator = AtomicFactGenerator()
-    text = """
-        To winterize your battery and prevent damage:
-        
-        1. **For the Li-ion battery**:
-        - Avoid storing the vehicle in temperatures below -13°F (-25°C) for more than seven days to prevent the Li-ion battery from freezing.
-        - Move the vehicle to a warm location if the outside temperature is -13°F (-25°C) or below, as it may freeze and be unable to charge or power the vehicle.
-        
-        2. **For the 12-volt battery**:
-        - Ensure it is fully charged during extremely cold weather conditions to prevent the battery fluid from freezing and possibly causing damage to the battery.
-        """.strip()
-
-    print(generator.run(text))
diff --git a/FactScoreLite/fact_scorer.py b/FactScoreLite/fact_scorer.py
@@ -34,7 +34,7 @@ def get_score(self, facts: list, knowledge_source: str) -> list:
 
             prompt += "\n\n"
 
-            prompt += f"Input:\n{atom} True or False?\nOutput:\n"
+            prompt += f"Input: {atom} True or False?\nOutput:\n"
 
             output = self.openai_agent.generate(prompt)
 
diff --git a/FactScoreLite/factscore.py b/FactScoreLite/factscore.py
@@ -2,6 +2,7 @@
 from . import FactScorer, AtomicFactGenerator
 from .state_handler import StateHandler
 from . import configs
+from tqdm import tqdm
 
 
 class FactScore:
@@ -25,9 +26,11 @@ def get_facts(self, generations: list) -> list:
             list: A list of generation-facts pairs dictionaries.
         """
 
+        print("Extracting facts from generations...")
+
         generation_facts_pairs = self.facts_handler.load()
 
-        for generation in generations[len(generation_facts_pairs) :]:
+        for generation in tqdm(generations[len(generation_facts_pairs) :]):
             atomic_facts_of_generation = self.atomic_fact_generator.run(generation)
             atomic_facts_of_generation = [
                 fact
@@ -48,6 +51,30 @@ def get_facts(self, generations: list) -> list:
 
         return generation_facts_pairs
 
+    def calculate_score(self, decision: list) -> tuple:
+        """
+        Calculates the score of a generation based on whether its facts are supported by the knowledge source.
+
+        Args:
+            decision (list): A list containing dictionaries of {output, is_supported, fact} for each fact of a generation.
+
+        Returns:
+            tuple: A tuple containing the score and the original score (without applying gamma penalty).
+        """
+
+        score = np.mean([d["is_supported"] for d in decision])
+        init_score = score
+
+        if self.gamma:
+            penalty = (
+                1.0
+                if len(decision) >= self.gamma
+                else np.exp(1 - self.gamma / len(decision))
+            )
+            score = penalty * score
+
+        return score, init_score
+
     def get_decisions(
         self, generation_facts_pairs: list, knowledge_sources: list
     ) -> list:
@@ -66,31 +93,27 @@ def get_decisions(
                 and initial scores (original score without applying gamma penalty).
         """
 
+        print("Generating decisions...")
+
         decisions = self.decisions_handler.load()
         scores = []
         init_scores = []
 
-        for entry in generation_facts_pairs[len(decisions) :]:
-            generation, facts = entry["generation"], entry["facts"]
-            score = None
-
-            if facts:
-                decision = self.fact_scorer.get_score(facts, knowledge_sources)
-                score = np.mean([d["is_supported"] for d in decision])
+        for enrty in decisions:
+            score, init_score = self.calculate_score(enrty["decision"])
+            init_scores.append(init_score)
+            scores.append(score)
 
-                if self.gamma:
-                    init_scores.append(score)
-                    penalty = (
-                        1.0
-                        if len(facts) > self.gamma
-                        else np.exp(1 - self.gamma / len(facts))
-                    )
-                    score = penalty * score
+        for entry in tqdm(generation_facts_pairs[len(decisions) :]):
+            generation, facts = entry["generation"], entry["facts"]
 
-                decisions.append({"generation": generation, "decision": decision})
-                self.decisions_handler.save(decisions)
+            decision = self.fact_scorer.get_score(facts, knowledge_sources)
+            score, init_score = self.calculate_score(decision)
 
+            init_scores.append(init_score)
             scores.append(score)
+            decisions.append({"generation": generation, "decision": decision})
+            self.decisions_handler.save(decisions)
 
             assert len(facts) == len(
                 decision
@@ -100,7 +123,7 @@ def get_decisions(
             generation_facts_pairs
         ), "Number of decisions and generation-facts pairs should be the same."
 
-        return scores, decisions, init_scores
+        return scores, init_scores
 
     def get_factscore(
         self,
@@ -124,6 +147,6 @@ def get_factscore(
         ), "`generations` and `knowledge_sources` should have the same length."
 
         facts = self.get_facts(generations)
-        scores, decisions, init_scores = self.get_decisions(facts, knowledge_sources)
+        scores, init_scores = self.get_decisions(facts, knowledge_sources)
 
         return np.mean(scores), np.mean(init_scores)
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = FactScoreLite
-version = 1.0.0
+version = 1.0.1
 author = armingh2000
 author_email = 
 license = MIT
diff --git a/tests/test_factscore.py b/tests/test_factscore.py
@@ -65,7 +65,7 @@ def test_get_decisions_with_valid_input(
         {"is_supported": True},
         {"is_supported": False},
     ]
-    scores, decisions, init_scores = fact_score.get_decisions(
+    scores, init_scores = fact_score.get_decisions(
         generation_facts_pairs, knowledge_sources
     )
 
@@ -85,7 +85,12 @@ def test_get_factscore_from_saved_states(
 ):
     mock_state_handler.load.side_effect = [
         [{"generation": "gen1", "facts": ["fact1", "fact2"]}],
-        [{"generation": "gen1", "facts": [{"fact": "fact1", "is_supported": True}]}],
+        [
+            {
+                "generation": "gen1",
+                "decision": [{"fact": "fact1", "is_supported": True, "output": "True"}],
+            }
+        ],
     ]  # First for facts, second for decisions
     generations = ["generation1", "generation2"]
     knowledge_sources = ["source1", "source2"]
@@ -97,3 +102,40 @@ def test_get_factscore_from_saved_states(
     avg_score, avg_init_score = fact_score.get_factscore(generations, knowledge_sources)
     assert isinstance(avg_score, float)
     assert isinstance(avg_init_score, float)
+
+
+@pytest.mark.parametrize(
+    "decision, expected_score",
+    [
+        ([{"is_supported": True} for _ in range(10)], 1.0),
+        ([{"is_supported": True} for _ in range(5)], 1.0),
+        ([{"is_supported": False} for _ in range(10)], 0.0),
+        (
+            [
+                {"is_supported": True} if i % 2 == 0 else {"is_supported": False}
+                for i in range(10)
+            ],
+            0.5,
+        ),
+    ],
+)
+def test_calculate_score_various_decisions(fact_score, decision, expected_score):
+    score, init_score = fact_score.calculate_score(decision)
+    assert (
+        init_score == expected_score
+    ), "Initial score should match expected mean of decisions"
+    if len(decision) >= fact_score.gamma:
+        assert (
+            score == expected_score
+        ), "Score should not be penalized when decision count exceeds gamma"
+    else:
+        assert (
+            score != expected_score
+        ), "Score should be penalized when decision count is below gamma"
+
+
+def test_gamma_zero(fact_score):
+    fact_score.gamma = 0  # Setting gamma to zero
+    decision = [{"is_supported": True} for _ in range(5)]
+    score, init_score = fact_score.calculate_score(decision)
+    assert score == init_score, "No penalty should apply when gamma is zero"