feat: ✨ Allow customization of LLM question (#252)

chadell · web-flow · commit 57237f08016d · 2023-12-14T19:26:31.000+01:00
* feat: ✨ Allow customization of LLM question

* fix env variable name

* Add an option to get the LLM question from a file

* Add a  prefix in the ENVs to diambiguate them

* fix property access
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ By default, there is a `GenericProvider` that supports a `SimpleProcessor` using
 
 #### LLM-powered Parsers
 
-The library supports an optional parser option leveraging Large Language Model (LLM) to provide best-effort parsing when the specific parsers have not been successful.
+The library supports an optional parser option leveraging Large Language Models (LLM) to provide best-effort parsing when the specific parsers have not been successful.
 
 > Warning: Some of these integrations, such as OpenAI, require of extras installations parameters. Check the [extras section](#extras)
 
@@ -98,9 +98,12 @@ When the appropriate environment variable(s) are set (see below), these LLM pars
 
 These are the currently supported LLM integrations:
 
+- `PARSER_LLM_QUESTION_STR` (Optional), question to overwrite the default one. Change it carefully. It has precedence over `PARSER_LLM_QUESTION_FILEPATH`
+- `PARSER_LLM_QUESTION_FILEPATH` (Optional), a path to a file that contains a question to overwrite the default one.
+
 - [OpenAI](https://openai.com/product), these are the supported ENVs:
-  - `OPENAI_API_KEY` (Required): OpenAI API Key.
-  - `OPENAI_MODEL` (Optional): The LLM model to use, defaults to "gpt-3.5-turbo".
+  - `PARSER_OPENAI_API_KEY` (Required): OpenAI API Key.
+  - `PARSER_OPENAI_MODEL` (Optional): The LLM model to use, defaults to "gpt-3.5-turbo".
 
 ### Metadata
 
diff --git a/circuit_maintenance_parser/parser.py b/circuit_maintenance_parser/parser.py
@@ -1,5 +1,6 @@
 """Definition of Mainentance Notification base classes."""
 import logging
+import os
 import base64
 import calendar
 import datetime
@@ -346,6 +347,23 @@ def get_key_with_string(dictionary: dict, string: str):
                 return key
         return None
 
+    @property
+    def llm_question(self):
+        """Return the LLM question."""
+        custom_llm_question = os.getenv("PARSER_LLM_QUESTION_STR")
+        if custom_llm_question:
+            return custom_llm_question
+
+        custom_llm_question_path = os.getenv("PARSER_LLM_QUESTION_FILEPATH")
+        if custom_llm_question_path:
+            try:
+                with open(custom_llm_question_path, mode="r", encoding="utf-8") as llm_question_file:
+                    return llm_question_file.read()
+            except OSError as err:
+                logger.warning("The file %s can't be read: %s", custom_llm_question_path, err)
+
+        return self._llm_question
+
     def get_llm_response(self, content):
         """Method to retrieve the response from the LLM for some content."""
         raise NotImplementedError
diff --git a/circuit_maintenance_parser/parsers/openai.py b/circuit_maintenance_parser/parsers/openai.py
@@ -24,15 +24,15 @@ def get_llm_response(self, content) -> Optional[List]:
         if not _HAS_OPENAI:
             raise ImportError("openai extra is required to use OpenAIParser.")
 
-        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-        model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
+        client = OpenAI(api_key=os.getenv("PARSER_OPENAI_API_KEY"))
+        model = os.getenv("PARSER_OPENAI_MODEL", "gpt-3.5-turbo")
         try:
             response = client.chat.completions.create(
                 model=model,
                 messages=[
                     {  # type: ignore
                         "role": "system",
-                        "content": self._llm_question,
+                        "content": self.llm_question,
                     },
                     {  # type: ignore
                         "role": "user",
diff --git a/circuit_maintenance_parser/provider.py b/circuit_maintenance_parser/provider.py
@@ -123,7 +123,7 @@ def get_maintenances(self, data: NotificationData) -> Iterable[Maintenance]:
             logger.debug("Skipping notification %s due filtering policy for %s.", data, self.__class__.__name__)
             return []
 
-        if os.getenv("OPENAI_API_KEY"):
+        if os.getenv("PARSER_OPENAI_API_KEY"):
             self._processors.append(CombinedProcessor(data_parsers=[EmailDateParser, OpenAIParser]))
 
         for processor in self._processors:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,16 +2,16 @@
 import os
 
 
-token_openai = os.getenv("OPENAI_API_KEY")
+token_openai = os.getenv("PARSER_OPENAI_API_KEY")
 
 
 def pytest_configure(config):  # pylint: disable=unused-argument
     """Clean environment for tests."""
     if token_openai:
-        del os.environ["OPENAI_API_KEY"]
+        del os.environ["PARSER_OPENAI_API_KEY"]
 
 
 def pytest_sessionfinish(session, exitstatus):  # pylint: disable=unused-argument
     """Recove environment after tests."""
     if token_openai:
-        os.environ["OPENAI_API_KEY"] = token_openai
+        os.environ["PARSER_OPENAI_API_KEY"] = token_openai
diff --git a/tests/unit/test_providers.py b/tests/unit/test_providers.py
@@ -117,7 +117,7 @@ class ProviderWithIncludeFilter(GenericProvider):
 )
 def test_provider_gets_mlparser(provider_class):
     """Test to check the any provider gets a default ML parser when ENV is activated."""
-    os.environ["OPENAI_API_KEY"] = "some_api_key"
+    os.environ["PARSER_OPENAI_API_KEY"] = "some_api_key"
     data = NotificationData.init_from_raw("text/plain", b"fake data")
     data.add_data_part("text/html", b"other data")
 

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ class ProviderWithIncludeFilter(GenericProvider):`
`117`	`117`	`)`
`118`	`118`	`def test_provider_gets_mlparser(provider_class):`
`119`	`119`	`"""Test to check the any provider gets a default ML parser when ENV is activated."""`
`120`		`- os.environ["OPENAI_API_KEY"] = "some_api_key"`
	`120`	`+ os.environ["PARSER_OPENAI_API_KEY"] = "some_api_key"`
`121`	`121`	`data = NotificationData.init_from_raw("text/plain", b"fake data")`
`122`	`122`	`data.add_data_part("text/html", b"other data")`
`123`	`123`