From 5519fc08cbc45d40779f8bd12924382920072564 Mon Sep 17 00:00:00 2001 From: Svenja Date: Mon, 4 Sep 2023 14:46:24 +0200 Subject: [PATCH 1/4] First draft --- extractors/__init__.py | 2 + .../llm/insurance_email_extraction/README.md | 2 + .../insurance_email_extraction/__init__.py | 76 +++++++++ .../code_snippet_common.md | 155 ++++++++++++++++++ .../code_snippet_refinery.md | 66 ++++++++ .../llm/insurance_email_extraction/config.py | 62 +++++++ 6 files changed, 363 insertions(+) create mode 100644 extractors/llm/insurance_email_extraction/README.md create mode 100644 extractors/llm/insurance_email_extraction/__init__.py create mode 100644 extractors/llm/insurance_email_extraction/code_snippet_common.md create mode 100644 extractors/llm/insurance_email_extraction/code_snippet_refinery.md create mode 100644 extractors/llm/insurance_email_extraction/config.py diff --git a/extractors/__init__.py b/extractors/__init__.py index 032c6d8b..d8c0dd47 100644 --- a/extractors/__init__.py +++ b/extractors/__init__.py @@ -18,6 +18,7 @@ gpt_information_extraction, deberta_ner_extraction, bert_ner_extraction, + insurance_email_extraction, ) from .media import work_of_art_extraction @@ -109,6 +110,7 @@ bic_extraction, deberta_ner_extraction, bert_ner_extraction, + insurance_email_extraction, ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/extractors/llm/insurance_email_extraction/README.md b/extractors/llm/insurance_email_extraction/README.md new file mode 100644 index 00000000..25730c83 --- /dev/null +++ b/extractors/llm/insurance_email_extraction/README.md @@ -0,0 +1,2 @@ +Uses OpenAI's `GPT-3.5-turbo` model to extract certain information from a insurance email. The informations are: insurance companies, insured company, website of insured company, address of insured company, type of coverage, date of submission, amount of revenue, description of insured company. +At a low temperature, the model extracts specified keywords. At a higher temperature, the model generates relevant keywords. An API key can be provided by us or be obtained directly from OpenAI. Contact us at info@kern.ai if you require an API key or need any support from us. \ No newline at end of file diff --git a/extractors/llm/insurance_email_extraction/__init__.py b/extractors/llm/insurance_email_extraction/__init__.py new file mode 100644 index 00000000..6998f400 --- /dev/null +++ b/extractors/llm/insurance_email_extraction/__init__.py @@ -0,0 +1,76 @@ +import re +import ast +import openai +from extractors.util.spacy import SpacySingleton +from pydantic import BaseModel + +INPUT_EXAMPLE = { + "apiKey": "", + "text" : "The Beatles were an English rock band, formed in Liverpool in 1960, that comprised John Lennon, Paul McCartney, George Harrison and Ringo Starr.", + "extractionKeyword": "names", + "temperature": 0.0, + "spacyTokenizer": "en_core_web_sm", +} + + +class InsuranceEmailExtractionModel(BaseModel): + apiKey: str + text: str + extractionKeyword: str + temperature: float + spacyTokenizer: str + + class Config: + schema_example = {"example": INPUT_EXAMPLE} + + +def gpt_information_extraction(req: InsuranceEmailExtractionModel): + """Uses OpenAI's GPT model to extract keyword from a text.""" + openai.api_key = req.apiKey + try: + response = openai.ChatCompletion.create( + model = "gpt-3.5-turbo", + messages = [ + { + "role": "system", + "content": f""" + Please extract all {req.extractionKeyword} from following text: + {req.text}- + Only return things that are linked to {req.extractionKeyword}. + Return only a valid JSON with this structure. + ```json + {{ + "keywords": ["list with keywords goes here"] + }} + ``` + Return nothing except this JSON. Make sure to only return {req.extractionKeyword} and nothing else. + If you can't find any {req.extractionKeyword} in the text, just return nothing.""" + , + }, + ], + temperature=req.temperature, + ) + + out = response["choices"][0]["message"]["content"] + output_dict = ast.literal_eval(out) + + # check if the output is really a dictionary + if isinstance(output_dict, dict): + nlp = SpacySingleton.get_nlp(req.spacyTokenizer) + doc = nlp(req.text) + + char_positions = [] + if len(output_dict["keywords"]) > 0: + for found_keyword in output_dict["keywords"]: + regex = re.compile(f"{found_keyword}") + match = regex.search(req.text) + start, end = match.span() + span = doc.char_span(start, end, alignment_mode="expand") + char_positions.append((req.extractionKeyword, span.start, span.end)) + else: + return "No matching keywords found." + return {"extraction": char_positions} + else: + return f"GPT response was not a valid dictionary. The response was: {response}." + except Exception as e: + return f"That didn't work. Did you provide a valid API key? Go error: {e}" diff --git a/extractors/llm/insurance_email_extraction/code_snippet_common.md b/extractors/llm/insurance_email_extraction/code_snippet_common.md new file mode 100644 index 00000000..f044d5b6 --- /dev/null +++ b/extractors/llm/insurance_email_extraction/code_snippet_common.md @@ -0,0 +1,155 @@ +``` python +import openai +import re +import spacy +from typing import List, Tuple +import ast + +def insurance_email_extraction(text: str, extraction_keyword: str, api_key: str, temperature: float) -> List[Tuple[str, int]]: + openai.api_key = api_key + response = openai.ChatCompletion.create( + model = "gpt-3.5-turbo", + messages = [ + { + "role": "system", + "content": f""" + Please extract all {extraction_keyword} from following text: + {text}- + Only return things that are linked to {extraction_keyword}. + Return only a valid JSON with this structure. + json + {{ + "keywords": ["list with keywords goes here"] + }} + + Return nothing except this JSON. Make sure to only return {extraction_keyword} and nothing else. + If you can't find any {extraction_keyword} in the text, just return nothing.""" + , + }, + ], + temperature=temperature, + ) + try: + out = response["choices"][0]["message"]["content"] + output_dict = ast.literal_eval(out) + + # check if the output is really a dictionary + if isinstance(output_dict, dict): + nlp = spacy.load("en_core_web_sm") + doc = nlp(text) + + char_positions = [] + if len(output_dict["keywords"]) > 0: + for found_keyword in output_dict["keywords"]: + regex = re.compile(f"{found_keyword}") + match = regex.search(text) + start, end = match.span() + span = doc.char_span(start, end, alignment_mode="expand") + #char_positions.append((extraction_keyword, span.start, span.end)) + char_positions.append((match[0], span.start, span.end)) + else: + return "No matching keywords found." + return char_positions + else: + return f"GPT response was not a valid dictionary. The response was: {response}." + except: + return response["choices"][0]["message"]["content"] + + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +emails_generated= [""" Dear Chloe, + +Hope you're doing great! I have some exciting news regarding a new insurance policy we have secured for UnityCare Insure. Please find all the relevant details below. + +Insured Company details: +Name: Yoga Health Life +Address: 123 Zen Way, Mindfulnessville, CA 98765, United States +Website: www.yogahealthlife.com +Number of Employees: Approximately 100 (estimated) + +Policy Information: +Type of Insurance: Yoga Health Life +Starting Date: 01.01.2024 (as requested by the insured company) +Duration of Submission: Four to seven sentences (to confirm further details) +Sum Insured: Car 2 bil + +Attached to this email, you will find additional information regarding the terms and conditions of the policy, including coverage details and premium amounts. Please make sure to go through it thoroughly and let me know if you have any questions or require any revisions before approving the policy. + +I believe this is an exceptional opportunity for UnityCare Insure to expand its coverage options and attract a diverse range of customers in the wellness and lifestyle industry. The Yoga Health Life policy offers comprehensive coverage for their yoga instructors, meditation facilities, and any liability arising from their business operations. + +If you require any further information or assistance, please do not hesitate to reach out. I will be more than happy to provide you with any additional details you may need. + +Looking forward to your positive response and approval for UnityCare Insure to proceed with the Yoga Health Life policy. + +Warm regards, + +Liam Brown +Insurance Broker""", +"""Dear Luna, + +I hope this email finds you well. It has been a while since we last caught up, but I have some exciting news to share with you today. Our team at SecureLife Underwriters has just taken up a new life well being case, with the insured company being none other than our own organization - SecureLife Underwriters itself! + +Let me provide you with the key details regarding this insurance opportunity: + +• Insured Company Name: SecureLife Underwriters +• Description: We are a large and renowned insurance and forwarding company. +• Number of Employees: Our organization currently employs around 500 staff members. +• Address: 123 Insurance Avenue, Central City, 56789. +• Website: www.securelifeunderwriters.com + +The Life Well Being coverage will come into effect from October 1st, 2023. It aims to provide comprehensive protection for our valued employees, ensuring their overall well-being both in and out of the workplace. + +We would like to submit the necessary information for this coverage within the next week to ensure prompt processing. Our submission will include details such as employee information, demographic data, specific health-related requirements, and any other information deemed necessary to provide utmost security to our personnel. + +Additionally, we have attached our organizational revenue report for reference purposes. Please note that our estimated revenue for the current fiscal year stands at around $100 million. + +Should you require any further details prior to processing this application, please feel free to reach out to me directly. You know me well, Luna, and I assure you that our organization greatly values the partnership we have with SecureLife Underwriters. + +Looking forward to working closely with you on securing this life well being insurance for our organization. + +Best regards, + +Zoe Martinez +Insurance Broker""","""Dear Daniel, + +I hope this email finds you well. It has been a pleasure working with you over the past years, and I trust this continued collaboration will strengthen the relationship between our companies. + +I am writing to inform you of a new client that we, HeritageShield Assurance, have acquired. They are a reputable IT company called IT and More located at 123 Main Street, Cityville. With over 100 employees, IT and More specializes in providing comprehensive information technology solutions to businesses across various sectors. + +To gain an insight into their operations, you can visit IT and More's website at www.itandmore.com. I encourage you to explore their corporate values and achievements to better understand their unique organizational culture. + +Given their forthcoming expansion plans, IT and More is seeking your assistance in obtaining insurance coverage for their evolving needs. They prioritize getting back to us with a proposal as soon as possible to ensure they meet their targets. + +The submission deadline for their custom-tailored insurance plan is [date]. The insurance package should include liability coverage, business interruption coverage, employee benefits, and property insurance among other standard provisions. The estimated annual revenue for IT and More is $10 million. + +Please note that I have attached an additional document containing more specific details about IT and More's requirements and preferences. It will facilitate your team in crafting a policy proposal that aligns with their business objectives. + +If you have any questions or require further clarification, please do not hesitate to reach out. I am available to discuss any concerns that you may have or coordinate a follow-up meeting with the decision-makers at IT and More. + +Thank you for your prompt attention to this matter, Daniel. I fully trust that HeritageShield Assurance will provide the top-notch insurance coverage tailored to accommodate IT and More's needs. I am looking forward to a fruitful partnership between our organizations. + +Warm regards, + +Charlotte Taylor +Chief Insurance Broker""" +] + +def example_integration(): + # replace this list with a list containing your data + api_key = "" + texts = emails_generated + extraction_keywords = ["insurance companies", "insured company", "website of insured company", "address of insured company", "type of coverage", "date of submission", "amount of revenue", "description of insured company"] + + for text in texts: + line = text.split('\n', 1)[0:3] + print(f"The email {line} has:\n") + for extract in extraction_keywords: + extraction = insurance_email_extraction(text, extract, api_key, temperature=0.0) + print(f"{extract} -> {extraction}") + +example_integration() + +``` \ No newline at end of file diff --git a/extractors/llm/insurance_email_extraction/code_snippet_refinery.md b/extractors/llm/insurance_email_extraction/code_snippet_refinery.md new file mode 100644 index 00000000..fad9394a --- /dev/null +++ b/extractors/llm/insurance_email_extraction/code_snippet_refinery.md @@ -0,0 +1,66 @@ +``` python +import openai +import re +import spacy +from typing import List, Tuple +import ast +import re + +API_KEY: str = "" +ATTRIBUTE: str = "text" # only text attributes +EXTRACTION_KEYWORDS: list = ["insurance companies", "insured company", "website of insured company", "address of insured company", "type of coverage", "date of submission", "amount of revenue", "description of insured company"] +TEMPERATURE: float = 0.0 + +def insurance_email_extraction(record): + text = record[ATTRIBUTE].text + openai.api_key = API_KEY + for extraction in EXTRACTION_KEYWORDS: + response = openai.ChatCompletion.create( + model = "gpt-3.5-turbo", + messages = [ + { + "role": "system", + "content": f""" + Please extract all {extraction} from following text: + {text}- + Only return things that are linked to {extraction}. + Return only a valid JSON with this structure. + json + {{ + "keywords": ["list with keywords goes here"] + }} + + Return nothing except this JSON. Make sure to only return {extraction} and nothing else. + If you can't find any {extraction} in the text, just return nothing.""" + , + }, + ], + temperature=TEMPERATURE, + ) + try: + out = response["choices"][0]["message"]["content"] + output_dict = ast.literal_eval(out) + + # check if the output is really a dictionary + if isinstance(output_dict, dict): + nlp = spacy.load("en_core_web_sm") + doc = nlp(text) + + char_positions = [] + if len(output_dict["keywords"]) > 0: + for found_keyword in output_dict["keywords"]: + regex = re.compile(f"{found_keyword}") + match = regex.search(text) + start, end = match.span() + span = doc.char_span(start, end, alignment_mode="expand") + #char_positions.append((extraction_keyword, span.start, span.end)) + char_positions.append((match[0], span.start, span.end)) + else: + return "No matching keywords found." + return char_positions + else: + return f"GPT response was not a valid dictionary. The response was: {response}." + except: + return response["choices"][0]["message"]["content"] + +``` \ No newline at end of file diff --git a/extractors/llm/insurance_email_extraction/config.py b/extractors/llm/insurance_email_extraction/config.py new file mode 100644 index 00000000..55f7f5f6 --- /dev/null +++ b/extractors/llm/insurance_email_extraction/config.py @@ -0,0 +1,62 @@ +from util.configs import build_extractor_premium_config +from util.enums import State, RefineryDataType, BricksVariableType, SelectionType +from . import insurance_email_extraction, INPUT_EXAMPLE + + +def get_config(): + return build_extractor_premium_config( + function=insurance_email_extraction, + input_example=INPUT_EXAMPLE, + issue_id=332, + tabler_icon="mail-opened-filled", + min_refinery_version="1.8.0", + state=State.PUBLIC.value, + type="python_function", + kern_token_proxy_usable="false", + docker_image="none", + available_for=["refinery", "common"], + part_of_group=[ + "llm", + ], # first entry should be parent directory + # bricks integrator information + integrator_inputs={ + "name": "insurance_email_extraction", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "API_KEY": { + "selectionType": SelectionType.STRING.value, + "defaultValue": "", + "addInfo": [BricksVariableType.GENERIC_STRING.value], + }, + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value, + ], + }, + "LABEL": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.LABEL.value, + BricksVariableType.GENERIC_STRING.value, + ], + "defaultValue": "names", + }, + "EXTRACTION_KEYWORD": { + "selectionType": SelectionType.CHOICE.value, + "defaultValue": "names", + "addInfo": [ + BricksVariableType.LABEL.value, + BricksVariableType.GENERIC_STRING.value, + ], + }, + "TEMPERATURE": { + "selectionType": SelectionType.INTEGER.value, + "defaultValue": 0, + "allowedValues": [0, 100], + "addInfo": [BricksVariableType.GENERIC_INT.value], + }, + }, + }, + ) From 5e294a6e2abfd6d60098197b1d80c35c88514b9f Mon Sep 17 00:00:00 2001 From: Svenja Date: Tue, 5 Sep 2023 12:00:00 +0200 Subject: [PATCH 2/4] Update to gpt-3.5-turbo-16k --- extractors/llm/insurance_email_extraction/README.md | 2 +- extractors/llm/insurance_email_extraction/__init__.py | 2 +- .../llm/insurance_email_extraction/code_snippet_refinery.md | 2 +- extractors/llm/insurance_email_extraction/config.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/extractors/llm/insurance_email_extraction/README.md b/extractors/llm/insurance_email_extraction/README.md index 25730c83..eac05cef 100644 --- a/extractors/llm/insurance_email_extraction/README.md +++ b/extractors/llm/insurance_email_extraction/README.md @@ -1,2 +1,2 @@ -Uses OpenAI's `GPT-3.5-turbo` model to extract certain information from a insurance email. The informations are: insurance companies, insured company, website of insured company, address of insured company, type of coverage, date of submission, amount of revenue, description of insured company. +Uses OpenAI's `gpt-3.5-turbo-16k` model to extract certain information from a insurance email. The informations are: insurance companies, insured company, website of insured company, address of insured company, type of coverage, date of submission, amount of revenue, description of insured company. At a low temperature, the model extracts specified keywords. At a higher temperature, the model generates relevant keywords. An API key can be provided by us or be obtained directly from OpenAI. Contact us at info@kern.ai if you require an API key or need any support from us. \ No newline at end of file diff --git a/extractors/llm/insurance_email_extraction/__init__.py b/extractors/llm/insurance_email_extraction/__init__.py index 6998f400..ac63a852 100644 --- a/extractors/llm/insurance_email_extraction/__init__.py +++ b/extractors/llm/insurance_email_extraction/__init__.py @@ -29,7 +29,7 @@ def gpt_information_extraction(req: InsuranceEmailExtractionModel): openai.api_key = req.apiKey try: response = openai.ChatCompletion.create( - model = "gpt-3.5-turbo", + model = "gpt-3.5-turbo-16k", messages = [ { "role": "system", diff --git a/extractors/llm/insurance_email_extraction/code_snippet_refinery.md b/extractors/llm/insurance_email_extraction/code_snippet_refinery.md index fad9394a..f1d2e256 100644 --- a/extractors/llm/insurance_email_extraction/code_snippet_refinery.md +++ b/extractors/llm/insurance_email_extraction/code_snippet_refinery.md @@ -8,7 +8,7 @@ import re API_KEY: str = "" ATTRIBUTE: str = "text" # only text attributes -EXTRACTION_KEYWORDS: list = ["insurance companies", "insured company", "website of insured company", "address of insured company", "type of coverage", "date of submission", "amount of revenue", "description of insured company"] +EXTRACTION_KEYWORDS: List = ["insurance companies", "insured company", "website of insured company", "address of insured company", "type of coverage", "date of submission", "amount of revenue", "description of insured company"] TEMPERATURE: float = 0.0 def insurance_email_extraction(record): diff --git a/extractors/llm/insurance_email_extraction/config.py b/extractors/llm/insurance_email_extraction/config.py index 55f7f5f6..4000a1e0 100644 --- a/extractors/llm/insurance_email_extraction/config.py +++ b/extractors/llm/insurance_email_extraction/config.py @@ -44,7 +44,7 @@ def get_config(): "defaultValue": "names", }, "EXTRACTION_KEYWORD": { - "selectionType": SelectionType.CHOICE.value, + "selectionType": SelectionType.LIST.value, "defaultValue": "names", "addInfo": [ BricksVariableType.LABEL.value, From e2d7ab7c83090c4a78dd0e19996c979e76ac6740 Mon Sep 17 00:00:00 2001 From: Svenja Date: Tue, 5 Sep 2023 12:12:46 +0200 Subject: [PATCH 3/4] Update to gpt-3.5-turbo-16k common --- .../llm/insurance_email_extraction/code_snippet_common.md | 2 +- .../llm/insurance_email_extraction/code_snippet_refinery.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extractors/llm/insurance_email_extraction/code_snippet_common.md b/extractors/llm/insurance_email_extraction/code_snippet_common.md index f044d5b6..0bcc8723 100644 --- a/extractors/llm/insurance_email_extraction/code_snippet_common.md +++ b/extractors/llm/insurance_email_extraction/code_snippet_common.md @@ -8,7 +8,7 @@ import ast def insurance_email_extraction(text: str, extraction_keyword: str, api_key: str, temperature: float) -> List[Tuple[str, int]]: openai.api_key = api_key response = openai.ChatCompletion.create( - model = "gpt-3.5-turbo", + model = "gpt-3.5-turbo-16k", messages = [ { "role": "system", diff --git a/extractors/llm/insurance_email_extraction/code_snippet_refinery.md b/extractors/llm/insurance_email_extraction/code_snippet_refinery.md index f1d2e256..05030bba 100644 --- a/extractors/llm/insurance_email_extraction/code_snippet_refinery.md +++ b/extractors/llm/insurance_email_extraction/code_snippet_refinery.md @@ -16,7 +16,7 @@ def insurance_email_extraction(record): openai.api_key = API_KEY for extraction in EXTRACTION_KEYWORDS: response = openai.ChatCompletion.create( - model = "gpt-3.5-turbo", + model = "gpt-3.5-turbo-16k", messages = [ { "role": "system", From 1bb4f82b82ae854c0cc79e499692b855ea38f194 Mon Sep 17 00:00:00 2001 From: Svenja Date: Wed, 6 Sep 2023 11:52:24 +0200 Subject: [PATCH 4/4] Update __init__.py --- extractors/llm/insurance_email_extraction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractors/llm/insurance_email_extraction/__init__.py b/extractors/llm/insurance_email_extraction/__init__.py index ac63a852..d7b203c6 100644 --- a/extractors/llm/insurance_email_extraction/__init__.py +++ b/extractors/llm/insurance_email_extraction/__init__.py @@ -24,7 +24,7 @@ class Config: schema_example = {"example": INPUT_EXAMPLE} -def gpt_information_extraction(req: InsuranceEmailExtractionModel): +def insurance_email_extraction(req: InsuranceEmailExtractionModel): """Uses OpenAI's GPT model to extract keyword from a text.""" openai.api_key = req.apiKey try: