Added support for faster n exhaustive regeneration #4

pramitchoudhary · pramitchoudhary · commit 3ba31f0f3a98 · 2023-09-05T17:43:18.000-07:00
diff --git a/about.md b/about.md
@@ -4,7 +4,7 @@
 
 **Actively Being Maintained:** Yes (Demo release: _In active RnD_)
 
-**Last Updated:** August, 2023
+**Last Updated:** September, 2023
 
 **Allows uploading and using new model and data:** Yes
 
diff --git a/app.toml b/app.toml
@@ -4,7 +4,7 @@ title = "SQL-Sidekick"
 description = "QnA with tabular data using NLI"
 LongDescription = "about.md"
 Tags = ["DATA_SCIENCE", "MACHINE_LEARNING", "NLP"]
-Version = "0.0.6"
+Version = "0.0.7"
 
 [Runtime]
 MemoryLimit = "64Gi"
diff --git a/sidekick/prompter.py b/sidekick/prompter.py
@@ -321,6 +321,7 @@ def query_api(
     sample_queries_path: str,
     table_name: str,
     is_regenerate: bool = False,
+    is_regen_with_options: bool = False,
     is_command: bool = False,
 ):
     """Asks question and returns SQL."""
@@ -400,7 +401,8 @@ def query_api(
             job_path=base_path,
             data_input_path=table_info_path,
             sample_queries_path=sample_queries_path,
-            is_regenerate = is_regenerate
+            is_regenerate_with_options=is_regen_with_options,
+            is_regenerate=is_regenerate,
         )
         if "h2ogpt-sql" not in model_name:
             sql_g._tasks = sql_g.generate_tasks(table_names, question)
@@ -418,9 +420,7 @@ def query_api(
             if updated_tasks is not None:
                 sql_g._tasks = updated_tasks
         alt_res = None
-        res, alt_res = sql_g.generate_sql(
-            table_names, question, model_name=model_name, _dialect=db_dialect, is_regenerate=is_regenerate
-        )
+        res, alt_res = sql_g.generate_sql(table_names, question, model_name=model_name, _dialect=db_dialect)
         logger.info(f"Input query: {question}")
         logger.info(f"Generated response:\n\n{res}")
 
@@ -439,11 +439,7 @@ def query_api(
                     elif res_val.lower() == "r" or res_val.lower() == "regenerate":
                         click.echo("Attempting to regenerate...")
                         res, alt_res = sql_g.generate_sql(
-                            table_names,
-                            question,
-                            model_name=model_name,
-                            _dialect=db_dialect,
-                            is_regenerate=is_regenerate,
+                            table_names, question, model_name=model_name, _dialect=db_dialect
                         )
                         logger.info(f"Input query: {question}")
                         logger.info(f"Generated response:\n\n{res}")
diff --git a/sidekick/query.py b/sidekick/query.py
@@ -1,6 +1,6 @@
+import gc
 import json
 import os
-import gc
 import random
 import sys
 from pathlib import Path
@@ -18,14 +18,13 @@
 from sidekick.utils import (
     _check_file_info,
     filter_samples,
+    is_resource_low,
     load_causal_lm_model,
     load_embedding_model,
     read_sample_pairs,
     remove_duplicates,
-    is_resource_low,
 )
 from sqlalchemy import create_engine
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 class SQLGenerator:
@@ -41,6 +40,7 @@ def __new__(
         job_path: str = "./",
         device: str = "auto",
         is_regenerate: bool = False,
+        is_regenerate_with_options: bool = False,
     ):
         offloading = is_resource_low()
         if offloading and is_regenerate:
@@ -73,6 +73,7 @@ def __init__(
         job_path: str = "./",
         device: str = "cpu",
         is_regenerate: bool = False,
+        is_regenerate_with_options: bool = False,
     ):
         self.db_url = db_url
         self.engine = create_engine(db_url)
@@ -86,6 +87,9 @@ def __init__(
         self.model_name = model_name
         self.openai_key = openai_key
         self.content_queries = None
+        self.is_regenerate_with_options = is_regenerate_with_options
+        self.is_regenerate = is_regenerate
+        self.device = device
 
     def clear(self):
         del SQLGenerator._instance
@@ -252,12 +256,7 @@ def generate_tasks(self, table_names: list, input_question: str):
             raise se
 
     def generate_sql(
-        self,
-        table_names: list,
-        input_question: str,
-        _dialect: str = "sqlite",
-        model_name: str = "h2ogpt-sql",
-        is_regenerate: bool = False,
+        self, table_names: list, input_question: str, _dialect: str = "sqlite", model_name: str = "h2ogpt-sql"
     ):
         context_file = f"{self.path}/var/lib/tmp/data/context.json"
         additional_context = json.load(open(context_file, "r")) if Path(context_file).exists() else {}
@@ -361,8 +360,8 @@ def generate_sql(
             logger.info(f"Number of possible contextual queries to question: {len(filtered_context)}")
             # If QnA pairs > 5, we keep top 5 for focused context
             _samples = filtered_context
-            if len(filtered_context) > 3:
-                _samples = filtered_context[0:3][::-1]
+            if len(filtered_context) > 5:
+                _samples = filtered_context[0:5][::-1]
 
             qna_samples = "\n".join(_samples)
 
@@ -431,7 +430,8 @@ def generate_sql(
             device_type = "cuda" if torch.cuda.is_available() else "cpu"
 
             alternate_queries = []
-            if not is_regenerate:
+            if not self.is_regenerate_with_options and not self.is_regenerate:
+                # Greedy decoding
                 output = self.model.generate(
                     **inputs.to(device_type),
                     max_new_tokens=300,
@@ -442,17 +442,37 @@ def generate_sql(
                 )
 
                 generated_tokens = output.sequences[:, input_length:][0]
-            else:
+            elif self.is_regenerate and not self.is_regenerate_with_options:
+                # throttle temperature for different result
                 logger.info("Regeneration requested on previous query ...")
                 random_seed = random.randint(0, 50)
                 torch.manual_seed(random_seed)
-                random_temperature = round(random.uniform(0.5, 0.75), 2)
+                possible_temp_choice = [0.1, 0.2, 0.3, 0.6, 0.75, 0.9]
+                random_temperature = np.random.choice(possible_temp_choice, 1)[0]
+                logger.debug(f"Selected temperature for fast regeneration : {random_temperature}")
+                output = self.model.generate(
+                    **inputs.to(device_type),
+                    max_new_tokens=300,
+                    temperature=random_temperature,
+                    output_scores=True,
+                    do_sample=True,
+                    return_dict_in_generate=True,
+                )
+                generated_tokens = output.sequences[:, input_length:][0]
+            else:
+                logger.info("Regeneration with options requested on previous query ...")
+                # Diverse beam search decoding to explore more options
+                random_seed = random.randint(0, 50)
+                torch.manual_seed(random_seed)
+                possible_temp_choice = [0.1, 0.3, 0.5, 0.6, 0.75]
+                random_temperature = np.random.choice(possible_temp_choice, 1)[0]
+                logger.debug(f"Selected temperature for diverse beam search: {random_temperature}")
                 output_re = self.model.generate(
                     **inputs.to(device_type),
                     max_new_tokens=300,
                     temperature=random_temperature,
                     top_k=5,
-                    top_p=0.95,
+                    top_p=0.9,
                     num_beams=5,
                     num_beam_groups=5,
                     num_return_sequences=5,
@@ -465,6 +485,7 @@ def generate_sql(
                 transition_scores = self.model.compute_transition_scores(
                     output_re.sequences, output_re.scores, output_re.beam_indices, normalize_logits=False
                 )
+
                 # Create a boolean tensor where elements are True if the corresponding element in transition_scores is less than 0
                 mask = transition_scores < 0
                 # Sum the True values along axis 1
diff --git a/sidekick/utils.py b/sidekick/utils.py
@@ -8,14 +8,14 @@
 import numpy as np
 import pandas as pd
 import torch
+from accelerate import infer_auto_device_map, init_empty_weights
 from InstructorEmbedding import INSTRUCTOR
 from pandasql import sqldf
 from sentence_transformers import SentenceTransformer
 from sidekick.logger import logger
 from sklearn.metrics.pairwise import cosine_similarity
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-from accelerate import init_empty_weights, infer_auto_device_map
-from transformers import BitsAndBytesConfig
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig)
 
 
 def generate_sentence_embeddings(model_path: str, x, batch_size: int = 32, device: Optional[str] = None):
@@ -324,7 +324,7 @@ def load_causal_lm_model(
                 model_name, cache_dir=cache_path, device_map=device, quantization_config=nf4_config
             )
 
-        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_path, device_map=device)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_path, device_map=device, use_fast=True)
 
         return model, tokenizer
     except Exception as e:
diff --git a/ui/app.py b/ui/app.py
@@ -72,7 +72,7 @@ async def chat(q: Q):
     table_names = []
     tables, _ = get_table_keys(f"{tmp_path}/data/tables.json", None)
     for table in tables:
-        table_names.append(ui.choice(table, f"Table: {table}"))
+        table_names.append(ui.choice(table, f"{table}"))
 
     add_card(q, "background_card", ui.form_card(box="horizontal", items=[ui.text("Ask your questions:")]))
 
@@ -100,7 +100,15 @@ async def chat(q: Q):
             box=ui.box("vertical", height="500px"),
             name="chatbot",
             data=data(fields="content from_user", t="list", size=-50),
-            commands=[ui.command(name=f"regenerate_event", icon="RepeatAll", caption="Regenerate", label="Regenerate")],
+            commands=[
+                ui.command(name=f"regenerate", icon="RepeatOne", caption="Attempts regeneration", label="Regenerate"),
+                ui.command(
+                    name=f"regenerate_with_options",
+                    icon="RepeatAll",
+                    caption="Regenerates with options",
+                    label="Try Harder",
+                ),
+            ],
         ),
     )
 
@@ -121,6 +129,10 @@ async def chatbot(q: Q):
     question = f"{q.args.chatbot}"
     logging.info(f"Question: {question}")
 
+    # For regeneration, currently there are 2 modes
+    # 1. Quick fast approach by throttling the temperature
+    # 2. "Try harder mode (THM)" Slow approach by using the diverse beam search
+
     try:
         if q.args.chatbot.lower() == "db setup":
             llm_response, err = db_setup_api(
@@ -133,15 +145,30 @@ async def chatbot(q: Q):
                 table_samples_path=q.user.table_samples_path,
                 table_name=q.user.table_name,
             )
-        elif q.args.chatbot.lower() == "regenerate" or q.args.regenerate_event:
-            # Attempts to regenerate response on the last supplie query
+        elif q.args.chatbot.lower() == "regenerate" or q.args.regenerate:
+            # Attempts to regenerate response on the last supplied query
+            logging.info(f"Attempt for regeneration")
             if q.client.query is not None and q.client.query.strip() != "":
                 llm_response, alt_response, err = query_api(
                     question=q.client.query,
                     sample_queries_path=q.user.sample_qna_path,
                     table_info_path=q.user.table_info_path,
                     table_name=q.user.table_name,
                     is_regenerate=True,
+                    is_regen_with_options=False,
+                )
+                llm_response = "\n".join(llm_response)
+        elif q.args.chatbot.lower() == "try harder" or q.args.regenerate_with_options:
+            # Attempts to regenerate response on the last supplied query
+            logging.info(f"Attempt for regeneration with options.")
+            if q.client.query is not None and q.client.query.strip() != "":
+                llm_response, alt_response, err = query_api(
+                    question=q.client.query,
+                    sample_queries_path=q.user.sample_qna_path,
+                    table_info_path=q.user.table_info_path,
+                    table_name=q.user.table_name,
+                    is_regenerate=False,
+                    is_regen_with_options=True,
                 )
                 response = "\n".join(llm_response)
                 if alt_response:
@@ -412,8 +439,12 @@ async def on_event(q: Q):
     logging.info(f"Event handled ... ")
     args_dict = expando_to_dict(q.args)
     logging.debug(f"Args dict {args_dict}")
-    if q.args.regenerate_event:
+    if q.args.regenerate_with_options:
+        q.args.chatbot = "try harder"
+    elif q.args.regenerate:
         q.args.chatbot = "regenerate"
+
+    if q.args.regenerate_with_options or q.args.regenerate:
         await chatbot(q)
         event_handled = True
     else:  # default chatbot event