diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 15d49e6..bf62e90 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,19 +1,19 @@ -# Use the official Python image from Microsoft devcontainers -FROM mcr.microsoft.com/devcontainers/python:3 - -# Install ODBC drivers and dependencies -RUN apt-get update && \ - apt-get install -y apt-transport-https curl && \ - curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ - curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list && \ - apt-get update && \ - ACCEPT_EULA=Y apt-get install -y msodbcsql18 unixodbc-dev - -# Set the working directory -WORKDIR /app - -# Copy the requirements file into the container -COPY requirements.txt . - -# Install Python dependencies -RUN pip install -r requirements.txt +# Use the official Python image from Microsoft devcontainers +FROM mcr.microsoft.com/devcontainers/python:3 + +# Install ODBC drivers and dependencies +RUN apt-get update && \ + apt-get install -y apt-transport-https curl && \ + curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ + curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list && \ + apt-get update && \ + ACCEPT_EULA=Y apt-get install -y msodbcsql18 unixodbc-dev + +# Set the working directory +WORKDIR /app + +# Copy the requirements file into the container +COPY requirements-dev.txt . + +# Install Python dependencies +RUN pip install -r requirements-dev.txt diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index d32bb0b..93640eb 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,23 +1,23 @@ -{ - "build": { - "context": "..", - "dockerfile": "Dockerfile" - }, - "customizations": { - "vscode": { - "extensions": [ - "ms-python.python", - "ms-python.vscode-pylance", - "ms-toolsai.jupyter", - "ms-azuretools.vscode-docker" - ], - "settings": { - "editor.formatOnSave": true - } - } - }, - "features": { - "ghcr.io/devcontainers-contrib/features/black:2": {} - }, - "name": "Dev Container" +{ + "build": { + "context": "..", + "dockerfile": "Dockerfile" + }, + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-python.vscode-pylance", + "ms-toolsai.jupyter", + "ms-azuretools.vscode-docker" + ], + "settings": { + "editor.formatOnSave": true + } + } + }, + "features": { + "ghcr.io/devcontainers-contrib/features/black:2": {} + }, + "name": "Dev Container" } diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2ad92e3..91d48e2 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,2 +1,2 @@ -# Default owner for everything in the repo -* @microsoft/dstoolkit-text2sql-and-imageprocessing-admins +# Default owner for everything in the repo +* @microsoft/dstoolkit-text2sql-and-imageprocessing-admins diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml index a5d49cc..41edcc6 100644 --- a/.github/dependabot.yaml +++ b/.github/dependabot.yaml @@ -1,14 +1,14 @@ -version: 2 -updates: - - package-ecosystem: "pip" - directory: "/" - schedule: - interval: "weekly" - commit-message: - prefix: "deps" - include: "scope" - rebase-strategy: "auto" - open-pull-requests-limit: 5 - labels: - - "dependencies" - - "automated-update" +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + commit-message: + prefix: "deps" + include: "scope" + rebase-strategy: "auto" + open-pull-requests-limit: 5 + labels: + - "dependencies" + - "automated-update" diff --git a/.github/workflows/codeql-analysis.yaml b/.github/workflows/codeql-analysis.yaml index 88c153f..bdab3bf 100644 --- a/.github/workflows/codeql-analysis.yaml +++ b/.github/workflows/codeql-analysis.yaml @@ -1,73 +1,73 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# -name: "CodeQL" - -on: - push: - branches: [main] - pull_request: - # The branches below must be a subset of the branches above - branches: [main] - schedule: - - cron: "38 19 * * 1" - -permissions: read-all - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - permissions: - actions: read - contents: read - security-events: write - - strategy: - fail-fast: false - matrix: - language: ["python"] - # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] - # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v2 - with: - languages: ${{ matrix.language }} - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - - # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs - # queries: security-extended,security-and-quality - - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v2 - - # â„šī¸ Command-line programs to run using the OS shell. - # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - - # If the Autobuild fails above, remove it and uncomment the following three lines. - # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. - - # - run: | - # echo "Run, Build Application using script" - # ./location_of_script_within_repo/buildscript.sh - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [main] + pull_request: + # The branches below must be a subset of the branches above + branches: [main] + schedule: + - cron: "38 19 * * 1" + +permissions: read-all + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: ["python"] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 diff --git a/.gitignore b/.gitignore index 82f9275..090fef4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,162 +1,162 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7b8f549..0c74906 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,45 +1,45 @@ -fail_fast: true -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 - hooks: - # Invalid file checks - - id: check-yaml - - id: check-added-large-files - - id: check-symlinks - - id: check-toml - - # File quality checks - - id: end-of-file-fixer - - id: trailing-whitespace - - # Git checks - - id: check-merge-conflict - - # Python checks - - id: name-tests-test - - # JSON files - - id: pretty-format-json - args: [--autofix] - - - id: check-json - - - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 - hooks: - - id: codespell - - - repo: https://github.com/psf/black-pre-commit-mirror - rev: 23.12.1 - hooks: - - id: black - - - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.1.9 - hooks: - # Run the linter. - - id: ruff - args: [--fix, --ignore, UP007] - exclude: samples +fail_fast: true +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + # Invalid file checks + - id: check-yaml + - id: check-added-large-files + - id: check-symlinks + - id: check-toml + + # File quality checks + - id: end-of-file-fixer + - id: trailing-whitespace + + # Git checks + - id: check-merge-conflict + + # Python checks + - id: name-tests-test + + # JSON files + - id: pretty-format-json + args: [--autofix] + + - id: check-json + + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 23.12.1 + hooks: + - id: black + + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.1.9 + hooks: + # Run the linter. + - id: ruff + args: [--fix, --ignore, UP007] + exclude: samples diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f9ba8cf..c72a574 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,9 +1,9 @@ -# Microsoft Open Source Code of Conduct - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). - -Resources: - -- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) -- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) -- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0c5c1ee..8637528 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,13 +1,13 @@ -## Contributing - -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. - -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/LICENSE b/LICENSE index 9e841e7..3d8b93b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ - MIT License - - Copyright (c) Microsoft Corporation. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/README.md b/README.md index 5cd7cec..835664b 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,30 @@ -# Project - -> This repo has been populated by an initial template to help get you started. Please -> make sure to update the content to build a great experience for community-building. - -As the maintainer of this project, please make a few updates: - -- Improving this README.MD file to provide a great experience -- Updating SUPPORT.MD with content about this project's support experience -- Understanding the security reporting process in SECURITY.MD -- Remove this section from the README - -## Contributing - -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. - -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. - -## Trademarks - -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow -[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). -Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. -Any use of third-party trademarks or logos are subject to those third-party's policies. +# Text2SQL and Image Processing in AI Search + +This repo provides sample code for improving RAG applications with rich data sources. + +- `./text2sql` contains an Multi-Shot implementation for Text2SQL generation and querying which can be used to answer questions backed by a database as a knowledge base. +- `./ai_search_with_adi` contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models to interpret and understand these. + +The above components have been successfully used on production RAG projects to increase the quality of responses. The code provided in this repo is a sample of the implementation and should be adjusted before being used in production. + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/SECURITY.md b/SECURITY.md index b3c89ef..84bc961 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,41 +1,41 @@ - - -## Security - -Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). - -If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. - -## Reporting Security Issues - -**Please do not report security vulnerabilities through public GitHub issues.** - -Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). - -If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). - -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). - -Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: - - * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) - * Full paths of source file(s) related to the manifestation of the issue - * The location of the affected source code (tag/branch/commit or direct URL) - * Any special configuration required to reproduce the issue - * Step-by-step instructions to reproduce the issue - * Proof-of-concept or exploit code (if possible) - * Impact of the issue, including how an attacker might exploit the issue - -This information will help us triage your report more quickly. - -If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. - -## Preferred Languages - -We prefer all communications to be in English. - -## Policy - -Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). - - + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). + + diff --git a/requirements-dev.txt b/requirements-dev.txt index 6b7920c..3d469bc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,4 @@ -pre-commit -ruff -black +pre-commit +ruff +black +ipykernel diff --git a/text2sql/.env b/text2sql/.env new file mode 100644 index 0000000..6b28e2c --- /dev/null +++ b/text2sql/.env @@ -0,0 +1,10 @@ +OPEN_AI_CONVERSATION_MODEL= +OPEN_AI_EMBEDDING_MODEL= +OPEN_AI_ENDPOINT= +OPEN_AI_KEY= +SQL_DB_ENGINE= +SQL_DB_NAME= +SQL_DB_CONNECTION_STRING= +AI_SEARCH_ENDPOINT= +AI_SEARCH_INDEX= +AI_SEARCH_SEMANTIC_CONFIG= diff --git a/text2sql/README.md b/text2sql/README.md index e69de29..7760929 100644 --- a/text2sql/README.md +++ b/text2sql/README.md @@ -0,0 +1,179 @@ +# Multi-Shot Text2SQL Component + +This portion of the repo contains code to implement a multi-shot approach to Text2SQL generation. This code can be integrated into a RAG application to allow the application to intelligently switch between different data sources (SQL, AI Search etc) to answer the question with the best possible information. + +The implementation is written for [Semantic Kernel](https://github.com/microsoft/semantic-kernel) in Python, although it can easily be adapted for C# or another framework such as LangChain. + +The sample provided works with Azure SQL Server, although it has been easily adapted to other SQL sources such as Snowflake. + +## High Level Workflow + +The following diagram shows a workflow for how the Text2SQL plugin would be incorporated into a RAG application. Using the plugins available, alongside the [Function Calling](https://platform.openai.com/docs/guides/function-calling) capabilities of LLMs, the LLM can do [Chain of Thought](https://learn.microsoft.com/en-us/dotnet/ai/conceptual/chain-of-thought-prompting) reasoning to determine the steps needed to answer the question. This allows the LLM to recognise intent and therefore pick appropriate data sources based on the intent of the question. + +![High level workflow for a plugin driven RAG application](./images/Plugin%20Based%20RAG%20Flow.png "High Level Workflow") + +## Why Text2SQL instead of indexing the database contents? + +Generating SQL queries and executing them to provide context for the RAG application provided several benefits in the use case this was designed for. + +- Automatic report generation did not have to be built to automatically index the contents of the database and chunk it accordingly. +- By retaining the original table structure rather than indexing the contents, we are able to perform aggregations and calculations on the data quickly and accurately to answer numerical or statistic based questions. On a pure document based system, some of these questions are not easily answerable without pre-computing reports or extracting all the content + - e.g. *What is our top performing sales per by quantity of units sold this month? What item did they sell the most of?* is answerable with a few simple SQL query if the correct views are exposed. + - Without Text2SQL, a document needs to contain the top sales information for each month and be updated regularly. Additionally, we need to then store in a document all the relevant information for what they have sold that month and add into the chunk information that they are the top performing sales person. +- Pushing numerical calculations onto the source SQL engine ensures accuracy in the maths. +- Data can be updated real-time in the source database and be immediately accessible to the LLM. + +## Multi-Shot Approach + +A common way to perform Text2SQL generation is to provide the complete schema information (either a full schema or a plain text description) inside the initial prompt. Whilst this works for small databases, there are issues with scalability as the number of tables and views exposed to the LLM increases: + +- More tables / views significantly increases the number of tokens used within the prompt and the cost of inference. +- More schema information can cause confusion with the LLM. In our original use case, when exceeding 5 complex tables / views, we found that the LLM could get confused between which columns belonged to which entity and as such, would generate invalid SQL queries. + +To solve these issues, a Multi-Shot approach is used: + +![Comparison between a common Text2SQL approach and a Multi-Shot Text2SQL approach.](./images/OneShot%20SQL%20vs%20TwoShot%20SQL%20OpenAI.png "Multi Shot SQL Approach") + +Instead of inserting the entire database schema into the prompt, a brief description of the available entities is injected into the prompt. This limits the number of tokens used and avoids filling the prompt with confusing schema information. + +Using Auto-Function calling capabilities, the LLM is able to retrieve from the plugin the full schema information for the views / tables that it considers useful for answering the question. Once retrieved, the full SQL query can then be generated. The schemas for multiple views / tables can be retrieved to allow the LLM to perform joins and other complex queries. + +## Provided Notebooks + +- `./rag_with_text_2_sql.ipynb` provides example of how to utilise the Text2SQL plugin to query the database. +- `./rag_with_ai_searchandtext_2_sql.ipynb` provides an example of how to use the Text2SQL and an AISearch plugin in parallel to automatically retrieve data from the most relevant source to answer the query. + - This setup is useful for a production application as the SQL Database is unlikely to be able to answer all the questions a user may ask. + +## SQL Plugin + +`./plugins/sql_plugin` contains all the relevant Semantic Kernel code for the plugin. + +### entities.json + +To power the knowledge of the LLM, a data dictionary containing all the SQL views / table metadata is used. Whilst the LLM could query the database at runtime to find out the schemas for the database, storing them in a text file reduces the overall latency of the system and allows the metadata for each table to be adjusted in a form of prompt engineering. + +The data dictionary is stored in `./plugins/sql_plugin/entities.json`. Below is a sample entry for a view / table that we which to expose to the LLM. The Microsoft SQL Server [Adventure Works Database](https://learn.microsoft.com/en-us/sql/samples/adventureworks-install-configure?view=sql-server-ver16) is used as an sample. + +```json +{ + "view_name": "Get All Categories", + "entity": "vGetAllCategories", + "description": "This view provides a comprehensive list of all product categories and their corresponding subcategories in the SalesLT schema of the AdventureWorksLT database. It is used to understand the hierarchical structure of product categories, facilitating product organization and categorization.", + "selector": "Use this view to retrieve information about product categories and subcategories. It is useful for scenarios where product categorization is required, such as generating reports based on product categories or filtering products by category.", + "columns": [ + { + "definition": "A unique identifier for each product category. This ID is used to reference specific categories.", + "name": "ProductCategoryID", + "type": "INT" + }, + { + "definition": "The name of the parent product category. This represents the top-level category under which subcategories are grouped.", + "name": "ParentProductCategoryName", + "type": "NVARCHAR(50)" + }, + { + "definition": "The name of the product category. This can refer to either a top-level category or a subcategory, depending on the context.", + "name": "ProductCategoryName", + "type": "NVARCHAR(50)" + } + ] +} +``` + +#### Property Definitions +- **view_name** or **table_name** is a human readable name for the entity. +- **entity** is the actual name for the entity that is used in the SQL query. +- **description** provides a comprehensive description of what information the entity contains. +- **selector** provides reasoning to the LLM of in which scenarios it should select this entity. +- **columns** contains a list of the columns exposed for querying. Each column contains: + - **definition** a short definition of what information the column contains. Here you can add extra metadata to **prompt engineer** the LLM to select the right columns or interpret the data in the column correctly. + - **name** is the actual column name. + - **type** is the datatype for the column. + - **sample_values (optional)** is a list of sample values that are in the column. This is useful for instructing the LLM of what format the data may be in. + - **allowed_values (optional)** is a list of absolute allowed values for the column. This instructs the LLM only to use these values if filtering against this column. + +A full data dictionary must be built for all the views / tables you which to expose to the LLM. The metadata provide directly influences the accuracy of the Text2SQL component. + +### sql_plugin.py + +The `./plugins/sql_plugin/sql_plugin.py` contains 3 key methods to power the Text2SQL engine. + +#### system_prompt() + +This method takes the loaded `entities.json` file and generates a system prompt based on it. Here, the **entity_name**, **description** and **selector** are used to build a list of available entities for the LLM to select. + +This is then inserted into a pre-made Text2SQL generation prompt that already contains optimised and working instructions for the LLM. This system prompt for the plugin is added to the main prompt file at runtime. + +The **target_engine** is passed to the prompt, along with **engine_specific_rules** to ensure that the SQL queries generated work on the target engine. + +#### get_entity_schema() + +This method is called by the Semantic Kernel framework automatically, when instructed to do so by the LLM, to fetch the full schema definitions for a given entity. This returns a JSON string of the chosen entity which allows the LLM to understand the column definitions and their associated metadata. This can be called in parallel for multiple entities. + +#### run_sql_query() + +This method is called by the Semantic Kernel framework automatically, when instructed to do so by the LLM, to run a SQL query against the given database. It returns a JSON string containing a row wise dump of the results returned. These results are then interpreted to answer the question. + +## Sample Usage + +### What is the top performing product by quantity of units sold? + +#### SQL Query Generated + +*SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC* + +#### JSON Result + +```json +{ + "answer": "The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2].", + "sources": [ + { + "title": "Sales Order Detail", + "chunk": "| ProductID | TotalUnitsSold |\n|-----------|----------------|\n| 864 | 87 |\n", + "reference": "SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC;" + }, + { + "title": "Product and Description", + "chunk": "| Name | ProductModel |\n|----------------|---------------|\n| Classic Vest, S| Classic Vest |\n", + "reference": "SELECT Name, ProductModel FROM SalesLT.vProductAndDescription WHERE ProductID = 864;" + } + ] +} +``` + +The **answer** and **sources** properties can be rendered to the user to visualize the results. Markdown support is useful for complex answer outputs and explaining the source of the information. + +#### Rendered Output + +The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2]. + +#### Rendered Sources + +| ProductID | TotalUnitsSold | +|-----------|----------------| +| 864 | 87 | + +| Name | ProductModel | +|----------------|---------------| +| Classic Vest, S| Classic Vest | + +## Tips for good Text2SQL performance. + +- Pre-assemble views to avoid the LLM having to make complex joins between multiple tables +- Give all columns and views / tables good names that are descriptive. +- Spend time providing good descriptions in the metadata for all entities and columns e.g. + - If a column contains a value in a given currency, give the currency information in the metadata. + - Clearly state in the **selector** what sorts of questions a given view / table can provide answers for. +- Use common codes for columns that need filtering e.g. + - A country can have multiple text representations e.g. United Kingdom or UK. Use ISO codes for countries, instead of text descriptions to increase the likelihood of correct and valid SQL queries. + +## Production Considerations + +Below are some of the considerations that should be made before using this plugin in production: + +- Despite prompting to only produce **SELECT** statements, there is a danger that dangerous SQL statements could be generated. + - Consider adding validation of the SQL query before it is executed to check it is only performing actions that you allow. + - Consider limiting the permissions of the identity or connection string to only allow access to certain tables or perform certain query types. +- If possible, run the queries under the identity of the end user so that any row or column level security is applied to the data. +- Consider data masking for sensitive columns that you do not wish to be exposed. diff --git a/text2sql/images/OneShot SQL vs TwoShot SQL OpenAI.png b/text2sql/images/OneShot SQL vs TwoShot SQL OpenAI.png new file mode 100644 index 0000000..e89a5bb Binary files /dev/null and b/text2sql/images/OneShot SQL vs TwoShot SQL OpenAI.png differ diff --git a/text2sql/images/Plugin Based RAG Flow.png b/text2sql/images/Plugin Based RAG Flow.png new file mode 100644 index 0000000..97c02d4 Binary files /dev/null and b/text2sql/images/Plugin Based RAG Flow.png differ diff --git a/text2sql/plugins/ai_search_plugin/__init__.py b/text2sql/plugins/ai_search_plugin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/text2sql/plugins/ai_search_plugin/ai_search_plugin.py b/text2sql/plugins/ai_search_plugin/ai_search_plugin.py new file mode 100644 index 0000000..ebec719 --- /dev/null +++ b/text2sql/plugins/ai_search_plugin/ai_search_plugin.py @@ -0,0 +1,79 @@ +from semantic_kernel.functions import kernel_function +from typing import Annotated +from azure.identity import DefaultAzureCredential +from openai import AsyncAzureOpenAI +from azure.search.documents.models import VectorizedQuery +from azure.search.documents.aio import SearchClient +import os +import json +import logging + + +class AISearchPlugin: + """A plugin that allows for the execution of AI Search queries against a text input.""" + + @staticmethod + def system_prompt() -> str: + """Get the system prompt for the AI Search Plugin.""" + return """Use the AI Search to return documents that have been indexed, that might be relevant for a piece of text to aid understanding. AI Search should always be used, even if you believe it might not be relevant. Execute this in parallel to any other functions that might be relevant.""" + + @kernel_function( + description="Runs an hybrid semantic search against some text to return relevant documents that are indexed within AI Search.", + name="QueryDocumentStorage", + ) + async def query_document_storage( + self, text: Annotated[str, "The text to run a semantic search against."] + ) -> str: + """Sends an text query to AI Search and uses Semantic Ranking to return a result. + + Args: + ---- + text (str): The text to run the search against. + + Returns: + ---- + str: The JSON representation of the search results. + """ + + async with AsyncAzureOpenAI( + # This is the default and can be omitted + api_key=os.environ["OPEN_AI_KEY"], + azure_endpoint=os.environ["OPEN_AI_ENDPOINT"], + api_version=os.environ["OPEN_AI_VERSION"], + ) as open_ai_client: + embeddings = await open_ai_client.embeddings.create( + model=os.environ["OPEN_AI_EMBEDDING_MODEL"], input=text + ) + + # Extract the embedding vector + embedding_vector = embeddings.data[0].embedding + + vector_query = VectorizedQuery( + vector=embedding_vector, + k_nearest_neighbors=5, + fields="chunk_vector", + ) + + credential = DefaultAzureCredential() + async with SearchClient( + endpoint=os.environ["AI_SEARCH_ENDPOINT"], + index_name=os.environ["AI_SEARCH_INDEX"], + credential=credential, + ) as search_client: + results = await search_client.search( + top=5, + query_type="semantic", + semantic_configuration_name=os.environ["AI_SEARCH_SEMANTIC_CONFIG"], + search_text=text, + select="title,chunk,source", + vector_queries=[vector_query], + ) + + documents = [ + document + async for result in results.by_page() + async for document in result + ] + + logging.debug("Results: %s", documents) + return json.dumps(documents, default=str) diff --git a/text2sql/plugins/sql_plugin/__init__.py b/text2sql/plugins/sql_plugin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/text2sql/plugins/sql_plugin/entities.json b/text2sql/plugins/sql_plugin/entities.json new file mode 100644 index 0000000..4638f41 --- /dev/null +++ b/text2sql/plugins/sql_plugin/entities.json @@ -0,0 +1,394 @@ +{ + "tables": [ + { + "columns": [ + { + "definition": "A unique identifier for each sales order ticket. This ID is auto-generated and serves as the primary key for the SalesOrderTicket table.", + "name": "SalesOrderID", + "type": "INT" + }, + { + "definition": "The date and time when the sales order was created. This is used to track when the order was initiated.", + "name": "OrderDate", + "type": "DATETIME" + }, + { + "definition": "The date by which the order is expected to be fulfilled or delivered. It helps in managing delivery timelines.", + "name": "DueDate", + "type": "DATETIME" + }, + { + "definition": "The date when the order was shipped to the customer. This is used for tracking shipping and fulfillment status.", + "name": "ShipDate", + "type": "DATETIME" + }, + { + "allowed_values": [ + 1, + 2, + 3 + ], + "definition": "The current status of the order, represented as a numeric code (e.g., 1 for In Progress, 2 for Completed, 3 for Canceled).", + "name": "Status", + "type": "TINYINT" + }, + { + "definition": "The total amount due for the order, including all line items, taxes, and shipping charges.", + "name": "TotalDue", + "type": "MONEY" + }, + { + "definition": "The date and time when the sales order ticket record was last modified. This is used for tracking updates and changes to the order.", + "name": "ModifiedDate", + "type": "DATETIME" + } + ], + "description": "This table stores detailed information about sales order tickets, including the order details, customer information, order status, and timestamps. It is used to manage and track sales orders throughout the order lifecycle, from creation to fulfillment.", + "entity": "SalesOrderDetail", + "selector": "Use this table to retrieve or store information related to individual sales order tickets. This is applicable when processing sales orders, managing customer transactions, or tracking order fulfillment status.", + "table_name": "Sales Order Detail" + }, + { + "columns": [ + { + "definition": "A unique identifier for each sales order. This ID is auto-generated and serves as the primary key for the SalesOrderHeader table.", + "name": "SalesOrderID", + "type": "INT" + }, + { + "definition": "The date and time when the sales order was created. This field is used to track when the order was initiated.", + "name": "OrderDate", + "type": "DATETIME" + }, + { + "definition": "The date by which the order is expected to be fulfilled or delivered. It helps in managing delivery timelines.", + "name": "DueDate", + "type": "DATETIME" + }, + { + "definition": "The date when the order was shipped to the customer. This is used for tracking shipping and fulfillment status.", + "name": "ShipDate", + "type": "DATETIME" + }, + { + "allowed_values": [ + 1, + 2, + 3 + ], + "definition": "The current status of the order, represented as a numeric code (e.g., 1 for In Progress, 2 for Completed, 3 for Canceled).", + "name": "Status", + "type": "TINYINT" + }, + { + "allowed_values": [ + "True", + "False" + ], + "definition": "Indicates whether the order was placed online.", + "name": "OnlineOrderFlag", + "type": "BIT" + }, + { + "definition": "A unique order number assigned to the sales order. This is used for tracking and identification purposes.", + "name": "SalesOrderNumber", + "type": "NVARCHAR(25)" + }, + { + "definition": "The purchase order number provided by the customer. This field links the sales order to the customer's purchase order.", + "name": "PurchaseOrderNumber", + "type": "NVARCHAR(25)" + }, + { + "definition": "The account number of the customer placing the order. This helps link the order to the customer's account.", + "name": "AccountNumber", + "type": "NVARCHAR(15)" + }, + { + "definition": "A foreign key that links to the Customer table, representing the customer who placed the order.", + "name": "CustomerID", + "type": "INT" + }, + { + "definition": "A foreign key that links to the Address table, representing the shipping address for the order.", + "name": "ShipToAddressID", + "type": "INT" + }, + { + "definition": "A foreign key that links to the Address table, representing the billing address for the order.", + "name": "BillToAddressID", + "type": "INT" + }, + { + "definition": "The shipping method used for the order (e.g., UPS, FedEx). This field helps track shipping preferences.", + "name": "ShipMethod", + "type": "NVARCHAR(50)" + }, + { + "definition": "The total cost of the order before taxes and shipping charges. This field is used to calculate the final total. The currency is pound sterling (GBP).", + "name": "SubTotal", + "type": "MONEY" + }, + { + "definition": "The tax amount applied to the order. This is calculated based on the order subtotal and applicable tax rates. The currency is pound sterling (GBP).", + "name": "TaxAmt", + "type": "MONEY" + }, + { + "definition": "The shipping charge applied to the order. This field represents the cost of shipping the order to the customer. The currency is pound sterling (GBP).", + "name": "Freight", + "type": "MONEY" + }, + { + "definition": "The total amount due for the order, including all line items, taxes, and shipping charges. The currency is pound sterling (GBP).", + "name": "TotalDue", + "type": "MONEY" + }, + { + "definition": "Any additional comments or notes related to the sales order. This field can include special instructions or remarks.", + "name": "Comment", + "type": "NVARCHAR(255)" + }, + { + "definition": "The date and time when the sales order header record was last modified. This is used for tracking updates and changes to the order.", + "name": "ModifiedDate", + "type": "DATETIME" + } + ], + "description": "This table contains high-level information about sales orders, including order dates, customer details, shipping information, and order status. It is used to manage and track sales orders from initiation to fulfillment.", + "entity": "SalesOrderHeader", + "selector": "Use this table to retrieve or store information related to the overall details of a sales order. It is applicable when you need to track order status, manage order dates, or relate orders to customers and shipping information.", + "table_name": "Sales Order Header" + }, + { + "columns": [ + { + "definition": "A unique identifier for each address. This ID is auto-generated and serves as the primary key for the Address table.", + "name": "AddressID", + "type": "INT" + }, + { + "definition": "The city in which the address is located. This is used to specify the city for the address.", + "name": "City", + "type": "NVARCHAR(30)" + }, + { + "definition": "The state or province in which the address is located. This is used to specify the state or province for the address.", + "name": "StateProvince", + "type": "NVARCHAR(50)" + }, + { + "definition": "The country or region in which the address is located. This is used to specify the country or region for the address.", + "name": "CountryRegion", + "type": "NVARCHAR(50)" + }, + { + "definition": "The postal code associated with the address. This is used to specify the postal code for the address, which helps in geographical sorting and shipping.", + "name": "PostalCode", + "type": "NVARCHAR(15)" + }, + { + "definition": "The date and time when the address record was last modified. This is used for tracking updates and changes to the address information.", + "name": "ModifiedDate", + "type": "DATETIME" + } + ], + "description": "This table stores address information for customers, including street addresses, city, state, postal code, and country/region. It is used to maintain contact and shipping information for orders, as well as to manage customer locations.", + "entity": "Address", + "selector": "Use this table to retrieve or store address details for customers, shipping locations, or billing addresses. It is applicable in scenarios where location information is required, such as shipping orders, verifying customer addresses, or managing geographical data.", + "table_name": "Address" + } + ], + "views": [ + { + "columns": [ + { + "definition": "A unique identifier for each product category. This ID is used to reference specific categories.", + "name": "ProductCategoryID", + "type": "INT" + }, + { + "definition": "The name of the parent product category. This represents the top-level category under which subcategories are grouped.", + "name": "ParentProductCategoryName", + "type": "NVARCHAR(50)" + }, + { + "definition": "The name of the product category. This can refer to either a top-level category or a subcategory, depending on the context.", + "name": "ProductCategoryName", + "type": "NVARCHAR(50)" + } + ], + "description": "This view provides a comprehensive list of all product categories and their corresponding subcategories in the SalesLT schema of the AdventureWorksLT database. It is used to understand the hierarchical structure of product categories, facilitating product organization and categorization.", + "entity": "vGetAllCategories", + "selector": "Use this view to retrieve information about product categories and subcategories. It is useful for scenarios where product categorization is required, such as generating reports based on product categories or filtering products by category.", + "view_name": "Get All Categories" + }, + { + "columns": [ + { + "definition": "A unique identifier for each product. This ID is used to distinguish individual products.", + "name": "ProductID", + "type": "INT" + }, + { + "definition": "The name of the product. This provides a brief and identifiable name for each product.", + "name": "Name", + "type": "NVARCHAR(50)" + }, + { + "definition": "The model name associated with the product. This indicates the specific model type or version of the product.", + "name": "ProductModel", + "type": "NVARCHAR(50)" + }, + { + "definition": "The culture or language code for the product description. This is used to localize the product description, such as 'en' for English or 'fr' for French.", + "name": "Culture", + "sample_values": [ + "en", + "fr", + "es", + "de" + ], + "type": "NVARCHAR(6)" + }, + { + "definition": "A detailed description of the product. This text provides additional information about the product, which can vary based on the culture or language.", + "name": "Description", + "type": "NVARCHAR(400)" + } + ], + "description": "This view provides detailed information about products, including their names, associated product models, descriptions, and the specific culture or language of the description. It is useful for understanding product details and translating product descriptions for different cultures.", + "entity": "vProductAndDescription", + "selector": "Use this view when you need comprehensive details about products, including their descriptions in different languages. This view is particularly useful for multilingual product catalogs or when creating localized content.", + "view_name": "Product and Description" + }, + { + "columns": [ + { + "definition": "A unique identifier for each product model. This ID is used to distinguish different product models.", + "name": "ProductModelID", + "type": "INT" + }, + { + "definition": "The name of the product model, providing a recognizable title for each model.", + "name": "Name", + "type": "NVARCHAR(50)" + }, + { + "definition": "A brief summary of the product model, highlighting key features and characteristics.", + "name": "Summary", + "type": "NVARCHAR(MAX)" + }, + { + "definition": "The name of the manufacturer of the product model.", + "name": "Manufacturer", + "type": "NVARCHAR(50)" + }, + { + "definition": "Copyright information related to the product model, indicating the legal ownership of the product design and content.", + "name": "Copyright", + "type": "NVARCHAR(30)" + }, + { + "definition": "The URL for the product model, providing a link to more information or to purchase the product.", + "name": "ProductURL", + "type": "NVARCHAR(256)" + }, + { + "definition": "The duration of the warranty period for the product model, specifying how long the warranty is valid.", + "name": "WarrantyPeriod", + "type": "NVARCHAR(30)" + }, + { + "definition": "A description of the warranty provided for the product model, detailing what is covered under the warranty.", + "name": "WarrantyDescription", + "type": "NVARCHAR(255)" + }, + { + "definition": "The number of years the warranty is valid for the product model.", + "name": "NoOfYears", + "type": "INT" + }, + { + "definition": "A description of the maintenance requirements and recommendations for the product model.", + "name": "MaintenanceDescription", + "type": "NVARCHAR(MAX)" + }, + { + "definition": "Details about the type of wheels used in the product model.", + "name": "Wheel", + "type": "NVARCHAR(50)" + }, + { + "definition": "Information about the saddle of the product model, such as material and design.", + "name": "Saddle", + "type": "NVARCHAR(50)" + }, + { + "definition": "Details regarding the pedal design and specifications of the product model.", + "name": "Pedal", + "type": "NVARCHAR(50)" + }, + { + "definition": "Description of the bike frame used in the product model, including material and type.", + "name": "BikeFrame", + "type": "NVARCHAR(50)" + }, + { + "definition": "Information about the crankset of the product model, specifying its design and features.", + "name": "Crankset", + "type": "NVARCHAR(50)" + }, + { + "definition": "The angle at which the product model is photographed, providing a visual perspective of the product.", + "name": "PictureAngle", + "type": "NVARCHAR(20)" + }, + { + "definition": "The size of the product model's picture, specifying dimensions or resolution.", + "name": "PictureSize", + "type": "NVARCHAR(20)" + }, + { + "definition": "An identifier linking to the product photo, which provides a visual representation of the product model.", + "name": "ProductPhotoID", + "type": "INT" + }, + { + "definition": "The material used in the construction of the product model, indicating durability and quality.", + "name": "Material", + "type": "NVARCHAR(50)" + }, + { + "definition": "The color of the product model, providing information about the appearance of the product.", + "name": "Color", + "type": "NVARCHAR(15)" + }, + { + "definition": "A code representing the product line to which the model belongs, categorizing the product within a broader product range.", + "name": "ProductLine", + "type": "NVARCHAR(2)" + }, + { + "definition": "The style of the product model, indicating design and aesthetic aspects.", + "name": "Style", + "type": "NVARCHAR(50)" + }, + { + "definition": "A description of the target rider's experience level for which the product model is designed, such as beginner, intermediate, or expert.", + "name": "RiderExperience", + "type": "NVARCHAR(50)" + }, + { + "definition": "The date and time when the product model information was last modified, indicating the currency of the data.", + "name": "ModifiedDate", + "type": "DATETIME" + } + ], + "description": "This view provides detailed catalog information about product models, including descriptions, manufacturing details, warranty information, and specifications related to product design and features. It is useful for generating comprehensive product catalogs and providing detailed product information to customers.", + "entity": "vProductModelCatalogDescription", + "selector": "Use this view when you need to retrieve detailed product model descriptions, manufacturing information, and specifications. It is particularly useful for creating product catalogs, detailing product features, and providing warranty information.", + "view_name": "Product Model Catalog Description" + } + ] +} diff --git a/text2sql/plugins/sql_plugin/entities_schema.json b/text2sql/plugins/sql_plugin/entities_schema.json new file mode 100644 index 0000000..27efcff --- /dev/null +++ b/text2sql/plugins/sql_plugin/entities_schema.json @@ -0,0 +1,155 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "tables": { + "items": { + "additionalProperties": false, + "properties": { + "columns": { + "items": { + "additionalProperties": false, + "properties": { + "allowed_values": { + "description": "Absolute list of values for the column, if applicable.", + "items": { + "type": "string" + }, + "type": "array" + }, + "definition": { + "description": "A brief definition of the column's purpose.", + "type": "string" + }, + "name": { + "description": "The name of the column.", + "type": "string" + }, + "sample_values": { + "description": "Sample values for the column, if applicable.", + "items": { + "type": "string" + }, + "type": "array" + }, + "type": { + "description": "The data type of the column.", + "type": "string" + } + }, + "required": [ + "name", + "type", + "definition" + ], + "type": "object" + }, + "type": "array" + }, + "description": { + "description": "A brief description of what the table contains.", + "type": "string" + }, + "entity": { + "description": "The technical name of the table entity.", + "type": "string" + }, + "selector": { + "description": "Guidance on when and how to use this table.", + "type": "string" + }, + "table_name": { + "description": "The name of the table.", + "type": "string" + } + }, + "required": [ + "table_name", + "entity", + "description", + "selector", + "columns" + ], + "type": "object" + }, + "type": "array" + }, + "views": { + "items": { + "additionalProperties": false, + "properties": { + "columns": { + "items": { + "additionalProperties": false, + "properties": { + "allowed_values": { + "description": "Absolute list of values for the column, if applicable.", + "items": { + "type": "string" + }, + "type": "array" + }, + "definition": { + "description": "A brief definition of the column's purpose.", + "type": "string" + }, + "name": { + "description": "The name of the column.", + "type": "string" + }, + "sample_values": { + "description": "Sample values for the column, if applicable.", + "items": { + "type": "string" + }, + "type": "array" + }, + "type": { + "description": "The data type of the column.", + "type": "string" + } + }, + "required": [ + "name", + "type", + "definition" + ], + "type": "object" + }, + "type": "array" + }, + "description": { + "description": "A brief description of what the view contains.", + "type": "string" + }, + "entity": { + "description": "The technical name of the view entity.", + "type": "string" + }, + "selector": { + "description": "Guidance on when and how to use this view.", + "type": "string" + }, + "view_name": { + "description": "The name of the view.", + "type": "string" + } + }, + "required": [ + "view_name", + "entity", + "description", + "selector", + "columns" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "tables", + "views" + ], + "type": "object" +} diff --git a/text2sql/plugins/sql_plugin/sql_plugin.py b/text2sql/plugins/sql_plugin/sql_plugin.py new file mode 100644 index 0000000..e8d7218 --- /dev/null +++ b/text2sql/plugins/sql_plugin/sql_plugin.py @@ -0,0 +1,159 @@ +from semantic_kernel.functions import kernel_function +import aioodbc +from typing import Annotated +import os +import json +import logging + + +class SQLPlugin: + """A plugin that allows for the execution of SQL queries against a SQL Database.""" + + def __init__(self, database: str, target_engine: str = "Microsoft TSQL Server"): + """Initialize the SQL Plugin. + + Args: + ---- + database (str): The name of the database to connect to. + target_engine (str): The target database engine to run the queries against. Default is 'SQL Server'. + """ + self.entities = {} + self.database = database + self.target_engine = target_engine + + self.load_entities() + + def load_entities(self): + """Load the views from the JSON file and formats into common memory dictionary.""" + with open("./plugins/sql_plugin/entities.json", "r", encoding="utf-8") as file: + entities = json.load(file) + + # Load views + for view in entities["views"]: + entity_object = view.copy() + + entity_object["entity_name"] = entity_object["view_name"] + del entity_object["view_name"] + entity = entity_object["entity"] + entity_object["select_from_entity"] = f"{self.database}.{entity}" + self.entities[entity_object["entity_name"].lower()] = entity_object + + # Load tables + for table in entities["tables"]: + entity_object = table.copy() + + entity_object["entity_name"] = entity_object["table_name"] + del entity_object["table_name"] + entity = entity_object["entity"] + entity_object["select_from_entity"] = f"{self.database}.{entity}" + self.entities[entity_object["entity_name"].lower()] = entity_object + + def system_prompt(self, engine_specific_rules: str | None = None) -> str: + """Get the schemas for the database entities and provide a system prompt for the user. + + Returns: + str: The system prompt for the user. + """ + + entity_descriptions = [] + for entity in self.entities.values(): + entity_string = " [BEGIN ENTITY = '{}']\n Name='{}'\n Description='{} {}'\n [END ENTITY = '{}']".format( + entity["entity_name"].upper(), + entity["entity_name"], + entity["description"], + entity["selector"], + entity["entity_name"].upper(), + ) + entity_descriptions.append(entity_string) + + entity_descriptions = "\n\n ".join(entity_descriptions) + + if engine_specific_rules: + engine_specific_rules = f"\n The following {self.target_engine} Syntax rules must be adhered to.\n {engine_specific_rules}" + + system_prompt = f"""Use the names and descriptions of {self.target_engine} entities provided in ENTITIES LIST to decide which entities to query if you need to retrieve information from the database. Use the 'GetEntitySchema()' function to get more details of the schema of the view you want to query. Use the 'RunSQLQuery()' function to run the SQL query against the database. + + You must always examine the provided {self.target_engine} entity descriptions to determine if they can answer the question. + + [BEGIN ENTITIES LIST] + {entity_descriptions} + [END ENTITIES LIST] + + Output corresponding text values in the answer for columns where there is an ID. For example, if the column is 'ProductID', output the corresponding 'ProductModel' in the response. Do not include the ID in the response. + If a user is asking for a comparison, always compare the relevant values in the database. + + The target database engine is {self.target_engine}, SQL queries must be able compatible to run on {self.target_engine}. {engine_specific_rules} + Always generate the SQL query based on the GetEntitySchema() function output, do not use the chat history data to generate the SQL query. + Do not use any other entities and columns in your SQL query, other than those defined above. Only use the column names obtained from GetEntitySchema() when constructing a SQL query, do not make up column names. + You must only provide SELECT SQL queries. + For a given entity, use the 'select_from_entity' property returned from 'GetEntitySchema()' function in the SELECT FROM part of the SQL query. If the property is {{'select_from_entity': 'test_schema.test_table'}}, the select statement will be formulated from 'SELECT FROM test_schema.test_table WHERE . + + If you don't know how the value is formatted in a column, run a query against the column to get the unique values that might match your query. + Some columns returned from 'GetEntitySchema()' may have the properties 'allowed_values' or 'sample_values'. Use these values to determine the possible values that can be used in the SQL query. + + The source title to cite is the 'entity_name' property. The source reference is the SQL query used. The source chunk is the result of the SQL query used to answer the user query in Markdown table format. e.g. {{ 'title': "vProductAndDescription", 'chunk': '| ProductID | Name | ProductModel | Culture | Description |\\n|-----------|-------------------|--------------|---------|----------------------------------|\\n| 101 | Mountain Bike | MT-100 | en | A durable bike for mountain use. |\\n| 102 | Road Bike | RB-200 | en | Lightweight bike for road use. |\\n| 103 | Hybrid Bike | HB-300 | fr | VÊlo hybride pour usage mixte. |\\n', 'reference': 'SELECT ProductID, Name, ProductModel, Culture, Description FROM vProductAndDescription WHERE Culture = \"en\";' }}""" + + return system_prompt + + @kernel_function( + description="Get the detailed schema of an entity in the Database. Use the entity and the column returned to formulate a SQL query. The view name or table name must be one of the ENTITY NAMES defined in the [ENTITIES LIST]. Only use the column names obtained from GetEntitySchema() when constructing a SQL query, do not make up column names.", + name="GetEntitySchema", + ) + async def get_entity_schema( + self, + entity_name: Annotated[ + str, + "The view or table name to get the schema for. It must be one of the ENTITY NAMES defined in the [ENTITIES LIST] function.", + ], + ) -> str: + """Get the schema of a view or table in the SQL Database. + + Args: + ---- + entity_name (str): A views or table name to get the schema for. + + Returns: + str: The schema of the views or tables in JSON format. + """ + + if entity_name.lower() not in self.entities: + return json.dumps( + { + "error": f"The view or table {entity_name} does not exist in the database. Refer to the previously provided list of entities. Allow values are: {', '.join(self.entities.keys())}." + } + ) + + return json.dumps({entity_name: self.entities[entity_name.lower()]}) + + @kernel_function( + description="Runs an SQL query against the SQL Database to extract information.", + name="RunSQLQuery", + ) + async def run_sql_query( + self, sql_query: Annotated[str, "The SQL query to run against the DB"] + ) -> str: + """Sends an SQL Query to the SQL Databases and returns to the result. + + Args: + ---- + sql_query (str): The query to run against the DB. + + Returns: + str: The JSON representation of the query results.""" + + logging.info("Executing SQL Query") + logging.debug("SQL Query: %s", sql_query) + + connection_string = os.environ["SQL_DB_CONNECTION_STRING"] + async with await aioodbc.connect(dsn=connection_string) as sql_db_client: + async with sql_db_client.cursor() as cursor: + await cursor.execute(sql_query) + + columns = [column[0] for column in cursor.description] + + rows = await cursor.fetchall() + results = [dict(zip(columns, returned_row)) for returned_row in rows] + + logging.debug("Results: %s", results) + + return json.dumps(results, default=str) diff --git a/text2sql/prompt.yaml b/text2sql/prompt.yaml new file mode 100644 index 0000000..3facc69 --- /dev/null +++ b/text2sql/prompt.yaml @@ -0,0 +1,117 @@ +template_format: semantic-kernel +template: | + + As a senior analyst, your primary responsibility is to provide precise and thorough answers to the user's queries. Utilize all the provided functions to craft your responses. You must deliver detailed and accurate final answers with clear explanations and actionable insights. + + Always use the provided functions to obtain key information in order to answer the question. + If you are asked to use always use a function, you must use that function to compliment the answer. + Always use multiple functions to formulate the answer. + Always execute multiple functions in parallel to compliment the results. + + The response to the user must meet the requirements in RESPONSE OUTPUT REQUIREMENTS. + IMPORTANT INFORMATION contains useful information that you can use to aid your knowledge. + CONVERSATION HISTORY contains the previous question and answer pairs in the conversation in JSON format. Do not use this information to answer the question, but to provide context on what was asked previously. + + [IMPORTANT INFORMATION] + + {{$important_information}} + + [END IMPORTANT INFORMATION] + + [RESPONSE OUTPUT REQUIREMENTS] + + The answer MUST be returned in JSON format as { "answer": "", "sources": [ {"title": , "chunk": , "reference": ""}, {"title": , "chunk": , "reference": ""} ] }. + + The 'answer' property MUST meet the requirements in the ANSWER PROPERTY REQUIREMENTS. + The 'sources' property MUST meet the requirements in the SOURCES PROPERTY REQUIREMENTS. + + Do NOT return anything outside of the provided JSON property. + + [ANSWER PROPERTY REQUIREMENTS] + 1. Language and Tone: + Use only British English throughout the response. + Employ a business-friendly language that is professional and easy to understand. + + 2. Content Restrictions: + Do not use any profanity, offensive language, hate speech, or code in the response. + If you encounter any such content, handle it gracefully by omitting or rephrasing it appropriately. + + 3. Information Sources: + Use only information from the provided functions and specified important information. + Do not use any external sources or the conversation history for constructing the response. + In case of conflicting information, prioritize data from the SQL Database as the primary source of truth. + + 4. Calculations: + For any required calculations, use only the values provided in the context. + Provide a brief, clear explanation of the calculations beneath the results. + + 5. Response Structure: + Ensure the response is direct, easy to understand, and well-structured. + Format the response using Markdown for clarity and readability. + Use bold sub-headings for clarity where needed. Only use Markdown headings Level 3 (###) and Level 4 (####). + Use bullet points or numbered lists when appropriate. + Do not vary the font size within the same sentence. + + 6. Citations: + All factual information used in the answer must be cited with numbered references. For example, [1] should be used to refer to the first source. + Each citation in the answer must correspond to a single entry in the 'sources' object. + The same citation and corresponding context chunk may be used multiple times if needed. + Place the numbered citation at the end of each relevant sentence that uses information from the sources. + Ensure that each source listed in the 'sources' property is cited at least once in the answer. + Do not provide a list of definitions from the business glossary; use such information only to enhance the answer contextually. + + 7. Citations Format: + Citations should be embedded within the text, not as a separate list at the end of the 'answer' property. + [END ANSWER PROPERTY REQUIREMENTS] + + [SOURCES PROPERTY REQUIREMENTS] + 1. Reference Inclusion: + Include all corresponding references for all cited content in the 'answer' property. + Place the references in the 'sources' property. + + 2. Source Format: + Each entry in the 'sources' property must be formatted as: {"title": "", "chunk": "", "reference": ""} + For example, a complete response with two citations would be formatted as: { "answer": "", "sources": [ {"title": , "chunk": , "reference": ""}, {"title": , "chunk": , "reference": ""} ] } + + 3. Source Chunk: + The 'chunk' property should contain a concise, unedited snippet of the relevant context that supports the answer. + + 4. Mandatory References: + Ensure that every citation in the 'answer' has a corresponding entry in the 'sources' property. + Every entry in the 'sources' property must be cited at least once in the answer. + [END SOURCES PROPERTY REQUIREMENTS] + + [END RESPONSE OUTPUT REQUIREMENTS] + + [CONVERSATION HISTORY] + + {{$chat_history}} + + [END CONVERSATION HISTORY] + + {{$user_input}} +description: Chatbot +name: ChatBot +input_variables: + - name: user_input + description: The user input + is_required: true + - name: chat_history + description: The history of the conversation for the last 3 messages + is_required: true + - name: important_information + description: Useful information for the chatbot + is_required: true +output_variable: + description: The chatbot response formatted in JSON as defined in the FINAL ANSWER OUTPUT REQUIREMENTS. +execution_settings: + chat: + function_choice_behavior: + type: auto + maximum_auto_invoke_attempts: 5 + filters: + excluded_plugins: + - ChatBot + response_format: + type: json_object + temperature: 0.5 diff --git a/text2sql/rag_with_ai_search_and_text_2_sql.ipynb b/text2sql/rag_with_ai_search_and_text_2_sql.ipynb new file mode 100644 index 0000000..e0e797e --- /dev/null +++ b/text2sql/rag_with_ai_search_and_text_2_sql.ipynb @@ -0,0 +1,305 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text2SQL & AI Search with Semantic Kernel & Azure OpenAI\n", + "\n", + "This notebook demonstrates how the SQL plugin can be integrated with Semantic Kernel and Azure OpenAI to answer questions from the database based on the schemas provided. Additionally, it integrates with an AI Search plugin to show how both can be used in parallel to answer questions based on the best possible source. The prompt may need to be tweaked for your usage.\n", + "\n", + "A multi-shot approach is used for SQL generation for more reliable results and reduced token usage. More details can be found in the README.md." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1718623217703 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "import yaml\n", + "import dotenv\n", + "import json\n", + "from semantic_kernel.connectors.ai.open_ai import (\n", + " AzureChatCompletion,\n", + ")\n", + "from semantic_kernel.contents.chat_history import ChatHistory\n", + "from semantic_kernel.kernel import Kernel\n", + "from plugins.sql_plugin.sql_plugin import SQLPlugin\n", + "from plugins.ai_search_plugin.ai_search_plugin import AISearchPlugin\n", + "from semantic_kernel.functions.kernel_arguments import KernelArguments\n", + "from semantic_kernel.prompt_template.prompt_template_config import PromptTemplateConfig\n", + "from IPython.display import display, Markdown\n", + "\n", + "logging.basicConfig(level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Kernel Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dotenv.load_dotenv()\n", + "kernel = Kernel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up GPT connections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1718623218006 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "service_id = \"chat\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1718623218267 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "chat_service = AzureChatCompletion(\n", + " service_id=service_id,\n", + " deployment_name=os.environ[\"OPEN_AI_CONVERSATION_MODEL\"],\n", + " endpoint=os.environ[\"OPEN_AI_ENDPOINT\"],\n", + " api_key=os.environ[\"OPEN_AI_KEY\"],\n", + ")\n", + "kernel.add_service(chat_service)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1718623218614 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# Register the SQL Plugin with the Database name to use.\n", + "sql_plugin = SQLPlugin(database=os.environ[\"SQL_DB_NAME\"])\n", + "kernel.add_plugin(sql_plugin, \"SQL\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ai_search_plugin = AISearchPlugin()\n", + "kernel.add_plugin(ai_search_plugin, \"AISearch\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Prompt Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load prompt and execution settings from the file\n", + "with open(\"./prompt.yaml\", \"r\") as file:\n", + " data = yaml.safe_load(file.read())\n", + " prompt_template_config = PromptTemplateConfig(**data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chat_function = kernel.add_function(\n", + " prompt_template_config=prompt_template_config,\n", + " plugin_name=\"ChatBot\",\n", + " function_name=\"Chat\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ChatBot setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "history = ChatHistory()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def ask_question(question: str, chat_history: ChatHistory) -> str:\n", + " \"\"\"Asks a question to the chatbot and returns the answer.\n", + " \n", + " Args:\n", + " question (str): The question to ask the chatbot.\n", + " chat_history (ChatHistory): The chat history object.\n", + " \n", + " Returns:\n", + " str: The answer from the chatbot.\n", + " \"\"\"\n", + "\n", + " # Create important information prompt that contains the SQL database information and the AI search description (useful if you have multiple indexes).\n", + " engine_specific_rules = \"Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.\"\n", + " important_information_prompt = f\"\"\"\n", + " [AI SEARCH INFORMATION]\n", + " {ai_search_plugin.system_prompt()}\n", + " [END AI SEARCH INFORMATION]\n", + "\n", + " [SQL DATABASE INFORMATION]\n", + " {sql_plugin.system_prompt(engine_specific_rules=engine_specific_rules)}\n", + " [END SQL DATABASE INFORMATION]\n", + " \"\"\"\n", + "\n", + " arguments = KernelArguments()\n", + " arguments[\"chat_history\"] = chat_history\n", + " arguments[\"important_information\"] = important_information_prompt\n", + " arguments[\"user_input\"] = question\n", + "\n", + " logging.info(\"Question: %s\", question)\n", + "\n", + " answer = await kernel.invoke(\n", + " function_name=\"Chat\",\n", + " plugin_name=\"ChatBot\",\n", + " arguments=arguments,\n", + " )\n", + "\n", + " logging.info(\"Answer: %s\", answer)\n", + "\n", + " # Log the question and answer to the chat history.\n", + " chat_history.add_user_message(question)\n", + " chat_history.add_message({\"role\": \"assistant\", \"message\": answer})\n", + "\n", + " json_answer = json.loads(str(answer))\n", + "\n", + " display(Markdown(json_answer[\"answer\"]))" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/text2sql/rag_with_text_2_sql.ipynb b/text2sql/rag_with_text_2_sql.ipynb new file mode 100644 index 0000000..ea5a307 --- /dev/null +++ b/text2sql/rag_with_text_2_sql.ipynb @@ -0,0 +1,971 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text2SQL with Semantic Kernel & Azure OpenAI\n", + "\n", + "This notebook demonstrates how the SQL plugin can be integrated with Semantic Kernel and Azure OpenAI to answer questions from the database based on the schemas provided. \n", + "\n", + "A multi-shot approach is used for SQL generation for more reliable results and reduced token usage. More details can be found in the README.md." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "gather": { + "logged": 1718623217703 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "import yaml\n", + "import dotenv\n", + "import json\n", + "from semantic_kernel.connectors.ai.open_ai import (\n", + " AzureChatCompletion,\n", + ")\n", + "from semantic_kernel.contents.chat_history import ChatHistory\n", + "from semantic_kernel.kernel import Kernel\n", + "from plugins.sql_plugin.sql_plugin import SQLPlugin\n", + "from semantic_kernel.functions.kernel_arguments import KernelArguments\n", + "from semantic_kernel.prompt_template.prompt_template_config import PromptTemplateConfig\n", + "from IPython.display import display, Markdown\n", + "\n", + "logging.basicConfig(level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Kernel Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dotenv.load_dotenv()\n", + "kernel = Kernel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up GPT connections" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "gather": { + "logged": 1718623218006 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "service_id = \"chat\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "gather": { + "logged": 1718623218267 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "chat_service = AzureChatCompletion(\n", + " service_id=service_id,\n", + " deployment_name=os.environ[\"OPEN_AI_CONVERSATION_MODEL\"],\n", + " endpoint=os.environ[\"OPEN_AI_ENDPOINT\"],\n", + " api_key=os.environ[\"OPEN_AI_KEY\"],\n", + ")\n", + "kernel.add_service(chat_service)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "gather": { + "logged": 1718623218614 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "KernelPlugin(name='SQL', description=None, functions={'GetEntitySchema': KernelFunctionFromMethod(metadata=KernelFunctionMetadata(name='GetEntitySchema', plugin_name='SQL', description='Get the detailed schema of an entity in the Database. Use the entity and the column returned to formulate a SQL query. The view name or table name must be one of the ENTITY NAMES defined in the [ENTITIES LIST]. Only use the column names obtained from GetEntitySchema() when constructing a SQL query, do not make up column names.', parameters=[KernelParameterMetadata(name='entity_name', description='The view or table name to get the schema for. It must be one of the ENTITY NAMES defined in the [ENTITIES LIST] function.', default_value=None, type_='str', is_required=True, type_object=, schema_data={'type': 'string', 'description': 'The view or table name to get the schema for. It must be one of the ENTITY NAMES defined in the [ENTITIES LIST] function.'}, function_schema_include=True)], is_prompt=False, is_asynchronous=True, return_parameter=KernelParameterMetadata(name='return', description='', default_value=None, type_='str', is_required=True, type_object=, schema_data={'type': 'string'}, function_schema_include=True), additional_properties={}), invocation_duration_histogram=, streaming_duration_histogram=, method=>, stream_method=None), 'RunSQLQuery': KernelFunctionFromMethod(metadata=KernelFunctionMetadata(name='RunSQLQuery', plugin_name='SQL', description='Runs an SQL query against the SQL Database to extract information.', parameters=[KernelParameterMetadata(name='sql_query', description='The SQL query to run against the DB', default_value=None, type_='str', is_required=True, type_object=, schema_data={'type': 'string', 'description': 'The SQL query to run against the DB'}, function_schema_include=True)], is_prompt=False, is_asynchronous=True, return_parameter=KernelParameterMetadata(name='return', description='', default_value=None, type_='str', is_required=True, type_object=, schema_data={'type': 'string'}, function_schema_include=True), additional_properties={}), invocation_duration_histogram=, streaming_duration_histogram=, method=>, stream_method=None)})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Register the SQL Plugin with the Database name to use.\n", + "sql_plugin = SQLPlugin(database=os.environ[\"SQL_DB_NAME\"])\n", + "kernel.add_plugin(sql_plugin, \"SQL\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Prompt Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Load prompt and execution settings from the file\n", + "with open(\"./prompt.yaml\", \"r\") as file:\n", + " data = yaml.safe_load(file.read())\n", + " prompt_template_config = PromptTemplateConfig(**data)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "chat_function = kernel.add_function(\n", + " prompt_template_config=prompt_template_config,\n", + " plugin_name=\"ChatBot\",\n", + " function_name=\"Chat\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ChatBot setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "history = ChatHistory()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "async def ask_question(question: str, chat_history: ChatHistory) -> str:\n", + " \"\"\"Asks a question to the chatbot and returns the answer.\n", + " \n", + " Args:\n", + " question (str): The question to ask the chatbot.\n", + " chat_history (ChatHistory): The chat history object.\n", + " \n", + " Returns:\n", + " str: The answer from the chatbot.\n", + " \"\"\"\n", + "\n", + " # Create important information prompt that contains the SQL database information.\n", + " engine_specific_rules = \"Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.\"\n", + " important_information_prompt = f\"\"\"\n", + " [SQL DATABASE INFORMATION]\n", + " {sql_plugin.system_prompt(engine_specific_rules=engine_specific_rules)}\n", + " [END SQL DATABASE INFORMATION]\n", + " \"\"\"\n", + "\n", + " arguments = KernelArguments()\n", + " arguments[\"chat_history\"] = chat_history\n", + " arguments[\"important_information\"] = important_information_prompt\n", + " arguments[\"user_input\"] = question\n", + "\n", + " logging.info(\"Question: %s\", question)\n", + "\n", + " answer = await kernel.invoke(\n", + " function_name=\"Chat\",\n", + " plugin_name=\"ChatBot\",\n", + " arguments=arguments,\n", + " )\n", + "\n", + " logging.info(\"Answer: %s\", answer)\n", + "\n", + " # Log the question and answer to the chat history.\n", + " chat_history.add_user_message(question)\n", + " chat_history.add_message({\"role\": \"assistant\", \"message\": answer})\n", + "\n", + " json_answer = json.loads(str(answer))\n", + "\n", + " display(Markdown(json_answer[\"answer\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Question: What are the different product categories we have?\n", + "INFO:semantic_kernel.functions.kernel_function:Function ChatBot-Chat invoking.\n", + "INFO:semantic_kernel.contents.chat_history:Could not parse prompt \n", + "As a senior analyst, your primary responsibility is to provide precise and thorough answers to the user's queries. Utilize all the provided functions to craft your responses. You must deliver detailed and accurate final answers with clear explanations and actionable insights.\n", + "\n", + "Always use the provided functions to obtain key information in order to answer the question.\n", + "If you are asked to use always use a function, you must use that function to compliment the answer.\n", + "Always use multiple functions to formulate the answer.\n", + "Always execute multiple functions in parallel to compliment the results.\n", + "\n", + "The response to the user must meet the requirements in RESPONSE OUTPUT REQUIREMENTS.\n", + "IMPORTANT INFORMATION contains useful information that you can use to aid your knowledge.\n", + "CONVERSATION HISTORY contains the previous question and answer pairs in the conversation in JSON format. Do not use this information to answer the question, but to provide context on what was asked previously.\n", + "\n", + "[IMPORTANT INFORMATION]\n", + "\n", + "\n", + " [SQL DATABASE INFORMATION]\n", + " Use the names and descriptions of Microsoft TSQL Server entities provided in ENTITIES LIST to decide which entities to query if you need to retrieve information from the database. Use the 'GetEntitySchema()' function to get more details of the schema of the view you want to query. Use the 'RunSQLQuery()' function to run the SQL query against the database.\n", + "\n", + " You must always examine the provided Microsoft TSQL Server entity descriptions to determine if they can answer the question.\n", + "\n", + " [BEGIN ENTITIES LIST]\n", + " [BEGIN ENTITY = 'GET ALL CATEGORIES']\n", + " Name='Get All Categories'\n", + " Description='This view provides a comprehensive list of all product categories and their corresponding subcategories in the SalesLT schema of the AdventureWorksLT database. It is used to understand the hierarchical structure of product categories, facilitating product organization and categorization. Use this view to retrieve information about product categories and subcategories. It is useful for scenarios where product categorization is required, such as generating reports based on product categories or filtering products by category.'\n", + " [END ENTITY = 'GET ALL CATEGORIES']\n", + "\n", + " [BEGIN ENTITY = 'PRODUCT AND DESCRIPTION']\n", + " Name='Product and Description'\n", + " Description='This view provides detailed information about products, including their names, associated product models, descriptions, and the specific culture or language of the description. It is useful for understanding product details and translating product descriptions for different cultures. Use this view when you need comprehensive details about products, including their descriptions in different languages. This view is particularly useful for multilingual product catalogs or when creating localized content.'\n", + " [END ENTITY = 'PRODUCT AND DESCRIPTION']\n", + "\n", + " [BEGIN ENTITY = 'PRODUCT MODEL CATALOG DESCRIPTION']\n", + " Name='Product Model Catalog Description'\n", + " Description='This view provides detailed catalog information about product models, including descriptions, manufacturing details, warranty information, and specifications related to product design and features. It is useful for generating comprehensive product catalogs and providing detailed product information to customers. Use this view when you need to retrieve detailed product model descriptions, manufacturing information, and specifications. It is particularly useful for creating product catalogs, detailing product features, and providing warranty information.'\n", + " [END ENTITY = 'PRODUCT MODEL CATALOG DESCRIPTION']\n", + "\n", + " [BEGIN ENTITY = 'SALES ORDER DETAIL']\n", + " Name='Sales Order Detail'\n", + " Description='This table stores detailed information about sales order tickets, including the order details, customer information, order status, and timestamps. It is used to manage and track sales orders throughout the order lifecycle, from creation to fulfillment. Use this table to retrieve or store information related to individual sales order tickets. This is applicable when processing sales orders, managing customer transactions, or tracking order fulfillment status.'\n", + " [END ENTITY = 'SALES ORDER DETAIL']\n", + "\n", + " [BEGIN ENTITY = 'SALES ORDER HEADER']\n", + " Name='Sales Order Header'\n", + " Description='This table contains high-level information about sales orders, including order dates, customer details, shipping information, and order status. It is used to manage and track sales orders from initiation to fulfillment. Use this table to retrieve or store information related to the overall details of a sales order. It is applicable when you need to track order status, manage order dates, or relate orders to customers and shipping information.'\n", + " [END ENTITY = 'SALES ORDER HEADER']\n", + "\n", + " [BEGIN ENTITY = 'ADDRESS']\n", + " Name='Address'\n", + " Description='This table stores address information for customers, including street addresses, city, state, postal code, and country/region. It is used to maintain contact and shipping information for orders, as well as to manage customer locations. Use this table to retrieve or store address details for customers, shipping locations, or billing addresses. It is applicable in scenarios where location information is required, such as shipping orders, verifying customer addresses, or managing geographical data.'\n", + " [END ENTITY = 'ADDRESS']\n", + " [END ENTITIES LIST]\n", + "\n", + " Output corresponding text values in the answer for columns where there is an ID. For example, if the column is 'ProductID', output the corresponding 'ProductModel' in the response. Do not include the ID in the response.\n", + " If a user is asking for a comparison, always compare the relevant values in the database.\n", + "\n", + " The target database engine is Microsoft TSQL Server, SQL queries must be able compatible to run on Microsoft TSQL Server. \n", + " The following Microsoft TSQL Server Syntax rules must be adhered to.\n", + " Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.\n", + " Always generate the SQL query based on the GetEntitySchema() function output, do not use the chat history data to generate the SQL query.\n", + " Do not use any other entities and columns in your SQL query, other than those defined above. Only use the column names obtained from GetEntitySchema() when constructing a SQL query, do not make up column names.\n", + " You must only provide SELECT SQL queries.\n", + " For a given entity, use the 'select_from_entity' property returned from 'GetEntitySchema()' function in the SELECT FROM part of the SQL query. If the property is {'select_from_entity': 'test_schema.test_table'}, the select statement will be formulated from 'SELECT <VALUES> FROM test_schema.test_table WHERE <CONDITION>.\n", + "\n", + " If you don't know how the value is formatted in a column, run a query against the column to get the unique values that might match your query.\n", + " Some columns returned from 'GetEntitySchema()' may have the properties 'allowed_values' or 'sample_values'. Use these values to determine the possible values that can be used in the SQL query.\n", + "\n", + " The source title to cite is the 'entity_name' property. The source reference is the SQL query used. The source chunk is the result of the SQL query used to answer the user query in Markdown table format. e.g. { 'title': "vProductAndDescription", 'chunk': '| ProductID | Name | ProductModel | Culture | Description |\\n|-----------|-------------------|--------------|---------|----------------------------------|\\n| 101 | Mountain Bike | MT-100 | en | A durable bike for mountain use. |\\n| 102 | Road Bike | RB-200 | en | Lightweight bike for road use. |\\n| 103 | Hybrid Bike | HB-300 | fr | V\u00e9lo hybride pour usage mixte. |\\n', 'reference': 'SELECT ProductID, Name, ProductModel, Culture, Description FROM vProductAndDescription WHERE Culture = "en";' }\n", + " [END SQL DATABASE INFORMATION]\n", + " \n", + "\n", + "[END IMPORTANT INFORMATION]\n", + "\n", + "[RESPONSE OUTPUT REQUIREMENTS]\n", + "\n", + " The answer MUST be returned in JSON format as { \"answer\": \"\", \"sources\": [ {\"title\": , \"chunk\": , \"reference\": \"\"}, {\"title\": , \"chunk\": , \"reference\": \"\"} ] }.\n", + "\n", + " The 'answer' property MUST meet the requirements in the ANSWER PROPERTY REQUIREMENTS.\n", + " The 'sources' property MUST meet the requirements in the SOURCES PROPERTY REQUIREMENTS.\n", + "\n", + " Do NOT return anything outside of the provided JSON property.\n", + "\n", + " [ANSWER PROPERTY REQUIREMENTS]\n", + " 1. Language and Tone:\n", + " Use only British English throughout the response.\n", + " Employ a business-friendly language that is professional and easy to understand.\n", + "\n", + " 2. Content Restrictions:\n", + " Do not use any profanity, offensive language, hate speech, or code in the response.\n", + " If you encounter any such content, handle it gracefully by omitting or rephrasing it appropriately.\n", + "\n", + " 3. Information Sources:\n", + " Use only information from the provided functions and specified important information.\n", + " Do not use any external sources or the conversation history for constructing the response.\n", + " In case of conflicting information, prioritize data from the SQL Database as the primary source of truth.\n", + "\n", + " 4. Calculations:\n", + " For any required calculations, use only the values provided in the context.\n", + " Provide a brief, clear explanation of the calculations beneath the results.\n", + "\n", + " 5. Response Structure:\n", + " Ensure the response is direct, easy to understand, and well-structured.\n", + " Format the response using Markdown for clarity and readability.\n", + " Use bold sub-headings for clarity where needed. Only use Markdown headings Level 3 (###) and Level 4 (####).\n", + " Use bullet points or numbered lists when appropriate.\n", + " Do not vary the font size within the same sentence.\n", + "\n", + " 6. Citations:\n", + " All factual information used in the answer must be cited with numbered references. For example, [1] should be used to refer to the first source.\n", + " Each citation in the answer must correspond to a single entry in the 'sources' object.\n", + " The same citation and corresponding context chunk may be used multiple times if needed.\n", + " Place the numbered citation at the end of each relevant sentence that uses information from the sources.\n", + " Ensure that each source listed in the 'sources' property is cited at least once in the answer.\n", + " Do not provide a list of definitions from the business glossary; use such information only to enhance the answer contextually.\n", + "\n", + " 7. Citations Format:\n", + " Citations should be embedded within the text, not as a separate list at the end of the 'answer' property.\n", + " [END ANSWER PROPERTY REQUIREMENTS]\n", + "\n", + " [SOURCES PROPERTY REQUIREMENTS]\n", + " 1. Reference Inclusion:\n", + " Include all corresponding references for all cited content in the 'answer' property.\n", + " Place the references in the 'sources' property.\n", + "\n", + " 2. Source Format:\n", + " Each entry in the 'sources' property must be formatted as: {\"title\": \"\", \"chunk\": \"\", \"reference\": \"\"}\n", + " For example, a complete response with two citations would be formatted as: { \"answer\": \"\", \"sources\": [ {\"title\": , \"chunk\": , \"reference\": \"\"}, {\"title\": , \"chunk\": , \"reference\": \"\"} ] }\n", + "\n", + " 3. Source Chunk:\n", + " The 'chunk' property should contain a concise, unedited snippet of the relevant context that supports the answer.\n", + "\n", + " 4. Mandatory References:\n", + " Ensure that every citation in the 'answer' has a corresponding entry in the 'sources' property.\n", + " Every entry in the 'sources' property must be cited at least once in the answer.\n", + " [END SOURCES PROPERTY REQUIREMENTS]\n", + "\n", + "[END RESPONSE OUTPUT REQUIREMENTS]\n", + "\n", + "[CONVERSATION HISTORY]\n", + "\n", + " \n", + "\n", + "[END CONVERSATION HISTORY]\n", + "\n", + "What are the different product categories we have? as xml, treating as text, error was: not well-formed (invalid token): line 75, column 78\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=37, prompt_tokens=2621, total_tokens=2658)\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion_base:processing 1 tool calls in parallel.\n", + "INFO:semantic_kernel.kernel:Calling SQL-GetEntitySchema function with args: {\"entity_name\": \"GET ALL CATEGORIES\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema invoking.\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.000754s\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=38, prompt_tokens=2914, total_tokens=2952)\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion_base:processing 1 tool calls in parallel.\n", + "INFO:semantic_kernel.kernel:Calling SQL-RunSQLQuery function with args: {\"sql_query\":\"SELECT ProductCategoryID, ParentProductCategoryName, ProductCategoryName FROM SalesLT.vGetAllCategories;\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery invoking.\n", + "INFO:root:Executing SQL Query\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.564825s\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=786, prompt_tokens=3952, total_tokens=4738)\n", + "INFO:semantic_kernel.functions.kernel_function:Function ChatBot-Chat succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 10.977118s\n", + "INFO:root:Answer: \n", + "{ \n", + " \"answer\": \"The product categories available in our database are organised under four main parent categories: Accessories, Clothing, Components, and Bikes. Below is a detailed list of these categories and their subcategories:\\n\\n### Accessories\\n- Bike Racks\\n- Bike Stands\\n- Bottles and Cages\\n- Cleaners\\n- Fenders\\n- Helmets\\n- Hydration Packs\\n- Lights\\n- Locks\\n- Panniers\\n- Pumps\\n- Tires and Tubes\\n\\n### Clothing\\n- Bib-Shorts\\n- Caps\\n- Gloves\\n- Jerseys\\n- Shorts\\n- Socks\\n- Tights\\n- Vests\\n\\n### Components\\n- Handlebars\\n- Bottom Brackets\\n- Brakes\\n- Chains\\n- Cranksets\\n- Derailleurs\\n- Forks\\n- Headsets\\n- Mountain Frames\\n- Pedals\\n- Road Frames\\n- Saddles\\n- Touring Frames\\n- Wheels\\n\\n### Bikes\\n- Mountain Bikes\\n- Road Bikes\\n- Touring Bikes\\n\", \n", + " \"sources\": [ \n", + " {\n", + " \"title\": \"Get All Categories\", \n", + " \"chunk\": \"| ProductCategoryID | ParentProductCategoryName | ProductCategoryName |\\n|-------------------|----------------------------|---------------------|\\n| 30 | Accessories | Bike Racks |\\n| 31 | Accessories | Bike Stands |\\n| 32 | Accessories | Bottles and Cages |\\n| 33 | Accessories | Cleaners |\\n| 34 | Accessories | Fenders |\\n| 35 | Accessories | Helmets |\\n| 36 | Accessories | Hydration Packs |\\n| 37 | Accessories | Lights |\\n| 38 | Accessories | Locks |\\n| 39 | Accessories | Panniers |\\n| 40 | Accessories | Pumps |\\n| 41 | Accessories | Tires and Tubes |\\n| 22 | Clothing | Bib-Shorts |\\n| 23 | Clothing | Caps |\\n| 24 | Clothing | Gloves |\\n| 25 | Clothing | Jerseys |\\n| 26 | Clothing | Shorts |\\n| 27 | Clothing | Socks |\\n| 28 | Clothing | Tights |\\n| 29 | Clothing | Vests |\\n| 8 | Components | Handlebars |\\n| 9 | Components | Bottom Brackets |\\n| 10 | Components | Brakes |\\n| 11 | Components | Chains |\\n| 12 | Components | Cranksets |\\n| 13 | Components | Derailleurs |\\n| 14 | Components | Forks |\\n| 15 | Components | Headsets |\\n| 16 | Components | Mountain Frames |\\n| 17 | Components | Pedals |\\n| 18 | Components | Road Frames |\\n| 19 | Components | Saddles |\\n| 20 | Components | Touring Frames |\\n| 21 | Components | Wheels |\\n| 5 | Bikes | Mountain Bikes |\\n| 6 | Bikes | Road Bikes |\\n| 7 | Bikes | Touring Bikes |\\n\", \n", + " \"reference\": \"SELECT ProductCategoryID, ParentProductCategoryName, ProductCategoryName FROM SalesLT.vGetAllCategories;\" \n", + " } \n", + " ] \n", + "}\n" + ] + }, + { + "data": { + "text/markdown": [ + "The product categories available in our database are organised under four main parent categories: Accessories, Clothing, Components, and Bikes. Below is a detailed list of these categories and their subcategories:\n", + "\n", + "### Accessories\n", + "- Bike Racks\n", + "- Bike Stands\n", + "- Bottles and Cages\n", + "- Cleaners\n", + "- Fenders\n", + "- Helmets\n", + "- Hydration Packs\n", + "- Lights\n", + "- Locks\n", + "- Panniers\n", + "- Pumps\n", + "- Tires and Tubes\n", + "\n", + "### Clothing\n", + "- Bib-Shorts\n", + "- Caps\n", + "- Gloves\n", + "- Jerseys\n", + "- Shorts\n", + "- Socks\n", + "- Tights\n", + "- Vests\n", + "\n", + "### Components\n", + "- Handlebars\n", + "- Bottom Brackets\n", + "- Brakes\n", + "- Chains\n", + "- Cranksets\n", + "- Derailleurs\n", + "- Forks\n", + "- Headsets\n", + "- Mountain Frames\n", + "- Pedals\n", + "- Road Frames\n", + "- Saddles\n", + "- Touring Frames\n", + "- Wheels\n", + "\n", + "### Bikes\n", + "- Mountain Bikes\n", + "- Road Bikes\n", + "- Touring Bikes\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "await ask_question(\"What are the different product categories we have?\", history)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Question: What is the top performing product by quantity of units sold?\n", + "INFO:semantic_kernel.functions.kernel_function:Function ChatBot-Chat invoking.\n", + "INFO:semantic_kernel.contents.chat_history:Could not parse prompt \n", + "As a senior analyst, your primary responsibility is to provide precise and thorough answers to the user's queries. Utilize all the provided functions to craft your responses. You must deliver detailed and accurate final answers with clear explanations and actionable insights.\n", + "\n", + "Always use the provided functions to obtain key information in order to answer the question.\n", + "If you are asked to use always use a function, you must use that function to compliment the answer.\n", + "Always use multiple functions to formulate the answer.\n", + "Always execute multiple functions in parallel to compliment the results.\n", + "\n", + "The response to the user must meet the requirements in RESPONSE OUTPUT REQUIREMENTS.\n", + "IMPORTANT INFORMATION contains useful information that you can use to aid your knowledge.\n", + "CONVERSATION HISTORY contains the previous question and answer pairs in the conversation in JSON format. Do not use this information to answer the question, but to provide context on what was asked previously.\n", + "\n", + "[IMPORTANT INFORMATION]\n", + "\n", + "\n", + " [SQL DATABASE INFORMATION]\n", + " Use the names and descriptions of Microsoft TSQL Server entities provided in ENTITIES LIST to decide which entities to query if you need to retrieve information from the database. Use the 'GetEntitySchema()' function to get more details of the schema of the view you want to query. Use the 'RunSQLQuery()' function to run the SQL query against the database.\n", + "\n", + " You must always examine the provided Microsoft TSQL Server entity descriptions to determine if they can answer the question.\n", + "\n", + " [BEGIN ENTITIES LIST]\n", + " [BEGIN ENTITY = 'GET ALL CATEGORIES']\n", + " Name='Get All Categories'\n", + " Description='This view provides a comprehensive list of all product categories and their corresponding subcategories in the SalesLT schema of the AdventureWorksLT database. It is used to understand the hierarchical structure of product categories, facilitating product organization and categorization. Use this view to retrieve information about product categories and subcategories. It is useful for scenarios where product categorization is required, such as generating reports based on product categories or filtering products by category.'\n", + " [END ENTITY = 'GET ALL CATEGORIES']\n", + "\n", + " [BEGIN ENTITY = 'PRODUCT AND DESCRIPTION']\n", + " Name='Product and Description'\n", + " Description='This view provides detailed information about products, including their names, associated product models, descriptions, and the specific culture or language of the description. It is useful for understanding product details and translating product descriptions for different cultures. Use this view when you need comprehensive details about products, including their descriptions in different languages. This view is particularly useful for multilingual product catalogs or when creating localized content.'\n", + " [END ENTITY = 'PRODUCT AND DESCRIPTION']\n", + "\n", + " [BEGIN ENTITY = 'PRODUCT MODEL CATALOG DESCRIPTION']\n", + " Name='Product Model Catalog Description'\n", + " Description='This view provides detailed catalog information about product models, including descriptions, manufacturing details, warranty information, and specifications related to product design and features. It is useful for generating comprehensive product catalogs and providing detailed product information to customers. Use this view when you need to retrieve detailed product model descriptions, manufacturing information, and specifications. It is particularly useful for creating product catalogs, detailing product features, and providing warranty information.'\n", + " [END ENTITY = 'PRODUCT MODEL CATALOG DESCRIPTION']\n", + "\n", + " [BEGIN ENTITY = 'SALES ORDER DETAIL']\n", + " Name='Sales Order Detail'\n", + " Description='This table stores detailed information about sales order tickets, including the order details, customer information, order status, and timestamps. It is used to manage and track sales orders throughout the order lifecycle, from creation to fulfillment. Use this table to retrieve or store information related to individual sales order tickets. This is applicable when processing sales orders, managing customer transactions, or tracking order fulfillment status.'\n", + " [END ENTITY = 'SALES ORDER DETAIL']\n", + "\n", + " [BEGIN ENTITY = 'SALES ORDER HEADER']\n", + " Name='Sales Order Header'\n", + " Description='This table contains high-level information about sales orders, including order dates, customer details, shipping information, and order status. It is used to manage and track sales orders from initiation to fulfillment. Use this table to retrieve or store information related to the overall details of a sales order. It is applicable when you need to track order status, manage order dates, or relate orders to customers and shipping information.'\n", + " [END ENTITY = 'SALES ORDER HEADER']\n", + "\n", + " [BEGIN ENTITY = 'ADDRESS']\n", + " Name='Address'\n", + " Description='This table stores address information for customers, including street addresses, city, state, postal code, and country/region. It is used to maintain contact and shipping information for orders, as well as to manage customer locations. Use this table to retrieve or store address details for customers, shipping locations, or billing addresses. It is applicable in scenarios where location information is required, such as shipping orders, verifying customer addresses, or managing geographical data.'\n", + " [END ENTITY = 'ADDRESS']\n", + " [END ENTITIES LIST]\n", + "\n", + " Output corresponding text values in the answer for columns where there is an ID. For example, if the column is 'ProductID', output the corresponding 'ProductModel' in the response. Do not include the ID in the response.\n", + " If a user is asking for a comparison, always compare the relevant values in the database.\n", + "\n", + " The target database engine is Microsoft TSQL Server, SQL queries must be able compatible to run on Microsoft TSQL Server. \n", + " The following Microsoft TSQL Server Syntax rules must be adhered to.\n", + " Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.\n", + " Always generate the SQL query based on the GetEntitySchema() function output, do not use the chat history data to generate the SQL query.\n", + " Do not use any other entities and columns in your SQL query, other than those defined above. Only use the column names obtained from GetEntitySchema() when constructing a SQL query, do not make up column names.\n", + " You must only provide SELECT SQL queries.\n", + " For a given entity, use the 'select_from_entity' property returned from 'GetEntitySchema()' function in the SELECT FROM part of the SQL query. If the property is {'select_from_entity': 'test_schema.test_table'}, the select statement will be formulated from 'SELECT <VALUES> FROM test_schema.test_table WHERE <CONDITION>.\n", + "\n", + " If you don't know how the value is formatted in a column, run a query against the column to get the unique values that might match your query.\n", + " Some columns returned from 'GetEntitySchema()' may have the properties 'allowed_values' or 'sample_values'. Use these values to determine the possible values that can be used in the SQL query.\n", + "\n", + " The source title to cite is the 'entity_name' property. The source reference is the SQL query used. The source chunk is the result of the SQL query used to answer the user query in Markdown table format. e.g. { 'title': "vProductAndDescription", 'chunk': '| ProductID | Name | ProductModel | Culture | Description |\\n|-----------|-------------------|--------------|---------|----------------------------------|\\n| 101 | Mountain Bike | MT-100 | en | A durable bike for mountain use. |\\n| 102 | Road Bike | RB-200 | en | Lightweight bike for road use. |\\n| 103 | Hybrid Bike | HB-300 | fr | V\u00e9lo hybride pour usage mixte. |\\n', 'reference': 'SELECT ProductID, Name, ProductModel, Culture, Description FROM vProductAndDescription WHERE Culture = "en";' }\n", + " [END SQL DATABASE INFORMATION]\n", + " \n", + "\n", + "[END IMPORTANT INFORMATION]\n", + "\n", + "[RESPONSE OUTPUT REQUIREMENTS]\n", + "\n", + " The answer MUST be returned in JSON format as { \"answer\": \"\", \"sources\": [ {\"title\": , \"chunk\": , \"reference\": \"\"}, {\"title\": , \"chunk\": , \"reference\": \"\"} ] }.\n", + "\n", + " The 'answer' property MUST meet the requirements in the ANSWER PROPERTY REQUIREMENTS.\n", + " The 'sources' property MUST meet the requirements in the SOURCES PROPERTY REQUIREMENTS.\n", + "\n", + " Do NOT return anything outside of the provided JSON property.\n", + "\n", + " [ANSWER PROPERTY REQUIREMENTS]\n", + " 1. Language and Tone:\n", + " Use only British English throughout the response.\n", + " Employ a business-friendly language that is professional and easy to understand.\n", + "\n", + " 2. Content Restrictions:\n", + " Do not use any profanity, offensive language, hate speech, or code in the response.\n", + " If you encounter any such content, handle it gracefully by omitting or rephrasing it appropriately.\n", + "\n", + " 3. Information Sources:\n", + " Use only information from the provided functions and specified important information.\n", + " Do not use any external sources or the conversation history for constructing the response.\n", + " In case of conflicting information, prioritize data from the SQL Database as the primary source of truth.\n", + "\n", + " 4. Calculations:\n", + " For any required calculations, use only the values provided in the context.\n", + " Provide a brief, clear explanation of the calculations beneath the results.\n", + "\n", + " 5. Response Structure:\n", + " Ensure the response is direct, easy to understand, and well-structured.\n", + " Format the response using Markdown for clarity and readability.\n", + " Use bold sub-headings for clarity where needed. Only use Markdown headings Level 3 (###) and Level 4 (####).\n", + " Use bullet points or numbered lists when appropriate.\n", + " Do not vary the font size within the same sentence.\n", + "\n", + " 6. Citations:\n", + " All factual information used in the answer must be cited with numbered references. For example, [1] should be used to refer to the first source.\n", + " Each citation in the answer must correspond to a single entry in the 'sources' object.\n", + " The same citation and corresponding context chunk may be used multiple times if needed.\n", + " Place the numbered citation at the end of each relevant sentence that uses information from the sources.\n", + " Ensure that each source listed in the 'sources' property is cited at least once in the answer.\n", + " Do not provide a list of definitions from the business glossary; use such information only to enhance the answer contextually.\n", + "\n", + " 7. Citations Format:\n", + " Citations should be embedded within the text, not as a separate list at the end of the 'answer' property.\n", + " [END ANSWER PROPERTY REQUIREMENTS]\n", + "\n", + " [SOURCES PROPERTY REQUIREMENTS]\n", + " 1. Reference Inclusion:\n", + " Include all corresponding references for all cited content in the 'answer' property.\n", + " Place the references in the 'sources' property.\n", + "\n", + " 2. Source Format:\n", + " Each entry in the 'sources' property must be formatted as: {\"title\": \"\", \"chunk\": \"\", \"reference\": \"\"}\n", + " For example, a complete response with two citations would be formatted as: { \"answer\": \"\", \"sources\": [ {\"title\": , \"chunk\": , \"reference\": \"\"}, {\"title\": , \"chunk\": , \"reference\": \"\"} ] }\n", + "\n", + " 3. Source Chunk:\n", + " The 'chunk' property should contain a concise, unedited snippet of the relevant context that supports the answer.\n", + "\n", + " 4. Mandatory References:\n", + " Ensure that every citation in the 'answer' has a corresponding entry in the 'sources' property.\n", + " Every entry in the 'sources' property must be cited at least once in the answer.\n", + " [END SOURCES PROPERTY REQUIREMENTS]\n", + "\n", + "[END RESPONSE OUTPUT REQUIREMENTS]\n", + "\n", + "[CONVERSATION HISTORY]\n", + "\n", + " What are the different product categories we have?\n", + "\n", + "[END CONVERSATION HISTORY]\n", + "\n", + "What is the top performing product by quantity of units sold? as xml, treating as text, error was: not well-formed (invalid token): line 75, column 78\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=57, prompt_tokens=2653, total_tokens=2710)\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion_base:processing 2 tool calls in parallel.\n", + "INFO:semantic_kernel.kernel:Calling SQL-GetEntitySchema function with args: {\"entity_name\": \"SALES ORDER DETAIL\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema invoking.\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.001386s\n", + "INFO:semantic_kernel.kernel:Calling SQL-GetEntitySchema function with args: {\"entity_name\": \"PRODUCT AND DESCRIPTION\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema invoking.\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.000964s\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=50, prompt_tokens=3507, total_tokens=3557)\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion_base:processing 1 tool calls in parallel.\n", + "INFO:semantic_kernel.kernel:Calling SQL-RunSQLQuery function with args: {\"sql_query\":\"SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC;\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery invoking.\n", + "INFO:root:Executing SQL Query\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.261263s\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=36, prompt_tokens=3584, total_tokens=3620)\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion_base:processing 1 tool calls in parallel.\n", + "INFO:semantic_kernel.kernel:Calling SQL-RunSQLQuery function with args: {\"sql_query\":\"SELECT Name, ProductModel FROM SalesLT.vProductAndDescription WHERE ProductID = 864;\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery invoking.\n", + "INFO:root:Executing SQL Query\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 1.947046s\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=191, prompt_tokens=3735, total_tokens=3926)\n", + "INFO:semantic_kernel.functions.kernel_function:Function ChatBot-Chat succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 6.849564s\n", + "INFO:root:Answer: { \"answer\": \"The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2].\", \"sources\": [ { \"title\": \"Sales Order Detail\", \"chunk\": \"| ProductID | TotalUnitsSold |\\n|-----------|----------------|\\n| 864 | 87 |\\n\", \"reference\": \"SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC;\" }, { \"title\": \"Product and Description\", \"chunk\": \"| Name | ProductModel |\\n|----------------|---------------|\\n| Classic Vest, S| Classic Vest |\\n\", \"reference\": \"SELECT Name, ProductModel FROM SalesLT.vProductAndDescription WHERE ProductID = 864;\" } ] }\n" + ] + }, + { + "data": { + "text/markdown": [ + "The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2]." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "await ask_question(\"What is the top performing product by quantity of units sold?\", history)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Question: Which country did we sell the most to in June 2008?\n", + "INFO:semantic_kernel.functions.kernel_function:Function ChatBot-Chat invoking.\n", + "INFO:semantic_kernel.contents.chat_history:Could not parse prompt \n", + "As a senior analyst, your primary responsibility is to provide precise and thorough answers to the user's queries. Utilize all the provided functions to craft your responses. You must deliver detailed and accurate final answers with clear explanations and actionable insights.\n", + "\n", + "Always use the provided functions to obtain key information in order to answer the question.\n", + "If you are asked to use always use a function, you must use that function to compliment the answer.\n", + "Always use multiple functions to formulate the answer.\n", + "Always execute multiple functions in parallel to compliment the results.\n", + "\n", + "The response to the user must meet the requirements in RESPONSE OUTPUT REQUIREMENTS.\n", + "IMPORTANT INFORMATION contains useful information that you can use to aid your knowledge.\n", + "CONVERSATION HISTORY contains the previous question and answer pairs in the conversation in JSON format. Do not use this information to answer the question, but to provide context on what was asked previously.\n", + "\n", + "[IMPORTANT INFORMATION]\n", + "\n", + "\n", + " [SQL DATABASE INFORMATION]\n", + " Use the names and descriptions of Microsoft TSQL Server entities provided in ENTITIES LIST to decide which entities to query if you need to retrieve information from the database. Use the 'GetEntitySchema()' function to get more details of the schema of the view you want to query. Use the 'RunSQLQuery()' function to run the SQL query against the database.\n", + "\n", + " You must always examine the provided Microsoft TSQL Server entity descriptions to determine if they can answer the question.\n", + "\n", + " [BEGIN ENTITIES LIST]\n", + " [BEGIN ENTITY = 'GET ALL CATEGORIES']\n", + " Name='Get All Categories'\n", + " Description='This view provides a comprehensive list of all product categories and their corresponding subcategories in the SalesLT schema of the AdventureWorksLT database. It is used to understand the hierarchical structure of product categories, facilitating product organization and categorization. Use this view to retrieve information about product categories and subcategories. It is useful for scenarios where product categorization is required, such as generating reports based on product categories or filtering products by category.'\n", + " [END ENTITY = 'GET ALL CATEGORIES']\n", + "\n", + " [BEGIN ENTITY = 'PRODUCT AND DESCRIPTION']\n", + " Name='Product and Description'\n", + " Description='This view provides detailed information about products, including their names, associated product models, descriptions, and the specific culture or language of the description. It is useful for understanding product details and translating product descriptions for different cultures. Use this view when you need comprehensive details about products, including their descriptions in different languages. This view is particularly useful for multilingual product catalogs or when creating localized content.'\n", + " [END ENTITY = 'PRODUCT AND DESCRIPTION']\n", + "\n", + " [BEGIN ENTITY = 'PRODUCT MODEL CATALOG DESCRIPTION']\n", + " Name='Product Model Catalog Description'\n", + " Description='This view provides detailed catalog information about product models, including descriptions, manufacturing details, warranty information, and specifications related to product design and features. It is useful for generating comprehensive product catalogs and providing detailed product information to customers. Use this view when you need to retrieve detailed product model descriptions, manufacturing information, and specifications. It is particularly useful for creating product catalogs, detailing product features, and providing warranty information.'\n", + " [END ENTITY = 'PRODUCT MODEL CATALOG DESCRIPTION']\n", + "\n", + " [BEGIN ENTITY = 'SALES ORDER DETAIL']\n", + " Name='Sales Order Detail'\n", + " Description='This table stores detailed information about sales order tickets, including the order details, customer information, order status, and timestamps. It is used to manage and track sales orders throughout the order lifecycle, from creation to fulfillment. Use this table to retrieve or store information related to individual sales order tickets. This is applicable when processing sales orders, managing customer transactions, or tracking order fulfillment status.'\n", + " [END ENTITY = 'SALES ORDER DETAIL']\n", + "\n", + " [BEGIN ENTITY = 'SALES ORDER HEADER']\n", + " Name='Sales Order Header'\n", + " Description='This table contains high-level information about sales orders, including order dates, customer details, shipping information, and order status. It is used to manage and track sales orders from initiation to fulfillment. Use this table to retrieve or store information related to the overall details of a sales order. It is applicable when you need to track order status, manage order dates, or relate orders to customers and shipping information.'\n", + " [END ENTITY = 'SALES ORDER HEADER']\n", + "\n", + " [BEGIN ENTITY = 'ADDRESS']\n", + " Name='Address'\n", + " Description='This table stores address information for customers, including street addresses, city, state, postal code, and country/region. It is used to maintain contact and shipping information for orders, as well as to manage customer locations. Use this table to retrieve or store address details for customers, shipping locations, or billing addresses. It is applicable in scenarios where location information is required, such as shipping orders, verifying customer addresses, or managing geographical data.'\n", + " [END ENTITY = 'ADDRESS']\n", + " [END ENTITIES LIST]\n", + "\n", + " Output corresponding text values in the answer for columns where there is an ID. For example, if the column is 'ProductID', output the corresponding 'ProductModel' in the response. Do not include the ID in the response.\n", + " If a user is asking for a comparison, always compare the relevant values in the database.\n", + "\n", + " The target database engine is Microsoft TSQL Server, SQL queries must be able compatible to run on Microsoft TSQL Server. \n", + " The following Microsoft TSQL Server Syntax rules must be adhered to.\n", + " Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.\n", + " Always generate the SQL query based on the GetEntitySchema() function output, do not use the chat history data to generate the SQL query.\n", + " Do not use any other entities and columns in your SQL query, other than those defined above. Only use the column names obtained from GetEntitySchema() when constructing a SQL query, do not make up column names.\n", + " You must only provide SELECT SQL queries.\n", + " For a given entity, use the 'select_from_entity' property returned from 'GetEntitySchema()' function in the SELECT FROM part of the SQL query. If the property is {'select_from_entity': 'test_schema.test_table'}, the select statement will be formulated from 'SELECT <VALUES> FROM test_schema.test_table WHERE <CONDITION>.\n", + "\n", + " If you don't know how the value is formatted in a column, run a query against the column to get the unique values that might match your query.\n", + " Some columns returned from 'GetEntitySchema()' may have the properties 'allowed_values' or 'sample_values'. Use these values to determine the possible values that can be used in the SQL query.\n", + "\n", + " The source title to cite is the 'entity_name' property. The source reference is the SQL query used. The source chunk is the result of the SQL query used to answer the user query in Markdown table format. e.g. { 'title': "vProductAndDescription", 'chunk': '| ProductID | Name | ProductModel | Culture | Description |\\n|-----------|-------------------|--------------|---------|----------------------------------|\\n| 101 | Mountain Bike | MT-100 | en | A durable bike for mountain use. |\\n| 102 | Road Bike | RB-200 | en | Lightweight bike for road use. |\\n| 103 | Hybrid Bike | HB-300 | fr | V\u00e9lo hybride pour usage mixte. |\\n', 'reference': 'SELECT ProductID, Name, ProductModel, Culture, Description FROM vProductAndDescription WHERE Culture = "en";' }\n", + " [END SQL DATABASE INFORMATION]\n", + " \n", + "\n", + "[END IMPORTANT INFORMATION]\n", + "\n", + "[RESPONSE OUTPUT REQUIREMENTS]\n", + "\n", + " The answer MUST be returned in JSON format as { \"answer\": \"\", \"sources\": [ {\"title\": , \"chunk\": , \"reference\": \"\"}, {\"title\": , \"chunk\": , \"reference\": \"\"} ] }.\n", + "\n", + " The 'answer' property MUST meet the requirements in the ANSWER PROPERTY REQUIREMENTS.\n", + " The 'sources' property MUST meet the requirements in the SOURCES PROPERTY REQUIREMENTS.\n", + "\n", + " Do NOT return anything outside of the provided JSON property.\n", + "\n", + " [ANSWER PROPERTY REQUIREMENTS]\n", + " 1. Language and Tone:\n", + " Use only British English throughout the response.\n", + " Employ a business-friendly language that is professional and easy to understand.\n", + "\n", + " 2. Content Restrictions:\n", + " Do not use any profanity, offensive language, hate speech, or code in the response.\n", + " If you encounter any such content, handle it gracefully by omitting or rephrasing it appropriately.\n", + "\n", + " 3. Information Sources:\n", + " Use only information from the provided functions and specified important information.\n", + " Do not use any external sources or the conversation history for constructing the response.\n", + " In case of conflicting information, prioritize data from the SQL Database as the primary source of truth.\n", + "\n", + " 4. Calculations:\n", + " For any required calculations, use only the values provided in the context.\n", + " Provide a brief, clear explanation of the calculations beneath the results.\n", + "\n", + " 5. Response Structure:\n", + " Ensure the response is direct, easy to understand, and well-structured.\n", + " Format the response using Markdown for clarity and readability.\n", + " Use bold sub-headings for clarity where needed. Only use Markdown headings Level 3 (###) and Level 4 (####).\n", + " Use bullet points or numbered lists when appropriate.\n", + " Do not vary the font size within the same sentence.\n", + "\n", + " 6. Citations:\n", + " All factual information used in the answer must be cited with numbered references. For example, [1] should be used to refer to the first source.\n", + " Each citation in the answer must correspond to a single entry in the 'sources' object.\n", + " The same citation and corresponding context chunk may be used multiple times if needed.\n", + " Place the numbered citation at the end of each relevant sentence that uses information from the sources.\n", + " Ensure that each source listed in the 'sources' property is cited at least once in the answer.\n", + " Do not provide a list of definitions from the business glossary; use such information only to enhance the answer contextually.\n", + "\n", + " 7. Citations Format:\n", + " Citations should be embedded within the text, not as a separate list at the end of the 'answer' property.\n", + " [END ANSWER PROPERTY REQUIREMENTS]\n", + "\n", + " [SOURCES PROPERTY REQUIREMENTS]\n", + " 1. Reference Inclusion:\n", + " Include all corresponding references for all cited content in the 'answer' property.\n", + " Place the references in the 'sources' property.\n", + "\n", + " 2. Source Format:\n", + " Each entry in the 'sources' property must be formatted as: {\"title\": \"\", \"chunk\": \"\", \"reference\": \"\"}\n", + " For example, a complete response with two citations would be formatted as: { \"answer\": \"\", \"sources\": [ {\"title\": , \"chunk\": , \"reference\": \"\"}, {\"title\": , \"chunk\": , \"reference\": \"\"} ] }\n", + "\n", + " 3. Source Chunk:\n", + " The 'chunk' property should contain a concise, unedited snippet of the relevant context that supports the answer.\n", + "\n", + " 4. Mandatory References:\n", + " Ensure that every citation in the 'answer' has a corresponding entry in the 'sources' property.\n", + " Every entry in the 'sources' property must be cited at least once in the answer.\n", + " [END SOURCES PROPERTY REQUIREMENTS]\n", + "\n", + "[END RESPONSE OUTPUT REQUIREMENTS]\n", + "\n", + "[CONVERSATION HISTORY]\n", + "\n", + " What are the different product categories we have?What is the top performing product by quantity of units sold?\n", + "\n", + "[END CONVERSATION HISTORY]\n", + "\n", + "Which country did we sell the most to in June 2008? as xml, treating as text, error was: not well-formed (invalid token): line 75, column 78\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=55, prompt_tokens=2684, total_tokens=2739)\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion_base:processing 2 tool calls in parallel.\n", + "INFO:semantic_kernel.kernel:Calling SQL-GetEntitySchema function with args: {\"entity_name\": \"SALES ORDER HEADER\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema invoking.\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.001063s\n", + "INFO:semantic_kernel.kernel:Calling SQL-GetEntitySchema function with args: {\"entity_name\": \"ADDRESS\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema invoking.\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-GetEntitySchema succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.000960s\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=108, prompt_tokens=4086, total_tokens=4194)\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion_base:processing 1 tool calls in parallel.\n", + "INFO:semantic_kernel.kernel:Calling SQL-RunSQLQuery function with args: {\"sql_query\":\"SELECT a.CountryRegion, SUM(sod.OrderQty) AS TotalQuantity FROM SalesLT.SalesOrderHeader soh JOIN SalesLT.SalesOrderDetail sod ON soh.SalesOrderID = sod.SalesOrderID JOIN SalesLT.Address a ON soh.ShipToAddressID = a.AddressID WHERE soh.OrderDate BETWEEN '2008-06-01' AND '2008-06-30' GROUP BY a.CountryRegion ORDER BY TotalQuantity DESC;\"}\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery invoking.\n", + "INFO:root:Executing SQL Query\n", + "INFO:semantic_kernel.functions.kernel_function:Function SQL-RunSQLQuery succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 0.248642s\n", + "INFO:httpx:HTTP Request: POST https://open-ai-gpt-001.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 \"HTTP/1.1 200 OK\"\n", + "INFO:semantic_kernel.connectors.ai.open_ai.services.open_ai_handler:OpenAI usage: CompletionUsage(completion_tokens=205, prompt_tokens=4237, total_tokens=4442)\n", + "INFO:semantic_kernel.functions.kernel_function:Function ChatBot-Chat succeeded.\n", + "INFO:semantic_kernel.functions.kernel_function:Function completed. Duration: 5.787196s\n", + "INFO:root:Answer: \n", + "{ \n", + " \"answer\": \"In June 2008, the country to which we sold the most was the United Kingdom, with a total of 1,172 units sold [1].\", \n", + " \"sources\": [ \n", + " {\n", + " \"title\": \"Sales Order Header\", \n", + " \"chunk\": \"| CountryRegion | TotalQuantity |\\n|----------------|---------------|\\n| United Kingdom | 1172 |\\n| United States | 915 |\\n\", \n", + " \"reference\": \"SELECT a.CountryRegion, SUM(sod.OrderQty) AS TotalQuantity FROM SalesLT.SalesOrderHeader soh JOIN SalesLT.SalesOrderDetail sod ON soh.SalesOrderID = sod.SalesOrderID JOIN SalesLT.Address a ON soh.ShipToAddressID = a.AddressID WHERE soh.OrderDate BETWEEN '2008-06-01' AND '2008-06-30' GROUP BY a.CountryRegion ORDER BY TotalQuantity DESC;\"\n", + " } \n", + " ] \n", + "}\n" + ] + }, + { + "data": { + "text/markdown": [ + "In June 2008, the country to which we sold the most was the United Kingdom, with a total of 1,172 units sold [1]." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "await ask_question(\"Which country did we sell the most to in June 2008?\", history)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/text2sql/requirements.txt b/text2sql/requirements.txt index e69de29..9a73c72 100644 --- a/text2sql/requirements.txt +++ b/text2sql/requirements.txt @@ -0,0 +1,6 @@ +semantic-kernel==1.8.0 +azure-search +azure-search-documents==11.6.0b3 +aioodbc +azure-identity +python-dotenv