From 027a7ec3120bf20e639ad10d49ef065a96314f10 Mon Sep 17 00:00:00 2001 From: "David L. Qiu" Date: Wed, 16 Oct 2024 09:42:40 -0700 Subject: [PATCH 1/2] use `RecursiveJsonSplitter` when learning JSON files --- packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index e0c6139c0..1de49fe63 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -27,6 +27,7 @@ MarkdownTextSplitter, PythonCodeTextSplitter, RecursiveCharacterTextSplitter, + RecursiveJsonSplitter, ) from langchain_community.vectorstores import FAISS @@ -240,6 +241,7 @@ async def learn_dir( ".md": MarkdownTextSplitter(**splitter_kwargs), ".tex": LatexTextSplitter(**splitter_kwargs), ".ipynb": NotebookSplitter(**splitter_kwargs), + ".json": RecursiveJsonSplitter(**splitter_kwargs), } splitter = ExtensionSplitter( splitters=splitters, From 7be57ef7aaf101f8e95fefcd7e389682d925a8f7 Mon Sep 17 00:00:00 2001 From: "David L. Qiu" Date: Wed, 16 Oct 2024 14:53:11 -0700 Subject: [PATCH 2/2] do not pass unsupported arguments to `RecursiveJsonSplitter` --- packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index 1de49fe63..8b1c5c104 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -84,7 +84,7 @@ def __init__(self, *args, **kwargs): action="store", default=DEFAULT_CHUNK_SIZE, type=int, - help="Max number of characters in chunk", + help="Maximum number of characters per chunk. This argument is not supported on JSON files.", ) self.parser.add_argument( "-o", @@ -92,7 +92,7 @@ def __init__(self, *args, **kwargs): action="store", default=DEFAULT_CHUNK_OVERLAP, type=int, - help="Number of characters overlapping between chunks, helpful to ensure text is not split mid-word or mid-sentence", + help="Number of characters that may overlap between chunks, which can help ensure a document is not split mid-word or mid-sentence. This argument is not supported on JSON files.", ) self.parser.add_argument("path", nargs=argparse.REMAINDER) self.index_name = "default" @@ -241,7 +241,7 @@ async def learn_dir( ".md": MarkdownTextSplitter(**splitter_kwargs), ".tex": LatexTextSplitter(**splitter_kwargs), ".ipynb": NotebookSplitter(**splitter_kwargs), - ".json": RecursiveJsonSplitter(**splitter_kwargs), + ".json": RecursiveJsonSplitter(), } splitter = ExtensionSplitter( splitters=splitters,