feat: replace milvus

gusye1234 · gusye1234 · commit 2ae85476e34a · 2024-08-21T16:54:54.000+08:00
diff --git a/examples/using_milvus_as_vectorDB.py b/examples/using_milvus_as_vectorDB.py
@@ -0,0 +1,94 @@
+import os
+import asyncio
+import numpy as np
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag._utils import logger
+from nano_graphrag.base import BaseVectorStorage
+from dataclasses import dataclass
+
+
+@dataclass
+class MilvusLiteStorge(BaseVectorStorage):
+
+    @staticmethod
+    def create_collection_if_not_exist(client, collection_name: str, **kwargs):
+        if client.has_collection(collection_name):
+            return
+        # TODO add constants for ID max length to 32
+        client.create_collection(
+            collection_name, max_length=32, id_type="string", **kwargs
+        )
+
+    def __post_init__(self):
+        from pymilvus import MilvusClient
+
+        self._client_file_name = os.path.join(
+            self.global_config["working_dir"], "milvus_lite.db"
+        )
+        self._client = MilvusClient(self._client_file_name)
+        self._max_batch_size = self.global_config["embedding_batch_num"]
+        MilvusLiteStorge.create_collection_if_not_exist(
+            self._client,
+            self.namespace,
+            dimension=self.embedding_func.embedding_dim,
+        )
+
+    async def upsert(self, data: dict[str, dict]):
+        logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
+        list_data = [
+            {
+                "id": k,
+                **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
+            }
+            for k, v in data.items()
+        ]
+        contents = [v["content"] for v in data.values()]
+        batches = [
+            contents[i : i + self._max_batch_size]
+            for i in range(0, len(contents), self._max_batch_size)
+        ]
+        embeddings_list = await asyncio.gather(
+            *[self.embedding_func(batch) for batch in batches]
+        )
+        embeddings = np.concatenate(embeddings_list)
+        for i, d in enumerate(list_data):
+            d["vector"] = embeddings[i]
+        results = self._client.upsert(collection_name=self.namespace, data=list_data)
+        return results
+
+    async def query(self, query, top_k=5):
+        embedding = await self.embedding_func([query])
+        results = self._client.search(
+            collection_name=self.namespace,
+            data=embedding,
+            limit=top_k,
+            output_fields=list(self.meta_fields),
+            search_params={"metric_type": "COSINE", "params": {"radius": 0.2}},
+        )
+        return [
+            {**dp["entity"], "id": dp["id"], "distance": dp["distance"]}
+            for dp in results[0]
+        ]
+
+
+def insert():
+    data = ["YOUR TEXT DATA HERE", "YOUR TEXT DATA HERE"]
+    rag = GraphRAG(
+        working_dir="./nano_graphrag_cache_milvus_TEST",
+        enable_llm_cache=True,
+        vector_db_storage_cls=MilvusLiteStorge,
+    )
+    rag.insert(data)
+
+
+def query():
+    rag = GraphRAG(
+        working_dir="./nano_graphrag_cache_milvus_TEST",
+        enable_llm_cache=True,
+        vector_db_storage_cls=MilvusLiteStorge,
+    )
+    print(rag.query("YOUR QUERY HERE", param=QueryParam(mode="local")))
+
+
+insert()
+query()
diff --git a/nano_graphrag/__init__.py b/nano_graphrag/__init__.py
@@ -1,6 +1,6 @@
 from .graphrag import GraphRAG, QueryParam
 
-__version__ = "0.0.3"
+__version__ = "0.0.4.dev"
 __author__ = "Jianbai Ye"
 __url__ = "https://github.com/gusye1234/nano-graphrag"
 
diff --git a/nano_graphrag/_storage.py b/nano_graphrag/_storage.py
@@ -8,7 +8,7 @@
 
 import networkx as nx
 import numpy as np
-from pymilvus import MilvusClient
+from nano_vectordb import NanoVectorDB
 
 from ._utils import load_json, logger, write_json
 from .base import (
@@ -62,37 +62,23 @@ async def drop(self):
 
 
 @dataclass
-class MilvusLiteStorge(BaseVectorStorage):
-
-    @staticmethod
-    def create_collection_if_not_exist(
-        client: "MilvusClient", collection_name: str, **kwargs
-    ):
-        if client.has_collection(collection_name):
-            return
-        # TODO add constants for ID max length to 32
-        client.create_collection(
-            collection_name, max_length=32, id_type="string", **kwargs
-        )
+class NanoVectorDBStorage(BaseVectorStorage):
 
     def __post_init__(self):
 
         self._client_file_name = os.path.join(
-            self.global_config["working_dir"], "milvus_lite.db"
+            self.global_config["working_dir"], f"vdb_{self.namespace}.json"
         )
-        self._client = MilvusClient(self._client_file_name)
         self._max_batch_size = self.global_config["embedding_batch_num"]
-        MilvusLiteStorge.create_collection_if_not_exist(
-            self._client,
-            self.namespace,
-            dimension=self.embedding_func.embedding_dim,
+        self._client = NanoVectorDB(
+            self.embedding_func.embedding_dim, storage_file=self._client_file_name
         )
 
     async def upsert(self, data: dict[str, dict]):
         logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
         list_data = [
             {
-                "id": k,
+                "__id__": k,
                 **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
             }
             for k, v in data.items()
@@ -107,23 +93,23 @@ async def upsert(self, data: dict[str, dict]):
         )
         embeddings = np.concatenate(embeddings_list)
         for i, d in enumerate(list_data):
-            d["vector"] = embeddings[i]
-        results = self._client.upsert(collection_name=self.namespace, data=list_data)
+            d["__vector__"] = embeddings[i]
+        results = self._client.upsert(datas=list_data)
         return results
 
-    async def query(self, query, top_k=5):
+    async def query(self, query: str, top_k=5):
         embedding = await self.embedding_func([query])
-        results = self._client.search(
-            collection_name=self.namespace,
-            data=embedding,
-            limit=top_k,
-            output_fields=list(self.meta_fields),
-            search_params={"metric_type": "COSINE", "params": {"radius": 0.2}},
+        embedding = embedding[0]
+        results = self._client.query(
+            query=embedding, top_k=top_k, better_than_threshold=0.2
         )
-        return [
-            {**dp["entity"], "id": dp["id"], "distance": dp["distance"]}
-            for dp in results[0]
+        results = [
+            {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
         ]
+        return results
+
+    async def index_done_callback(self):
+        self._client.save()
 
 
 @dataclass
diff --git a/nano_graphrag/graphrag.py b/nano_graphrag/graphrag.py
@@ -14,7 +14,11 @@
     local_query,
     global_query,
 )
-from ._storage import JsonKVStorage, MilvusLiteStorge, NetworkXStorage
+from ._storage import (
+    JsonKVStorage,
+    NanoVectorDBStorage,
+    NetworkXStorage,
+)
 from ._utils import EmbeddingFunc, compute_mdhash_id, limit_async_func_call, logger
 from .base import (
     BaseGraphStorage,
@@ -81,7 +85,7 @@ class GraphRAG:
 
     # storage
     key_string_value_json_storage_cls: Type[BaseKVStorage] = JsonKVStorage
-    vector_db_storage_cls: Type[BaseVectorStorage] = MilvusLiteStorge
+    vector_db_storage_cls: Type[BaseVectorStorage] = NanoVectorDBStorage
     graph_storage_cls: Type[BaseGraphStorage] = NetworkXStorage
     enable_llm_cache: bool = True
 
diff --git a/readme.md b/readme.md
@@ -122,22 +122,6 @@ Some important prompts:
 - `PROMPTS["global_reduce_rag_response"]` is the system prompt template of the global search generation.
 - `PROMPTS["fail_response"]` is the fallback response when nothing is related to the user query.
 
-### Storage
-
-You can replace all storage-related components to your own implementation, `nano-graphrag` mainly uses three kinds of storage:
-
-- `base.BaseKVStorage` for storing key-json pairs of data. 
-  - By default we use disk file storage as the backend. 
-  -  `GraphRAG(.., key_string_value_json_storage_cls=YOURS,...)`
-- `base.BaseVectorStorage` for indexing embeddings. 
-  - By default we use [`milvus-lite`](https://github.com/milvus-io/milvus-lite) as the backend.
-  - `GraphRAG(.., vector_db_storage_cls=YOURS,...)`
-- `base.BaseGraphStorage` for storing knowledge graph. 
-  - By default we use [`networkx`](https://github.com/networkx/networkx) as the backend.
-  - `GraphRAG(.., graph_storage_cls=YOURS,...)`
-
-You can refer to `nano_graphrag.base` to see detailed interfaces for each components.
-
 ### LLM
 
 In `nano-graphrag`, we requires two types of LLM, a great one and a cheap one. The former is used to plan and respond, the latter is used to summary. By default, the great one is `gpt-4o` and the cheap one is `gpt-4o-mini`
@@ -191,6 +175,28 @@ GraphRAG(embedding_func=your_embed_func, embedding_batch_num=..., embedding_func
 
 You can refer to an [example](./examples/using_local_embedding_model.py) that use `sentence-transformer` to locally compute embeddings.
 
+### Storage
+
+You can replace all storage-related components to your own implementation, `nano-graphrag` mainly uses three kinds of storage:
+
+**`base.BaseKVStorage` for storing key-json pairs of data** 
+
+- By default we use disk file storage as the backend. 
+- `GraphRAG(.., key_string_value_json_storage_cls=YOURS,...)`
+
+**`base.BaseVectorStorage` for indexing embeddings**
+
+- By default we use [`nano-vectordb`](https://github.com/gusye1234/nano-vectordb) as the backend.
+- Check out this [example](./examples/using_milvus_as_vectorDB.py) that use [`milvus-lite`](https://github.com/milvus-io/milvus-lite) as the backend (not available in Windows).
+- `GraphRAG(.., vector_db_storage_cls=YOURS,...)`
+
+**`base.BaseGraphStorage` for storing knowledge graph**
+
+- By default we use [`networkx`](https://github.com/networkx/networkx) as the backend.
+- `GraphRAG(.., graph_storage_cls=YOURS,...)`
+
+You can refer to `nano_graphrag.base` to see detailed interfaces for each components.
+
 
 
 ## Benchmark
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 openai
 tiktoken
-pymilvus
 networkx
-graspologic
+graspologic
+nano-vectordb