From a00f40797aa26123ccc9d09fe84177e98d85ae37 Mon Sep 17 00:00:00 2001 From: dhruv-anand-aintech Date: Wed, 15 May 2024 16:54:22 +0530 Subject: [PATCH 1/6] boilerplate code for connect to db --- README.md | 9 +- src/vdf_io/export_vdf/pgvector_export.py | 157 ++++++++++++++++ src/vdf_io/import_vdf/pgvector_import.py | 189 ++++++++++++++++++++ src/vdf_io/import_vdf/pgvector_util.py | 84 +++++++++ src/vdf_io/names.py | 1 + src/vdf_io/notebooks/jsonl_to_parquet.ipynb | 158 +++++++++++----- 6 files changed, 551 insertions(+), 47 deletions(-) create mode 100644 src/vdf_io/export_vdf/pgvector_export.py create mode 100644 src/vdf_io/import_vdf/pgvector_import.py create mode 100644 src/vdf_io/import_vdf/pgvector_util.py diff --git a/README.md b/README.md index 5d67d4a..e0f30be 100644 --- a/README.md +++ b/README.md @@ -60,13 +60,12 @@ See the [Contributing](#contributing) section to add support for your favorite v | Vector Database | Import | Export | |--------------------------------|--------|--------| +| pgvector | ❌ | ❌ | | Azure AI Search | ❌ | ❌ | | Weaviate | ❌ | ❌ | | MongoDB Atlas | ❌ | ❌ | -| OpenSearch | ❌ | ❌ | | Apache Cassandra | ❌ | ❌ | | txtai | ❌ | ❌ | -| pgvector | ❌ | ❌ | | SQLite-VSS | ❌ | ❌ | @@ -78,8 +77,12 @@ See the [Contributing](#contributing) section to add support for your favorite v | Vector Database | Import | Export | |--------------------------------|--------|--------| | Vespa | ❌ | ❌ | +| AWS Neptune | ❌ | ❌ | +| Neo4j | ❌ | ❌ | | Marqo | ❌ | ❌ | +| OpenSearch | ❌ | ❌ | | Elasticsearch | ❌ | ❌ | +| Apache Solr | ❌ | ❌ | | Redis Search | ❌ | ❌ | | ClickHouse | ❌ | ❌ | | USearch | ❌ | ❌ | @@ -90,13 +93,11 @@ See the [Contributing](#contributing) section to add support for your favorite v | CrateDB | ❌ | ❌ | | Meilisearch | ❌ | ❌ | | MyScale | ❌ | ❌ | -| Neo4j | ❌ | ❌ | | Nuclia DB | ❌ | ❌ | | OramaSearch | ❌ | ❌ | | Typesense | ❌ | ❌ | | Anari AI | ❌ | ❌ | | Vald | ❌ | ❌ | -| Apache Solr | ❌ | ❌ | ## Installation diff --git a/src/vdf_io/export_vdf/pgvector_export.py b/src/vdf_io/export_vdf/pgvector_export.py new file mode 100644 index 0000000..cc2deea --- /dev/null +++ b/src/vdf_io/export_vdf/pgvector_export.py @@ -0,0 +1,157 @@ +import json +import os +from typing import Dict, List +import pandas as pd +import pyarrow +from tqdm import tqdm + +import psycopg2 + +from vdf_io.import_vdf.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt +from vdf_io.meta_types import NamespaceMeta +from vdf_io.names import DBNames +from vdf_io.util import set_arg_from_input, set_arg_from_password +from vdf_io.export_vdf.vdb_export_cls import ExportVDB + + +class ExportPGVector(ExportVDB): + DB_NAME_SLUG = DBNames.PGVECTOR + + @classmethod + def make_parser(cls, subparsers): + parser_pgvector = make_pgv_parser(cls.DB_NAME_SLUG, subparsers) + parser_pgvector.add_argument( + "--tables", type=str, help="PGVector tables to export (comma-separated)" + ) + parser_pgvector.add_argument( + "--batch_size", + type=int, + help="Batch size for exporting data", + default=10_000, + ) + + @classmethod + def export_vdb(cls, args): + set_pgv_args_from_prompt(args) + pgvector_export = ExportPGVector(args) + pgvector_export.get_all_table_names() + pgvector_export.get_all_schemas() + set_arg_from_input( + args, + "schema", + "Enter the name of the schema of the Postgres instance (default: public): ", + str, + choices=pgvector_export.all_schemas, + ) + set_arg_from_input( + args, + "tables", + "Enter the name of tables to import (comma-separated, all will be imported by default): ", + str, + choices=pgvector_export.all_tables, + ) + pgvector_export.get_data() + return pgvector_export + + def __init__(self, args): + super().__init__(args) + if args.get("connection_string"): + self.conn = psycopg2.connect(args["connection_string"]) + else: + self.conn = psycopg2.connect( + user=args["user"], + password=args["password"], + host=args["host"] if args.get("host", "") != "" else "localhost", + port=args["port"] if args.get("port", "") != "" else "5432", + dbname=args["dbname"] if args.get("dbname", "") != "" else "postgres", + ) + + def get_all_schemas(self): + schemas = self.conn.execute( + "SELECT schema_name FROM information_schema.schemata" + ) + self.all_schemas = [schema[0] for schema in schemas] + return [schema[0] for schema in schemas] + + def get_all_table_names(self): + tables = self.conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema='public'" + ) + self.all_tables = [table[0] for table in tables] + return [table[0] for table in tables] + + def get_all_index_names(self): + return self.db.table_names() + + def get_index_names(self): + if self.args.get("tables", None) is not None: + return self.args["tables"].split(",") + return self.db.table_names() + + def get_data(self): + index_names = self.get_index_names() + BATCH_SIZE = self.args["batch_size"] + total = 0 + index_metas: Dict[str, List[NamespaceMeta]] = {} + for index_name in index_names: + namespace_metas = [] + vectors_directory = self.create_vec_dir(index_name) + table = self.db.open_table(index_name) + offset = 0 + remainder_df = None + j = 0 + for batch in tqdm(table.to_lance().to_batches()): + df = batch.to_pandas() + if remainder_df is not None: + df = pd.concat([remainder_df, df]) + while len(df) >= BATCH_SIZE: + # TODO: use save_vectors_to_parquet + df[:BATCH_SIZE].to_parquet( + os.path.join(vectors_directory, f"{j}.parquet") + ) + j += 1 + total += BATCH_SIZE + df = df[BATCH_SIZE:] + offset += BATCH_SIZE + remainder_df = df + if remainder_df is not None and len(remainder_df) > 0: + # TODO: use save_vectors_to_parquet + remainder_df.to_parquet(os.path.join(vectors_directory, f"{j}.parquet")) + total += len(remainder_df) + dim = -1 + for name in table.schema.names: + if pyarrow.types.is_fixed_size_list(table.schema.field(name).type): + dim = table.schema.field(name).type.list_size + vector_columns = [ + name + for name in table.schema.names + if pyarrow.types.is_fixed_size_list(table.schema.field(name).type) + ] + distance = "Cosine" + try: + for index in table.list_indices(): + if index.vector_column_name == vector_columns[0]: + distance = vector_columns[0] + except Exception: + pass + + namespace_metas = [ + self.get_namespace_meta( + index_name, + vectors_directory, + total=total, + num_vectors_exported=total, + dim=dim, + vector_columns=vector_columns, + distance=distance, + ) + ] + index_metas[index_name] = namespace_metas + self.file_structure.append(os.path.join(self.vdf_directory, "VDF_META.json")) + internal_metadata = self.get_basic_vdf_meta(index_metas) + meta_text = json.dumps(internal_metadata.model_dump(), indent=4) + tqdm.write(meta_text) + with open(os.path.join(self.vdf_directory, "VDF_META.json"), "w") as json_file: + json_file.write(meta_text) + # print internal metadata properly + return True diff --git a/src/vdf_io/import_vdf/pgvector_import.py b/src/vdf_io/import_vdf/pgvector_import.py new file mode 100644 index 0000000..e5509f6 --- /dev/null +++ b/src/vdf_io/import_vdf/pgvector_import.py @@ -0,0 +1,189 @@ +from typing import Dict, List +from dotenv import load_dotenv +import pandas as pd +from tqdm import tqdm +import pyarrow.parquet as pq + +from pgvector.psycopg import register_vector +import psycopg2 + +from vdf_io.constants import DEFAULT_BATCH_SIZE, INT_MAX +from vdf_io.import_vdf.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt +from vdf_io.meta_types import NamespaceMeta +from vdf_io.names import DBNames +from vdf_io.util import ( + cleanup_df, + divide_into_batches, + set_arg_from_input, + set_arg_from_password, +) +from vdf_io.import_vdf.vdf_import_cls import ImportVDB + + +load_dotenv() + + +class ImportPGVector(ImportVDB): + DB_NAME_SLUG = DBNames.PGVECTOR + + @classmethod + def import_vdb(cls, args): + """ + Import data to PGVector + """ + set_pgv_args_from_prompt(args) + pgvector_import = ImportPGVector(args) + pgvector_import.get_all_table_names() + pgvector_import.get_all_schemas() + set_arg_from_input( + args, + "schema", + "Enter the name of the schema of the Postgres instance (default: public): ", + str, + choices=pgvector_import.all_schemas, + ) + set_arg_from_input( + args, + "tables", + "Enter the name of tables to import (comma-separated, all will be imported by default): ", + str, + choices=pgvector_import.all_tables, + ) + pgvector_import.upsert_data() + return pgvector_import + + @classmethod + def make_parser(cls, subparsers): + parser_pgvector = make_pgv_parser(cls.DB_NAME_SLUG, subparsers) + parser_pgvector.add_argument( + "--tables", type=str, help="Postgres tables to export (comma-separated)" + ) + + def __init__(self, args): + # call super class constructor + super().__init__(args) + # use connection_string + if args.get("connection_string"): + self.conn = psycopg2.connect(args["connection_string"]) + else: + self.conn = psycopg2.connect( + user=args["user"], + password=args["password"], + host=args["host"] if args.get("host", "") != "" else "localhost", + port=args["port"] if args.get("port", "") != "" else "5432", + dbname=args["dbname"] if args.get("dbname", "") != "" else "postgres", + ) + + def get_all_schemas(self): + schemas = self.conn.execute( + "SELECT schema_name FROM information_schema.schemata" + ) + self.all_schemas = [schema[0] for schema in schemas] + return [schema[0] for schema in schemas] + + def get_all_table_names(self): + tables = self.conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema='public'" + ) + self.all_tables = [table[0] for table in tables] + return [table[0] for table in tables] + + def upsert_data(self): + register_vector(self.conn) + self.conn.execute("CREATE EXTENSION IF NOT EXISTS vector") + + max_hit = False + self.total_imported_count = 0 + indexes_content: Dict[str, List[NamespaceMeta]] = self.vdf_meta["indexes"] + index_names: List[str] = list(indexes_content.keys()) + if len(index_names) == 0: + raise ValueError("No indexes found in VDF_META.json") + tables = self.get_all_table_names() + # Load Parquet file + # print(indexes_content[index_names[0]]):List[NamespaceMeta] + for index_name, index_meta in tqdm( + indexes_content.items(), desc="Importing indexes" + ): + for namespace_meta in tqdm(index_meta, desc="Importing namespaces"): + self.set_dims(namespace_meta, index_name) + data_path = namespace_meta["data_path"] + final_data_path = self.get_final_data_path(data_path) + # Load the data from the parquet files + parquet_files = self.get_parquet_files(final_data_path) + + new_index_name = index_name + ( + f'_{namespace_meta["namespace"]}' + if namespace_meta["namespace"] + else "" + ) + new_index_name = self.create_new_name(new_index_name, tables) + if new_index_name not in tables: + table = self.conn.create_table( + new_index_name, schema=pq.read_schema(parquet_files[0]) + ) + tqdm.write(f"Created table {new_index_name}") + else: + table = self.conn.open_table(new_index_name) + tqdm.write(f"Opened table {new_index_name}") + + for file in tqdm(parquet_files, desc="Iterating parquet files"): + file_path = self.get_file_path(final_data_path, file) + df = self.read_parquet_progress( + file_path, + max_num_rows=( + (self.args.get("max_num_rows") or INT_MAX) + - self.total_imported_count + ), + ) + df = cleanup_df(df) + # if there are additional columns in the parquet file, add them to the table + for col in df.columns: + if col not in [field.name for field in table.schema]: + col_type = df[col].dtype + tqdm.write( + f"Adding column {col} of type {col_type} to {new_index_name}" + ) + table.add_columns( + { + col: get_default_value(col_type), + } + ) + # split in batches + BATCH_SIZE = self.args.get("batch_size") or DEFAULT_BATCH_SIZE + for batch in tqdm( + divide_into_batches(df, BATCH_SIZE), + desc="Importing batches", + total=len(df) // BATCH_SIZE, + ): + if self.total_imported_count + len(batch) >= ( + self.args.get("max_num_rows") or INT_MAX + ): + batch = batch[ + : (self.args.get("max_num_rows") or INT_MAX) + - self.total_imported_count + ] + max_hit = True + # convert df into list of dicts + table.add(batch) + self.total_imported_count += len(batch) + if max_hit: + break + tqdm.write(f"Imported {self.total_imported_count} rows") + tqdm.write(f"New table size: {table.count_rows()}") + if max_hit: + break + print("Data imported successfully") + + +def get_default_value(data_type): + # Define default values for common data types + default_values = { + "object": "", + "int64": 0, + "float64": 0.0, + "bool": False, + "datetime64[ns]": pd.Timestamp("NaT"), + "timedelta64[ns]": pd.Timedelta("NaT"), + } + # Return the default value for the specified data type, or None if not specified + return default_values.get(data_type.name, None) diff --git a/src/vdf_io/import_vdf/pgvector_util.py b/src/vdf_io/import_vdf/pgvector_util.py new file mode 100644 index 0000000..90f59f4 --- /dev/null +++ b/src/vdf_io/import_vdf/pgvector_util.py @@ -0,0 +1,84 @@ +from vdf_io.util import set_arg_from_input, set_arg_from_password + + +def make_pgv_parser(DB_NAME_SLUG, subparsers): + parser_pgvector = subparsers.add_parser( + DB_NAME_SLUG, help="Import data to pgvector" + ) + parser_pgvector.add_argument( + "--platform", + type=str, + choices=["supabase", "neon", "tembo", "any"], + default="any", + help="Platform to connect to", + ) + parser_pgvector.add_argument( + "--schema", + type=str, + help="Schema of the Postgres instance (default: public)", + default="public", + ) + parser_pgvector.add_argument( + "--connection_string", + type=str, + help="Connection string to Postgres instance", + ) + parser_pgvector.add_argument("--user", type=str, help="User of Postgres instance") + parser_pgvector.add_argument( + "--password", type=str, help="Password of Postgres instance" + ) + parser_pgvector.add_argument("--host", type=str, help="Host of Postgres instance") + parser_pgvector.add_argument("--port", type=str, help="Port of Postgres instance") + parser_pgvector.add_argument( + "--dbname", type=str, help="Database name of Postgres instance" + ) + parser_pgvector.add_argument( + "--tables", type=str, help="Postgres tables to export (comma-separated)" + ) + return parser_pgvector + + +def set_pgv_args_from_prompt(args): + set_arg_from_input( + args, + "connection_string", + "Enter the connection string to Postgres instance: ", + str, + ) + if not args.get("connection_string"): + set_arg_from_input( + args, + "user", + "Enter the user of Postgres instance (default: postgres): ", + str, + default_value="postgres", + ) + set_arg_from_password( + args, + "password", + "Enter the password of Postgres instance (default: postgres): ", + ) + if not args.get("password"): + # If password is not provided, set it to "postgres" + args["password"] = "postgres" + set_arg_from_input( + args, + "host", + "Enter the host of Postgres instance: ", + str, + default_value="localhost", + ) + set_arg_from_input( + args, + "port", + "Enter the port of Postgres instance: ", + str, + default_value="5432", + ) + set_arg_from_input( + args, + "dbname", + "Enter the database name of Postgres instance: ", + str, + default_value="postgres", + ) diff --git a/src/vdf_io/names.py b/src/vdf_io/names.py index 6697e05..e0e33f8 100644 --- a/src/vdf_io/names.py +++ b/src/vdf_io/names.py @@ -12,3 +12,4 @@ class DBNames: ASTRADB = "astradb" AZUREAI = "azureai" TURBOPUFFER = "turbopuffer" + PGVECTOR = "pgvector" diff --git a/src/vdf_io/notebooks/jsonl_to_parquet.ipynb b/src/vdf_io/notebooks/jsonl_to_parquet.ipynb index e9b952f..b4a3e9e 100644 --- a/src/vdf_io/notebooks/jsonl_to_parquet.ipynb +++ b/src/vdf_io/notebooks/jsonl_to_parquet.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -30,12 +30,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load JSONL File\n", - "jsonl_file = '/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2'\n" + "jsonl_file = '/Users/dhruvanand/Downloads/579aac087579b5acbc881164eb7af8b8-662d020154513ab1ef571e325a8fc649ebf64561/o200k_tokens.jsonl'\n" ] }, { @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -82,70 +82,75 @@ " \n", " \n", " \n", - " query\n", - " document\n", - " metadata\n", + " token\n", + " token_id\n", + " type\n", + " language\n", + " definition\n", " \n", " \n", " \n", " \n", " 0\n", - " What animals prey on meerkats?\n", - " What animals prey of the meerkats?\n", - " {'topic': 'Animals', 'objective': {'self': [],...\n", + " ا\n", + " 496\n", + " subword\n", + " Arabic\n", + " This is a letter in the Arabic alphabet, repre...\n", " \n", " \n", " 1\n", - " What is the most pointless question you have s...\n", - " What is the most idiotic, or pointless, questi...\n", - " {'topic': 'Quora (2)', 'objective': {'self': [...\n", + " ال\n", + " 589\n", + " subword\n", + " Arabic\n", + " The (definite article)\n", " \n", " \n", " 2\n", - " Can we apply online for net banking in sbi?\n", - " Can I apply for internet banking in SBI withou...\n", - " {'topic': 'Personal finance', 'objective': {'s...\n", + " п\n", + " 648\n", + " subword\n", + " Russian\n", + " This is the Cyrillic letter 'п', which is the ...\n", " \n", " \n", " 3\n", - " What are the best SEO strategies for 2017?\n", - " What is the best SEO strategies in 2017?\n", - " {'topic': 'SEO', 'objective': {'self': [], 'pa...\n", + " с\n", + " 669\n", + " subword\n", + " Russian\n", + " The Cyrillic letter 'с', which represents the ...\n", " \n", " \n", " 4\n", - " What is particle spin?\n", - " What is spin? What are the causes for a partic...\n", - " {'topic': 'Chromatography', 'objective': {'sel...\n", + " ра\n", + " 714\n", + " Subword\n", + " Russian\n", + " This is a Cyrillic character that is part of a...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " query \\\n", - "0 What animals prey on meerkats? \n", - "1 What is the most pointless question you have s... \n", - "2 Can we apply online for net banking in sbi? \n", - "3 What are the best SEO strategies for 2017? \n", - "4 What is particle spin? \n", - "\n", - " document \\\n", - "0 What animals prey of the meerkats? \n", - "1 What is the most idiotic, or pointless, questi... \n", - "2 Can I apply for internet banking in SBI withou... \n", - "3 What is the best SEO strategies in 2017? \n", - "4 What is spin? What are the causes for a partic... \n", + " token token_id type language \\\n", + "0 ا 496 subword Arabic \n", + "1 ال 589 subword Arabic \n", + "2 п 648 subword Russian \n", + "3 с 669 subword Russian \n", + "4 ра 714 Subword Russian \n", "\n", - " metadata \n", - "0 {'topic': 'Animals', 'objective': {'self': [],... \n", - "1 {'topic': 'Quora (2)', 'objective': {'self': [... \n", - "2 {'topic': 'Personal finance', 'objective': {'s... \n", - "3 {'topic': 'SEO', 'objective': {'self': [], 'pa... \n", - "4 {'topic': 'Chromatography', 'objective': {'sel... " + " definition \n", + "0 This is a letter in the Arabic alphabet, repre... \n", + "1 The (definite article) \n", + "2 This is the Cyrillic letter 'п', which is the ... \n", + "3 The Cyrillic letter 'с', which represents the ... \n", + "4 This is a Cyrillic character that is part of a... " ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -154,6 +159,73 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "64465" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 64465 entries, 0 to 64464\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 token 64465 non-null object\n", + " 1 token_id 64465 non-null int64 \n", + " 2 type 64420 non-null object\n", + " 3 language 64420 non-null object\n", + " 4 definition 64416 non-null object\n", + "dtypes: int64(1), object(4)\n", + "memory usage: 3.0+ MB\n" + ] + } + ], + "source": [ + "# summary of df\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df['token_length'] = df['token'].apply(lambda x: len(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_parquet('/Users/dhruvanand/Downloads/579aac087579b5acbc881164eb7af8b8-662d020154513ab1ef571e325a8fc649ebf64561/'+'o200k_tokens.parquet', engine='pyarrow')" + ] + }, { "cell_type": "code", "execution_count": 12, From 2069c1ac50c71c970e7fd518bd7ff57a8799e544 Mon Sep 17 00:00:00 2001 From: dhruv-anand-aintech Date: Wed, 15 May 2024 17:28:51 +0530 Subject: [PATCH 2/6] fix syntax, move file --- src/vdf_io/export_vdf/pgvector_export.py | 4 ++-- src/vdf_io/import_vdf/pgvector_import.py | 3 +-- src/vdf_io/{import_vdf => }/pgvector_util.py | 0 3 files changed, 3 insertions(+), 4 deletions(-) rename src/vdf_io/{import_vdf => }/pgvector_util.py (100%) diff --git a/src/vdf_io/export_vdf/pgvector_export.py b/src/vdf_io/export_vdf/pgvector_export.py index cc2deea..c85b86e 100644 --- a/src/vdf_io/export_vdf/pgvector_export.py +++ b/src/vdf_io/export_vdf/pgvector_export.py @@ -7,10 +7,10 @@ import psycopg2 -from vdf_io.import_vdf.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt +from vdf_io.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt from vdf_io.meta_types import NamespaceMeta from vdf_io.names import DBNames -from vdf_io.util import set_arg_from_input, set_arg_from_password +from vdf_io.util import set_arg_from_input from vdf_io.export_vdf.vdb_export_cls import ExportVDB diff --git a/src/vdf_io/import_vdf/pgvector_import.py b/src/vdf_io/import_vdf/pgvector_import.py index e5509f6..078a65c 100644 --- a/src/vdf_io/import_vdf/pgvector_import.py +++ b/src/vdf_io/import_vdf/pgvector_import.py @@ -8,14 +8,13 @@ import psycopg2 from vdf_io.constants import DEFAULT_BATCH_SIZE, INT_MAX -from vdf_io.import_vdf.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt +from vdf_io.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt from vdf_io.meta_types import NamespaceMeta from vdf_io.names import DBNames from vdf_io.util import ( cleanup_df, divide_into_batches, set_arg_from_input, - set_arg_from_password, ) from vdf_io.import_vdf.vdf_import_cls import ImportVDB diff --git a/src/vdf_io/import_vdf/pgvector_util.py b/src/vdf_io/pgvector_util.py similarity index 100% rename from src/vdf_io/import_vdf/pgvector_util.py rename to src/vdf_io/pgvector_util.py From 91cdf0e8883c709e55f46e01743457528c07fc43 Mon Sep 17 00:00:00 2001 From: dhruv-anand-aintech Date: Tue, 21 May 2024 16:54:57 +0530 Subject: [PATCH 3/6] adding a bunch of import logic quickstart notebook --- requirements.txt | 4 +- src/vdf_io/export_vdf/pgvector_export.py | 93 ++-- src/vdf_io/import_vdf/pgvector_import.py | 104 ++-- src/vdf_io/notebooks/neon.ipynb | 662 +++++++++++++++++++++++ src/vdf_io/pgvector_util.py | 23 +- src/vdf_io/util.py | 15 +- 6 files changed, 805 insertions(+), 96 deletions(-) create mode 100644 src/vdf_io/notebooks/neon.ipynb diff --git a/requirements.txt b/requirements.txt index eb64017..903b181 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,4 +33,6 @@ datasets~=2.16,>=2.19.0 mlx_embedding_models azure-search-documents azure-identity -turbopuffer[fast] \ No newline at end of file +turbopuffer[fast] +pgvector +psycopg \ No newline at end of file diff --git a/src/vdf_io/export_vdf/pgvector_export.py b/src/vdf_io/export_vdf/pgvector_export.py index c85b86e..8a72b02 100644 --- a/src/vdf_io/export_vdf/pgvector_export.py +++ b/src/vdf_io/export_vdf/pgvector_export.py @@ -20,15 +20,15 @@ class ExportPGVector(ExportVDB): @classmethod def make_parser(cls, subparsers): parser_pgvector = make_pgv_parser(cls.DB_NAME_SLUG, subparsers) - parser_pgvector.add_argument( - "--tables", type=str, help="PGVector tables to export (comma-separated)" - ) parser_pgvector.add_argument( "--batch_size", type=int, help="Batch size for exporting data", default=10_000, ) + parser_pgvector.add_argument( + "--tables", type=str, help="Postgres tables to export (comma-separated)" + ) @classmethod def export_vdb(cls, args): @@ -65,28 +65,33 @@ def __init__(self, args): port=args["port"] if args.get("port", "") != "" else "5432", dbname=args["dbname"] if args.get("dbname", "") != "" else "postgres", ) + self.cur = self.conn.cursor() def get_all_schemas(self): - schemas = self.conn.execute( + schemas = self.cur.execute( "SELECT schema_name FROM information_schema.schemata" ) self.all_schemas = [schema[0] for schema in schemas] return [schema[0] for schema in schemas] def get_all_table_names(self): - tables = self.conn.execute( + self.schema_name = self.args.get("schema") or "public" + tables = self.cur.execute( "SELECT table_name FROM information_schema.tables WHERE table_schema='public'" ) self.all_tables = [table[0] for table in tables] return [table[0] for table in tables] def get_all_index_names(self): - return self.db.table_names() + # get all tables in the schema + return self.cur.execute( + f"SELECT table_name FROM information_schema.tables WHERE table_schema='{self.schema_name}'" + ) def get_index_names(self): if self.args.get("tables", None) is not None: return self.args["tables"].split(",") - return self.db.table_names() + return self.get_all_index_names() def get_data(self): index_names = self.get_index_names() @@ -100,40 +105,40 @@ def get_data(self): offset = 0 remainder_df = None j = 0 - for batch in tqdm(table.to_lance().to_batches()): - df = batch.to_pandas() - if remainder_df is not None: - df = pd.concat([remainder_df, df]) - while len(df) >= BATCH_SIZE: - # TODO: use save_vectors_to_parquet - df[:BATCH_SIZE].to_parquet( - os.path.join(vectors_directory, f"{j}.parquet") - ) - j += 1 - total += BATCH_SIZE - df = df[BATCH_SIZE:] - offset += BATCH_SIZE - remainder_df = df - if remainder_df is not None and len(remainder_df) > 0: - # TODO: use save_vectors_to_parquet - remainder_df.to_parquet(os.path.join(vectors_directory, f"{j}.parquet")) - total += len(remainder_df) - dim = -1 - for name in table.schema.names: - if pyarrow.types.is_fixed_size_list(table.schema.field(name).type): - dim = table.schema.field(name).type.list_size - vector_columns = [ - name - for name in table.schema.names - if pyarrow.types.is_fixed_size_list(table.schema.field(name).type) - ] - distance = "Cosine" - try: - for index in table.list_indices(): - if index.vector_column_name == vector_columns[0]: - distance = vector_columns[0] - except Exception: - pass + # for batch in tqdm(table.to_lance().to_batches()): + # df = batch.to_pandas() + # if remainder_df is not None: + # df = pd.concat([remainder_df, df]) + # while len(df) >= BATCH_SIZE: + # # TODO: use save_vectors_to_parquet + # df[:BATCH_SIZE].to_parquet( + # os.path.join(vectors_directory, f"{j}.parquet") + # ) + # j += 1 + # total += BATCH_SIZE + # df = df[BATCH_SIZE:] + # offset += BATCH_SIZE + # remainder_df = df + # if remainder_df is not None and len(remainder_df) > 0: + # # TODO: use save_vectors_to_parquet + # remainder_df.to_parquet(os.path.join(vectors_directory, f"{j}.parquet")) + # total += len(remainder_df) + # dim = -1 + # for name in table.schema.names: + # if pyarrow.types.is_fixed_size_list(table.schema.field(name).type): + # dim = table.schema.field(name).type.list_size + # vector_columns = [ + # name + # for name in table.schema.names + # if pyarrow.types.is_fixed_size_list(table.schema.field(name).type) + # ] + # distance = "Cosine" + # try: + # for index in table.list_indices(): + # if index.vector_column_name == vector_columns[0]: + # distance = vector_columns[0] + # except Exception: + # pass namespace_metas = [ self.get_namespace_meta( @@ -141,9 +146,9 @@ def get_data(self): vectors_directory, total=total, num_vectors_exported=total, - dim=dim, - vector_columns=vector_columns, - distance=distance, + # dim=dim, + # vector_columns=vector_columns, + # distance=distance, ) ] index_metas[index_name] = namespace_metas diff --git a/src/vdf_io/import_vdf/pgvector_import.py b/src/vdf_io/import_vdf/pgvector_import.py index 078a65c..de240f3 100644 --- a/src/vdf_io/import_vdf/pgvector_import.py +++ b/src/vdf_io/import_vdf/pgvector_import.py @@ -41,22 +41,12 @@ def import_vdb(cls, args): str, choices=pgvector_import.all_schemas, ) - set_arg_from_input( - args, - "tables", - "Enter the name of tables to import (comma-separated, all will be imported by default): ", - str, - choices=pgvector_import.all_tables, - ) pgvector_import.upsert_data() return pgvector_import @classmethod def make_parser(cls, subparsers): - parser_pgvector = make_pgv_parser(cls.DB_NAME_SLUG, subparsers) - parser_pgvector.add_argument( - "--tables", type=str, help="Postgres tables to export (comma-separated)" - ) + _parser_pgvector = make_pgv_parser(cls.DB_NAME_SLUG, subparsers) def __init__(self, args): # call super class constructor @@ -74,22 +64,37 @@ def __init__(self, args): ) def get_all_schemas(self): - schemas = self.conn.execute( - "SELECT schema_name FROM information_schema.schemata" + with self.conn.cursor() as cur: + cur.execute("SELECT schema_name FROM information_schema.schemata") + schemas_response = cur.fetchall() + self.all_schemas = ( + [schema[0] for schema in schemas_response] if schemas_response else [] ) - self.all_schemas = [schema[0] for schema in schemas] - return [schema[0] for schema in schemas] + return self.all_schemas def get_all_table_names(self): - tables = self.conn.execute( - "SELECT table_name FROM information_schema.tables WHERE table_schema='public'" + self.schema_name = self.args.get("schema") or "public" + with self.conn.cursor() as cur: + cur.execute( + f"SELECT table_name FROM information_schema.tables WHERE table_schema='{self.schema_name}'" + ) + tables_response = cur.fetchall() + tqdm.write(f"Tables in schema {self.schema_name}: {tables_response}") + self.all_tables = ( + [table[0] for table in tables_response] if tables_response else [] ) - self.all_tables = [table[0] for table in tables] - return [table[0] for table in tables] + return self.all_tables def upsert_data(self): + # create pgvector extension if not exists + with self.conn.cursor() as cur: + cur.execute("CREATE EXTENSION IF NOT EXISTS vector") + # register vector type register_vector(self.conn) - self.conn.execute("CREATE EXTENSION IF NOT EXISTS vector") + + # use the schema + with self.conn.cursor() as cur: + cur.execute(f"SET search_path TO {self.schema_name}") max_hit = False self.total_imported_count = 0 @@ -97,7 +102,7 @@ def upsert_data(self): index_names: List[str] = list(indexes_content.keys()) if len(index_names) == 0: raise ValueError("No indexes found in VDF_META.json") - tables = self.get_all_table_names() + self.tables = self.get_all_table_names() # Load Parquet file # print(indexes_content[index_names[0]]):List[NamespaceMeta] for index_name, index_meta in tqdm( @@ -115,16 +120,33 @@ def upsert_data(self): if namespace_meta["namespace"] else "" ) - new_index_name = self.create_new_name(new_index_name, tables) - if new_index_name not in tables: - table = self.conn.create_table( - new_index_name, schema=pq.read_schema(parquet_files[0]) - ) + new_index_name = self.create_new_name(new_index_name, self.tables) + if new_index_name not in self.tables: + # create postgres table + with self.conn.cursor() as cur: + cur.execute( + f"CREATE TABLE {new_index_name} (id SERIAL PRIMARY KEY)" + ) + # use parquet file's schema + schema = pq.read_schema(parquet_files[0]) + for field in schema: + col_type = field.type + col_name = field.name + with self.conn.cursor() as cur: + cur.execute( + f"ALTER TABLE {new_index_name} ADD COLUMN {col_name} {col_type}" + ) + # check if the column is a vector column + if col_name == namespace_meta["vector_column"]: + with self.conn.cursor() as cur: + cur.execute( + f"CREATE INDEX {new_index_name}_{col_name}_idx ON {new_index_name} USING vector ({col_name})" + ) tqdm.write(f"Created table {new_index_name}") + table_name = new_index_name else: - table = self.conn.open_table(new_index_name) - tqdm.write(f"Opened table {new_index_name}") - + # set table name + table_name = new_index_name for file in tqdm(parquet_files, desc="Iterating parquet files"): file_path = self.get_file_path(final_data_path, file) df = self.read_parquet_progress( @@ -136,17 +158,6 @@ def upsert_data(self): ) df = cleanup_df(df) # if there are additional columns in the parquet file, add them to the table - for col in df.columns: - if col not in [field.name for field in table.schema]: - col_type = df[col].dtype - tqdm.write( - f"Adding column {col} of type {col_type} to {new_index_name}" - ) - table.add_columns( - { - col: get_default_value(col_type), - } - ) # split in batches BATCH_SIZE = self.args.get("batch_size") or DEFAULT_BATCH_SIZE for batch in tqdm( @@ -163,12 +174,21 @@ def upsert_data(self): ] max_hit = True # convert df into list of dicts - table.add(batch) + with self.conn.cursor() as cur: + cur.execute( + f"""INSERT INTO {table_name + } ({', '.join(batch.columns) + }) VALUES {', '.join([str(tuple(row)) for row in batch.itertuples(index=False)]) + }""" + ) self.total_imported_count += len(batch) if max_hit: break tqdm.write(f"Imported {self.total_imported_count} rows") - tqdm.write(f"New table size: {table.count_rows()}") + with self.conn.cursor() as cur: + cur.execute(f"SELECT COUNT(*) FROM {table_name}") + new_table_size = cur.fetchone()[0] + tqdm.write(f"New table size: {new_table_size}") if max_hit: break print("Data imported successfully") diff --git a/src/vdf_io/notebooks/neon.ipynb b/src/vdf_io/notebooks/neon.ipynb new file mode 100644 index 0000000..1a9481d --- /dev/null +++ b/src/vdf_io/notebooks/neon.ipynb @@ -0,0 +1,662 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pgvector in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (0.2.5)\n", + "Collecting psycopg\n", + " Obtaining dependency information for psycopg from https://files.pythonhosted.org/packages/f2/cf/172701ea48987548c681e011681dda1d2605131f723e89225477fe7802e9/psycopg-3.1.19-py3-none-any.whl.metadata\n", + " Downloading psycopg-3.1.19-py3-none-any.whl.metadata (4.2 kB)\n", + "Requirement already satisfied: numpy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pgvector) (1.26.4)\n", + "Requirement already satisfied: typing-extensions>=4.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from psycopg) (4.9.0)\n", + "Downloading psycopg-3.1.19-py3-none-any.whl (179 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.4/179.4 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: psycopg\n", + "Successfully installed psycopg-3.1.19\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install pgvector psycopg" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import json\n", + "import os\n", + "from dotenv import load_dotenv, find_dotenv\n", + "from typing import List, Dict, Any\n", + "from rich import print as rprint\n", + "\n", + "load_dotenv(find_dotenv(), override=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "NEON_CONNECTION_STRING = os.environ.get(\"NEON_CONNECTION_STRING\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import psycopg\n", + "import pgvector\n", + "from pgvector.psycopg import register_vector\n", + "\n", + "conn = psycopg.connect(NEON_CONNECTION_STRING)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
['public', 'information_schema', 'pg_catalog', 'pg_toast']\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\u001b[32m'public'\u001b[0m, \u001b[32m'information_schema'\u001b[0m, \u001b[32m'pg_catalog'\u001b[0m, \u001b[32m'pg_toast'\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# list schemas\n", + "with conn.cursor() as cur:\n", + " cur.execute(\"SELECT schema_name FROM information_schema.schemata;\")\n", + " schemas = cur.fetchall()\n", + " schemas = [schema[0] for schema in schemas]\n", + " rprint(schemas)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# list tables in public\n", + "with conn.cursor() as cur:\n", + " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';\")\n", + " tables = cur.fetchall()\n", + " tables = [table[0] for table in tables]\n", + " rprint(tables)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
public\n",
+       "[]\n",
+       "
\n" + ], + "text/plain": [ + "public\n", + "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
information_schema\n",
+       "[\n",
+       "    'role_column_grants',\n",
+       "    'information_schema_catalog_name',\n",
+       "    'column_domain_usage',\n",
+       "    'applicable_roles',\n",
+       "    'administrable_role_authorizations',\n",
+       "    'domain_constraints',\n",
+       "    'attributes',\n",
+       "    'column_privileges',\n",
+       "    'character_sets',\n",
+       "    'check_constraint_routine_usage',\n",
+       "    'check_constraints',\n",
+       "    'column_udt_usage',\n",
+       "    'collations',\n",
+       "    'collation_character_set_applicability',\n",
+       "    'key_column_usage',\n",
+       "    'column_column_usage',\n",
+       "    'columns',\n",
+       "    'domain_udt_usage',\n",
+       "    'constraint_column_usage',\n",
+       "    'constraint_table_usage',\n",
+       "    'domains',\n",
+       "    'referential_constraints',\n",
+       "    'enabled_roles',\n",
+       "    'parameters',\n",
+       "    'routine_column_usage',\n",
+       "    'routine_privileges',\n",
+       "    'role_routine_grants',\n",
+       "    'routine_routine_usage',\n",
+       "    'table_privileges',\n",
+       "    'routine_sequence_usage',\n",
+       "    'routine_table_usage',\n",
+       "    'udt_privileges',\n",
+       "    'routines',\n",
+       "    'role_table_grants',\n",
+       "    'schemata',\n",
+       "    'sequences',\n",
+       "    'sql_features',\n",
+       "    'tables',\n",
+       "    'sql_implementation_info',\n",
+       "    'sql_parts',\n",
+       "    'sql_sizing',\n",
+       "    'transforms',\n",
+       "    'table_constraints',\n",
+       "    'view_routine_usage',\n",
+       "    'role_udt_grants',\n",
+       "    'triggered_update_columns',\n",
+       "    'triggers',\n",
+       "    'user_defined_types',\n",
+       "    'usage_privileges',\n",
+       "    'role_usage_grants',\n",
+       "    'view_column_usage',\n",
+       "    'view_table_usage',\n",
+       "    'views',\n",
+       "    'data_type_privileges',\n",
+       "    '_pg_foreign_table_columns',\n",
+       "    'element_types',\n",
+       "    'column_options',\n",
+       "    '_pg_foreign_data_wrappers',\n",
+       "    'foreign_data_wrapper_options',\n",
+       "    'foreign_data_wrappers',\n",
+       "    '_pg_foreign_servers',\n",
+       "    'foreign_server_options',\n",
+       "    'foreign_servers',\n",
+       "    '_pg_foreign_tables',\n",
+       "    'foreign_table_options',\n",
+       "    'foreign_tables',\n",
+       "    '_pg_user_mappings',\n",
+       "    'user_mapping_options',\n",
+       "    'user_mappings'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "information_schema\n", + "\u001b[1m[\u001b[0m\n", + " \u001b[32m'role_column_grants'\u001b[0m,\n", + " \u001b[32m'information_schema_catalog_name'\u001b[0m,\n", + " \u001b[32m'column_domain_usage'\u001b[0m,\n", + " \u001b[32m'applicable_roles'\u001b[0m,\n", + " \u001b[32m'administrable_role_authorizations'\u001b[0m,\n", + " \u001b[32m'domain_constraints'\u001b[0m,\n", + " \u001b[32m'attributes'\u001b[0m,\n", + " \u001b[32m'column_privileges'\u001b[0m,\n", + " \u001b[32m'character_sets'\u001b[0m,\n", + " \u001b[32m'check_constraint_routine_usage'\u001b[0m,\n", + " \u001b[32m'check_constraints'\u001b[0m,\n", + " \u001b[32m'column_udt_usage'\u001b[0m,\n", + " \u001b[32m'collations'\u001b[0m,\n", + " \u001b[32m'collation_character_set_applicability'\u001b[0m,\n", + " \u001b[32m'key_column_usage'\u001b[0m,\n", + " \u001b[32m'column_column_usage'\u001b[0m,\n", + " \u001b[32m'columns'\u001b[0m,\n", + " \u001b[32m'domain_udt_usage'\u001b[0m,\n", + " \u001b[32m'constraint_column_usage'\u001b[0m,\n", + " \u001b[32m'constraint_table_usage'\u001b[0m,\n", + " \u001b[32m'domains'\u001b[0m,\n", + " \u001b[32m'referential_constraints'\u001b[0m,\n", + " \u001b[32m'enabled_roles'\u001b[0m,\n", + " \u001b[32m'parameters'\u001b[0m,\n", + " \u001b[32m'routine_column_usage'\u001b[0m,\n", + " \u001b[32m'routine_privileges'\u001b[0m,\n", + " \u001b[32m'role_routine_grants'\u001b[0m,\n", + " \u001b[32m'routine_routine_usage'\u001b[0m,\n", + " \u001b[32m'table_privileges'\u001b[0m,\n", + " \u001b[32m'routine_sequence_usage'\u001b[0m,\n", + " \u001b[32m'routine_table_usage'\u001b[0m,\n", + " \u001b[32m'udt_privileges'\u001b[0m,\n", + " \u001b[32m'routines'\u001b[0m,\n", + " \u001b[32m'role_table_grants'\u001b[0m,\n", + " \u001b[32m'schemata'\u001b[0m,\n", + " \u001b[32m'sequences'\u001b[0m,\n", + " \u001b[32m'sql_features'\u001b[0m,\n", + " \u001b[32m'tables'\u001b[0m,\n", + " \u001b[32m'sql_implementation_info'\u001b[0m,\n", + " \u001b[32m'sql_parts'\u001b[0m,\n", + " \u001b[32m'sql_sizing'\u001b[0m,\n", + " \u001b[32m'transforms'\u001b[0m,\n", + " \u001b[32m'table_constraints'\u001b[0m,\n", + " \u001b[32m'view_routine_usage'\u001b[0m,\n", + " \u001b[32m'role_udt_grants'\u001b[0m,\n", + " \u001b[32m'triggered_update_columns'\u001b[0m,\n", + " \u001b[32m'triggers'\u001b[0m,\n", + " \u001b[32m'user_defined_types'\u001b[0m,\n", + " \u001b[32m'usage_privileges'\u001b[0m,\n", + " \u001b[32m'role_usage_grants'\u001b[0m,\n", + " \u001b[32m'view_column_usage'\u001b[0m,\n", + " \u001b[32m'view_table_usage'\u001b[0m,\n", + " \u001b[32m'views'\u001b[0m,\n", + " \u001b[32m'data_type_privileges'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_table_columns'\u001b[0m,\n", + " \u001b[32m'element_types'\u001b[0m,\n", + " \u001b[32m'column_options'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_data_wrappers'\u001b[0m,\n", + " \u001b[32m'foreign_data_wrapper_options'\u001b[0m,\n", + " \u001b[32m'foreign_data_wrappers'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_servers'\u001b[0m,\n", + " \u001b[32m'foreign_server_options'\u001b[0m,\n", + " \u001b[32m'foreign_servers'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_tables'\u001b[0m,\n", + " \u001b[32m'foreign_table_options'\u001b[0m,\n", + " \u001b[32m'foreign_tables'\u001b[0m,\n", + " \u001b[32m'_pg_user_mappings'\u001b[0m,\n", + " \u001b[32m'user_mapping_options'\u001b[0m,\n", + " \u001b[32m'user_mappings'\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
pg_catalog\n",
+       "[\n",
+       "    'pg_statistic',\n",
+       "    'pg_type',\n",
+       "    'pg_foreign_table',\n",
+       "    'pg_authid',\n",
+       "    'pg_shadow',\n",
+       "    'pg_roles',\n",
+       "    'pg_statistic_ext_data',\n",
+       "    'pg_settings',\n",
+       "    'pg_file_settings',\n",
+       "    'pg_hba_file_rules',\n",
+       "    'pg_ident_file_mappings',\n",
+       "    'pg_config',\n",
+       "    'pg_shmem_allocations',\n",
+       "    'pg_backend_memory_contexts',\n",
+       "    'pg_user_mapping',\n",
+       "    'pg_stat_activity',\n",
+       "    'pg_replication_origin_status',\n",
+       "    'pg_subscription',\n",
+       "    'pg_attribute',\n",
+       "    'pg_proc',\n",
+       "    'pg_class',\n",
+       "    'pg_attrdef',\n",
+       "    'pg_constraint',\n",
+       "    'pg_inherits',\n",
+       "    'pg_index',\n",
+       "    'pg_stat_replication',\n",
+       "    'pg_stat_slru',\n",
+       "    'pg_stat_wal_receiver',\n",
+       "    'pg_stat_recovery_prefetch',\n",
+       "    'pg_operator',\n",
+       "    'pg_opfamily',\n",
+       "    'pg_opclass',\n",
+       "    'pg_am',\n",
+       "    'pg_amop',\n",
+       "    'pg_amproc',\n",
+       "    'pg_language',\n",
+       "    'pg_largeobject_metadata',\n",
+       "    'pg_aggregate',\n",
+       "    'pg_statistic_ext',\n",
+       "    'pg_rewrite',\n",
+       "    'pg_trigger',\n",
+       "    'pg_event_trigger',\n",
+       "    'pg_description',\n",
+       "    'pg_cast',\n",
+       "    'pg_enum',\n",
+       "    'pg_namespace',\n",
+       "    'pg_conversion',\n",
+       "    'pg_depend',\n",
+       "    'pg_database',\n",
+       "    'pg_db_role_setting',\n",
+       "    'pg_tablespace',\n",
+       "    'pg_auth_members',\n",
+       "    'pg_shdepend',\n",
+       "    'pg_shdescription',\n",
+       "    'pg_ts_config',\n",
+       "    'pg_ts_config_map',\n",
+       "    'pg_ts_dict',\n",
+       "    'pg_ts_parser',\n",
+       "    'pg_ts_template',\n",
+       "    'pg_extension',\n",
+       "    'pg_foreign_data_wrapper',\n",
+       "    'pg_foreign_server',\n",
+       "    'pg_policy',\n",
+       "    'pg_replication_origin',\n",
+       "    'pg_default_acl',\n",
+       "    'pg_init_privs',\n",
+       "    'pg_seclabel',\n",
+       "    'pg_shseclabel',\n",
+       "    'pg_collation',\n",
+       "    'pg_parameter_acl',\n",
+       "    'pg_partitioned_table',\n",
+       "    'pg_range',\n",
+       "    'pg_transform',\n",
+       "    'pg_sequence',\n",
+       "    'pg_publication',\n",
+       "    'pg_publication_namespace',\n",
+       "    'pg_publication_rel',\n",
+       "    'pg_subscription_rel',\n",
+       "    'pg_group',\n",
+       "    'pg_user',\n",
+       "    'pg_policies',\n",
+       "    'pg_rules',\n",
+       "    'pg_views',\n",
+       "    'pg_tables',\n",
+       "    'pg_matviews',\n",
+       "    'pg_indexes',\n",
+       "    'pg_sequences',\n",
+       "    'pg_stats',\n",
+       "    'pg_stats_ext',\n",
+       "    'pg_stats_ext_exprs',\n",
+       "    'pg_publication_tables',\n",
+       "    'pg_locks',\n",
+       "    'pg_cursors',\n",
+       "    'pg_available_extensions',\n",
+       "    'pg_available_extension_versions',\n",
+       "    'pg_prepared_xacts',\n",
+       "    'pg_prepared_statements',\n",
+       "    'pg_seclabels',\n",
+       "    'pg_timezone_abbrevs',\n",
+       "    'pg_timezone_names',\n",
+       "    'pg_stat_sys_tables',\n",
+       "    'pg_stat_xact_sys_tables',\n",
+       "    'pg_stat_user_tables',\n",
+       "    'pg_stat_all_tables',\n",
+       "    'pg_stat_xact_all_tables',\n",
+       "    'pg_stat_xact_user_tables',\n",
+       "    'pg_statio_all_tables',\n",
+       "    'pg_statio_sys_tables',\n",
+       "    'pg_statio_user_tables',\n",
+       "    'pg_stat_all_indexes',\n",
+       "    'pg_stat_sys_indexes',\n",
+       "    'pg_stat_user_indexes',\n",
+       "    'pg_statio_all_indexes',\n",
+       "    'pg_statio_sys_indexes',\n",
+       "    'pg_statio_user_indexes',\n",
+       "    'pg_statio_all_sequences',\n",
+       "    'pg_statio_sys_sequences',\n",
+       "    'pg_statio_user_sequences',\n",
+       "    'pg_stat_subscription',\n",
+       "    'pg_stat_ssl',\n",
+       "    'pg_stat_gssapi',\n",
+       "    'pg_replication_slots',\n",
+       "    'pg_stat_replication_slots',\n",
+       "    'pg_stat_database',\n",
+       "    'pg_stat_database_conflicts',\n",
+       "    'pg_stat_user_functions',\n",
+       "    'pg_stat_xact_user_functions',\n",
+       "    'pg_stat_archiver',\n",
+       "    'pg_stat_bgwriter',\n",
+       "    'pg_stat_wal',\n",
+       "    'pg_stat_progress_analyze',\n",
+       "    'pg_stat_progress_vacuum',\n",
+       "    'pg_stat_progress_cluster',\n",
+       "    'pg_stat_progress_create_index',\n",
+       "    'pg_stat_progress_basebackup',\n",
+       "    'pg_stat_progress_copy',\n",
+       "    'pg_user_mappings',\n",
+       "    'pg_stat_subscription_stats',\n",
+       "    'pg_largeobject'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "pg_catalog\n", + "\u001b[1m[\u001b[0m\n", + " \u001b[32m'pg_statistic'\u001b[0m,\n", + " \u001b[32m'pg_type'\u001b[0m,\n", + " \u001b[32m'pg_foreign_table'\u001b[0m,\n", + " \u001b[32m'pg_authid'\u001b[0m,\n", + " \u001b[32m'pg_shadow'\u001b[0m,\n", + " \u001b[32m'pg_roles'\u001b[0m,\n", + " \u001b[32m'pg_statistic_ext_data'\u001b[0m,\n", + " \u001b[32m'pg_settings'\u001b[0m,\n", + " \u001b[32m'pg_file_settings'\u001b[0m,\n", + " \u001b[32m'pg_hba_file_rules'\u001b[0m,\n", + " \u001b[32m'pg_ident_file_mappings'\u001b[0m,\n", + " \u001b[32m'pg_config'\u001b[0m,\n", + " \u001b[32m'pg_shmem_allocations'\u001b[0m,\n", + " \u001b[32m'pg_backend_memory_contexts'\u001b[0m,\n", + " \u001b[32m'pg_user_mapping'\u001b[0m,\n", + " \u001b[32m'pg_stat_activity'\u001b[0m,\n", + " \u001b[32m'pg_replication_origin_status'\u001b[0m,\n", + " \u001b[32m'pg_subscription'\u001b[0m,\n", + " \u001b[32m'pg_attribute'\u001b[0m,\n", + " \u001b[32m'pg_proc'\u001b[0m,\n", + " \u001b[32m'pg_class'\u001b[0m,\n", + " \u001b[32m'pg_attrdef'\u001b[0m,\n", + " \u001b[32m'pg_constraint'\u001b[0m,\n", + " \u001b[32m'pg_inherits'\u001b[0m,\n", + " \u001b[32m'pg_index'\u001b[0m,\n", + " \u001b[32m'pg_stat_replication'\u001b[0m,\n", + " \u001b[32m'pg_stat_slru'\u001b[0m,\n", + " \u001b[32m'pg_stat_wal_receiver'\u001b[0m,\n", + " \u001b[32m'pg_stat_recovery_prefetch'\u001b[0m,\n", + " \u001b[32m'pg_operator'\u001b[0m,\n", + " \u001b[32m'pg_opfamily'\u001b[0m,\n", + " \u001b[32m'pg_opclass'\u001b[0m,\n", + " \u001b[32m'pg_am'\u001b[0m,\n", + " \u001b[32m'pg_amop'\u001b[0m,\n", + " \u001b[32m'pg_amproc'\u001b[0m,\n", + " \u001b[32m'pg_language'\u001b[0m,\n", + " \u001b[32m'pg_largeobject_metadata'\u001b[0m,\n", + " \u001b[32m'pg_aggregate'\u001b[0m,\n", + " \u001b[32m'pg_statistic_ext'\u001b[0m,\n", + " \u001b[32m'pg_rewrite'\u001b[0m,\n", + " \u001b[32m'pg_trigger'\u001b[0m,\n", + " \u001b[32m'pg_event_trigger'\u001b[0m,\n", + " \u001b[32m'pg_description'\u001b[0m,\n", + " \u001b[32m'pg_cast'\u001b[0m,\n", + " \u001b[32m'pg_enum'\u001b[0m,\n", + " \u001b[32m'pg_namespace'\u001b[0m,\n", + " \u001b[32m'pg_conversion'\u001b[0m,\n", + " \u001b[32m'pg_depend'\u001b[0m,\n", + " \u001b[32m'pg_database'\u001b[0m,\n", + " \u001b[32m'pg_db_role_setting'\u001b[0m,\n", + " \u001b[32m'pg_tablespace'\u001b[0m,\n", + " \u001b[32m'pg_auth_members'\u001b[0m,\n", + " \u001b[32m'pg_shdepend'\u001b[0m,\n", + " \u001b[32m'pg_shdescription'\u001b[0m,\n", + " \u001b[32m'pg_ts_config'\u001b[0m,\n", + " \u001b[32m'pg_ts_config_map'\u001b[0m,\n", + " \u001b[32m'pg_ts_dict'\u001b[0m,\n", + " \u001b[32m'pg_ts_parser'\u001b[0m,\n", + " \u001b[32m'pg_ts_template'\u001b[0m,\n", + " \u001b[32m'pg_extension'\u001b[0m,\n", + " \u001b[32m'pg_foreign_data_wrapper'\u001b[0m,\n", + " \u001b[32m'pg_foreign_server'\u001b[0m,\n", + " \u001b[32m'pg_policy'\u001b[0m,\n", + " \u001b[32m'pg_replication_origin'\u001b[0m,\n", + " \u001b[32m'pg_default_acl'\u001b[0m,\n", + " \u001b[32m'pg_init_privs'\u001b[0m,\n", + " \u001b[32m'pg_seclabel'\u001b[0m,\n", + " \u001b[32m'pg_shseclabel'\u001b[0m,\n", + " \u001b[32m'pg_collation'\u001b[0m,\n", + " \u001b[32m'pg_parameter_acl'\u001b[0m,\n", + " \u001b[32m'pg_partitioned_table'\u001b[0m,\n", + " \u001b[32m'pg_range'\u001b[0m,\n", + " \u001b[32m'pg_transform'\u001b[0m,\n", + " \u001b[32m'pg_sequence'\u001b[0m,\n", + " \u001b[32m'pg_publication'\u001b[0m,\n", + " \u001b[32m'pg_publication_namespace'\u001b[0m,\n", + " \u001b[32m'pg_publication_rel'\u001b[0m,\n", + " \u001b[32m'pg_subscription_rel'\u001b[0m,\n", + " \u001b[32m'pg_group'\u001b[0m,\n", + " \u001b[32m'pg_user'\u001b[0m,\n", + " \u001b[32m'pg_policies'\u001b[0m,\n", + " \u001b[32m'pg_rules'\u001b[0m,\n", + " \u001b[32m'pg_views'\u001b[0m,\n", + " \u001b[32m'pg_tables'\u001b[0m,\n", + " \u001b[32m'pg_matviews'\u001b[0m,\n", + " \u001b[32m'pg_indexes'\u001b[0m,\n", + " \u001b[32m'pg_sequences'\u001b[0m,\n", + " \u001b[32m'pg_stats'\u001b[0m,\n", + " \u001b[32m'pg_stats_ext'\u001b[0m,\n", + " \u001b[32m'pg_stats_ext_exprs'\u001b[0m,\n", + " \u001b[32m'pg_publication_tables'\u001b[0m,\n", + " \u001b[32m'pg_locks'\u001b[0m,\n", + " \u001b[32m'pg_cursors'\u001b[0m,\n", + " \u001b[32m'pg_available_extensions'\u001b[0m,\n", + " \u001b[32m'pg_available_extension_versions'\u001b[0m,\n", + " \u001b[32m'pg_prepared_xacts'\u001b[0m,\n", + " \u001b[32m'pg_prepared_statements'\u001b[0m,\n", + " \u001b[32m'pg_seclabels'\u001b[0m,\n", + " \u001b[32m'pg_timezone_abbrevs'\u001b[0m,\n", + " \u001b[32m'pg_timezone_names'\u001b[0m,\n", + " \u001b[32m'pg_stat_sys_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_sys_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_user_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_all_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_all_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_user_tables'\u001b[0m,\n", + " \u001b[32m'pg_statio_all_tables'\u001b[0m,\n", + " \u001b[32m'pg_statio_sys_tables'\u001b[0m,\n", + " \u001b[32m'pg_statio_user_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_all_indexes'\u001b[0m,\n", + " \u001b[32m'pg_stat_sys_indexes'\u001b[0m,\n", + " \u001b[32m'pg_stat_user_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_all_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_sys_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_user_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_all_sequences'\u001b[0m,\n", + " \u001b[32m'pg_statio_sys_sequences'\u001b[0m,\n", + " \u001b[32m'pg_statio_user_sequences'\u001b[0m,\n", + " \u001b[32m'pg_stat_subscription'\u001b[0m,\n", + " \u001b[32m'pg_stat_ssl'\u001b[0m,\n", + " \u001b[32m'pg_stat_gssapi'\u001b[0m,\n", + " \u001b[32m'pg_replication_slots'\u001b[0m,\n", + " \u001b[32m'pg_stat_replication_slots'\u001b[0m,\n", + " \u001b[32m'pg_stat_database'\u001b[0m,\n", + " \u001b[32m'pg_stat_database_conflicts'\u001b[0m,\n", + " \u001b[32m'pg_stat_user_functions'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_user_functions'\u001b[0m,\n", + " \u001b[32m'pg_stat_archiver'\u001b[0m,\n", + " \u001b[32m'pg_stat_bgwriter'\u001b[0m,\n", + " \u001b[32m'pg_stat_wal'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_analyze'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_vacuum'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_cluster'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_create_index'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_basebackup'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_copy'\u001b[0m,\n", + " \u001b[32m'pg_user_mappings'\u001b[0m,\n", + " \u001b[32m'pg_stat_subscription_stats'\u001b[0m,\n", + " \u001b[32m'pg_largeobject'\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
pg_toast\n",
+       "[]\n",
+       "
\n" + ], + "text/plain": [ + "pg_toast\n", + "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# for each schema, list tables\n", + "for schema in schemas:\n", + " with conn.cursor() as cur:\n", + " cur.execute(\n", + " f\"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}';\"\n", + " )\n", + " tables = cur.fetchall()\n", + " tables = [table[0] for table in tables]\n", + " rprint(schema, tables)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/vdf_io/pgvector_util.py b/src/vdf_io/pgvector_util.py index 90f59f4..adbe58b 100644 --- a/src/vdf_io/pgvector_util.py +++ b/src/vdf_io/pgvector_util.py @@ -8,8 +8,7 @@ def make_pgv_parser(DB_NAME_SLUG, subparsers): parser_pgvector.add_argument( "--platform", type=str, - choices=["supabase", "neon", "tembo", "any"], - default="any", + choices=["supabase", "neon", "tembo", "aws", "custom"], help="Platform to connect to", ) parser_pgvector.add_argument( @@ -32,18 +31,32 @@ def make_pgv_parser(DB_NAME_SLUG, subparsers): parser_pgvector.add_argument( "--dbname", type=str, help="Database name of Postgres instance" ) - parser_pgvector.add_argument( - "--tables", type=str, help="Postgres tables to export (comma-separated)" - ) return parser_pgvector def set_pgv_args_from_prompt(args): + set_arg_from_input( + args, + "platform", + "Enter the platform to connect to (default: custom): ", + str, + default_value="custom", + ) + env_var_name = "POSTGRES_CONNECTION_STRING" + if args["platform"] == "supabase": + env_var_name = "SUPABASE_CONNECTION_STRING" + elif args["platform"] == "neon": + env_var_name = "NEON_CONNECTION_STRING" + elif args["platform"] == "tembo": + env_var_name = "TEMBO_CONNECTION_STRING" + elif args["platform"] == "aws": + env_var_name = "AURORA_CONNECTION_STRING" set_arg_from_input( args, "connection_string", "Enter the connection string to Postgres instance: ", str, + env_var=env_var_name, ) if not args.get("connection_string"): set_arg_from_input( diff --git a/src/vdf_io/util.py b/src/vdf_io/util.py index 18554a8..d5dd038 100644 --- a/src/vdf_io/util.py +++ b/src/vdf_io/util.py @@ -92,20 +92,27 @@ def set_arg_from_input( """ Set the value of an argument from user input if it is not already present """ + read_from_env = False if ( (default_value is None) and (env_var is not None) and (os.getenv(env_var) is not None) ): default_value = os.getenv(env_var) + read_from_env = True if arg_name not in args or ( args[arg_name] is None and default_value != "DO_NOT_PROMPT" ): while True: - inp = input( - prompt - + (" " + str(list(choices)) + ": " if choices is not None else "") - ) + default_suffix = "" + choices_suffix = "" + if read_from_env: + default_suffix = f" (default: Read from env var {env_var})" + else: + default_suffix = f" (default: {default_value})" + if choices is not None: + choices_suffix = f" (choose from: {str(list(choices))})" + inp = input(f"{prompt}{choices_suffix}{default_suffix}: ") if len(inp) >= 2: if inp[0] == '"' and inp[-1] == '"': inp = inp[1:-1] From 9d4bef8a63483528ac43bfc9de13457c60a18ba7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 11:26:21 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/vdf_io/export_vdf/pgvector_export.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/vdf_io/export_vdf/pgvector_export.py b/src/vdf_io/export_vdf/pgvector_export.py index 8a72b02..8c760db 100644 --- a/src/vdf_io/export_vdf/pgvector_export.py +++ b/src/vdf_io/export_vdf/pgvector_export.py @@ -1,8 +1,6 @@ import json import os from typing import Dict, List -import pandas as pd -import pyarrow from tqdm import tqdm import psycopg2 From 9bf15372d188b7a1df45530070e9ec4622c4153d Mon Sep 17 00:00:00 2001 From: dhruv-anand-aintech Date: Fri, 7 Jun 2024 17:13:19 +0530 Subject: [PATCH 5/6] fix psycopg version try duckdb --- requirements.txt | 3 +- src/vdf_io/import_vdf/pgvector_import.py | 71 ++++-- src/vdf_io/import_vdf/vdf_import_cls.py | 9 +- src/vdf_io/notebooks/duckdb.ipynb | 296 +++++++++++++++++++++++ src/vdf_io/notebooks/neon.ipynb | 241 +++++++++++++++++- 5 files changed, 590 insertions(+), 30 deletions(-) create mode 100644 src/vdf_io/notebooks/duckdb.ipynb diff --git a/requirements.txt b/requirements.txt index 903b181..1b0cc09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,5 @@ azure-search-documents azure-identity turbopuffer[fast] pgvector -psycopg \ No newline at end of file +psycopg +duckdb \ No newline at end of file diff --git a/src/vdf_io/import_vdf/pgvector_import.py b/src/vdf_io/import_vdf/pgvector_import.py index de240f3..0d147eb 100644 --- a/src/vdf_io/import_vdf/pgvector_import.py +++ b/src/vdf_io/import_vdf/pgvector_import.py @@ -5,7 +5,7 @@ import pyarrow.parquet as pq from pgvector.psycopg import register_vector -import psycopg2 +import psycopg from vdf_io.constants import DEFAULT_BATCH_SIZE, INT_MAX from vdf_io.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt @@ -53,9 +53,9 @@ def __init__(self, args): super().__init__(args) # use connection_string if args.get("connection_string"): - self.conn = psycopg2.connect(args["connection_string"]) + self.conn = psycopg.connect(args["connection_string"]) else: - self.conn = psycopg2.connect( + self.conn = psycopg.connect( user=args["user"], password=args["password"], host=args["host"] if args.get("host", "") != "" else "localhost", @@ -104,7 +104,6 @@ def upsert_data(self): raise ValueError("No indexes found in VDF_META.json") self.tables = self.get_all_table_names() # Load Parquet file - # print(indexes_content[index_names[0]]):List[NamespaceMeta] for index_name, index_meta in tqdm( indexes_content.items(), desc="Importing indexes" ): @@ -122,26 +121,60 @@ def upsert_data(self): ) new_index_name = self.create_new_name(new_index_name, self.tables) if new_index_name not in self.tables: - # create postgres table - with self.conn.cursor() as cur: - cur.execute( - f"CREATE TABLE {new_index_name} (id SERIAL PRIMARY KEY)" - ) - # use parquet file's schema + # assemble schema using parquet file's schema schema = pq.read_schema(parquet_files[0]) + schema_dict = {} + if "model_map" not in namespace_meta: + namespace_meta["model_map"] = {} +""" +┌──────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐ +│ column_name │ column_type │ null │ key │ default │ extra │ +│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │ +├──────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤ +│ id │ BIGINT │ YES │ │ │ │ +│ vector │ DOUBLE[] │ YES │ │ │ │ +│ claps │ BIGINT │ YES │ │ │ │ +│ title │ VARCHAR │ YES │ │ │ │ +│ responses │ BIGINT │ YES │ │ │ │ +│ reading_time │ BIGINT │ YES │ │ │ │ +│ publication │ VARCHAR │ YES │ │ │ │ +│ link │ VARCHAR │ YES │ │ │ │ +└──────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘ +id: int64 +vector: list + child 0, element: double +claps: int64 +title: string +responses: int64 +reading_time: int64 +publication: string +link: string +""" + parquet_to_sql_type_map = { + + "int64": "BIGINT", + "float64": "DOUBLE PRECISION", + "bool": "BOOLEAN", + "datetime64[ns]": "TIMESTAMP", + "timedelta64[ns]": "INTERVAL", + "object": "VARCHAR", + } for field in schema: col_type = field.type col_name = field.name - with self.conn.cursor() as cur: - cur.execute( - f"ALTER TABLE {new_index_name} ADD COLUMN {col_name} {col_type}" - ) + schema_dict[col_name] = col_type # check if the column is a vector column - if col_name == namespace_meta["vector_column"]: - with self.conn.cursor() as cur: - cur.execute( - f"CREATE INDEX {new_index_name}_{col_name}_idx ON {new_index_name} USING vector ({col_name})" - ) + if col_name in namespace_meta["model_map"]: + schema_dict[col_name] = "vector()" + # create schema string + schema_str = ", ".join( + [f"{col_name} {col_type}" for col_name, col_type in schema_dict.items()] + ) + # create postgres table + with self.conn.cursor() as cur: + cur.execute( + f"CREATE TABLE {new_index_name} (id SERIAL PRIMARY KEY)" + ) tqdm.write(f"Created table {new_index_name}") table_name = new_index_name else: diff --git a/src/vdf_io/import_vdf/vdf_import_cls.py b/src/vdf_io/import_vdf/vdf_import_cls.py index 31ef4d6..4c7f4d1 100644 --- a/src/vdf_io/import_vdf/vdf_import_cls.py +++ b/src/vdf_io/import_vdf/vdf_import_cls.py @@ -149,10 +149,10 @@ def get_file_path(self, final_data_path, parquet_file): return parquet_file return os.path.join(final_data_path, parquet_file) - def set_dims(self, namespace_meta, index_name): + def set_dims(self, namespace_meta, index_name, vector_column_name=None): if namespace_meta["dimensions"] == -1: tqdm.write(f"Resolving dimensions for index '{index_name}'") - dims = self.resolve_dims(namespace_meta, index_name) + dims = self.resolve_dims(namespace_meta, index_name, vector_column_name) if dims != -1: namespace_meta["dimensions"] = dims tqdm.write(f"Resolved dimensions: {dims}") @@ -162,10 +162,11 @@ def set_dims(self, namespace_meta, index_name): f"Failed to resolve dimensions for index '{index_name}'" ) - def resolve_dims(self, namespace_meta, index_name): + def resolve_dims(self, namespace_meta, index_name, vector_column_name = None): final_data_path = self.get_final_data_path(namespace_meta["data_path"]) parquet_files = self.get_parquet_files(final_data_path) - _, vector_column_name = self.get_vector_column_name(index_name, namespace_meta) + if vector_column_name is None: + _, vector_column_name = self.get_vector_column_name(index_name, namespace_meta) dims = -1 found = False for file in tqdm(parquet_files, desc="Iterating parquet files"): diff --git a/src/vdf_io/notebooks/duckdb.ipynb b/src/vdf_io/notebooks/duckdb.ipynb new file mode 100644 index 0000000..572333d --- /dev/null +++ b/src/vdf_io/notebooks/duckdb.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: numpy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 1)) (1.26.4)\n", + "Requirement already satisfied: pandas in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 2)) (2.1.4)\n", + "Requirement already satisfied: pinecone-client~=4.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 3)) (4.0.0)\n", + "Requirement already satisfied: pyarrow in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 4)) (16.1.0)\n", + "Requirement already satisfied: qdrant_client in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 5)) (1.7.1)\n", + "Requirement already satisfied: tqdm in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 6)) (4.66.4)\n", + "Requirement already satisfied: python-dotenv in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 7)) (1.0.0)\n", + "Requirement already satisfied: huggingface_hub in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 8)) (0.23.0)\n", + "Requirement already satisfied: pymilvus>=2.3.6 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 9)) (2.3.6)\n", + "Requirement already satisfied: openai>=1.10.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 10)) (1.12.0)\n", + "Requirement already satisfied: litellm>=1.20.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 11)) (1.34.20)\n", + "Requirement already satisfied: google-cloud-aiplatform in /Users/dhruvanand/.local/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 12)) (1.40.0)\n", + "Requirement already satisfied: google-api-python-client in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 13)) (2.118.0)\n", + "Requirement already satisfied: google-auth-httplib2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 14)) (0.2.0)\n", + "Requirement already satisfied: google-auth-oauthlib in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 15)) (1.2.0)\n", + "Requirement already satisfied: ratelimit in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 16)) (2.2.1)\n", + "Requirement already satisfied: backoff in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 17)) (2.2.1)\n", + "Requirement already satisfied: kdbai-client in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 18)) (0.1.2)\n", + "Requirement already satisfied: sentry-sdk[opentelemetry] in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 19)) (1.40.3)\n", + "Requirement already satisfied: halo in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 20)) (0.0.31)\n", + "Requirement already satisfied: sentence-transformers>=2.6.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 21)) (2.6.1)\n", + "Requirement already satisfied: pyvespa in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 22)) (0.39.0)\n", + "Requirement already satisfied: lancedb in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 23)) (0.5.7)\n", + "Requirement already satisfied: chromadb in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 24)) (0.4.24)\n", + "Requirement already satisfied: txtai in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 25)) (7.0.0)\n", + "Requirement already satisfied: weaviate-client>=4.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 26)) (4.5.2)\n", + "Requirement already satisfied: astrapy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 27)) (1.0.0)\n", + "Requirement already satisfied: cassandra-driver in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 28)) (3.29.0)\n", + "Requirement already satisfied: tenacity in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 29)) (8.2.3)\n", + "Requirement already satisfied: torch in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 30)) (2.1.2)\n", + "Requirement already satisfied: twine in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 31)) (5.0.0)\n", + "Requirement already satisfied: datasets>=2.19.0,~=2.16 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 32)) (2.19.0)\n", + "Requirement already satisfied: mlx_embedding_models in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 33)) (0.0.9)\n", + "Requirement already satisfied: azure-search-documents in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 34)) (11.4.0)\n", + "Requirement already satisfied: azure-identity in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 35)) (1.16.0)\n", + "Requirement already satisfied: turbopuffer[fast] in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 36)) (0.1.7)\n", + "Requirement already satisfied: pgvector in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 37)) (0.2.5)\n", + "Requirement already satisfied: psycopg in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 38)) (3.1.19)\n", + "Collecting duckdb (from -r ../../../requirements.txt (line 39))\n", + " Obtaining dependency information for duckdb from https://files.pythonhosted.org/packages/a4/a3/ff1b0127acc0d7cfe72005df80431225f056f463f6d179c6175cb95fec15/duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Downloading duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (763 bytes)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pandas->-r ../../../requirements.txt (line 2)) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pandas->-r ../../../requirements.txt (line 2)) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pandas->-r ../../../requirements.txt (line 2)) (2023.4)\n", + "Requirement already satisfied: certifi>=2019.11.17 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pinecone-client~=4.0.0->-r ../../../requirements.txt (line 3)) (2023.11.17)\n", + "Requirement already satisfied: typing-extensions>=3.7.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pinecone-client~=4.0.0->-r ../../../requirements.txt (line 3)) (4.9.0)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pinecone-client~=4.0.0->-r ../../../requirements.txt (line 3)) (2.0.7)\n", + "Requirement already satisfied: grpcio>=1.41.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (1.60.0)\n", + "Requirement already satisfied: grpcio-tools>=1.41.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (1.60.0)\n", + "Requirement already satisfied: httpx[http2]>=0.14.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (0.27.0)\n", + "Requirement already satisfied: portalocker<3.0.0,>=2.7.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (2.8.2)\n", + "Requirement already satisfied: pydantic>=1.10.8 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (2.7.1)\n", + "Requirement already satisfied: filelock in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (3.13.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (2023.12.2)\n", + "Requirement already satisfied: packaging>=20.9 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (6.0.1)\n", + "Requirement already satisfied: requests in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (2.31.0)\n", + "Requirement already satisfied: setuptools>=67 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (68.1.2)\n", + "Requirement already satisfied: protobuf>=3.20.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (4.25.3)\n", + "Requirement already satisfied: environs<=9.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (9.5.0)\n", + "Requirement already satisfied: ujson>=2.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (5.9.0)\n", + "Requirement already satisfied: minio>=7.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (7.2.3)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from openai>=1.10.0->-r ../../../requirements.txt (line 10)) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from openai>=1.10.0->-r ../../../requirements.txt (line 10)) (1.9.0)\n", + "Requirement already satisfied: sniffio in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from openai>=1.10.0->-r ../../../requirements.txt (line 10)) (1.3.0)\n", + "Requirement already satisfied: aiohttp in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (3.9.1)\n", + "Requirement already satisfied: click in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (8.1.7)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (6.11.0)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (3.1.3)\n", + "Requirement already satisfied: tiktoken>=0.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (0.7.0)\n", + "Requirement already satisfied: tokenizers in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (0.15.2)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.19.0)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.23.0)\n", + "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.16.0)\n", + "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (3.17.1)\n", + "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.11.0)\n", + "Requirement already satisfied: shapely<3.0.0dev in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.0.2)\n", + "Requirement already satisfied: httplib2<1.dev0,>=0.15.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-python-client->-r ../../../requirements.txt (line 13)) (0.22.0)\n", + "Requirement already satisfied: google-auth<3.0.0.dev0,>=1.19.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-python-client->-r ../../../requirements.txt (line 13)) (2.26.1)\n", + "Requirement already satisfied: uritemplate<5,>=3.0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-python-client->-r ../../../requirements.txt (line 13)) (4.1.1)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-auth-oauthlib->-r ../../../requirements.txt (line 15)) (1.3.1)\n", + "Requirement already satisfied: pykx<3.0.0,>=2.1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from kdbai-client->-r ../../../requirements.txt (line 18)) (2.3.0)\n", + "Requirement already satisfied: opentelemetry-distro>=0.35b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentry-sdk[opentelemetry]->-r ../../../requirements.txt (line 19)) (0.45b0)\n", + "Requirement already satisfied: log-symbols>=0.0.14 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (0.0.14)\n", + "Requirement already satisfied: spinners>=0.0.24 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (0.0.24)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (2.4.0)\n", + "Requirement already satisfied: colorama>=0.3.9 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (0.4.6)\n", + "Requirement already satisfied: six>=1.12.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (1.16.0)\n", + "Requirement already satisfied: transformers<5.0.0,>=4.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (4.36.2)\n", + "Requirement already satisfied: scikit-learn in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (1.3.2)\n", + "Requirement already satisfied: scipy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (1.11.4)\n", + "Requirement already satisfied: Pillow in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (10.2.0)\n", + "Requirement already satisfied: docker in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pyvespa->-r ../../../requirements.txt (line 22)) (7.0.0)\n", + "Requirement already satisfied: cryptography in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pyvespa->-r ../../../requirements.txt (line 22)) (42.0.2)\n", + "Requirement already satisfied: deprecation in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (2.1.0)\n", + "Requirement already satisfied: pylance==0.9.18 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (0.9.18)\n", + "Requirement already satisfied: ratelimiter~=1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (1.2.0.post0)\n", + "Requirement already satisfied: retry>=0.9.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (0.9.2)\n", + "Requirement already satisfied: attrs>=21.3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (23.2.0)\n", + "Requirement already satisfied: semver>=3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (3.0.2)\n", + "Requirement already satisfied: cachetools in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (5.3.2)\n", + "Requirement already satisfied: overrides>=0.7 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (7.7.0)\n", + "Requirement already satisfied: build>=1.0.3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.0.3)\n", + "Requirement already satisfied: chroma-hnswlib==0.7.3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.7.3)\n", + "Requirement already satisfied: fastapi>=0.95.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.104.1)\n", + "Requirement already satisfied: uvicorn[standard]>=0.18.3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.22.0)\n", + "Requirement already satisfied: posthog>=2.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (3.4.2)\n", + "Requirement already satisfied: pulsar-client>=3.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (3.4.0)\n", + "Requirement already satisfied: onnxruntime>=1.14.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.17.3)\n", + "Requirement already satisfied: opentelemetry-api>=1.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: pypika>=0.48.9 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.48.9)\n", + "Requirement already satisfied: importlib-resources in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (6.1.1)\n", + "Requirement already satisfied: bcrypt>=4.0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (4.1.2)\n", + "Requirement already satisfied: typer>=0.9.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.9.0)\n", + "Requirement already satisfied: kubernetes>=28.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (29.0.0)\n", + "Requirement already satisfied: mmh3>=4.0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (4.1.0)\n", + "Requirement already satisfied: orjson>=3.9.12 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (3.9.15)\n", + "Requirement already satisfied: faiss-cpu>=1.7.1.post2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from txtai->-r ../../../requirements.txt (line 25)) (1.8.0)\n", + "Requirement already satisfied: regex>=2022.8.17 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from txtai->-r ../../../requirements.txt (line 25)) (2023.12.25)\n", + "Requirement already satisfied: validators==0.22.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from weaviate-client>=4.0.0->-r ../../../requirements.txt (line 26)) (0.22.0)\n", + "Requirement already satisfied: authlib<2.0.0,>=1.2.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from weaviate-client>=4.0.0->-r ../../../requirements.txt (line 26)) (1.3.0)\n", + "Requirement already satisfied: grpcio-health-checking<2.0.0,>=1.57.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from weaviate-client>=4.0.0->-r ../../../requirements.txt (line 26)) (1.60.0)\n", + "Requirement already satisfied: httpcore==1.* in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (1.0.2)\n", + "Requirement already satisfied: idna in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (3.6)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpcore==1.*->httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (0.14.0)\n", + "Requirement already satisfied: bson<0.6.0,>=0.5.10 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (0.5.10)\n", + "Requirement already satisfied: cassio<0.2.0,>=0.1.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (0.1.4)\n", + "Requirement already satisfied: toml<0.11.0,>=0.10.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (0.10.2)\n", + "Requirement already satisfied: uuid6<2024.2.0,>=2024.1.12 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (2024.1.12)\n", + "Requirement already satisfied: geomet<0.3,>=0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from cassandra-driver->-r ../../../requirements.txt (line 28)) (0.2.1.post1)\n", + "Requirement already satisfied: sympy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from torch->-r ../../../requirements.txt (line 30)) (1.12)\n", + "Requirement already satisfied: networkx in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from torch->-r ../../../requirements.txt (line 30)) (3.2.1)\n", + "Requirement already satisfied: pkginfo>=1.8.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (1.10.0)\n", + "Requirement already satisfied: readme-renderer>=35.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (43.0)\n", + "Requirement already satisfied: requests-toolbelt!=0.9.0,>=0.8.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (1.0.0)\n", + "Requirement already satisfied: keyring>=15.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (25.2.0)\n", + "Requirement already satisfied: rfc3986>=1.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (2.0.0)\n", + "Requirement already satisfied: rich>=12.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (13.7.0)\n", + "Requirement already satisfied: pyarrow-hotfix in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (0.6)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (0.3.7)\n", + "Requirement already satisfied: xxhash in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (3.4.1)\n", + "Requirement already satisfied: multiprocess in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (0.70.15)\n", + "Requirement already satisfied: mlx>=0.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from mlx_embedding_models->-r ../../../requirements.txt (line 33)) (0.13.0)\n", + "Requirement already satisfied: awkward in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from mlx_embedding_models->-r ../../../requirements.txt (line 33)) (2.6.3)\n", + "Requirement already satisfied: azure-core<2.0.0,>=1.28.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-search-documents->-r ../../../requirements.txt (line 34)) (1.30.1)\n", + "Requirement already satisfied: azure-common~=1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-search-documents->-r ../../../requirements.txt (line 34)) (1.1.28)\n", + "Requirement already satisfied: isodate>=0.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-search-documents->-r ../../../requirements.txt (line 34)) (0.6.1)\n", + "Requirement already satisfied: msal>=1.24.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-identity->-r ../../../requirements.txt (line 35)) (1.28.0)\n", + "Requirement already satisfied: msal-extensions>=0.3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-identity->-r ../../../requirements.txt (line 35)) (1.1.0)\n", + "Requirement already satisfied: iso8601<3.0.0,>=2.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from turbopuffer[fast]->-r ../../../requirements.txt (line 36)) (2.1.0)\n", + "Requirement already satisfied: exceptiongroup in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai>=1.10.0->-r ../../../requirements.txt (line 10)) (1.2.0)\n", + "Requirement already satisfied: pyproject_hooks in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from build>=1.0.3->chromadb->-r ../../../requirements.txt (line 24)) (1.0.0)\n", + "Requirement already satisfied: tomli>=1.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from build>=1.0.3->chromadb->-r ../../../requirements.txt (line 24)) (2.0.1)\n", + "Requirement already satisfied: cffi>=1.12 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from cryptography->pyvespa->-r ../../../requirements.txt (line 22)) (1.16.0)\n", + "Requirement already satisfied: marshmallow>=3.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from environs<=9.5.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (3.20.2)\n", + "Requirement already satisfied: starlette<0.28.0,>=0.27.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from fastapi>=0.95.2->chromadb->-r ../../../requirements.txt (line 24)) (0.27.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (1.9.4)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (1.4.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (4.0.3)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.63.0)\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.60.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-auth<3.0.0.dev0,>=1.19.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (0.3.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-auth<3.0.0.dev0,>=1.19.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (4.9)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.4.1)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.7.0)\n", + "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (0.13.0)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-storage<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.5.0)\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httplib2<1.dev0,>=0.15.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (3.1.2)\n", + "Requirement already satisfied: h2<5,>=3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (4.1.0)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from importlib-metadata>=6.8.0->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (3.17.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from jinja2<4.0.0,>=3.1.2->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (2.1.5)\n", + "Requirement already satisfied: jaraco.classes in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (3.4.0)\n", + "Requirement already satisfied: jaraco.functools in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (4.0.1)\n", + "Requirement already satisfied: jaraco.context in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (5.3.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r ../../../requirements.txt (line 24)) (1.7.0)\n", + "Requirement already satisfied: oauthlib>=3.2.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r ../../../requirements.txt (line 24)) (3.2.2)\n", + "Requirement already satisfied: argon2-cffi in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from minio>=7.0.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (23.1.0)\n", + "Requirement already satisfied: pycryptodome in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from minio>=7.0.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (3.20.0)\n", + "Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from msal>=1.24.0->azure-identity->-r ../../../requirements.txt (line 35)) (2.8.0)\n", + "Requirement already satisfied: coloredlogs in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r ../../../requirements.txt (line 24)) (15.0.1)\n", + "Requirement already satisfied: flatbuffers in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r ../../../requirements.txt (line 24)) (23.5.26)\n", + "Requirement already satisfied: deprecated>=1.2.6 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-api>=1.2.0->chromadb->-r ../../../requirements.txt (line 24)) (1.2.14)\n", + "Requirement already satisfied: opentelemetry-instrumentation==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-distro>=0.35b0->sentry-sdk[opentelemetry]->-r ../../../requirements.txt (line 19)) (0.45b0)\n", + "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-distro>=0.35b0->sentry-sdk[opentelemetry]->-r ../../../requirements.txt (line 19)) (1.16.0)\n", + "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.24.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-proto==1.24.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-instrumentation-asgi==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: opentelemetry-semantic-conventions==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: opentelemetry-util-http==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: asgiref~=3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-asgi==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (3.7.2)\n", + "Requirement already satisfied: monotonic>=1.5 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from posthog>=2.4.0->chromadb->-r ../../../requirements.txt (line 24)) (1.6)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pydantic>=1.10.8->qdrant_client->-r ../../../requirements.txt (line 5)) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.18.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pydantic>=1.10.8->qdrant_client->-r ../../../requirements.txt (line 5)) (2.18.2)\n", + "Requirement already satisfied: nh3>=0.2.14 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from readme-renderer>=35.0->twine->-r ../../../requirements.txt (line 31)) (0.2.17)\n", + "Requirement already satisfied: docutils>=0.13.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from readme-renderer>=35.0->twine->-r ../../../requirements.txt (line 31)) (0.20.1)\n", + "Requirement already satisfied: Pygments>=2.5.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from readme-renderer>=35.0->twine->-r ../../../requirements.txt (line 31)) (2.17.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from requests->huggingface_hub->-r ../../../requirements.txt (line 8)) (3.3.2)\n", + "Requirement already satisfied: decorator>=3.4.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from retry>=0.9.2->lancedb->-r ../../../requirements.txt (line 23)) (5.1.1)\n", + "Requirement already satisfied: py<2.0.0,>=1.4.26 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from retry>=0.9.2->lancedb->-r ../../../requirements.txt (line 23)) (1.11.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from rich>=12.0.0->twine->-r ../../../requirements.txt (line 31)) (3.0.0)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (0.4.2)\n", + "Requirement already satisfied: httptools>=0.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (0.6.1)\n", + "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (0.19.0)\n", + "Requirement already satisfied: watchfiles>=0.13 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (0.21.0)\n", + "Requirement already satisfied: websockets>=10.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (11.0.3)\n", + "Requirement already satisfied: awkward-cpp==32 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from awkward->mlx_embedding_models->-r ../../../requirements.txt (line 33)) (32)\n", + "Requirement already satisfied: joblib>=1.1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from scikit-learn->sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from scikit-learn->sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (3.2.0)\n", + "Requirement already satisfied: mpmath>=0.19 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sympy->torch->-r ../../../requirements.txt (line 30)) (1.3.0)\n", + "Requirement already satisfied: pycparser in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from cffi>=1.12->cryptography->pyvespa->-r ../../../requirements.txt (line 22)) (2.21)\n", + "Requirement already satisfied: hyperframe<7,>=6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from h2<5,>=3->httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (6.0.1)\n", + "Requirement already satisfied: hpack<5,>=4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from h2<5,>=3->httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (4.0.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=12.0.0->twine->-r ../../../requirements.txt (line 31)) (0.1.2)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.0.dev0,>=1.19.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (0.5.1)\n", + "Requirement already satisfied: argon2-cffi-bindings in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from argon2-cffi->minio>=7.0.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (21.2.0)\n", + "Requirement already satisfied: humanfriendly>=9.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb->-r ../../../requirements.txt (line 24)) (10.0)\n", + "Requirement already satisfied: more-itertools in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from jaraco.classes->keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (10.2.0)\n", + "Requirement already satisfied: backports.tarfile in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from jaraco.context->keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (1.1.1)\n", + "Downloading duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl (14.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.4/14.4 MB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hInstalling collected packages: duckdb\n", + "Successfully installed duckdb-0.10.2\n" + ] + } + ], + "source": [ + "!pip install -r ../../../requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(42,)]\n" + ] + } + ], + "source": [ + "import duckdb\n", + "cursor = duckdb.connect()\n", + "print(cursor.execute('SELECT 42').fetchall())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "duckdb.connect" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/vdf_io/notebooks/neon.ipynb b/src/vdf_io/notebooks/neon.ipynb index 1a9481d..3bb8221 100644 --- a/src/vdf_io/notebooks/neon.ipynb +++ b/src/vdf_io/notebooks/neon.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -38,7 +38,7 @@ "True" ] }, - "execution_count": 1, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -76,6 +76,26 @@ "conn = psycopg.connect(NEON_CONNECTION_STRING)\n" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "psycopg.Connection" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(conn)" + ] + }, { "cell_type": "code", "execution_count": 12, @@ -106,17 +126,17 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
[]\n",
+       "
['test_table']\n",
        "
\n" ], "text/plain": [ - "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n" + "\u001b[1m[\u001b[0m\u001b[32m'test_table'\u001b[0m\u001b[1m]\u001b[0m\n" ] }, "metadata": {}, @@ -630,6 +650,215 @@ " rprint(schema, tables)" ] }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "with conn.cursor() as cur:\n", + " # Execute the CREATE EXTENSION command\n", + " try:\n", + " cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector;\")\n", + " conn.commit()\n", + " except Exception as e:\n", + " conn.rollback()\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "register_vector(conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# create a table in public\n", + "with conn.cursor() as cur:\n", + " cur.execute(\n", + " \"\"\"\n", + " CREATE TABLE public.test_table (\n", + " id serial PRIMARY KEY,\n", + " name VARCHAR ( 50 ) UNIQUE NOT NULL,\n", + " vector VECTOR(3)\n", + " );\n", + " \"\"\"\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "ename": "OperationalError", + "evalue": "connection failed: connection to server at \"3.209.98.240\", port 5432 failed: could not receive data from server: Connection refused\ncould not send SSL negotiation packet: Connection refused", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOperationalError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m conn \u001b[38;5;241m=\u001b[39m \u001b[43mpsycopg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menviron\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mAURORA_POSTGRES_CONNECTION_STRING\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/psycopg/connection.py:748\u001b[0m, in \u001b[0;36mConnection.connect\u001b[0;34m(cls, conninfo, autocommit, prepare_threshold, row_factory, cursor_factory, context, **kwargs)\u001b[0m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m rv:\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m last_ex\n\u001b[0;32m--> 748\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m last_ex\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 750\u001b[0m rv\u001b[38;5;241m.\u001b[39m_autocommit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mbool\u001b[39m(autocommit)\n\u001b[1;32m 751\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m row_factory:\n", + "\u001b[0;31mOperationalError\u001b[0m: connection failed: connection to server at \"3.209.98.240\", port 5432 failed: could not receive data from server: Connection refused\ncould not send SSL negotiation packet: Connection refused" + ] + } + ], + "source": [ + "conn = psycopg.connect(os.environ['AURORA_POSTGRES_CONNECTION_STRING'])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table 'test_table' does not exist in schema 'public'. Creating the table...\n", + "Data inserted successfully.\n" + ] + } + ], + "source": [ + "# Ensure table exists and use the correct schema\n", + "with conn.cursor() as cur:\n", + " try:\n", + " # Set search path to public schema\n", + " cur.execute(\"SET search_path TO public;\")\n", + " conn.commit()\n", + " \n", + " # Verify table creation\n", + " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'test_table';\")\n", + " table_exists = cur.fetchone()\n", + " if not table_exists:\n", + " print(\"Table 'test_table' does not exist in schema 'public'. Creating the table...\")\n", + " cur.execute(\n", + " \"\"\"\n", + " CREATE TABLE public.test_table (\n", + " id serial PRIMARY KEY,\n", + " name text NOT NULL,\n", + " vector vector(3) -- assuming the vector has 3 dimensions\n", + " );\n", + " \"\"\"\n", + " )\n", + " conn.commit()\n", + " else:\n", + " print(\"Table 'test_table' already exists.\")\n", + "\n", + " # Insert data into the table\n", + " cur.execute(\n", + " \"\"\"\n", + " INSERT INTO public.test_table (name, vector)\n", + " VALUES\n", + " ('A', ARRAY[1, 2, 3]),\n", + " ('B', ARRAY[4, 5, 6]),\n", + " ('C', ARRAY[7, 8, 9]);\n", + " \"\"\"\n", + " )\n", + " conn.commit()\n", + " print(\"Data inserted successfully.\")\n", + " except Exception as e:\n", + " conn.rollback()\n", + " print(f\"Error occurred: {e}\")\n", + "\n", + "# Close the connection\n", + "conn.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "\n", + "schema = pq.read_schema('/Users/dhruvanand/Code/vector-io/vdf_20240515_224025_95696/medium_articles/1.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id: int64\n", + "vector: list\n", + " child 0, element: double\n", + "claps: int64\n", + "title: string\n", + "responses: int64\n", + "reading_time: int64\n", + "publication: string\n", + "link: string\n", + "-- schema metadata --\n", + "pandas: '{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"' + 1163" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "double\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for field in schema:\n", + " # print(field,field.name,field.type, type(field.type))\n", + " if isinstance(field.type, pa.lib.ListType):\n", + " print(field.type.value_type)\n", + " print(field.type)\n", + " # switch case for field.type\n", + " if field.type == pa.lib.ListType:\n", + " print(field.type)\n", + " if field.type == pa.lib.StructType:\n", + " print(field.type)\n", + " if field.type == pa.lib.DictionaryType:\n", + " print(field.type)\n", + " if field.type == pa.lib.TimestampType:\n", + " print(field.type)\n", + " if field.type == pa.lib.TimeType:\n", + " print(field.type)\n", + " " + ] + }, { "cell_type": "code", "execution_count": null, From c9bcada150b7f22bbd134cc79020b52cab82f9ab Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 7 Jun 2024 11:44:27 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/vdf_io/import_vdf/vdf_import_cls.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/vdf_io/import_vdf/vdf_import_cls.py b/src/vdf_io/import_vdf/vdf_import_cls.py index 4c7f4d1..72e4428 100644 --- a/src/vdf_io/import_vdf/vdf_import_cls.py +++ b/src/vdf_io/import_vdf/vdf_import_cls.py @@ -162,11 +162,13 @@ def set_dims(self, namespace_meta, index_name, vector_column_name=None): f"Failed to resolve dimensions for index '{index_name}'" ) - def resolve_dims(self, namespace_meta, index_name, vector_column_name = None): + def resolve_dims(self, namespace_meta, index_name, vector_column_name=None): final_data_path = self.get_final_data_path(namespace_meta["data_path"]) parquet_files = self.get_parquet_files(final_data_path) if vector_column_name is None: - _, vector_column_name = self.get_vector_column_name(index_name, namespace_meta) + _, vector_column_name = self.get_vector_column_name( + index_name, namespace_meta + ) dims = -1 found = False for file in tqdm(parquet_files, desc="Iterating parquet files"):