diff --git a/README.md b/README.md index 9bab88a..68f75cc 100644 --- a/README.md +++ b/README.md @@ -66,13 +66,12 @@ See the [Contributing](#contributing) section to add support for your favorite v | Vector Database | Import | Export | |--------------------------------|--------|--------| +| pgvector | ❌ | ❌ | | Azure AI Search | ❌ | ❌ | | Weaviate | ❌ | ❌ | | MongoDB Atlas | ❌ | ❌ | -| OpenSearch | ❌ | ❌ | | Apache Cassandra | ❌ | ❌ | | txtai | ❌ | ❌ | -| pgvector | ❌ | ❌ | | SQLite-VSS | ❌ | ❌ | @@ -84,8 +83,12 @@ See the [Contributing](#contributing) section to add support for your favorite v | Vector Database | Import | Export | |--------------------------------|--------|--------| | Vespa | ❌ | ❌ | +| AWS Neptune | ❌ | ❌ | +| Neo4j | ❌ | ❌ | | Marqo | ❌ | ❌ | +| OpenSearch | ❌ | ❌ | | Elasticsearch | ❌ | ❌ | +| Apache Solr | ❌ | ❌ | | Redis Search | ❌ | ❌ | | ClickHouse | ❌ | ❌ | | USearch | ❌ | ❌ | @@ -96,13 +99,11 @@ See the [Contributing](#contributing) section to add support for your favorite v | CrateDB | ❌ | ❌ | | Meilisearch | ❌ | ❌ | | MyScale | ❌ | ❌ | -| Neo4j | ❌ | ❌ | | Nuclia DB | ❌ | ❌ | | OramaSearch | ❌ | ❌ | | Typesense | ❌ | ❌ | | Anari AI | ❌ | ❌ | | Vald | ❌ | ❌ | -| Apache Solr | ❌ | ❌ | ## Installation diff --git a/requirements.txt b/requirements.txt index 260025b..f75c915 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,6 @@ mlx_embedding_models azure-search-documents azure-identity turbopuffer[fast] -psycopg2 \ No newline at end of file +pgvector +psycopg +duckdb diff --git a/src/vdf_io/export_vdf/pgvector_export.py b/src/vdf_io/export_vdf/pgvector_export.py new file mode 100644 index 0000000..8c760db --- /dev/null +++ b/src/vdf_io/export_vdf/pgvector_export.py @@ -0,0 +1,160 @@ +import json +import os +from typing import Dict, List +from tqdm import tqdm + +import psycopg2 + +from vdf_io.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt +from vdf_io.meta_types import NamespaceMeta +from vdf_io.names import DBNames +from vdf_io.util import set_arg_from_input +from vdf_io.export_vdf.vdb_export_cls import ExportVDB + + +class ExportPGVector(ExportVDB): + DB_NAME_SLUG = DBNames.PGVECTOR + + @classmethod + def make_parser(cls, subparsers): + parser_pgvector = make_pgv_parser(cls.DB_NAME_SLUG, subparsers) + parser_pgvector.add_argument( + "--batch_size", + type=int, + help="Batch size for exporting data", + default=10_000, + ) + parser_pgvector.add_argument( + "--tables", type=str, help="Postgres tables to export (comma-separated)" + ) + + @classmethod + def export_vdb(cls, args): + set_pgv_args_from_prompt(args) + pgvector_export = ExportPGVector(args) + pgvector_export.get_all_table_names() + pgvector_export.get_all_schemas() + set_arg_from_input( + args, + "schema", + "Enter the name of the schema of the Postgres instance (default: public): ", + str, + choices=pgvector_export.all_schemas, + ) + set_arg_from_input( + args, + "tables", + "Enter the name of tables to import (comma-separated, all will be imported by default): ", + str, + choices=pgvector_export.all_tables, + ) + pgvector_export.get_data() + return pgvector_export + + def __init__(self, args): + super().__init__(args) + if args.get("connection_string"): + self.conn = psycopg2.connect(args["connection_string"]) + else: + self.conn = psycopg2.connect( + user=args["user"], + password=args["password"], + host=args["host"] if args.get("host", "") != "" else "localhost", + port=args["port"] if args.get("port", "") != "" else "5432", + dbname=args["dbname"] if args.get("dbname", "") != "" else "postgres", + ) + self.cur = self.conn.cursor() + + def get_all_schemas(self): + schemas = self.cur.execute( + "SELECT schema_name FROM information_schema.schemata" + ) + self.all_schemas = [schema[0] for schema in schemas] + return [schema[0] for schema in schemas] + + def get_all_table_names(self): + self.schema_name = self.args.get("schema") or "public" + tables = self.cur.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema='public'" + ) + self.all_tables = [table[0] for table in tables] + return [table[0] for table in tables] + + def get_all_index_names(self): + # get all tables in the schema + return self.cur.execute( + f"SELECT table_name FROM information_schema.tables WHERE table_schema='{self.schema_name}'" + ) + + def get_index_names(self): + if self.args.get("tables", None) is not None: + return self.args["tables"].split(",") + return self.get_all_index_names() + + def get_data(self): + index_names = self.get_index_names() + BATCH_SIZE = self.args["batch_size"] + total = 0 + index_metas: Dict[str, List[NamespaceMeta]] = {} + for index_name in index_names: + namespace_metas = [] + vectors_directory = self.create_vec_dir(index_name) + table = self.db.open_table(index_name) + offset = 0 + remainder_df = None + j = 0 + # for batch in tqdm(table.to_lance().to_batches()): + # df = batch.to_pandas() + # if remainder_df is not None: + # df = pd.concat([remainder_df, df]) + # while len(df) >= BATCH_SIZE: + # # TODO: use save_vectors_to_parquet + # df[:BATCH_SIZE].to_parquet( + # os.path.join(vectors_directory, f"{j}.parquet") + # ) + # j += 1 + # total += BATCH_SIZE + # df = df[BATCH_SIZE:] + # offset += BATCH_SIZE + # remainder_df = df + # if remainder_df is not None and len(remainder_df) > 0: + # # TODO: use save_vectors_to_parquet + # remainder_df.to_parquet(os.path.join(vectors_directory, f"{j}.parquet")) + # total += len(remainder_df) + # dim = -1 + # for name in table.schema.names: + # if pyarrow.types.is_fixed_size_list(table.schema.field(name).type): + # dim = table.schema.field(name).type.list_size + # vector_columns = [ + # name + # for name in table.schema.names + # if pyarrow.types.is_fixed_size_list(table.schema.field(name).type) + # ] + # distance = "Cosine" + # try: + # for index in table.list_indices(): + # if index.vector_column_name == vector_columns[0]: + # distance = vector_columns[0] + # except Exception: + # pass + + namespace_metas = [ + self.get_namespace_meta( + index_name, + vectors_directory, + total=total, + num_vectors_exported=total, + # dim=dim, + # vector_columns=vector_columns, + # distance=distance, + ) + ] + index_metas[index_name] = namespace_metas + self.file_structure.append(os.path.join(self.vdf_directory, "VDF_META.json")) + internal_metadata = self.get_basic_vdf_meta(index_metas) + meta_text = json.dumps(internal_metadata.model_dump(), indent=4) + tqdm.write(meta_text) + with open(os.path.join(self.vdf_directory, "VDF_META.json"), "w") as json_file: + json_file.write(meta_text) + # print internal metadata properly + return True diff --git a/src/vdf_io/import_vdf/pgvector_import.py b/src/vdf_io/import_vdf/pgvector_import.py new file mode 100644 index 0000000..0d147eb --- /dev/null +++ b/src/vdf_io/import_vdf/pgvector_import.py @@ -0,0 +1,241 @@ +from typing import Dict, List +from dotenv import load_dotenv +import pandas as pd +from tqdm import tqdm +import pyarrow.parquet as pq + +from pgvector.psycopg import register_vector +import psycopg + +from vdf_io.constants import DEFAULT_BATCH_SIZE, INT_MAX +from vdf_io.pgvector_util import make_pgv_parser, set_pgv_args_from_prompt +from vdf_io.meta_types import NamespaceMeta +from vdf_io.names import DBNames +from vdf_io.util import ( + cleanup_df, + divide_into_batches, + set_arg_from_input, +) +from vdf_io.import_vdf.vdf_import_cls import ImportVDB + + +load_dotenv() + + +class ImportPGVector(ImportVDB): + DB_NAME_SLUG = DBNames.PGVECTOR + + @classmethod + def import_vdb(cls, args): + """ + Import data to PGVector + """ + set_pgv_args_from_prompt(args) + pgvector_import = ImportPGVector(args) + pgvector_import.get_all_table_names() + pgvector_import.get_all_schemas() + set_arg_from_input( + args, + "schema", + "Enter the name of the schema of the Postgres instance (default: public): ", + str, + choices=pgvector_import.all_schemas, + ) + pgvector_import.upsert_data() + return pgvector_import + + @classmethod + def make_parser(cls, subparsers): + _parser_pgvector = make_pgv_parser(cls.DB_NAME_SLUG, subparsers) + + def __init__(self, args): + # call super class constructor + super().__init__(args) + # use connection_string + if args.get("connection_string"): + self.conn = psycopg.connect(args["connection_string"]) + else: + self.conn = psycopg.connect( + user=args["user"], + password=args["password"], + host=args["host"] if args.get("host", "") != "" else "localhost", + port=args["port"] if args.get("port", "") != "" else "5432", + dbname=args["dbname"] if args.get("dbname", "") != "" else "postgres", + ) + + def get_all_schemas(self): + with self.conn.cursor() as cur: + cur.execute("SELECT schema_name FROM information_schema.schemata") + schemas_response = cur.fetchall() + self.all_schemas = ( + [schema[0] for schema in schemas_response] if schemas_response else [] + ) + return self.all_schemas + + def get_all_table_names(self): + self.schema_name = self.args.get("schema") or "public" + with self.conn.cursor() as cur: + cur.execute( + f"SELECT table_name FROM information_schema.tables WHERE table_schema='{self.schema_name}'" + ) + tables_response = cur.fetchall() + tqdm.write(f"Tables in schema {self.schema_name}: {tables_response}") + self.all_tables = ( + [table[0] for table in tables_response] if tables_response else [] + ) + return self.all_tables + + def upsert_data(self): + # create pgvector extension if not exists + with self.conn.cursor() as cur: + cur.execute("CREATE EXTENSION IF NOT EXISTS vector") + # register vector type + register_vector(self.conn) + + # use the schema + with self.conn.cursor() as cur: + cur.execute(f"SET search_path TO {self.schema_name}") + + max_hit = False + self.total_imported_count = 0 + indexes_content: Dict[str, List[NamespaceMeta]] = self.vdf_meta["indexes"] + index_names: List[str] = list(indexes_content.keys()) + if len(index_names) == 0: + raise ValueError("No indexes found in VDF_META.json") + self.tables = self.get_all_table_names() + # Load Parquet file + for index_name, index_meta in tqdm( + indexes_content.items(), desc="Importing indexes" + ): + for namespace_meta in tqdm(index_meta, desc="Importing namespaces"): + self.set_dims(namespace_meta, index_name) + data_path = namespace_meta["data_path"] + final_data_path = self.get_final_data_path(data_path) + # Load the data from the parquet files + parquet_files = self.get_parquet_files(final_data_path) + + new_index_name = index_name + ( + f'_{namespace_meta["namespace"]}' + if namespace_meta["namespace"] + else "" + ) + new_index_name = self.create_new_name(new_index_name, self.tables) + if new_index_name not in self.tables: + # assemble schema using parquet file's schema + schema = pq.read_schema(parquet_files[0]) + schema_dict = {} + if "model_map" not in namespace_meta: + namespace_meta["model_map"] = {} +""" +┌──────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐ +│ column_name │ column_type │ null │ key │ default │ extra │ +│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │ +├──────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤ +│ id │ BIGINT │ YES │ │ │ │ +│ vector │ DOUBLE[] │ YES │ │ │ │ +│ claps │ BIGINT │ YES │ │ │ │ +│ title │ VARCHAR │ YES │ │ │ │ +│ responses │ BIGINT │ YES │ │ │ │ +│ reading_time │ BIGINT │ YES │ │ │ │ +│ publication │ VARCHAR │ YES │ │ │ │ +│ link │ VARCHAR │ YES │ │ │ │ +└──────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘ +id: int64 +vector: list + child 0, element: double +claps: int64 +title: string +responses: int64 +reading_time: int64 +publication: string +link: string +""" + parquet_to_sql_type_map = { + + "int64": "BIGINT", + "float64": "DOUBLE PRECISION", + "bool": "BOOLEAN", + "datetime64[ns]": "TIMESTAMP", + "timedelta64[ns]": "INTERVAL", + "object": "VARCHAR", + } + for field in schema: + col_type = field.type + col_name = field.name + schema_dict[col_name] = col_type + # check if the column is a vector column + if col_name in namespace_meta["model_map"]: + schema_dict[col_name] = "vector()" + # create schema string + schema_str = ", ".join( + [f"{col_name} {col_type}" for col_name, col_type in schema_dict.items()] + ) + # create postgres table + with self.conn.cursor() as cur: + cur.execute( + f"CREATE TABLE {new_index_name} (id SERIAL PRIMARY KEY)" + ) + tqdm.write(f"Created table {new_index_name}") + table_name = new_index_name + else: + # set table name + table_name = new_index_name + for file in tqdm(parquet_files, desc="Iterating parquet files"): + file_path = self.get_file_path(final_data_path, file) + df = self.read_parquet_progress( + file_path, + max_num_rows=( + (self.args.get("max_num_rows") or INT_MAX) + - self.total_imported_count + ), + ) + df = cleanup_df(df) + # if there are additional columns in the parquet file, add them to the table + # split in batches + BATCH_SIZE = self.args.get("batch_size") or DEFAULT_BATCH_SIZE + for batch in tqdm( + divide_into_batches(df, BATCH_SIZE), + desc="Importing batches", + total=len(df) // BATCH_SIZE, + ): + if self.total_imported_count + len(batch) >= ( + self.args.get("max_num_rows") or INT_MAX + ): + batch = batch[ + : (self.args.get("max_num_rows") or INT_MAX) + - self.total_imported_count + ] + max_hit = True + # convert df into list of dicts + with self.conn.cursor() as cur: + cur.execute( + f"""INSERT INTO {table_name + } ({', '.join(batch.columns) + }) VALUES {', '.join([str(tuple(row)) for row in batch.itertuples(index=False)]) + }""" + ) + self.total_imported_count += len(batch) + if max_hit: + break + tqdm.write(f"Imported {self.total_imported_count} rows") + with self.conn.cursor() as cur: + cur.execute(f"SELECT COUNT(*) FROM {table_name}") + new_table_size = cur.fetchone()[0] + tqdm.write(f"New table size: {new_table_size}") + if max_hit: + break + print("Data imported successfully") + + +def get_default_value(data_type): + # Define default values for common data types + default_values = { + "object": "", + "int64": 0, + "float64": 0.0, + "bool": False, + "datetime64[ns]": pd.Timestamp("NaT"), + "timedelta64[ns]": pd.Timedelta("NaT"), + } + # Return the default value for the specified data type, or None if not specified + return default_values.get(data_type.name, None) diff --git a/src/vdf_io/import_vdf/vdf_import_cls.py b/src/vdf_io/import_vdf/vdf_import_cls.py index 31ef4d6..72e4428 100644 --- a/src/vdf_io/import_vdf/vdf_import_cls.py +++ b/src/vdf_io/import_vdf/vdf_import_cls.py @@ -149,10 +149,10 @@ def get_file_path(self, final_data_path, parquet_file): return parquet_file return os.path.join(final_data_path, parquet_file) - def set_dims(self, namespace_meta, index_name): + def set_dims(self, namespace_meta, index_name, vector_column_name=None): if namespace_meta["dimensions"] == -1: tqdm.write(f"Resolving dimensions for index '{index_name}'") - dims = self.resolve_dims(namespace_meta, index_name) + dims = self.resolve_dims(namespace_meta, index_name, vector_column_name) if dims != -1: namespace_meta["dimensions"] = dims tqdm.write(f"Resolved dimensions: {dims}") @@ -162,10 +162,13 @@ def set_dims(self, namespace_meta, index_name): f"Failed to resolve dimensions for index '{index_name}'" ) - def resolve_dims(self, namespace_meta, index_name): + def resolve_dims(self, namespace_meta, index_name, vector_column_name=None): final_data_path = self.get_final_data_path(namespace_meta["data_path"]) parquet_files = self.get_parquet_files(final_data_path) - _, vector_column_name = self.get_vector_column_name(index_name, namespace_meta) + if vector_column_name is None: + _, vector_column_name = self.get_vector_column_name( + index_name, namespace_meta + ) dims = -1 found = False for file in tqdm(parquet_files, desc="Iterating parquet files"): diff --git a/src/vdf_io/names.py b/src/vdf_io/names.py index 6697e05..e0e33f8 100644 --- a/src/vdf_io/names.py +++ b/src/vdf_io/names.py @@ -12,3 +12,4 @@ class DBNames: ASTRADB = "astradb" AZUREAI = "azureai" TURBOPUFFER = "turbopuffer" + PGVECTOR = "pgvector" diff --git a/src/vdf_io/notebooks/duckdb.ipynb b/src/vdf_io/notebooks/duckdb.ipynb new file mode 100644 index 0000000..572333d --- /dev/null +++ b/src/vdf_io/notebooks/duckdb.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: numpy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 1)) (1.26.4)\n", + "Requirement already satisfied: pandas in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 2)) (2.1.4)\n", + "Requirement already satisfied: pinecone-client~=4.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 3)) (4.0.0)\n", + "Requirement already satisfied: pyarrow in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 4)) (16.1.0)\n", + "Requirement already satisfied: qdrant_client in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 5)) (1.7.1)\n", + "Requirement already satisfied: tqdm in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 6)) (4.66.4)\n", + "Requirement already satisfied: python-dotenv in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 7)) (1.0.0)\n", + "Requirement already satisfied: huggingface_hub in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 8)) (0.23.0)\n", + "Requirement already satisfied: pymilvus>=2.3.6 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 9)) (2.3.6)\n", + "Requirement already satisfied: openai>=1.10.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 10)) (1.12.0)\n", + "Requirement already satisfied: litellm>=1.20.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 11)) (1.34.20)\n", + "Requirement already satisfied: google-cloud-aiplatform in /Users/dhruvanand/.local/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 12)) (1.40.0)\n", + "Requirement already satisfied: google-api-python-client in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 13)) (2.118.0)\n", + "Requirement already satisfied: google-auth-httplib2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 14)) (0.2.0)\n", + "Requirement already satisfied: google-auth-oauthlib in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 15)) (1.2.0)\n", + "Requirement already satisfied: ratelimit in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 16)) (2.2.1)\n", + "Requirement already satisfied: backoff in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 17)) (2.2.1)\n", + "Requirement already satisfied: kdbai-client in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 18)) (0.1.2)\n", + "Requirement already satisfied: sentry-sdk[opentelemetry] in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 19)) (1.40.3)\n", + "Requirement already satisfied: halo in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 20)) (0.0.31)\n", + "Requirement already satisfied: sentence-transformers>=2.6.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 21)) (2.6.1)\n", + "Requirement already satisfied: pyvespa in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 22)) (0.39.0)\n", + "Requirement already satisfied: lancedb in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 23)) (0.5.7)\n", + "Requirement already satisfied: chromadb in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 24)) (0.4.24)\n", + "Requirement already satisfied: txtai in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 25)) (7.0.0)\n", + "Requirement already satisfied: weaviate-client>=4.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 26)) (4.5.2)\n", + "Requirement already satisfied: astrapy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 27)) (1.0.0)\n", + "Requirement already satisfied: cassandra-driver in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 28)) (3.29.0)\n", + "Requirement already satisfied: tenacity in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 29)) (8.2.3)\n", + "Requirement already satisfied: torch in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 30)) (2.1.2)\n", + "Requirement already satisfied: twine in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 31)) (5.0.0)\n", + "Requirement already satisfied: datasets>=2.19.0,~=2.16 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 32)) (2.19.0)\n", + "Requirement already satisfied: mlx_embedding_models in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 33)) (0.0.9)\n", + "Requirement already satisfied: azure-search-documents in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 34)) (11.4.0)\n", + "Requirement already satisfied: azure-identity in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 35)) (1.16.0)\n", + "Requirement already satisfied: turbopuffer[fast] in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 36)) (0.1.7)\n", + "Requirement already satisfied: pgvector in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 37)) (0.2.5)\n", + "Requirement already satisfied: psycopg in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from -r ../../../requirements.txt (line 38)) (3.1.19)\n", + "Collecting duckdb (from -r ../../../requirements.txt (line 39))\n", + " Obtaining dependency information for duckdb from https://files.pythonhosted.org/packages/a4/a3/ff1b0127acc0d7cfe72005df80431225f056f463f6d179c6175cb95fec15/duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Downloading duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (763 bytes)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pandas->-r ../../../requirements.txt (line 2)) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pandas->-r ../../../requirements.txt (line 2)) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pandas->-r ../../../requirements.txt (line 2)) (2023.4)\n", + "Requirement already satisfied: certifi>=2019.11.17 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pinecone-client~=4.0.0->-r ../../../requirements.txt (line 3)) (2023.11.17)\n", + "Requirement already satisfied: typing-extensions>=3.7.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pinecone-client~=4.0.0->-r ../../../requirements.txt (line 3)) (4.9.0)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pinecone-client~=4.0.0->-r ../../../requirements.txt (line 3)) (2.0.7)\n", + "Requirement already satisfied: grpcio>=1.41.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (1.60.0)\n", + "Requirement already satisfied: grpcio-tools>=1.41.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (1.60.0)\n", + "Requirement already satisfied: httpx[http2]>=0.14.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (0.27.0)\n", + "Requirement already satisfied: portalocker<3.0.0,>=2.7.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (2.8.2)\n", + "Requirement already satisfied: pydantic>=1.10.8 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from qdrant_client->-r ../../../requirements.txt (line 5)) (2.7.1)\n", + "Requirement already satisfied: filelock in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (3.13.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (2023.12.2)\n", + "Requirement already satisfied: packaging>=20.9 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (6.0.1)\n", + "Requirement already satisfied: requests in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from huggingface_hub->-r ../../../requirements.txt (line 8)) (2.31.0)\n", + "Requirement already satisfied: setuptools>=67 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (68.1.2)\n", + "Requirement already satisfied: protobuf>=3.20.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (4.25.3)\n", + "Requirement already satisfied: environs<=9.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (9.5.0)\n", + "Requirement already satisfied: ujson>=2.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (5.9.0)\n", + "Requirement already satisfied: minio>=7.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (7.2.3)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from openai>=1.10.0->-r ../../../requirements.txt (line 10)) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from openai>=1.10.0->-r ../../../requirements.txt (line 10)) (1.9.0)\n", + "Requirement already satisfied: sniffio in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from openai>=1.10.0->-r ../../../requirements.txt (line 10)) (1.3.0)\n", + "Requirement already satisfied: aiohttp in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (3.9.1)\n", + "Requirement already satisfied: click in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (8.1.7)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (6.11.0)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (3.1.3)\n", + "Requirement already satisfied: tiktoken>=0.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (0.7.0)\n", + "Requirement already satisfied: tokenizers in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (0.15.2)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.19.0)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.23.0)\n", + "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.16.0)\n", + "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (3.17.1)\n", + "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.11.0)\n", + "Requirement already satisfied: shapely<3.0.0dev in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.0.2)\n", + "Requirement already satisfied: httplib2<1.dev0,>=0.15.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-python-client->-r ../../../requirements.txt (line 13)) (0.22.0)\n", + "Requirement already satisfied: google-auth<3.0.0.dev0,>=1.19.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-python-client->-r ../../../requirements.txt (line 13)) (2.26.1)\n", + "Requirement already satisfied: uritemplate<5,>=3.0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-python-client->-r ../../../requirements.txt (line 13)) (4.1.1)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-auth-oauthlib->-r ../../../requirements.txt (line 15)) (1.3.1)\n", + "Requirement already satisfied: pykx<3.0.0,>=2.1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from kdbai-client->-r ../../../requirements.txt (line 18)) (2.3.0)\n", + "Requirement already satisfied: opentelemetry-distro>=0.35b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentry-sdk[opentelemetry]->-r ../../../requirements.txt (line 19)) (0.45b0)\n", + "Requirement already satisfied: log-symbols>=0.0.14 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (0.0.14)\n", + "Requirement already satisfied: spinners>=0.0.24 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (0.0.24)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (2.4.0)\n", + "Requirement already satisfied: colorama>=0.3.9 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (0.4.6)\n", + "Requirement already satisfied: six>=1.12.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from halo->-r ../../../requirements.txt (line 20)) (1.16.0)\n", + "Requirement already satisfied: transformers<5.0.0,>=4.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (4.36.2)\n", + "Requirement already satisfied: scikit-learn in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (1.3.2)\n", + "Requirement already satisfied: scipy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (1.11.4)\n", + "Requirement already satisfied: Pillow in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (10.2.0)\n", + "Requirement already satisfied: docker in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pyvespa->-r ../../../requirements.txt (line 22)) (7.0.0)\n", + "Requirement already satisfied: cryptography in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pyvespa->-r ../../../requirements.txt (line 22)) (42.0.2)\n", + "Requirement already satisfied: deprecation in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (2.1.0)\n", + "Requirement already satisfied: pylance==0.9.18 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (0.9.18)\n", + "Requirement already satisfied: ratelimiter~=1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (1.2.0.post0)\n", + "Requirement already satisfied: retry>=0.9.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (0.9.2)\n", + "Requirement already satisfied: attrs>=21.3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (23.2.0)\n", + "Requirement already satisfied: semver>=3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (3.0.2)\n", + "Requirement already satisfied: cachetools in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (5.3.2)\n", + "Requirement already satisfied: overrides>=0.7 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from lancedb->-r ../../../requirements.txt (line 23)) (7.7.0)\n", + "Requirement already satisfied: build>=1.0.3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.0.3)\n", + "Requirement already satisfied: chroma-hnswlib==0.7.3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.7.3)\n", + "Requirement already satisfied: fastapi>=0.95.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.104.1)\n", + "Requirement already satisfied: uvicorn[standard]>=0.18.3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.22.0)\n", + "Requirement already satisfied: posthog>=2.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (3.4.2)\n", + "Requirement already satisfied: pulsar-client>=3.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (3.4.0)\n", + "Requirement already satisfied: onnxruntime>=1.14.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.17.3)\n", + "Requirement already satisfied: opentelemetry-api>=1.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: pypika>=0.48.9 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.48.9)\n", + "Requirement already satisfied: importlib-resources in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (6.1.1)\n", + "Requirement already satisfied: bcrypt>=4.0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (4.1.2)\n", + "Requirement already satisfied: typer>=0.9.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (0.9.0)\n", + "Requirement already satisfied: kubernetes>=28.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (29.0.0)\n", + "Requirement already satisfied: mmh3>=4.0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (4.1.0)\n", + "Requirement already satisfied: orjson>=3.9.12 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from chromadb->-r ../../../requirements.txt (line 24)) (3.9.15)\n", + "Requirement already satisfied: faiss-cpu>=1.7.1.post2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from txtai->-r ../../../requirements.txt (line 25)) (1.8.0)\n", + "Requirement already satisfied: regex>=2022.8.17 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from txtai->-r ../../../requirements.txt (line 25)) (2023.12.25)\n", + "Requirement already satisfied: validators==0.22.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from weaviate-client>=4.0.0->-r ../../../requirements.txt (line 26)) (0.22.0)\n", + "Requirement already satisfied: authlib<2.0.0,>=1.2.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from weaviate-client>=4.0.0->-r ../../../requirements.txt (line 26)) (1.3.0)\n", + "Requirement already satisfied: grpcio-health-checking<2.0.0,>=1.57.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from weaviate-client>=4.0.0->-r ../../../requirements.txt (line 26)) (1.60.0)\n", + "Requirement already satisfied: httpcore==1.* in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (1.0.2)\n", + "Requirement already satisfied: idna in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (3.6)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpcore==1.*->httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (0.14.0)\n", + "Requirement already satisfied: bson<0.6.0,>=0.5.10 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (0.5.10)\n", + "Requirement already satisfied: cassio<0.2.0,>=0.1.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (0.1.4)\n", + "Requirement already satisfied: toml<0.11.0,>=0.10.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (0.10.2)\n", + "Requirement already satisfied: uuid6<2024.2.0,>=2024.1.12 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from astrapy->-r ../../../requirements.txt (line 27)) (2024.1.12)\n", + "Requirement already satisfied: geomet<0.3,>=0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from cassandra-driver->-r ../../../requirements.txt (line 28)) (0.2.1.post1)\n", + "Requirement already satisfied: sympy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from torch->-r ../../../requirements.txt (line 30)) (1.12)\n", + "Requirement already satisfied: networkx in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from torch->-r ../../../requirements.txt (line 30)) (3.2.1)\n", + "Requirement already satisfied: pkginfo>=1.8.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (1.10.0)\n", + "Requirement already satisfied: readme-renderer>=35.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (43.0)\n", + "Requirement already satisfied: requests-toolbelt!=0.9.0,>=0.8.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (1.0.0)\n", + "Requirement already satisfied: keyring>=15.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (25.2.0)\n", + "Requirement already satisfied: rfc3986>=1.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (2.0.0)\n", + "Requirement already satisfied: rich>=12.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from twine->-r ../../../requirements.txt (line 31)) (13.7.0)\n", + "Requirement already satisfied: pyarrow-hotfix in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (0.6)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (0.3.7)\n", + "Requirement already satisfied: xxhash in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (3.4.1)\n", + "Requirement already satisfied: multiprocess in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from datasets>=2.19.0,~=2.16->-r ../../../requirements.txt (line 32)) (0.70.15)\n", + "Requirement already satisfied: mlx>=0.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from mlx_embedding_models->-r ../../../requirements.txt (line 33)) (0.13.0)\n", + "Requirement already satisfied: awkward in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from mlx_embedding_models->-r ../../../requirements.txt (line 33)) (2.6.3)\n", + "Requirement already satisfied: azure-core<2.0.0,>=1.28.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-search-documents->-r ../../../requirements.txt (line 34)) (1.30.1)\n", + "Requirement already satisfied: azure-common~=1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-search-documents->-r ../../../requirements.txt (line 34)) (1.1.28)\n", + "Requirement already satisfied: isodate>=0.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-search-documents->-r ../../../requirements.txt (line 34)) (0.6.1)\n", + "Requirement already satisfied: msal>=1.24.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-identity->-r ../../../requirements.txt (line 35)) (1.28.0)\n", + "Requirement already satisfied: msal-extensions>=0.3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from azure-identity->-r ../../../requirements.txt (line 35)) (1.1.0)\n", + "Requirement already satisfied: iso8601<3.0.0,>=2.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from turbopuffer[fast]->-r ../../../requirements.txt (line 36)) (2.1.0)\n", + "Requirement already satisfied: exceptiongroup in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai>=1.10.0->-r ../../../requirements.txt (line 10)) (1.2.0)\n", + "Requirement already satisfied: pyproject_hooks in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from build>=1.0.3->chromadb->-r ../../../requirements.txt (line 24)) (1.0.0)\n", + "Requirement already satisfied: tomli>=1.1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from build>=1.0.3->chromadb->-r ../../../requirements.txt (line 24)) (2.0.1)\n", + "Requirement already satisfied: cffi>=1.12 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from cryptography->pyvespa->-r ../../../requirements.txt (line 22)) (1.16.0)\n", + "Requirement already satisfied: marshmallow>=3.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from environs<=9.5.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (3.20.2)\n", + "Requirement already satisfied: starlette<0.28.0,>=0.27.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from fastapi>=0.95.2->chromadb->-r ../../../requirements.txt (line 24)) (0.27.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (1.9.4)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (1.4.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from aiohttp->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (4.0.3)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.63.0)\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.60.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-auth<3.0.0.dev0,>=1.19.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (0.3.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-auth<3.0.0.dev0,>=1.19.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (4.9)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.4.1)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (2.7.0)\n", + "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /Users/dhruvanand/.local/lib/python3.10/site-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (0.13.0)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from google-cloud-storage<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r ../../../requirements.txt (line 12)) (1.5.0)\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httplib2<1.dev0,>=0.15.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (3.1.2)\n", + "Requirement already satisfied: h2<5,>=3 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (4.1.0)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from importlib-metadata>=6.8.0->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (3.17.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from jinja2<4.0.0,>=3.1.2->litellm>=1.20.0->-r ../../../requirements.txt (line 11)) (2.1.5)\n", + "Requirement already satisfied: jaraco.classes in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (3.4.0)\n", + "Requirement already satisfied: jaraco.functools in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (4.0.1)\n", + "Requirement already satisfied: jaraco.context in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (5.3.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r ../../../requirements.txt (line 24)) (1.7.0)\n", + "Requirement already satisfied: oauthlib>=3.2.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r ../../../requirements.txt (line 24)) (3.2.2)\n", + "Requirement already satisfied: argon2-cffi in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from minio>=7.0.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (23.1.0)\n", + "Requirement already satisfied: pycryptodome in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from minio>=7.0.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (3.20.0)\n", + "Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from msal>=1.24.0->azure-identity->-r ../../../requirements.txt (line 35)) (2.8.0)\n", + "Requirement already satisfied: coloredlogs in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r ../../../requirements.txt (line 24)) (15.0.1)\n", + "Requirement already satisfied: flatbuffers in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r ../../../requirements.txt (line 24)) (23.5.26)\n", + "Requirement already satisfied: deprecated>=1.2.6 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-api>=1.2.0->chromadb->-r ../../../requirements.txt (line 24)) (1.2.14)\n", + "Requirement already satisfied: opentelemetry-instrumentation==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-distro>=0.35b0->sentry-sdk[opentelemetry]->-r ../../../requirements.txt (line 19)) (0.45b0)\n", + "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-distro>=0.35b0->sentry-sdk[opentelemetry]->-r ../../../requirements.txt (line 19)) (1.16.0)\n", + "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.24.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-proto==1.24.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb->-r ../../../requirements.txt (line 24)) (1.24.0)\n", + "Requirement already satisfied: opentelemetry-instrumentation-asgi==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: opentelemetry-semantic-conventions==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: opentelemetry-util-http==0.45b0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (0.45b0)\n", + "Requirement already satisfied: asgiref~=3.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from opentelemetry-instrumentation-asgi==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r ../../../requirements.txt (line 24)) (3.7.2)\n", + "Requirement already satisfied: monotonic>=1.5 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from posthog>=2.4.0->chromadb->-r ../../../requirements.txt (line 24)) (1.6)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pydantic>=1.10.8->qdrant_client->-r ../../../requirements.txt (line 5)) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.18.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pydantic>=1.10.8->qdrant_client->-r ../../../requirements.txt (line 5)) (2.18.2)\n", + "Requirement already satisfied: nh3>=0.2.14 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from readme-renderer>=35.0->twine->-r ../../../requirements.txt (line 31)) (0.2.17)\n", + "Requirement already satisfied: docutils>=0.13.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from readme-renderer>=35.0->twine->-r ../../../requirements.txt (line 31)) (0.20.1)\n", + "Requirement already satisfied: Pygments>=2.5.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from readme-renderer>=35.0->twine->-r ../../../requirements.txt (line 31)) (2.17.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from requests->huggingface_hub->-r ../../../requirements.txt (line 8)) (3.3.2)\n", + "Requirement already satisfied: decorator>=3.4.2 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from retry>=0.9.2->lancedb->-r ../../../requirements.txt (line 23)) (5.1.1)\n", + "Requirement already satisfied: py<2.0.0,>=1.4.26 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from retry>=0.9.2->lancedb->-r ../../../requirements.txt (line 23)) (1.11.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from rich>=12.0.0->twine->-r ../../../requirements.txt (line 31)) (3.0.0)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (0.4.2)\n", + "Requirement already satisfied: httptools>=0.5.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (0.6.1)\n", + "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (0.19.0)\n", + "Requirement already satisfied: watchfiles>=0.13 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (0.21.0)\n", + "Requirement already satisfied: websockets>=10.4 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r ../../../requirements.txt (line 24)) (11.0.3)\n", + "Requirement already satisfied: awkward-cpp==32 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from awkward->mlx_embedding_models->-r ../../../requirements.txt (line 33)) (32)\n", + "Requirement already satisfied: joblib>=1.1.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from scikit-learn->sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from scikit-learn->sentence-transformers>=2.6.1->-r ../../../requirements.txt (line 21)) (3.2.0)\n", + "Requirement already satisfied: mpmath>=0.19 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from sympy->torch->-r ../../../requirements.txt (line 30)) (1.3.0)\n", + "Requirement already satisfied: pycparser in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from cffi>=1.12->cryptography->pyvespa->-r ../../../requirements.txt (line 22)) (2.21)\n", + "Requirement already satisfied: hyperframe<7,>=6.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from h2<5,>=3->httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (6.0.1)\n", + "Requirement already satisfied: hpack<5,>=4.0 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from h2<5,>=3->httpx[http2]>=0.14.0->qdrant_client->-r ../../../requirements.txt (line 5)) (4.0.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=12.0.0->twine->-r ../../../requirements.txt (line 31)) (0.1.2)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.0.dev0,>=1.19.0->google-api-python-client->-r ../../../requirements.txt (line 13)) (0.5.1)\n", + "Requirement already satisfied: argon2-cffi-bindings in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from argon2-cffi->minio>=7.0.0->pymilvus>=2.3.6->-r ../../../requirements.txt (line 9)) (21.2.0)\n", + "Requirement already satisfied: humanfriendly>=9.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb->-r ../../../requirements.txt (line 24)) (10.0)\n", + "Requirement already satisfied: more-itertools in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from jaraco.classes->keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (10.2.0)\n", + "Requirement already satisfied: backports.tarfile in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from jaraco.context->keyring>=15.1->twine->-r ../../../requirements.txt (line 31)) (1.1.1)\n", + "Downloading duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl (14.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.4/14.4 MB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hInstalling collected packages: duckdb\n", + "Successfully installed duckdb-0.10.2\n" + ] + } + ], + "source": [ + "!pip install -r ../../../requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(42,)]\n" + ] + } + ], + "source": [ + "import duckdb\n", + "cursor = duckdb.connect()\n", + "print(cursor.execute('SELECT 42').fetchall())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "duckdb.connect" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/vdf_io/notebooks/jsonl_to_parquet.ipynb b/src/vdf_io/notebooks/jsonl_to_parquet.ipynb index e9b952f..b4a3e9e 100644 --- a/src/vdf_io/notebooks/jsonl_to_parquet.ipynb +++ b/src/vdf_io/notebooks/jsonl_to_parquet.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -30,12 +30,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load JSONL File\n", - "jsonl_file = '/Users/dhruvanand/Code/datasets-dumps/shard-00000.jsonl 2'\n" + "jsonl_file = '/Users/dhruvanand/Downloads/579aac087579b5acbc881164eb7af8b8-662d020154513ab1ef571e325a8fc649ebf64561/o200k_tokens.jsonl'\n" ] }, { @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -82,70 +82,75 @@ " \n", " \n", " \n", - " query\n", - " document\n", - " metadata\n", + " token\n", + " token_id\n", + " type\n", + " language\n", + " definition\n", " \n", " \n", " \n", " \n", " 0\n", - " What animals prey on meerkats?\n", - " What animals prey of the meerkats?\n", - " {'topic': 'Animals', 'objective': {'self': [],...\n", + " ا\n", + " 496\n", + " subword\n", + " Arabic\n", + " This is a letter in the Arabic alphabet, repre...\n", " \n", " \n", " 1\n", - " What is the most pointless question you have s...\n", - " What is the most idiotic, or pointless, questi...\n", - " {'topic': 'Quora (2)', 'objective': {'self': [...\n", + " ال\n", + " 589\n", + " subword\n", + " Arabic\n", + " The (definite article)\n", " \n", " \n", " 2\n", - " Can we apply online for net banking in sbi?\n", - " Can I apply for internet banking in SBI withou...\n", - " {'topic': 'Personal finance', 'objective': {'s...\n", + " п\n", + " 648\n", + " subword\n", + " Russian\n", + " This is the Cyrillic letter 'п', which is the ...\n", " \n", " \n", " 3\n", - " What are the best SEO strategies for 2017?\n", - " What is the best SEO strategies in 2017?\n", - " {'topic': 'SEO', 'objective': {'self': [], 'pa...\n", + " с\n", + " 669\n", + " subword\n", + " Russian\n", + " The Cyrillic letter 'с', which represents the ...\n", " \n", " \n", " 4\n", - " What is particle spin?\n", - " What is spin? What are the causes for a partic...\n", - " {'topic': 'Chromatography', 'objective': {'sel...\n", + " ра\n", + " 714\n", + " Subword\n", + " Russian\n", + " This is a Cyrillic character that is part of a...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " query \\\n", - "0 What animals prey on meerkats? \n", - "1 What is the most pointless question you have s... \n", - "2 Can we apply online for net banking in sbi? \n", - "3 What are the best SEO strategies for 2017? \n", - "4 What is particle spin? \n", - "\n", - " document \\\n", - "0 What animals prey of the meerkats? \n", - "1 What is the most idiotic, or pointless, questi... \n", - "2 Can I apply for internet banking in SBI withou... \n", - "3 What is the best SEO strategies in 2017? \n", - "4 What is spin? What are the causes for a partic... \n", + " token token_id type language \\\n", + "0 ا 496 subword Arabic \n", + "1 ال 589 subword Arabic \n", + "2 п 648 subword Russian \n", + "3 с 669 subword Russian \n", + "4 ра 714 Subword Russian \n", "\n", - " metadata \n", - "0 {'topic': 'Animals', 'objective': {'self': [],... \n", - "1 {'topic': 'Quora (2)', 'objective': {'self': [... \n", - "2 {'topic': 'Personal finance', 'objective': {'s... \n", - "3 {'topic': 'SEO', 'objective': {'self': [], 'pa... \n", - "4 {'topic': 'Chromatography', 'objective': {'sel... " + " definition \n", + "0 This is a letter in the Arabic alphabet, repre... \n", + "1 The (definite article) \n", + "2 This is the Cyrillic letter 'п', which is the ... \n", + "3 The Cyrillic letter 'с', which represents the ... \n", + "4 This is a Cyrillic character that is part of a... " ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -154,6 +159,73 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "64465" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 64465 entries, 0 to 64464\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 token 64465 non-null object\n", + " 1 token_id 64465 non-null int64 \n", + " 2 type 64420 non-null object\n", + " 3 language 64420 non-null object\n", + " 4 definition 64416 non-null object\n", + "dtypes: int64(1), object(4)\n", + "memory usage: 3.0+ MB\n" + ] + } + ], + "source": [ + "# summary of df\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df['token_length'] = df['token'].apply(lambda x: len(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_parquet('/Users/dhruvanand/Downloads/579aac087579b5acbc881164eb7af8b8-662d020154513ab1ef571e325a8fc649ebf64561/'+'o200k_tokens.parquet', engine='pyarrow')" + ] + }, { "cell_type": "code", "execution_count": 12, diff --git a/src/vdf_io/notebooks/neon.ipynb b/src/vdf_io/notebooks/neon.ipynb new file mode 100644 index 0000000..3bb8221 --- /dev/null +++ b/src/vdf_io/notebooks/neon.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pgvector in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (0.2.5)\n", + "Collecting psycopg\n", + " Obtaining dependency information for psycopg from https://files.pythonhosted.org/packages/f2/cf/172701ea48987548c681e011681dda1d2605131f723e89225477fe7802e9/psycopg-3.1.19-py3-none-any.whl.metadata\n", + " Downloading psycopg-3.1.19-py3-none-any.whl.metadata (4.2 kB)\n", + "Requirement already satisfied: numpy in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from pgvector) (1.26.4)\n", + "Requirement already satisfied: typing-extensions>=4.1 in /Users/dhruvanand/miniforge3/lib/python3.10/site-packages (from psycopg) (4.9.0)\n", + "Downloading psycopg-3.1.19-py3-none-any.whl (179 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.4/179.4 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: psycopg\n", + "Successfully installed psycopg-3.1.19\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install pgvector psycopg" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import json\n", + "import os\n", + "from dotenv import load_dotenv, find_dotenv\n", + "from typing import List, Dict, Any\n", + "from rich import print as rprint\n", + "\n", + "load_dotenv(find_dotenv(), override=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "NEON_CONNECTION_STRING = os.environ.get(\"NEON_CONNECTION_STRING\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import psycopg\n", + "import pgvector\n", + "from pgvector.psycopg import register_vector\n", + "\n", + "conn = psycopg.connect(NEON_CONNECTION_STRING)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "psycopg.Connection" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
['public', 'information_schema', 'pg_catalog', 'pg_toast']\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\u001b[32m'public'\u001b[0m, \u001b[32m'information_schema'\u001b[0m, \u001b[32m'pg_catalog'\u001b[0m, \u001b[32m'pg_toast'\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# list schemas\n", + "with conn.cursor() as cur:\n", + " cur.execute(\"SELECT schema_name FROM information_schema.schemata;\")\n", + " schemas = cur.fetchall()\n", + " schemas = [schema[0] for schema in schemas]\n", + " rprint(schemas)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
['test_table']\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\u001b[32m'test_table'\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# list tables in public\n", + "with conn.cursor() as cur:\n", + " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';\")\n", + " tables = cur.fetchall()\n", + " tables = [table[0] for table in tables]\n", + " rprint(tables)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
public\n",
+       "[]\n",
+       "
\n" + ], + "text/plain": [ + "public\n", + "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
information_schema\n",
+       "[\n",
+       "    'role_column_grants',\n",
+       "    'information_schema_catalog_name',\n",
+       "    'column_domain_usage',\n",
+       "    'applicable_roles',\n",
+       "    'administrable_role_authorizations',\n",
+       "    'domain_constraints',\n",
+       "    'attributes',\n",
+       "    'column_privileges',\n",
+       "    'character_sets',\n",
+       "    'check_constraint_routine_usage',\n",
+       "    'check_constraints',\n",
+       "    'column_udt_usage',\n",
+       "    'collations',\n",
+       "    'collation_character_set_applicability',\n",
+       "    'key_column_usage',\n",
+       "    'column_column_usage',\n",
+       "    'columns',\n",
+       "    'domain_udt_usage',\n",
+       "    'constraint_column_usage',\n",
+       "    'constraint_table_usage',\n",
+       "    'domains',\n",
+       "    'referential_constraints',\n",
+       "    'enabled_roles',\n",
+       "    'parameters',\n",
+       "    'routine_column_usage',\n",
+       "    'routine_privileges',\n",
+       "    'role_routine_grants',\n",
+       "    'routine_routine_usage',\n",
+       "    'table_privileges',\n",
+       "    'routine_sequence_usage',\n",
+       "    'routine_table_usage',\n",
+       "    'udt_privileges',\n",
+       "    'routines',\n",
+       "    'role_table_grants',\n",
+       "    'schemata',\n",
+       "    'sequences',\n",
+       "    'sql_features',\n",
+       "    'tables',\n",
+       "    'sql_implementation_info',\n",
+       "    'sql_parts',\n",
+       "    'sql_sizing',\n",
+       "    'transforms',\n",
+       "    'table_constraints',\n",
+       "    'view_routine_usage',\n",
+       "    'role_udt_grants',\n",
+       "    'triggered_update_columns',\n",
+       "    'triggers',\n",
+       "    'user_defined_types',\n",
+       "    'usage_privileges',\n",
+       "    'role_usage_grants',\n",
+       "    'view_column_usage',\n",
+       "    'view_table_usage',\n",
+       "    'views',\n",
+       "    'data_type_privileges',\n",
+       "    '_pg_foreign_table_columns',\n",
+       "    'element_types',\n",
+       "    'column_options',\n",
+       "    '_pg_foreign_data_wrappers',\n",
+       "    'foreign_data_wrapper_options',\n",
+       "    'foreign_data_wrappers',\n",
+       "    '_pg_foreign_servers',\n",
+       "    'foreign_server_options',\n",
+       "    'foreign_servers',\n",
+       "    '_pg_foreign_tables',\n",
+       "    'foreign_table_options',\n",
+       "    'foreign_tables',\n",
+       "    '_pg_user_mappings',\n",
+       "    'user_mapping_options',\n",
+       "    'user_mappings'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "information_schema\n", + "\u001b[1m[\u001b[0m\n", + " \u001b[32m'role_column_grants'\u001b[0m,\n", + " \u001b[32m'information_schema_catalog_name'\u001b[0m,\n", + " \u001b[32m'column_domain_usage'\u001b[0m,\n", + " \u001b[32m'applicable_roles'\u001b[0m,\n", + " \u001b[32m'administrable_role_authorizations'\u001b[0m,\n", + " \u001b[32m'domain_constraints'\u001b[0m,\n", + " \u001b[32m'attributes'\u001b[0m,\n", + " \u001b[32m'column_privileges'\u001b[0m,\n", + " \u001b[32m'character_sets'\u001b[0m,\n", + " \u001b[32m'check_constraint_routine_usage'\u001b[0m,\n", + " \u001b[32m'check_constraints'\u001b[0m,\n", + " \u001b[32m'column_udt_usage'\u001b[0m,\n", + " \u001b[32m'collations'\u001b[0m,\n", + " \u001b[32m'collation_character_set_applicability'\u001b[0m,\n", + " \u001b[32m'key_column_usage'\u001b[0m,\n", + " \u001b[32m'column_column_usage'\u001b[0m,\n", + " \u001b[32m'columns'\u001b[0m,\n", + " \u001b[32m'domain_udt_usage'\u001b[0m,\n", + " \u001b[32m'constraint_column_usage'\u001b[0m,\n", + " \u001b[32m'constraint_table_usage'\u001b[0m,\n", + " \u001b[32m'domains'\u001b[0m,\n", + " \u001b[32m'referential_constraints'\u001b[0m,\n", + " \u001b[32m'enabled_roles'\u001b[0m,\n", + " \u001b[32m'parameters'\u001b[0m,\n", + " \u001b[32m'routine_column_usage'\u001b[0m,\n", + " \u001b[32m'routine_privileges'\u001b[0m,\n", + " \u001b[32m'role_routine_grants'\u001b[0m,\n", + " \u001b[32m'routine_routine_usage'\u001b[0m,\n", + " \u001b[32m'table_privileges'\u001b[0m,\n", + " \u001b[32m'routine_sequence_usage'\u001b[0m,\n", + " \u001b[32m'routine_table_usage'\u001b[0m,\n", + " \u001b[32m'udt_privileges'\u001b[0m,\n", + " \u001b[32m'routines'\u001b[0m,\n", + " \u001b[32m'role_table_grants'\u001b[0m,\n", + " \u001b[32m'schemata'\u001b[0m,\n", + " \u001b[32m'sequences'\u001b[0m,\n", + " \u001b[32m'sql_features'\u001b[0m,\n", + " \u001b[32m'tables'\u001b[0m,\n", + " \u001b[32m'sql_implementation_info'\u001b[0m,\n", + " \u001b[32m'sql_parts'\u001b[0m,\n", + " \u001b[32m'sql_sizing'\u001b[0m,\n", + " \u001b[32m'transforms'\u001b[0m,\n", + " \u001b[32m'table_constraints'\u001b[0m,\n", + " \u001b[32m'view_routine_usage'\u001b[0m,\n", + " \u001b[32m'role_udt_grants'\u001b[0m,\n", + " \u001b[32m'triggered_update_columns'\u001b[0m,\n", + " \u001b[32m'triggers'\u001b[0m,\n", + " \u001b[32m'user_defined_types'\u001b[0m,\n", + " \u001b[32m'usage_privileges'\u001b[0m,\n", + " \u001b[32m'role_usage_grants'\u001b[0m,\n", + " \u001b[32m'view_column_usage'\u001b[0m,\n", + " \u001b[32m'view_table_usage'\u001b[0m,\n", + " \u001b[32m'views'\u001b[0m,\n", + " \u001b[32m'data_type_privileges'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_table_columns'\u001b[0m,\n", + " \u001b[32m'element_types'\u001b[0m,\n", + " \u001b[32m'column_options'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_data_wrappers'\u001b[0m,\n", + " \u001b[32m'foreign_data_wrapper_options'\u001b[0m,\n", + " \u001b[32m'foreign_data_wrappers'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_servers'\u001b[0m,\n", + " \u001b[32m'foreign_server_options'\u001b[0m,\n", + " \u001b[32m'foreign_servers'\u001b[0m,\n", + " \u001b[32m'_pg_foreign_tables'\u001b[0m,\n", + " \u001b[32m'foreign_table_options'\u001b[0m,\n", + " \u001b[32m'foreign_tables'\u001b[0m,\n", + " \u001b[32m'_pg_user_mappings'\u001b[0m,\n", + " \u001b[32m'user_mapping_options'\u001b[0m,\n", + " \u001b[32m'user_mappings'\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
pg_catalog\n",
+       "[\n",
+       "    'pg_statistic',\n",
+       "    'pg_type',\n",
+       "    'pg_foreign_table',\n",
+       "    'pg_authid',\n",
+       "    'pg_shadow',\n",
+       "    'pg_roles',\n",
+       "    'pg_statistic_ext_data',\n",
+       "    'pg_settings',\n",
+       "    'pg_file_settings',\n",
+       "    'pg_hba_file_rules',\n",
+       "    'pg_ident_file_mappings',\n",
+       "    'pg_config',\n",
+       "    'pg_shmem_allocations',\n",
+       "    'pg_backend_memory_contexts',\n",
+       "    'pg_user_mapping',\n",
+       "    'pg_stat_activity',\n",
+       "    'pg_replication_origin_status',\n",
+       "    'pg_subscription',\n",
+       "    'pg_attribute',\n",
+       "    'pg_proc',\n",
+       "    'pg_class',\n",
+       "    'pg_attrdef',\n",
+       "    'pg_constraint',\n",
+       "    'pg_inherits',\n",
+       "    'pg_index',\n",
+       "    'pg_stat_replication',\n",
+       "    'pg_stat_slru',\n",
+       "    'pg_stat_wal_receiver',\n",
+       "    'pg_stat_recovery_prefetch',\n",
+       "    'pg_operator',\n",
+       "    'pg_opfamily',\n",
+       "    'pg_opclass',\n",
+       "    'pg_am',\n",
+       "    'pg_amop',\n",
+       "    'pg_amproc',\n",
+       "    'pg_language',\n",
+       "    'pg_largeobject_metadata',\n",
+       "    'pg_aggregate',\n",
+       "    'pg_statistic_ext',\n",
+       "    'pg_rewrite',\n",
+       "    'pg_trigger',\n",
+       "    'pg_event_trigger',\n",
+       "    'pg_description',\n",
+       "    'pg_cast',\n",
+       "    'pg_enum',\n",
+       "    'pg_namespace',\n",
+       "    'pg_conversion',\n",
+       "    'pg_depend',\n",
+       "    'pg_database',\n",
+       "    'pg_db_role_setting',\n",
+       "    'pg_tablespace',\n",
+       "    'pg_auth_members',\n",
+       "    'pg_shdepend',\n",
+       "    'pg_shdescription',\n",
+       "    'pg_ts_config',\n",
+       "    'pg_ts_config_map',\n",
+       "    'pg_ts_dict',\n",
+       "    'pg_ts_parser',\n",
+       "    'pg_ts_template',\n",
+       "    'pg_extension',\n",
+       "    'pg_foreign_data_wrapper',\n",
+       "    'pg_foreign_server',\n",
+       "    'pg_policy',\n",
+       "    'pg_replication_origin',\n",
+       "    'pg_default_acl',\n",
+       "    'pg_init_privs',\n",
+       "    'pg_seclabel',\n",
+       "    'pg_shseclabel',\n",
+       "    'pg_collation',\n",
+       "    'pg_parameter_acl',\n",
+       "    'pg_partitioned_table',\n",
+       "    'pg_range',\n",
+       "    'pg_transform',\n",
+       "    'pg_sequence',\n",
+       "    'pg_publication',\n",
+       "    'pg_publication_namespace',\n",
+       "    'pg_publication_rel',\n",
+       "    'pg_subscription_rel',\n",
+       "    'pg_group',\n",
+       "    'pg_user',\n",
+       "    'pg_policies',\n",
+       "    'pg_rules',\n",
+       "    'pg_views',\n",
+       "    'pg_tables',\n",
+       "    'pg_matviews',\n",
+       "    'pg_indexes',\n",
+       "    'pg_sequences',\n",
+       "    'pg_stats',\n",
+       "    'pg_stats_ext',\n",
+       "    'pg_stats_ext_exprs',\n",
+       "    'pg_publication_tables',\n",
+       "    'pg_locks',\n",
+       "    'pg_cursors',\n",
+       "    'pg_available_extensions',\n",
+       "    'pg_available_extension_versions',\n",
+       "    'pg_prepared_xacts',\n",
+       "    'pg_prepared_statements',\n",
+       "    'pg_seclabels',\n",
+       "    'pg_timezone_abbrevs',\n",
+       "    'pg_timezone_names',\n",
+       "    'pg_stat_sys_tables',\n",
+       "    'pg_stat_xact_sys_tables',\n",
+       "    'pg_stat_user_tables',\n",
+       "    'pg_stat_all_tables',\n",
+       "    'pg_stat_xact_all_tables',\n",
+       "    'pg_stat_xact_user_tables',\n",
+       "    'pg_statio_all_tables',\n",
+       "    'pg_statio_sys_tables',\n",
+       "    'pg_statio_user_tables',\n",
+       "    'pg_stat_all_indexes',\n",
+       "    'pg_stat_sys_indexes',\n",
+       "    'pg_stat_user_indexes',\n",
+       "    'pg_statio_all_indexes',\n",
+       "    'pg_statio_sys_indexes',\n",
+       "    'pg_statio_user_indexes',\n",
+       "    'pg_statio_all_sequences',\n",
+       "    'pg_statio_sys_sequences',\n",
+       "    'pg_statio_user_sequences',\n",
+       "    'pg_stat_subscription',\n",
+       "    'pg_stat_ssl',\n",
+       "    'pg_stat_gssapi',\n",
+       "    'pg_replication_slots',\n",
+       "    'pg_stat_replication_slots',\n",
+       "    'pg_stat_database',\n",
+       "    'pg_stat_database_conflicts',\n",
+       "    'pg_stat_user_functions',\n",
+       "    'pg_stat_xact_user_functions',\n",
+       "    'pg_stat_archiver',\n",
+       "    'pg_stat_bgwriter',\n",
+       "    'pg_stat_wal',\n",
+       "    'pg_stat_progress_analyze',\n",
+       "    'pg_stat_progress_vacuum',\n",
+       "    'pg_stat_progress_cluster',\n",
+       "    'pg_stat_progress_create_index',\n",
+       "    'pg_stat_progress_basebackup',\n",
+       "    'pg_stat_progress_copy',\n",
+       "    'pg_user_mappings',\n",
+       "    'pg_stat_subscription_stats',\n",
+       "    'pg_largeobject'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "pg_catalog\n", + "\u001b[1m[\u001b[0m\n", + " \u001b[32m'pg_statistic'\u001b[0m,\n", + " \u001b[32m'pg_type'\u001b[0m,\n", + " \u001b[32m'pg_foreign_table'\u001b[0m,\n", + " \u001b[32m'pg_authid'\u001b[0m,\n", + " \u001b[32m'pg_shadow'\u001b[0m,\n", + " \u001b[32m'pg_roles'\u001b[0m,\n", + " \u001b[32m'pg_statistic_ext_data'\u001b[0m,\n", + " \u001b[32m'pg_settings'\u001b[0m,\n", + " \u001b[32m'pg_file_settings'\u001b[0m,\n", + " \u001b[32m'pg_hba_file_rules'\u001b[0m,\n", + " \u001b[32m'pg_ident_file_mappings'\u001b[0m,\n", + " \u001b[32m'pg_config'\u001b[0m,\n", + " \u001b[32m'pg_shmem_allocations'\u001b[0m,\n", + " \u001b[32m'pg_backend_memory_contexts'\u001b[0m,\n", + " \u001b[32m'pg_user_mapping'\u001b[0m,\n", + " \u001b[32m'pg_stat_activity'\u001b[0m,\n", + " \u001b[32m'pg_replication_origin_status'\u001b[0m,\n", + " \u001b[32m'pg_subscription'\u001b[0m,\n", + " \u001b[32m'pg_attribute'\u001b[0m,\n", + " \u001b[32m'pg_proc'\u001b[0m,\n", + " \u001b[32m'pg_class'\u001b[0m,\n", + " \u001b[32m'pg_attrdef'\u001b[0m,\n", + " \u001b[32m'pg_constraint'\u001b[0m,\n", + " \u001b[32m'pg_inherits'\u001b[0m,\n", + " \u001b[32m'pg_index'\u001b[0m,\n", + " \u001b[32m'pg_stat_replication'\u001b[0m,\n", + " \u001b[32m'pg_stat_slru'\u001b[0m,\n", + " \u001b[32m'pg_stat_wal_receiver'\u001b[0m,\n", + " \u001b[32m'pg_stat_recovery_prefetch'\u001b[0m,\n", + " \u001b[32m'pg_operator'\u001b[0m,\n", + " \u001b[32m'pg_opfamily'\u001b[0m,\n", + " \u001b[32m'pg_opclass'\u001b[0m,\n", + " \u001b[32m'pg_am'\u001b[0m,\n", + " \u001b[32m'pg_amop'\u001b[0m,\n", + " \u001b[32m'pg_amproc'\u001b[0m,\n", + " \u001b[32m'pg_language'\u001b[0m,\n", + " \u001b[32m'pg_largeobject_metadata'\u001b[0m,\n", + " \u001b[32m'pg_aggregate'\u001b[0m,\n", + " \u001b[32m'pg_statistic_ext'\u001b[0m,\n", + " \u001b[32m'pg_rewrite'\u001b[0m,\n", + " \u001b[32m'pg_trigger'\u001b[0m,\n", + " \u001b[32m'pg_event_trigger'\u001b[0m,\n", + " \u001b[32m'pg_description'\u001b[0m,\n", + " \u001b[32m'pg_cast'\u001b[0m,\n", + " \u001b[32m'pg_enum'\u001b[0m,\n", + " \u001b[32m'pg_namespace'\u001b[0m,\n", + " \u001b[32m'pg_conversion'\u001b[0m,\n", + " \u001b[32m'pg_depend'\u001b[0m,\n", + " \u001b[32m'pg_database'\u001b[0m,\n", + " \u001b[32m'pg_db_role_setting'\u001b[0m,\n", + " \u001b[32m'pg_tablespace'\u001b[0m,\n", + " \u001b[32m'pg_auth_members'\u001b[0m,\n", + " \u001b[32m'pg_shdepend'\u001b[0m,\n", + " \u001b[32m'pg_shdescription'\u001b[0m,\n", + " \u001b[32m'pg_ts_config'\u001b[0m,\n", + " \u001b[32m'pg_ts_config_map'\u001b[0m,\n", + " \u001b[32m'pg_ts_dict'\u001b[0m,\n", + " \u001b[32m'pg_ts_parser'\u001b[0m,\n", + " \u001b[32m'pg_ts_template'\u001b[0m,\n", + " \u001b[32m'pg_extension'\u001b[0m,\n", + " \u001b[32m'pg_foreign_data_wrapper'\u001b[0m,\n", + " \u001b[32m'pg_foreign_server'\u001b[0m,\n", + " \u001b[32m'pg_policy'\u001b[0m,\n", + " \u001b[32m'pg_replication_origin'\u001b[0m,\n", + " \u001b[32m'pg_default_acl'\u001b[0m,\n", + " \u001b[32m'pg_init_privs'\u001b[0m,\n", + " \u001b[32m'pg_seclabel'\u001b[0m,\n", + " \u001b[32m'pg_shseclabel'\u001b[0m,\n", + " \u001b[32m'pg_collation'\u001b[0m,\n", + " \u001b[32m'pg_parameter_acl'\u001b[0m,\n", + " \u001b[32m'pg_partitioned_table'\u001b[0m,\n", + " \u001b[32m'pg_range'\u001b[0m,\n", + " \u001b[32m'pg_transform'\u001b[0m,\n", + " \u001b[32m'pg_sequence'\u001b[0m,\n", + " \u001b[32m'pg_publication'\u001b[0m,\n", + " \u001b[32m'pg_publication_namespace'\u001b[0m,\n", + " \u001b[32m'pg_publication_rel'\u001b[0m,\n", + " \u001b[32m'pg_subscription_rel'\u001b[0m,\n", + " \u001b[32m'pg_group'\u001b[0m,\n", + " \u001b[32m'pg_user'\u001b[0m,\n", + " \u001b[32m'pg_policies'\u001b[0m,\n", + " \u001b[32m'pg_rules'\u001b[0m,\n", + " \u001b[32m'pg_views'\u001b[0m,\n", + " \u001b[32m'pg_tables'\u001b[0m,\n", + " \u001b[32m'pg_matviews'\u001b[0m,\n", + " \u001b[32m'pg_indexes'\u001b[0m,\n", + " \u001b[32m'pg_sequences'\u001b[0m,\n", + " \u001b[32m'pg_stats'\u001b[0m,\n", + " \u001b[32m'pg_stats_ext'\u001b[0m,\n", + " \u001b[32m'pg_stats_ext_exprs'\u001b[0m,\n", + " \u001b[32m'pg_publication_tables'\u001b[0m,\n", + " \u001b[32m'pg_locks'\u001b[0m,\n", + " \u001b[32m'pg_cursors'\u001b[0m,\n", + " \u001b[32m'pg_available_extensions'\u001b[0m,\n", + " \u001b[32m'pg_available_extension_versions'\u001b[0m,\n", + " \u001b[32m'pg_prepared_xacts'\u001b[0m,\n", + " \u001b[32m'pg_prepared_statements'\u001b[0m,\n", + " \u001b[32m'pg_seclabels'\u001b[0m,\n", + " \u001b[32m'pg_timezone_abbrevs'\u001b[0m,\n", + " \u001b[32m'pg_timezone_names'\u001b[0m,\n", + " \u001b[32m'pg_stat_sys_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_sys_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_user_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_all_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_all_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_user_tables'\u001b[0m,\n", + " \u001b[32m'pg_statio_all_tables'\u001b[0m,\n", + " \u001b[32m'pg_statio_sys_tables'\u001b[0m,\n", + " \u001b[32m'pg_statio_user_tables'\u001b[0m,\n", + " \u001b[32m'pg_stat_all_indexes'\u001b[0m,\n", + " \u001b[32m'pg_stat_sys_indexes'\u001b[0m,\n", + " \u001b[32m'pg_stat_user_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_all_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_sys_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_user_indexes'\u001b[0m,\n", + " \u001b[32m'pg_statio_all_sequences'\u001b[0m,\n", + " \u001b[32m'pg_statio_sys_sequences'\u001b[0m,\n", + " \u001b[32m'pg_statio_user_sequences'\u001b[0m,\n", + " \u001b[32m'pg_stat_subscription'\u001b[0m,\n", + " \u001b[32m'pg_stat_ssl'\u001b[0m,\n", + " \u001b[32m'pg_stat_gssapi'\u001b[0m,\n", + " \u001b[32m'pg_replication_slots'\u001b[0m,\n", + " \u001b[32m'pg_stat_replication_slots'\u001b[0m,\n", + " \u001b[32m'pg_stat_database'\u001b[0m,\n", + " \u001b[32m'pg_stat_database_conflicts'\u001b[0m,\n", + " \u001b[32m'pg_stat_user_functions'\u001b[0m,\n", + " \u001b[32m'pg_stat_xact_user_functions'\u001b[0m,\n", + " \u001b[32m'pg_stat_archiver'\u001b[0m,\n", + " \u001b[32m'pg_stat_bgwriter'\u001b[0m,\n", + " \u001b[32m'pg_stat_wal'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_analyze'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_vacuum'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_cluster'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_create_index'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_basebackup'\u001b[0m,\n", + " \u001b[32m'pg_stat_progress_copy'\u001b[0m,\n", + " \u001b[32m'pg_user_mappings'\u001b[0m,\n", + " \u001b[32m'pg_stat_subscription_stats'\u001b[0m,\n", + " \u001b[32m'pg_largeobject'\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
pg_toast\n",
+       "[]\n",
+       "
\n" + ], + "text/plain": [ + "pg_toast\n", + "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# for each schema, list tables\n", + "for schema in schemas:\n", + " with conn.cursor() as cur:\n", + " cur.execute(\n", + " f\"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}';\"\n", + " )\n", + " tables = cur.fetchall()\n", + " tables = [table[0] for table in tables]\n", + " rprint(schema, tables)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "with conn.cursor() as cur:\n", + " # Execute the CREATE EXTENSION command\n", + " try:\n", + " cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector;\")\n", + " conn.commit()\n", + " except Exception as e:\n", + " conn.rollback()\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "register_vector(conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# create a table in public\n", + "with conn.cursor() as cur:\n", + " cur.execute(\n", + " \"\"\"\n", + " CREATE TABLE public.test_table (\n", + " id serial PRIMARY KEY,\n", + " name VARCHAR ( 50 ) UNIQUE NOT NULL,\n", + " vector VECTOR(3)\n", + " );\n", + " \"\"\"\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "ename": "OperationalError", + "evalue": "connection failed: connection to server at \"3.209.98.240\", port 5432 failed: could not receive data from server: Connection refused\ncould not send SSL negotiation packet: Connection refused", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOperationalError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m conn \u001b[38;5;241m=\u001b[39m \u001b[43mpsycopg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menviron\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mAURORA_POSTGRES_CONNECTION_STRING\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/psycopg/connection.py:748\u001b[0m, in \u001b[0;36mConnection.connect\u001b[0;34m(cls, conninfo, autocommit, prepare_threshold, row_factory, cursor_factory, context, **kwargs)\u001b[0m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m rv:\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m last_ex\n\u001b[0;32m--> 748\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m last_ex\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 750\u001b[0m rv\u001b[38;5;241m.\u001b[39m_autocommit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mbool\u001b[39m(autocommit)\n\u001b[1;32m 751\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m row_factory:\n", + "\u001b[0;31mOperationalError\u001b[0m: connection failed: connection to server at \"3.209.98.240\", port 5432 failed: could not receive data from server: Connection refused\ncould not send SSL negotiation packet: Connection refused" + ] + } + ], + "source": [ + "conn = psycopg.connect(os.environ['AURORA_POSTGRES_CONNECTION_STRING'])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table 'test_table' does not exist in schema 'public'. Creating the table...\n", + "Data inserted successfully.\n" + ] + } + ], + "source": [ + "# Ensure table exists and use the correct schema\n", + "with conn.cursor() as cur:\n", + " try:\n", + " # Set search path to public schema\n", + " cur.execute(\"SET search_path TO public;\")\n", + " conn.commit()\n", + " \n", + " # Verify table creation\n", + " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'test_table';\")\n", + " table_exists = cur.fetchone()\n", + " if not table_exists:\n", + " print(\"Table 'test_table' does not exist in schema 'public'. Creating the table...\")\n", + " cur.execute(\n", + " \"\"\"\n", + " CREATE TABLE public.test_table (\n", + " id serial PRIMARY KEY,\n", + " name text NOT NULL,\n", + " vector vector(3) -- assuming the vector has 3 dimensions\n", + " );\n", + " \"\"\"\n", + " )\n", + " conn.commit()\n", + " else:\n", + " print(\"Table 'test_table' already exists.\")\n", + "\n", + " # Insert data into the table\n", + " cur.execute(\n", + " \"\"\"\n", + " INSERT INTO public.test_table (name, vector)\n", + " VALUES\n", + " ('A', ARRAY[1, 2, 3]),\n", + " ('B', ARRAY[4, 5, 6]),\n", + " ('C', ARRAY[7, 8, 9]);\n", + " \"\"\"\n", + " )\n", + " conn.commit()\n", + " print(\"Data inserted successfully.\")\n", + " except Exception as e:\n", + " conn.rollback()\n", + " print(f\"Error occurred: {e}\")\n", + "\n", + "# Close the connection\n", + "conn.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "\n", + "schema = pq.read_schema('/Users/dhruvanand/Code/vector-io/vdf_20240515_224025_95696/medium_articles/1.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id: int64\n", + "vector: list\n", + " child 0, element: double\n", + "claps: int64\n", + "title: string\n", + "responses: int64\n", + "reading_time: int64\n", + "publication: string\n", + "link: string\n", + "-- schema metadata --\n", + "pandas: '{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"' + 1163" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "double\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for field in schema:\n", + " # print(field,field.name,field.type, type(field.type))\n", + " if isinstance(field.type, pa.lib.ListType):\n", + " print(field.type.value_type)\n", + " print(field.type)\n", + " # switch case for field.type\n", + " if field.type == pa.lib.ListType:\n", + " print(field.type)\n", + " if field.type == pa.lib.StructType:\n", + " print(field.type)\n", + " if field.type == pa.lib.DictionaryType:\n", + " print(field.type)\n", + " if field.type == pa.lib.TimestampType:\n", + " print(field.type)\n", + " if field.type == pa.lib.TimeType:\n", + " print(field.type)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/vdf_io/pgvector_util.py b/src/vdf_io/pgvector_util.py new file mode 100644 index 0000000..adbe58b --- /dev/null +++ b/src/vdf_io/pgvector_util.py @@ -0,0 +1,97 @@ +from vdf_io.util import set_arg_from_input, set_arg_from_password + + +def make_pgv_parser(DB_NAME_SLUG, subparsers): + parser_pgvector = subparsers.add_parser( + DB_NAME_SLUG, help="Import data to pgvector" + ) + parser_pgvector.add_argument( + "--platform", + type=str, + choices=["supabase", "neon", "tembo", "aws", "custom"], + help="Platform to connect to", + ) + parser_pgvector.add_argument( + "--schema", + type=str, + help="Schema of the Postgres instance (default: public)", + default="public", + ) + parser_pgvector.add_argument( + "--connection_string", + type=str, + help="Connection string to Postgres instance", + ) + parser_pgvector.add_argument("--user", type=str, help="User of Postgres instance") + parser_pgvector.add_argument( + "--password", type=str, help="Password of Postgres instance" + ) + parser_pgvector.add_argument("--host", type=str, help="Host of Postgres instance") + parser_pgvector.add_argument("--port", type=str, help="Port of Postgres instance") + parser_pgvector.add_argument( + "--dbname", type=str, help="Database name of Postgres instance" + ) + return parser_pgvector + + +def set_pgv_args_from_prompt(args): + set_arg_from_input( + args, + "platform", + "Enter the platform to connect to (default: custom): ", + str, + default_value="custom", + ) + env_var_name = "POSTGRES_CONNECTION_STRING" + if args["platform"] == "supabase": + env_var_name = "SUPABASE_CONNECTION_STRING" + elif args["platform"] == "neon": + env_var_name = "NEON_CONNECTION_STRING" + elif args["platform"] == "tembo": + env_var_name = "TEMBO_CONNECTION_STRING" + elif args["platform"] == "aws": + env_var_name = "AURORA_CONNECTION_STRING" + set_arg_from_input( + args, + "connection_string", + "Enter the connection string to Postgres instance: ", + str, + env_var=env_var_name, + ) + if not args.get("connection_string"): + set_arg_from_input( + args, + "user", + "Enter the user of Postgres instance (default: postgres): ", + str, + default_value="postgres", + ) + set_arg_from_password( + args, + "password", + "Enter the password of Postgres instance (default: postgres): ", + ) + if not args.get("password"): + # If password is not provided, set it to "postgres" + args["password"] = "postgres" + set_arg_from_input( + args, + "host", + "Enter the host of Postgres instance: ", + str, + default_value="localhost", + ) + set_arg_from_input( + args, + "port", + "Enter the port of Postgres instance: ", + str, + default_value="5432", + ) + set_arg_from_input( + args, + "dbname", + "Enter the database name of Postgres instance: ", + str, + default_value="postgres", + ) diff --git a/src/vdf_io/util.py b/src/vdf_io/util.py index 18554a8..d5dd038 100644 --- a/src/vdf_io/util.py +++ b/src/vdf_io/util.py @@ -92,20 +92,27 @@ def set_arg_from_input( """ Set the value of an argument from user input if it is not already present """ + read_from_env = False if ( (default_value is None) and (env_var is not None) and (os.getenv(env_var) is not None) ): default_value = os.getenv(env_var) + read_from_env = True if arg_name not in args or ( args[arg_name] is None and default_value != "DO_NOT_PROMPT" ): while True: - inp = input( - prompt - + (" " + str(list(choices)) + ": " if choices is not None else "") - ) + default_suffix = "" + choices_suffix = "" + if read_from_env: + default_suffix = f" (default: Read from env var {env_var})" + else: + default_suffix = f" (default: {default_value})" + if choices is not None: + choices_suffix = f" (choose from: {str(list(choices))})" + inp = input(f"{prompt}{choices_suffix}{default_suffix}: ") if len(inp) >= 2: if inp[0] == '"' and inp[-1] == '"': inp = inp[1:-1]