Adds delta count methods (#169)

JWittmeyer · web-flow · commit 42fa11b74903 · 2025-05-23T12:56:54.000+02:00
* Adds delta count methods

* Model update for new fileds

* Adds record selection for missing embedding tensors

* Adds distinct count options

* removes print

* Sanatized embedding id
diff --git a/business_objects/embedding.py b/business_objects/embedding.py
@@ -9,6 +9,7 @@
 from .. import enums
 
 from ..util import prevent_sql_injection
+from sqlalchemy import distinct, func
 
 
 ALL_EMBEDDINGS_WHITELIST = {
@@ -587,6 +588,15 @@ def get_tensor_count(embedding_id: str) -> EmbeddingTensor:
     )
 
 
+def get_record_ids_count(embedding_id: str) -> int:
+    # note that this is not the same as tensors since e.g. embedding lists are stored with sub_key
+    return (
+        session.query(func.count(distinct(models.EmbeddingTensor.record_id)))
+        .filter(models.EmbeddingTensor.embedding_id == embedding_id)
+        .scalar()
+    )
+
+
 def get_tensor(
     embedding_id: str, record_id: Optional[str] = None, sub_key: Optional[int] = None
 ) -> EmbeddingTensor:
@@ -782,6 +792,22 @@ def delete_tensors(embedding_id: str, with_commit: bool = False) -> None:
     general.flush_or_commit(with_commit)
 
 
+def delete_tensors_by_record_ids(
+    project_id: str,
+    record_ids: List[str],
+    embedding_id: Optional[str] = None,
+    with_commit: bool = False,
+) -> None:
+    query = session.query(EmbeddingTensor).filter(
+        EmbeddingTensor.project_id == project_id,
+        EmbeddingTensor.record_id.in_(record_ids),
+    )
+    if embedding_id:
+        query = query.filter(EmbeddingTensor.embedding_id == embedding_id)
+    query.delete()
+    general.flush_or_commit(with_commit)
+
+
 def delete_by_record_ids(
     project_id: str,
     embedding_id: str,
diff --git a/business_objects/record.py b/business_objects/record.py
@@ -451,29 +451,51 @@ def get_full_record_data_for_id_group(
 
 
 def get_attribute_data(
-    project_id: str, attribute_name: str
+    project_id: str,
+    attribute_name: str,
+    only_missing: bool = False,
+    embedding_id: Optional[str] = None,
 ) -> Tuple[List[str], List[str]]:
     project_id = prevent_sql_injection(project_id, isinstance(project_id, str))
     attribute_name = prevent_sql_injection(
         attribute_name, isinstance(attribute_name, str)
     )
+    if embedding_id:
+        embedding_id = prevent_sql_injection(
+            embedding_id, isinstance(embedding_id, str)
+        )
     query = None
-    order = __get_order_by(project_id)
+    order = __get_order_by(project_id, prefix="r.")
+    join_extension, where_add = "", ""
+    if only_missing:
+        if not embedding_id:
+            raise ValueError("embedding_id must be provided if only_missing is True")
+        join_extension, where_add = (
+            f"""
+        LEFT JOIN embedding_tensor et
+            ON et.project_id = r.project_id
+            AND et.record_id = r.id
+            AND et.project_id = '{project_id}' AND et.embedding_id = '{embedding_id}'  
+        """,
+            "AND et.id IS NULL",
+        )
     if attribute.get_by_name(project_id, attribute_name).data_type == "EMBEDDING_LIST":
         query = f"""
         SELECT id::TEXT || '@' || sub_key id, att AS "{attribute_name}"
         FROM (
-            SELECT id, value as att, ordinality - 1 as sub_key
-            FROM record
-            cross join json_array_elements_text((data::JSON->'{attribute_name}')) with ordinality
-            WHERE project_id = '{project_id}'
+            SELECT r.id, value as att, ordinality - 1 as sub_key
+            FROM record r
+            {join_extension}
+            cross join json_array_elements_text((r.data::JSON->'{attribute_name}')) with ordinality
+            WHERE r.project_id = '{project_id}' {where_add}
             {order} 
         )x """
     else:
         query = f"""
-        SELECT id::TEXT, data::JSON->'{attribute_name}' AS "{attribute_name}"
-        FROM record
-        WHERE project_id = '{project_id}'
+        SELECT r.id::TEXT, r.data::JSON->'{attribute_name}' AS "{attribute_name}"
+        FROM record r
+        {join_extension}
+        WHERE r.project_id = '{project_id}' {where_add}
         {order}
         """
     result = general.execute_all(query)
@@ -485,6 +507,43 @@ def count(project_id: str) -> int:
     return session.query(Record).filter(Record.project_id == project_id).count()
 
 
+def count_missing_delta(project_id: str, attribute_id: str) -> int:
+    project_id = prevent_sql_injection(project_id, isinstance(project_id, str))
+    attribute_id = prevent_sql_injection(attribute_id, isinstance(attribute_id, str))
+    query = f"""
+    WITH n AS (
+        SELECT NAME
+        FROM attribute a
+        WHERE id = '{attribute_id}'
+    )
+    SELECT COUNT(*)
+    FROM record r, n
+    WHERE r.project_id = '{project_id}'
+    AND r.data->>n.name IS NULL
+    """
+    value = general.execute_first(query)
+    if not value or not value[0]:
+        return 0
+    return value[0]
+
+
+def get_missing_delta_record_ids(project_id: str, attribute_id: str) -> List[str]:
+    project_id = prevent_sql_injection(project_id, isinstance(project_id, str))
+    attribute_id = prevent_sql_injection(attribute_id, isinstance(attribute_id, str))
+    query = f"""
+    WITH n AS (
+        SELECT NAME
+        FROM attribute a
+        WHERE id = '{attribute_id}'
+    )
+    SELECT r.id::TEXT
+    FROM record r, n
+    WHERE r.project_id = '{project_id}'
+    AND r.data->>n.name IS NULL
+    """
+    return [row[0] for row in general.execute_all(query)]
+
+
 def count_attribute_list_entries(project_id: str, attribute_name: str) -> int:
     project_id = prevent_sql_injection(project_id, isinstance(project_id, str))
     attribute_name = prevent_sql_injection(
@@ -809,7 +868,7 @@ def get_tokenized_records_from_db(
     )
 
 
-def __get_order_by(project_id: str, first_x: int = 3) -> str:
+def __get_order_by(project_id: str, first_x: int = 3, prefix: str = "") -> str:
     query = f"""
     SELECT name, data_type
     FROM attribute a
@@ -823,7 +882,7 @@ def __get_order_by(project_id: str, first_x: int = 3) -> str:
     for x in values:
         if order != "":
             order += ", "
-        tmp = f"data->>'{x.name}'"
+        tmp = f"{prefix}data->>'{x.name}'"
 
         r_id = attribute.get_running_id_name(project_id)
         if x.data_type == "INTEGER" and x.name == r_id:
diff --git a/models.py b/models.py
@@ -807,6 +807,13 @@ class Embedding(Base):
     )
     additional_data = Column(JSON)
 
+    # threshold indicates when the embedding should be completely recalculated
+    delta_full_recalculation_threshold = Column(Float, default=0.5)
+    # holds the current number of records that were caluclated with the previous PCA if new records + current delta > threshold we recreate completely
+    # note that this number can be higher than expected because of updated records being recalculated as well
+    # meaning in theory if someone updates the same record over and over again at some point the full recalculation will be triggered
+    current_delta_record_count = Column(Integer, default=0)
+
 
 class EmbeddingTensor(Base):
     __tablename__ = Tablenames.EMBEDDING_TENSOR.value