webrecorder
diff --git a/‎backend/btrixcloud/crawlconfigs.py
Lines changed: 54 additions & 28 deletions b/‎backend/btrixcloud/crawlconfigs.py
Lines changed: 54 additions & 28 deletions
diff --git a/‎backend/btrixcloud/models.py
Lines changed: 11 additions & 0 deletions b/‎backend/btrixcloud/models.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎backend/test/conftest.py
Lines changed: 190 additions & 0 deletions b/‎backend/test/conftest.py
Lines changed: 190 additions & 0 deletions
@@ -4,7 +4,7 @@
 
 # pylint: disable=too-many-lines
 
-from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple
+from typing import List, Optional, TYPE_CHECKING, cast, Dict, Tuple, Annotated
 
 import asyncio
 import json
@@ -46,6 +46,7 @@
     CrawlerProxies,
     ValidateCustomBehavior,
     RawCrawlConfig,
+    ListFilterType,
 )
 from .utils import (
     dt_now,
@@ -597,13 +598,14 @@ async def get_crawl_configs(
         page: int = 1,
         created_by: Optional[UUID] = None,
         modified_by: Optional[UUID] = None,
-        profileid: Optional[UUID] = None,
+        profile_ids: Optional[List[UUID]] = None,
         first_seed: Optional[str] = None,
         name: Optional[str] = None,
         description: Optional[str] = None,
         tags: Optional[List[str]] = None,
+        tag_match: Optional[ListFilterType] = ListFilterType.AND,
         schedule: Optional[bool] = None,
-        isCrawlRunning: Optional[bool] = None,
+        is_crawl_running: Optional[bool] = None,
         sort_by: str = "lastRun",
         sort_direction: int = -1,
     ) -> tuple[list[CrawlConfigOut], int]:
@@ -616,16 +618,17 @@ async def get_crawl_configs(
         match_query = {"oid": org.id, "inactive": {"$ne": True}}
 
         if tags:
-            match_query["tags"] = {"$all": tags}
+            query_type = "$all" if tag_match == ListFilterType.AND else "$in"
+            match_query["tags"] = {query_type: tags}
 
         if created_by:
             match_query["createdBy"] = created_by
 
         if modified_by:
             match_query["modifiedBy"] = modified_by
 
-        if profileid:
-            match_query["profileid"] = profileid
+        if profile_ids:
+            match_query["profileid"] = {"$in": profile_ids}
 
         if name:
             match_query["name"] = name
@@ -639,8 +642,8 @@ async def get_crawl_configs(
             else:
                 match_query["schedule"] = {"$in": ["", None]}
 
-        if isCrawlRunning is not None:
-            match_query["isCrawlRunning"] = isCrawlRunning
+        if is_crawl_running is not None:
+            match_query["isCrawlRunning"] = is_crawl_running
 
         # pylint: disable=duplicate-code
         aggregate = [
@@ -1369,24 +1372,46 @@ def init_crawl_config_api(
     @router.get("", response_model=PaginatedCrawlConfigOutResponse)
     async def get_crawl_configs(
         org: Organization = Depends(org_viewer_dep),
-        pageSize: int = DEFAULT_PAGE_SIZE,
+        page_size: Annotated[
+            int, Query(alias="pageSize", title="Page Size")
+        ] = DEFAULT_PAGE_SIZE,
         page: int = 1,
         # createdBy, kept as userid for API compatibility
-        userid: Optional[UUID] = None,
-        modifiedBy: Optional[UUID] = None,
-        profileid: Optional[UUID] = None,
-        firstSeed: Optional[str] = None,
+        user_id: Annotated[
+            Optional[UUID], Query(alias="userid", title="User ID")
+        ] = None,
+        modified_by: Annotated[
+            Optional[UUID], Query(alias="modifiedBy", title="Modified By User ID")
+        ] = None,
+        profile_ids: Annotated[
+            Optional[List[UUID]], Query(alias="profileIds", title="Profile IDs")
+        ] = None,
+        first_seed: Annotated[
+            Optional[str], Query(alias="firstSeed", title="First Seed")
+        ] = None,
         name: Optional[str] = None,
         description: Optional[str] = None,
-        tag: Union[List[str], None] = Query(default=None),
+        tag: Annotated[Optional[List[str]], Query(title="Tags")] = None,
+        tag_match: Annotated[
+            Optional[ListFilterType],
+            Query(
+                alias="tagMatch",
+                title="Tag Match Type",
+                description='Defaults to `"and"` if omitted',
+            ),
+        ] = ListFilterType.AND,
         schedule: Optional[bool] = None,
-        isCrawlRunning: Optional[bool] = None,
-        sortBy: str = "",
-        sortDirection: int = -1,
+        is_crawl_running: Annotated[
+            Optional[bool], Query(alias="isCrawlRunning", title="Is Crawl Running")
+        ] = None,
+        sort_by: Annotated[str, Query(alias="sortBy", title="Sort Field")] = "",
+        sort_direction: Annotated[
+            int, Query(alias="sortDirection", title="Sort Direction")
+        ] = -1,
     ):
         # pylint: disable=duplicate-code
-        if firstSeed:
-            firstSeed = urllib.parse.unquote(firstSeed)
+        if first_seed:
+            first_seed = urllib.parse.unquote(first_seed)
 
         if name:
             name = urllib.parse.unquote(name)
@@ -1396,21 +1421,22 @@ async def get_crawl_configs(
 
         crawl_configs, total = await ops.get_crawl_configs(
             org,
-            created_by=userid,
-            modified_by=modifiedBy,
-            profileid=profileid,
-            first_seed=firstSeed,
+            created_by=user_id,
+            modified_by=modified_by,
+            profile_ids=profile_ids,
+            first_seed=first_seed,
             name=name,
             description=description,
             tags=tag,
+            tag_match=tag_match,
             schedule=schedule,
-            isCrawlRunning=isCrawlRunning,
-            page_size=pageSize,
+            is_crawl_running=is_crawl_running,
+            page_size=page_size,
             page=page,
-            sort_by=sortBy,
-            sort_direction=sortDirection,
+            sort_by=sort_by,
+            sort_direction=sort_direction,
         )
-        return paginated_format(crawl_configs, total, page, pageSize)
+        return paginated_format(crawl_configs, total, page, page_size)
 
     @router.get("/tags", response_model=List[str], deprecated=True)
     async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)):
 
@@ -2964,3 +2964,14 @@ class PageUrlCountResponse(BaseModel):
     """Response model for page count by url"""
 
     items: List[PageUrlCount]
+
+
+# FILTER UTILITIES
+
+
+# ============================================================================
+class ListFilterType(str, Enum):
+    """Combination type for query filters that accept lists"""
+
+    OR = "or"
+    AND = "and"
@@ -629,3 +629,193 @@ def echo_server():
     print(f"Echo server terminating", flush=True)
     p.terminate()
     print(f"Echo server terminated", flush=True)
+
+
+PROFILE_NAME = "Test profile"
+PROFILE_DESC = "Profile used for backend tests"
+
+PROFILE_NAME_UPDATED = "Updated test profile"
+PROFILE_DESC_UPDATED = "Updated profile used for backend tests"
+
+PROFILE_2_NAME = "Second test profile"
+PROFILE_2_DESC = "Second profile used to test list endpoint"
+
+
+def prepare_browser_for_profile_commit(
+    browser_id: str, headers: Dict[str, str], oid: UUID
+) -> None:
+    # Ping to make sure it doesn't expire
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/ping",
+        headers=headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data.get("success")
+    assert data.get("origins") or data.get("origins") == []
+
+    # Verify browser seems good
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}",
+        headers=headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["url"]
+    assert data["path"]
+    assert data["password"]
+    assert data["auth_bearer"]
+    assert data["scale"]
+    assert data["oid"] == oid
+
+    # Navigate to new URL
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/navigate",
+        headers=headers,
+        json={"url": "https://webrecorder.net/tools"},
+    )
+    assert r.status_code == 200
+    assert r.json()["success"]
+
+    # Ping browser until ready
+    max_attempts = 20
+    attempts = 1
+    while attempts <= max_attempts:
+        try:
+            r = requests.post(
+                f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/ping",
+                headers=headers,
+            )
+            data = r.json()
+            if data["success"]:
+                break
+            time.sleep(5)
+        except:
+            pass
+        attempts += 1
+
+
+@pytest.fixture(scope="session")
+def profile_id(admin_auth_headers, default_org_id, profile_browser_id):
+    prepare_browser_for_profile_commit(
+        profile_browser_id, admin_auth_headers, default_org_id
+    )
+
+    # Create profile
+    start_time = time.monotonic()
+    time_limit = 300
+    while True:
+        try:
+            r = requests.post(
+                f"{API_PREFIX}/orgs/{default_org_id}/profiles",
+                headers=admin_auth_headers,
+                json={
+                    "browserid": profile_browser_id,
+                    "name": PROFILE_NAME,
+                    "description": PROFILE_DESC,
+                },
+                timeout=10,
+            )
+            assert r.status_code == 200
+            data = r.json()
+            if data.get("detail") and data.get("detail") == "waiting_for_browser":
+                time.sleep(5)
+                continue
+            if data.get("added"):
+                assert data["storageQuotaReached"] in (True, False)
+                return data["id"]
+        except:
+            if time.monotonic() - start_time > time_limit:
+                raise
+            time.sleep(5)
+
+
+@pytest.fixture(scope="session")
+def profile_config_id(admin_auth_headers, default_org_id, profile_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["id"] == profile_id
+    assert data["name"] == PROFILE_NAME
+    assert data["description"] == PROFILE_DESC
+    assert data["userid"]
+    assert data["oid"] == default_org_id
+    assert data.get("origins") or data.get("origins") == []
+    assert data["createdBy"]
+    assert data["createdByName"] == "admin"
+    assert data["modifiedBy"]
+    assert data["modifiedByName"] == "admin"
+    assert not data["baseid"]
+
+    created = data["created"]
+    assert created
+    assert created.endswith("Z")
+
+    modified = data["modified"]
+    assert modified
+    assert modified.endswith("Z")
+
+    resource = data["resource"]
+    assert resource
+    assert resource["filename"]
+    assert resource["hash"]
+    assert resource["size"]
+    assert resource["storage"]
+    assert resource["storage"]["name"]
+    assert resource.get("replicas") or resource.get("replicas") == []
+
+    # Use profile in a workflow
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json={
+            "runNow": False,
+            "name": "Profile Test Crawl",
+            "description": "Crawl using browser profile",
+            "config": {
+                "seeds": [{"url": "https://webrecorder.net/"}],
+                "exclude": "community",
+            },
+            "profileid": profile_id,
+        },
+    )
+    data = r.json()
+    return data["id"]
+
+
+@pytest.fixture(scope="session")
+def profile_2_id(admin_auth_headers, default_org_id, profile_browser_2_id):
+    prepare_browser_for_profile_commit(
+        profile_browser_2_id, admin_auth_headers, default_org_id
+    )
+
+    # Create profile
+    start_time = time.monotonic()
+    time_limit = 300
+    while True:
+        try:
+            r = requests.post(
+                f"{API_PREFIX}/orgs/{default_org_id}/profiles",
+                headers=admin_auth_headers,
+                json={
+                    "browserid": profile_browser_2_id,
+                    "name": PROFILE_2_NAME,
+                    "description": PROFILE_2_DESC,
+                },
+                timeout=10,
+            )
+            assert r.status_code == 200
+            data = r.json()
+            if data.get("detail") and data.get("detail") == "waiting_for_browser":
+                time.sleep(5)
+            if data.get("added"):
+                assert data["storageQuotaReached"] in (True, False)
+
+                return data["id"]
+        except:
+            if time.monotonic() - start_time > time_limit:
+                raise
+            time.sleep(5)