Skip to content

Commit f91bfda

Browse files
emma-sgikreymer
andauthored
Allow searching by multiple tags & profiles with "and"/"or" options for tags (#2717)
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1 parent 6f1ced0 commit f91bfda

File tree

6 files changed

+450
-219
lines changed

6 files changed

+450
-219
lines changed

backend/btrixcloud/crawlconfigs.py

Lines changed: 54 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
# pylint: disable=too-many-lines
66

7-
from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple
7+
from typing import List, Optional, TYPE_CHECKING, cast, Dict, Tuple, Annotated
88

99
import asyncio
1010
import json
@@ -46,6 +46,7 @@
4646
CrawlerProxies,
4747
ValidateCustomBehavior,
4848
RawCrawlConfig,
49+
ListFilterType,
4950
)
5051
from .utils import (
5152
dt_now,
@@ -597,13 +598,14 @@ async def get_crawl_configs(
597598
page: int = 1,
598599
created_by: Optional[UUID] = None,
599600
modified_by: Optional[UUID] = None,
600-
profileid: Optional[UUID] = None,
601+
profile_ids: Optional[List[UUID]] = None,
601602
first_seed: Optional[str] = None,
602603
name: Optional[str] = None,
603604
description: Optional[str] = None,
604605
tags: Optional[List[str]] = None,
606+
tag_match: Optional[ListFilterType] = ListFilterType.AND,
605607
schedule: Optional[bool] = None,
606-
isCrawlRunning: Optional[bool] = None,
608+
is_crawl_running: Optional[bool] = None,
607609
sort_by: str = "lastRun",
608610
sort_direction: int = -1,
609611
) -> tuple[list[CrawlConfigOut], int]:
@@ -616,16 +618,17 @@ async def get_crawl_configs(
616618
match_query = {"oid": org.id, "inactive": {"$ne": True}}
617619

618620
if tags:
619-
match_query["tags"] = {"$all": tags}
621+
query_type = "$all" if tag_match == ListFilterType.AND else "$in"
622+
match_query["tags"] = {query_type: tags}
620623

621624
if created_by:
622625
match_query["createdBy"] = created_by
623626

624627
if modified_by:
625628
match_query["modifiedBy"] = modified_by
626629

627-
if profileid:
628-
match_query["profileid"] = profileid
630+
if profile_ids:
631+
match_query["profileid"] = {"$in": profile_ids}
629632

630633
if name:
631634
match_query["name"] = name
@@ -639,8 +642,8 @@ async def get_crawl_configs(
639642
else:
640643
match_query["schedule"] = {"$in": ["", None]}
641644

642-
if isCrawlRunning is not None:
643-
match_query["isCrawlRunning"] = isCrawlRunning
645+
if is_crawl_running is not None:
646+
match_query["isCrawlRunning"] = is_crawl_running
644647

645648
# pylint: disable=duplicate-code
646649
aggregate = [
@@ -1369,24 +1372,46 @@ def init_crawl_config_api(
13691372
@router.get("", response_model=PaginatedCrawlConfigOutResponse)
13701373
async def get_crawl_configs(
13711374
org: Organization = Depends(org_viewer_dep),
1372-
pageSize: int = DEFAULT_PAGE_SIZE,
1375+
page_size: Annotated[
1376+
int, Query(alias="pageSize", title="Page Size")
1377+
] = DEFAULT_PAGE_SIZE,
13731378
page: int = 1,
13741379
# createdBy, kept as userid for API compatibility
1375-
userid: Optional[UUID] = None,
1376-
modifiedBy: Optional[UUID] = None,
1377-
profileid: Optional[UUID] = None,
1378-
firstSeed: Optional[str] = None,
1380+
user_id: Annotated[
1381+
Optional[UUID], Query(alias="userid", title="User ID")
1382+
] = None,
1383+
modified_by: Annotated[
1384+
Optional[UUID], Query(alias="modifiedBy", title="Modified By User ID")
1385+
] = None,
1386+
profile_ids: Annotated[
1387+
Optional[List[UUID]], Query(alias="profileIds", title="Profile IDs")
1388+
] = None,
1389+
first_seed: Annotated[
1390+
Optional[str], Query(alias="firstSeed", title="First Seed")
1391+
] = None,
13791392
name: Optional[str] = None,
13801393
description: Optional[str] = None,
1381-
tag: Union[List[str], None] = Query(default=None),
1394+
tag: Annotated[Optional[List[str]], Query(title="Tags")] = None,
1395+
tag_match: Annotated[
1396+
Optional[ListFilterType],
1397+
Query(
1398+
alias="tagMatch",
1399+
title="Tag Match Type",
1400+
description='Defaults to `"and"` if omitted',
1401+
),
1402+
] = ListFilterType.AND,
13821403
schedule: Optional[bool] = None,
1383-
isCrawlRunning: Optional[bool] = None,
1384-
sortBy: str = "",
1385-
sortDirection: int = -1,
1404+
is_crawl_running: Annotated[
1405+
Optional[bool], Query(alias="isCrawlRunning", title="Is Crawl Running")
1406+
] = None,
1407+
sort_by: Annotated[str, Query(alias="sortBy", title="Sort Field")] = "",
1408+
sort_direction: Annotated[
1409+
int, Query(alias="sortDirection", title="Sort Direction")
1410+
] = -1,
13861411
):
13871412
# pylint: disable=duplicate-code
1388-
if firstSeed:
1389-
firstSeed = urllib.parse.unquote(firstSeed)
1413+
if first_seed:
1414+
first_seed = urllib.parse.unquote(first_seed)
13901415

13911416
if name:
13921417
name = urllib.parse.unquote(name)
@@ -1396,21 +1421,22 @@ async def get_crawl_configs(
13961421

13971422
crawl_configs, total = await ops.get_crawl_configs(
13981423
org,
1399-
created_by=userid,
1400-
modified_by=modifiedBy,
1401-
profileid=profileid,
1402-
first_seed=firstSeed,
1424+
created_by=user_id,
1425+
modified_by=modified_by,
1426+
profile_ids=profile_ids,
1427+
first_seed=first_seed,
14031428
name=name,
14041429
description=description,
14051430
tags=tag,
1431+
tag_match=tag_match,
14061432
schedule=schedule,
1407-
isCrawlRunning=isCrawlRunning,
1408-
page_size=pageSize,
1433+
is_crawl_running=is_crawl_running,
1434+
page_size=page_size,
14091435
page=page,
1410-
sort_by=sortBy,
1411-
sort_direction=sortDirection,
1436+
sort_by=sort_by,
1437+
sort_direction=sort_direction,
14121438
)
1413-
return paginated_format(crawl_configs, total, page, pageSize)
1439+
return paginated_format(crawl_configs, total, page, page_size)
14141440

14151441
@router.get("/tags", response_model=List[str], deprecated=True)
14161442
async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)):

backend/btrixcloud/models.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2964,3 +2964,14 @@ class PageUrlCountResponse(BaseModel):
29642964
"""Response model for page count by url"""
29652965

29662966
items: List[PageUrlCount]
2967+
2968+
2969+
# FILTER UTILITIES
2970+
2971+
2972+
# ============================================================================
2973+
class ListFilterType(str, Enum):
2974+
"""Combination type for query filters that accept lists"""
2975+
2976+
OR = "or"
2977+
AND = "and"

backend/test/conftest.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,3 +629,193 @@ def echo_server():
629629
print(f"Echo server terminating", flush=True)
630630
p.terminate()
631631
print(f"Echo server terminated", flush=True)
632+
633+
634+
PROFILE_NAME = "Test profile"
635+
PROFILE_DESC = "Profile used for backend tests"
636+
637+
PROFILE_NAME_UPDATED = "Updated test profile"
638+
PROFILE_DESC_UPDATED = "Updated profile used for backend tests"
639+
640+
PROFILE_2_NAME = "Second test profile"
641+
PROFILE_2_DESC = "Second profile used to test list endpoint"
642+
643+
644+
def prepare_browser_for_profile_commit(
645+
browser_id: str, headers: Dict[str, str], oid: UUID
646+
) -> None:
647+
# Ping to make sure it doesn't expire
648+
r = requests.post(
649+
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/ping",
650+
headers=headers,
651+
)
652+
assert r.status_code == 200
653+
data = r.json()
654+
assert data.get("success")
655+
assert data.get("origins") or data.get("origins") == []
656+
657+
# Verify browser seems good
658+
r = requests.get(
659+
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}",
660+
headers=headers,
661+
)
662+
assert r.status_code == 200
663+
data = r.json()
664+
assert data["url"]
665+
assert data["path"]
666+
assert data["password"]
667+
assert data["auth_bearer"]
668+
assert data["scale"]
669+
assert data["oid"] == oid
670+
671+
# Navigate to new URL
672+
r = requests.post(
673+
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/navigate",
674+
headers=headers,
675+
json={"url": "https://webrecorder.net/tools"},
676+
)
677+
assert r.status_code == 200
678+
assert r.json()["success"]
679+
680+
# Ping browser until ready
681+
max_attempts = 20
682+
attempts = 1
683+
while attempts <= max_attempts:
684+
try:
685+
r = requests.post(
686+
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/ping",
687+
headers=headers,
688+
)
689+
data = r.json()
690+
if data["success"]:
691+
break
692+
time.sleep(5)
693+
except:
694+
pass
695+
attempts += 1
696+
697+
698+
@pytest.fixture(scope="session")
699+
def profile_id(admin_auth_headers, default_org_id, profile_browser_id):
700+
prepare_browser_for_profile_commit(
701+
profile_browser_id, admin_auth_headers, default_org_id
702+
)
703+
704+
# Create profile
705+
start_time = time.monotonic()
706+
time_limit = 300
707+
while True:
708+
try:
709+
r = requests.post(
710+
f"{API_PREFIX}/orgs/{default_org_id}/profiles",
711+
headers=admin_auth_headers,
712+
json={
713+
"browserid": profile_browser_id,
714+
"name": PROFILE_NAME,
715+
"description": PROFILE_DESC,
716+
},
717+
timeout=10,
718+
)
719+
assert r.status_code == 200
720+
data = r.json()
721+
if data.get("detail") and data.get("detail") == "waiting_for_browser":
722+
time.sleep(5)
723+
continue
724+
if data.get("added"):
725+
assert data["storageQuotaReached"] in (True, False)
726+
return data["id"]
727+
except:
728+
if time.monotonic() - start_time > time_limit:
729+
raise
730+
time.sleep(5)
731+
732+
733+
@pytest.fixture(scope="session")
734+
def profile_config_id(admin_auth_headers, default_org_id, profile_id):
735+
r = requests.get(
736+
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
737+
headers=admin_auth_headers,
738+
)
739+
assert r.status_code == 200
740+
data = r.json()
741+
assert data["id"] == profile_id
742+
assert data["name"] == PROFILE_NAME
743+
assert data["description"] == PROFILE_DESC
744+
assert data["userid"]
745+
assert data["oid"] == default_org_id
746+
assert data.get("origins") or data.get("origins") == []
747+
assert data["createdBy"]
748+
assert data["createdByName"] == "admin"
749+
assert data["modifiedBy"]
750+
assert data["modifiedByName"] == "admin"
751+
assert not data["baseid"]
752+
753+
created = data["created"]
754+
assert created
755+
assert created.endswith("Z")
756+
757+
modified = data["modified"]
758+
assert modified
759+
assert modified.endswith("Z")
760+
761+
resource = data["resource"]
762+
assert resource
763+
assert resource["filename"]
764+
assert resource["hash"]
765+
assert resource["size"]
766+
assert resource["storage"]
767+
assert resource["storage"]["name"]
768+
assert resource.get("replicas") or resource.get("replicas") == []
769+
770+
# Use profile in a workflow
771+
r = requests.post(
772+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
773+
headers=admin_auth_headers,
774+
json={
775+
"runNow": False,
776+
"name": "Profile Test Crawl",
777+
"description": "Crawl using browser profile",
778+
"config": {
779+
"seeds": [{"url": "https://webrecorder.net/"}],
780+
"exclude": "community",
781+
},
782+
"profileid": profile_id,
783+
},
784+
)
785+
data = r.json()
786+
return data["id"]
787+
788+
789+
@pytest.fixture(scope="session")
790+
def profile_2_id(admin_auth_headers, default_org_id, profile_browser_2_id):
791+
prepare_browser_for_profile_commit(
792+
profile_browser_2_id, admin_auth_headers, default_org_id
793+
)
794+
795+
# Create profile
796+
start_time = time.monotonic()
797+
time_limit = 300
798+
while True:
799+
try:
800+
r = requests.post(
801+
f"{API_PREFIX}/orgs/{default_org_id}/profiles",
802+
headers=admin_auth_headers,
803+
json={
804+
"browserid": profile_browser_2_id,
805+
"name": PROFILE_2_NAME,
806+
"description": PROFILE_2_DESC,
807+
},
808+
timeout=10,
809+
)
810+
assert r.status_code == 200
811+
data = r.json()
812+
if data.get("detail") and data.get("detail") == "waiting_for_browser":
813+
time.sleep(5)
814+
if data.get("added"):
815+
assert data["storageQuotaReached"] in (True, False)
816+
817+
return data["id"]
818+
except:
819+
if time.monotonic() - start_time > time_limit:
820+
raise
821+
time.sleep(5)

0 commit comments

Comments
 (0)