Skip to content

Commit 1400c56

Browse files
tw4lSuaYoo
andauthored
Additional tagCounts endpoints and options (#2923)
- Add `onlySuccessful` flag to `/all-crawls/tagCounts` endpoint (defaulting to true to avoid breaking change) - Add `crawlType` option to `/all-crawls/tagCounts` (to match search-values endpoint) - Add filtered `/crawls/tagCounts` and `/uploads/tagCounts` endpoints - Rename `CrawlConfigTags` response model to `TagsResponse` now that it's used for crawls/uploads in addition to workflows - Add tests --------- Co-authored-by: sua yoo <sua@webrecorder.org>
1 parent 3cd3079 commit 1400c56

File tree

9 files changed

+239
-19
lines changed

9 files changed

+239
-19
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from .models import (
2727
SUCCESSFUL_STATES,
28-
CrawlConfigTags,
28+
TagsResponse,
2929
CrawlFile,
3030
CrawlFileOut,
3131
BaseCrawl,
@@ -984,13 +984,22 @@ async def get_org_last_crawl_finished(self, oid: UUID) -> Optional[datetime]:
984984

985985
return last_crawl_finished
986986

987-
async def get_all_crawls_tag_counts(self, org: Organization):
988-
"""get distinct tags from all archived items for this org"""
987+
async def get_all_crawls_tag_counts(
988+
self,
989+
org: Organization,
990+
only_successful: bool = True,
991+
type_: Optional[str] = None,
992+
):
993+
"""get distinct tags from archived items for this org"""
994+
match_query: Dict[str, Any] = {"oid": org.id}
995+
if only_successful:
996+
match_query["state"] = {"$in": SUCCESSFUL_STATES}
997+
if type_ in ("crawl", "upload"):
998+
match_query["type"] = type_
999+
9891000
tags = await self.crawls.aggregate(
9901001
[
991-
# Match only against the states of archived items that might be
992-
# displayed in the frontend
993-
{"$match": {"oid": org.id, "state": {"$in": SUCCESSFUL_STATES}}},
1002+
{"$match": match_query},
9941003
{"$unwind": "$tags"},
9951004
{"$group": {"_id": "$tags", "count": {"$sum": 1}}},
9961005
{"$project": {"tag": "$_id", "count": "$count", "_id": 0}},
@@ -1094,10 +1103,20 @@ async def get_all_crawls_search_values(
10941103
@app.get(
10951104
"/orgs/{oid}/all-crawls/tagCounts",
10961105
tags=["all-crawls"],
1097-
response_model=CrawlConfigTags,
1106+
response_model=TagsResponse,
10981107
)
1099-
async def get_all_crawls_tag_counts(org: Organization = Depends(org_viewer_dep)):
1100-
return {"tags": await ops.get_all_crawls_tag_counts(org)}
1108+
async def get_all_crawls_tag_counts(
1109+
org: Organization = Depends(org_viewer_dep),
1110+
onlySuccessful: bool = True,
1111+
crawlType: Optional[str] = None,
1112+
):
1113+
if crawlType and crawlType not in ("crawl", "upload"):
1114+
raise HTTPException(status_code=400, detail="invalid_crawl_type")
1115+
1116+
tags = await ops.get_all_crawls_tag_counts(
1117+
org, only_successful=onlySuccessful, type_=crawlType
1118+
)
1119+
return {"tags": tags}
11011120

11021121
@app.get(
11031122
"/orgs/{oid}/all-crawls/{crawl_id}",

backend/btrixcloud/crawlconfigs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
ConfigRevision,
2727
CrawlConfig,
2828
CrawlConfigOut,
29-
CrawlConfigTags,
29+
TagsResponse,
3030
CrawlOut,
3131
CrawlOutWithResources,
3232
UpdateCrawlConfig,
@@ -1641,7 +1641,7 @@ async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)):
16411641
"""
16421642
return await ops.get_crawl_config_tags(org)
16431643

1644-
@router.get("/tagCounts", response_model=CrawlConfigTags)
1644+
@router.get("/tagCounts", response_model=TagsResponse)
16451645
async def get_crawl_config_tag_counts(org: Organization = Depends(org_viewer_dep)):
16461646
return {"tags": await ops.get_crawl_config_tag_counts(org)}
16471647

backend/btrixcloud/crawls.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
CrawlQueueResponse,
7979
MatchCrawlQueueResponse,
8080
CrawlLogLine,
81+
TagsResponse,
8182
)
8283

8384

@@ -1389,6 +1390,20 @@ async def delete_crawls(
13891390
deleted=count, storageQuotaReached=quota_reached
13901391
)
13911392

1393+
@app.get(
1394+
"/orgs/{oid}/crawls/tagCounts",
1395+
tags=["crawls"],
1396+
response_model=TagsResponse,
1397+
)
1398+
async def get_crawls_tag_counts(
1399+
org: Organization = Depends(org_viewer_dep),
1400+
onlySuccessful: bool = True,
1401+
):
1402+
tags = await ops.get_all_crawls_tag_counts(
1403+
org, only_successful=onlySuccessful, type_="crawl"
1404+
)
1405+
return {"tags": tags}
1406+
13921407
@app.get("/orgs/all/crawls/stats", tags=["crawls"], response_model=bytes)
13931408
async def get_all_orgs_crawl_stats(
13941409
user: User = Depends(user_dep),

backend/btrixcloud/models.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -603,18 +603,18 @@ class CrawlConfigAddedResponse(BaseModel):
603603

604604

605605
# ============================================================================
606-
class CrawlConfigTagCount(BaseModel):
607-
"""Response model for crawlconfig tag count"""
606+
class TagCount(BaseModel):
607+
"""Response model for crawlconfig/crawl tag count"""
608608

609609
tag: str
610610
count: int
611611

612612

613613
# ============================================================================
614-
class CrawlConfigTags(BaseModel):
615-
"""Response model for crawlconfig tags"""
614+
class TagsResponse(BaseModel):
615+
"""Response model for crawlconfig/crawl tags"""
616616

617-
tags: List[CrawlConfigTagCount]
617+
tags: List[TagCount]
618618

619619

620620
# ============================================================================

backend/btrixcloud/uploads.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
AddedResponseIdQuota,
2929
FilePreparer,
3030
MIN_UPLOAD_PART_SIZE,
31+
TagsResponse,
3132
)
3233
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
3334
from .utils import dt_now
@@ -362,6 +363,19 @@ async def list_uploads(
362363
)
363364
return paginated_format(uploads, total, page, pageSize)
364365

366+
@app.get(
367+
"/orgs/{oid}/uploads/tagCounts",
368+
tags=["uploads"],
369+
response_model=TagsResponse,
370+
)
371+
async def get_uploads_tag_counts(
372+
org: Organization = Depends(org_viewer_dep),
373+
):
374+
tags = await ops.get_all_crawls_tag_counts(
375+
org, only_successful=False, type_="upload"
376+
)
377+
return {"tags": tags}
378+
365379
@app.get(
366380
"/orgs/{oid}/uploads/{crawlid}",
367381
tags=["uploads"],

backend/test/conftest.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
NON_DEFAULT_ORG_NAME = "Non-default org"
3232
NON_DEFAULT_ORG_SLUG = "non-default-org"
3333

34+
RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"]
35+
3436
FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]
3537

3638
SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"]
@@ -266,6 +268,7 @@ def qa_crawl_id(crawler_auth_headers, default_org_id):
266268
"runNow": True,
267269
"name": "Crawler User Crawl for Testing QA",
268270
"description": "crawler test crawl for qa",
271+
"tags": ["qa", "wr-test-1"],
269272
"config": {"seeds": [{"url": "https://old.webrecorder.net/"}], "limit": 1},
270273
"crawlerChannel": "test",
271274
}
@@ -295,6 +298,7 @@ def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
295298
crawl_data = {
296299
"runNow": True,
297300
"name": "Webrecorder Specs sample crawl",
301+
"tags": ["wr-test-1"],
298302
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
299303
}
300304
r = requests.post(
@@ -358,6 +362,7 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_
358362
"runNow": True,
359363
"name": "Auto Add",
360364
"description": "For testing auto-adding new workflow crawls to collections",
365+
"tags": ["wr-test-1"],
361366
"autoAddCollections": [auto_add_collection_id],
362367
"config": {
363368
"seeds": [{"url": "https://old.webrecorder.net/"}],
@@ -399,6 +404,7 @@ def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
399404
"runNow": True,
400405
"name": "All Crawls Test Crawl",
401406
"description": "Lorem ipsum",
407+
"tags": ["all-crawls", "wr-test-2"],
402408
"config": {
403409
"seeds": [{"url": "https://old.webrecorder.net/"}],
404410
"exclude": "community",
@@ -458,6 +464,7 @@ def all_crawls_delete_crawl_ids(admin_auth_headers, default_org_id):
458464
"runNow": True,
459465
"name": "All Crawls Delete Test Workflow",
460466
"description": "Lorem ipsum",
467+
"tags": ["wr-test-1", "to-delete"],
461468
"config": {
462469
"seeds": [{"url": "https://old.webrecorder.net/"}],
463470
"exclude": "community",
@@ -520,6 +527,7 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
520527
crawl_data = {
521528
"runNow": True,
522529
"name": "Custom Behavior Logs",
530+
"tags": ["behaviors", "wr-test-1"],
523531
"config": {
524532
"seeds": [{"url": "https://specs.webrecorder.net/"}],
525533
"customBehaviors": [
@@ -551,13 +559,67 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
551559
return crawl_id
552560

553561

562+
@pytest.fixture(scope="session")
563+
def canceled_crawl_id(admin_auth_headers, default_org_id):
564+
crawl_data = {
565+
"runNow": True,
566+
"name": "Canceled crawl",
567+
"tags": ["canceled"],
568+
"config": {
569+
"seeds": [{"url": "https://old.webrecorder.net/"}],
570+
"limit": 5,
571+
},
572+
"browserWindows": 1,
573+
}
574+
r = requests.post(
575+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
576+
headers=admin_auth_headers,
577+
json=crawl_data,
578+
)
579+
data = r.json()
580+
581+
crawl_id = data["run_now_job"]
582+
583+
# Cancel crawl after it's started
584+
while True:
585+
r = requests.get(
586+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
587+
headers=admin_auth_headers,
588+
)
589+
data = r.json()
590+
if data["state"] in RUNNING_STATES:
591+
break
592+
time.sleep(5)
593+
594+
r = requests.post(
595+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
596+
headers=admin_auth_headers,
597+
)
598+
data = r.json()
599+
assert data["success"] == True
600+
601+
# Wait until crawl finishes
602+
while True:
603+
r = requests.get(
604+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
605+
headers=admin_auth_headers,
606+
)
607+
data = r.json()
608+
if data["state"] in FINISHED_STATES:
609+
break
610+
time.sleep(5)
611+
612+
return crawl_id
613+
614+
554615
@pytest.fixture(scope="session")
555616
def url_list_config_id(crawler_auth_headers, default_org_id):
556617
# Start crawl.
557618
crawl_data = {
558619
"runNow": False,
559620
"name": "URL List config",
560621
"description": "Contains 3 seeds",
622+
"tags": ["wr-test-1", "seed-list"],
561623
"config": {
562624
"seeds": [
563625
{"url": "https://old.webrecorder.net"},

backend/test/test_uploads.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,104 @@ def test_clear_all_presigned_urls(
10651065
assert r.json()["success"]
10661066

10671067

1068+
def test_all_crawls_tag_counts(crawler_auth_headers, default_org_id):
1069+
r = requests.get(
1070+
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts",
1071+
headers=crawler_auth_headers,
1072+
)
1073+
assert r.status_code == 200
1074+
assert r.json() == {
1075+
"tags": [
1076+
{"tag": "wr-test-1", "count": 3},
1077+
{"tag": "wr-test-2", "count": 2},
1078+
{"tag": "all-crawls", "count": 1},
1079+
{"tag": "behaviors", "count": 1},
1080+
{"tag": "four", "count": 1},
1081+
{"tag": "qa", "count": 1},
1082+
{"tag": "three", "count": 1},
1083+
{"tag": "wr-test-1-updated-again", "count": 1},
1084+
{"tag": "wr-test-2-updated-again", "count": 1},
1085+
]
1086+
}
1087+
1088+
1089+
def test_all_crawls_tag_counts_including_failed(
1090+
crawler_auth_headers, default_org_id, canceled_crawl_id
1091+
):
1092+
r = requests.get(
1093+
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts?onlySuccessful=false",
1094+
headers=crawler_auth_headers,
1095+
)
1096+
assert r.status_code == 200
1097+
assert r.json() == {
1098+
"tags": [
1099+
{"tag": "wr-test-1", "count": 3},
1100+
{"tag": "wr-test-2", "count": 2},
1101+
{"tag": "all-crawls", "count": 1},
1102+
{"tag": "behaviors", "count": 1},
1103+
{"tag": "canceled", "count": 1},
1104+
{"tag": "four", "count": 1},
1105+
{"tag": "qa", "count": 1},
1106+
{"tag": "three", "count": 1},
1107+
{"tag": "wr-test-1-updated-again", "count": 1},
1108+
{"tag": "wr-test-2-updated-again", "count": 1},
1109+
]
1110+
}
1111+
1112+
1113+
def test_crawls_tag_counts(crawler_auth_headers, default_org_id):
1114+
r = requests.get(
1115+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts",
1116+
headers=crawler_auth_headers,
1117+
)
1118+
assert r.status_code == 200
1119+
assert r.json() == {
1120+
"tags": [
1121+
{"tag": "wr-test-1", "count": 3},
1122+
{"tag": "wr-test-2", "count": 2},
1123+
{"tag": "all-crawls", "count": 1},
1124+
{"tag": "behaviors", "count": 1},
1125+
{"tag": "qa", "count": 1},
1126+
]
1127+
}
1128+
1129+
1130+
def test_crawls_tag_counts_including_failed(
1131+
crawler_auth_headers, default_org_id, canceled_crawl_id
1132+
):
1133+
r = requests.get(
1134+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts?onlySuccessful=false",
1135+
headers=crawler_auth_headers,
1136+
)
1137+
assert r.status_code == 200
1138+
assert r.json() == {
1139+
"tags": [
1140+
{"tag": "wr-test-1", "count": 3},
1141+
{"tag": "wr-test-2", "count": 2},
1142+
{"tag": "all-crawls", "count": 1},
1143+
{"tag": "behaviors", "count": 1},
1144+
{"tag": "canceled", "count": 1},
1145+
{"tag": "qa", "count": 1},
1146+
]
1147+
}
1148+
1149+
1150+
def test_uploads_tag_counts(crawler_auth_headers, default_org_id):
1151+
r = requests.get(
1152+
f"{API_PREFIX}/orgs/{default_org_id}/uploads/tagCounts",
1153+
headers=crawler_auth_headers,
1154+
)
1155+
assert r.status_code == 200
1156+
assert r.json() == {
1157+
"tags": [
1158+
{"tag": "four", "count": 1},
1159+
{"tag": "three", "count": 1},
1160+
{"tag": "wr-test-1-updated-again", "count": 1},
1161+
{"tag": "wr-test-2-updated-again", "count": 1},
1162+
]
1163+
}
1164+
1165+
10681166
def test_delete_form_upload_and_crawls_from_all_crawls(
10691167
admin_auth_headers,
10701168
crawler_auth_headers,

0 commit comments

Comments
 (0)