diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index ff9cc70959..b165182add 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -705,6 +705,8 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids) + top_page_origins = await self.page_ops.get_top_page_origins(crawl_ids) + await self.collections.find_one_and_update( {"_id": collection_id}, { @@ -715,6 +717,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): "totalSize": total_size, "tags": sorted_tags, "preloadResources": preload_resources, + "topPageOrigins": top_page_origins, } }, ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 953cfff602..03205f1c95 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1415,6 +1415,14 @@ class PreloadResource(BaseModel): crawlId: str +# ============================================================================ +class OriginCount(BaseModel): + """Origin Count""" + + origin: str + count: int + + # ============================================================================ class Collection(BaseMongoModel): """Org collection structure""" @@ -1513,6 +1521,8 @@ class CollOut(BaseMongoModel): pagesQueryUrl: str = "" downloadUrl: Optional[str] = None + topPageOrigins: List[OriginCount] = [] + # ============================================================================ class PublicCollOut(BaseMongoModel): @@ -1548,6 +1558,8 @@ class PublicCollOut(BaseMongoModel): allowPublicDownload: bool = True + topPageOrigins: List[OriginCount] = [] + # ============================================================================ class UpdateColl(BaseModel): diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 189c2108d9..194ee5f1b0 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -923,6 +923,30 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int: res = await cursor.to_list(1) return res[0].get("urls") if res else 0 + async def get_top_page_origins( + self, crawl_ids: List[str] + ) -> List[dict[str, str | int]]: + """Get count of top page origins across all archived items""" + cursor = self.pages.aggregate( + [ + {"$match": {"crawl_id": {"$in": crawl_ids}}}, + { + "$addFields": { + "origin": { + "$regexFind": { + "input": "$url", + "regex": "^https?://([^/]+)", + } + } + } + }, + {"$group": {"_id": "$origin.match", "count": {"$count": {}}}}, + {"$sort": {"count": -1}}, + ] + ) + res = await cursor.to_list(10) + return [{"origin": x.get("_id"), "count": x.get("count")} for x in res] + async def set_archived_item_page_counts(self, crawl_id: str): """Store archived item page and unique page counts in crawl document""" page_count = await self.pages.count_documents({"crawl_id": crawl_id}) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 6eb7c45946..f52c14e103 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -563,6 +563,10 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id): assert data["dateEarliest"] assert data["dateLatest"] assert data["defaultThumbnailName"] + assert data["topPageOrigins"] + for origin in data["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] # Verify it was added r = requests.get( @@ -625,6 +629,10 @@ def test_list_collections( assert first_coll["dateEarliest"] assert first_coll["dateLatest"] assert first_coll["defaultThumbnailName"] + assert first_coll["topPageOrigins"] + for origin in first_coll["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0] assert second_coll["id"] @@ -641,6 +649,10 @@ def test_list_collections( assert second_coll["access"] == "private" assert second_coll["dateEarliest"] assert second_coll["dateLatest"] + assert second_coll["topPageOrigins"] + for origin in second_coll["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] def test_list_pages_in_collection(crawler_auth_headers, default_org_id): @@ -1137,6 +1149,10 @@ def test_list_public_collections( assert collection["pageCount"] > 0 assert collection["uniquePageCount"] > 0 assert collection["totalSize"] > 0 + assert collection["topPageOrigins"] + for origin in collection["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] # Test non-existing slug - it should return a 404 but not reveal # whether or not an org exists with that slug @@ -1329,6 +1345,10 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["pageCount"] > 0 assert coll["uniquePageCount"] > 0 assert coll["totalSize"] > 0 + assert coll["topPageOrigins"] + for origin in coll["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] for field in non_public_fields: assert field not in coll @@ -1380,6 +1400,10 @@ def test_get_public_collection(default_org_id): assert coll["pageCount"] > 0 assert coll["uniquePageCount"] > 0 assert coll["totalSize"] > 0 + assert coll["topPageOrigins"] + for origin in coll["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll @@ -1462,6 +1486,10 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): assert coll["totalSize"] > 0 assert coll["defaultThumbnailName"] == "orange-default.avif" assert coll["allowPublicDownload"] + assert coll["topPageOrigins"] + for origin in coll["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll @@ -1504,6 +1532,10 @@ def test_get_public_collection_unlisted_org_profile_disabled( assert coll["totalSize"] > 0 assert coll["defaultThumbnailName"] == "orange-default.avif" assert coll["allowPublicDownload"] + assert coll["topPageOrigins"] + for origin in coll["topPageOrigins"]: + assert origin["origin"] + assert origin["count"] for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll diff --git a/frontend/src/layouts/collections/metadataColumn.ts b/frontend/src/layouts/collections/metadataColumn.ts index 3207e92496..ebccc9bdf1 100644 --- a/frontend/src/layouts/collections/metadataColumn.ts +++ b/frontend/src/layouts/collections/metadataColumn.ts @@ -56,6 +56,22 @@ export function metadataColumn(collection?: Collection | PublicCollection) { label: metadata.totalSize, render: (col) => `${localize.bytes(col.totalSize)}`, })} + ${when(collection?.topPageOrigins.length, () => + metadataItem({ + label: metadata.topPageDomains, + render: (col) => + html` + ${col.topPageOrigins.map( + (x) => html` + + + + + `, + )} +
${x.origin}${localize.number(x.count)}
`, + }), + )} `; } diff --git a/frontend/src/strings/collections/metadata.ts b/frontend/src/strings/collections/metadata.ts index 39805563bd..fd348d8ce9 100644 --- a/frontend/src/strings/collections/metadata.ts +++ b/frontend/src/strings/collections/metadata.ts @@ -5,4 +5,5 @@ export const metadata = { uniquePageCount: msg("Unique Pages in Collection"), pageCount: msg("Total Pages Crawled"), totalSize: msg("Collection Size"), + topPageDomains: msg("Top Page Domains"), }; diff --git a/frontend/src/types/collection.ts b/frontend/src/types/collection.ts index 4bd6192fbb..136f6e223c 100644 --- a/frontend/src/types/collection.ts +++ b/frontend/src/types/collection.ts @@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({ crawlCount: z.number(), uniquePageCount: z.number(), pageCount: z.number(), + topPageOrigins: z.array( + z.object({ + origin: z.string(), + count: z.number(), + }), + ), totalSize: z.number(), allowPublicDownload: z.boolean(), homeUrl: z.string().url().nullable(),