diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index ff9cc70959..c9d348b9dc 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -705,6 +705,8 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids) + top_page_hosts = await self.page_ops.get_top_page_hosts(crawl_ids) + await self.collections.find_one_and_update( {"_id": collection_id}, { @@ -715,6 +717,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): "totalSize": total_size, "tags": sorted_tags, "preloadResources": preload_resources, + "topPageHosts": top_page_hosts, } }, ) diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 22df847515..510cef4991 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -32,7 +32,7 @@ ) = PageOps = BackgroundJobOps = object -CURR_DB_VERSION = "0043" +CURR_DB_VERSION = "0044" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0044_coll_stats.py b/backend/btrixcloud/migrations/migration_0044_coll_stats.py new file mode 100644 index 0000000000..ff6440a884 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0044_coll_stats.py @@ -0,0 +1,44 @@ +""" +Migration 0044 - Recalculate collection stats +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0044" + + +# pylint: disable=duplicate-code +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.coll_ops = kwargs.get("coll_ops") + + async def migrate_up(self): + """Perform migration up. + + Recalculate collection stats to get top host names + """ + colls_mdb = self.mdb["collections"] + + if self.coll_ops is None: + print( + "Unable to set collection stats, missing coll_ops", + flush=True, + ) + return + + async for coll in colls_mdb.find({}): + coll_id = coll["_id"] + try: + await self.coll_ops.update_collection_counts_and_tags(coll_id) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to update page stats for collection {coll_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index fa333f832f..27405b451f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1417,6 +1417,14 @@ class PreloadResource(BaseModel): crawlId: str +# ============================================================================ +class HostCount(BaseModel): + """Host Count""" + + host: str + count: int + + # ============================================================================ class Collection(BaseMongoModel): """Org collection structure""" @@ -1515,6 +1523,8 @@ class CollOut(BaseMongoModel): pagesQueryUrl: str = "" downloadUrl: Optional[str] = None + topPageHosts: List[HostCount] = [] + # ============================================================================ class PublicCollOut(BaseMongoModel): @@ -1550,6 +1560,8 @@ class PublicCollOut(BaseMongoModel): allowPublicDownload: bool = True + topPageHosts: List[HostCount] = [] + # ============================================================================ class UpdateColl(BaseModel): diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 189c2108d9..07a8fda0db 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -923,6 +923,35 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int: res = await cursor.to_list(1) return res[0].get("urls") if res else 0 + async def get_top_page_hosts( + self, crawl_ids: List[str] + ) -> List[dict[str, str | int]]: + """Get count of top page hosts across all archived items""" + cursor = self.pages.aggregate( + [ + {"$match": {"crawl_id": {"$in": crawl_ids}}}, + { + "$addFields": { + "host": { + "$regexFind": { + "input": "$url", + "regex": "^https?://([^/]+)", + } + } + } + }, + { + "$group": { + "_id": {"$first": "$host.captures"}, + "count": {"$count": {}}, + } + }, + {"$sort": {"count": -1}}, + ] + ) + res = await cursor.to_list(10) + return [{"host": x.get("_id"), "count": x.get("count")} for x in res] + async def set_archived_item_page_counts(self, crawl_id: str): """Store archived item page and unique page counts in crawl document""" page_count = await self.pages.count_documents({"crawl_id": crawl_id}) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 6eb7c45946..8b39591825 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -94,6 +94,8 @@ def test_create_collection( assert data["defaultThumbnailName"] == default_thumbnail_name assert data["allowPublicDownload"] + assert data["topPageHosts"] == [{'count': 3, 'host': 'webrecorder.net'}] + def test_create_public_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id @@ -223,6 +225,7 @@ def test_update_collection( assert data["dateEarliest"] assert data["dateLatest"] assert data["defaultThumbnailName"] + assert data["topPageHosts"] def test_rename_collection( @@ -310,6 +313,7 @@ def test_add_remove_crawl_from_collection( assert data["tags"] == ["wr-test-2", "wr-test-1"] assert data["dateEarliest"] assert data["dateLatest"] + assert data["topPageHosts"] == [{'count': 7, 'host': 'webrecorder.net'}] # Verify it was added r = requests.get( @@ -335,6 +339,7 @@ def test_add_remove_crawl_from_collection( assert data.get("tags", []) == [] assert data.get("dateEarliest") is None assert data.get("dateLatest") is None + assert data["topPageHosts"] == [] # Verify they were removed r = requests.get( @@ -366,6 +371,7 @@ def test_add_remove_crawl_from_collection( assert data["tags"] == ["wr-test-2", "wr-test-1"] assert data["dateEarliest"] assert data["dateLatest"] + assert data["topPageHosts"] def test_get_collection(crawler_auth_headers, default_org_id): @@ -1137,6 +1143,7 @@ def test_list_public_collections( assert collection["pageCount"] > 0 assert collection["uniquePageCount"] > 0 assert collection["totalSize"] > 0 + assert collection["topPageHosts"] # Test non-existing slug - it should return a 404 but not reveal # whether or not an org exists with that slug diff --git a/frontend/src/layouts/collections/metadataColumn.ts b/frontend/src/layouts/collections/metadataColumn.ts index 3207e92496..f687cf1ee9 100644 --- a/frontend/src/layouts/collections/metadataColumn.ts +++ b/frontend/src/layouts/collections/metadataColumn.ts @@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) { label: metadata.totalSize, render: (col) => `${localize.bytes(col.totalSize)}`, })} + ${metadataItem({ + label: metadata.topPageHosts, + render: (col) => + html` + ${col.topPageHosts.map( + (x) => html` + + + + + `, + )} +
${x.host}${x.count}
`, + })} `; } diff --git a/frontend/src/strings/collections/metadata.ts b/frontend/src/strings/collections/metadata.ts index 39805563bd..258081c761 100644 --- a/frontend/src/strings/collections/metadata.ts +++ b/frontend/src/strings/collections/metadata.ts @@ -5,4 +5,5 @@ export const metadata = { uniquePageCount: msg("Unique Pages in Collection"), pageCount: msg("Total Pages Crawled"), totalSize: msg("Collection Size"), + topPageHosts: msg("Top Page Hostnames"), }; diff --git a/frontend/src/types/collection.ts b/frontend/src/types/collection.ts index 4bd6192fbb..c539e39ab9 100644 --- a/frontend/src/types/collection.ts +++ b/frontend/src/types/collection.ts @@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({ crawlCount: z.number(), uniquePageCount: z.number(), pageCount: z.number(), + topPageHosts: z.array( + z.object({ + host: z.string(), + count: z.number(), + }), + ), totalSize: z.number(), allowPublicDownload: z.boolean(), homeUrl: z.string().url().nullable(),