webrecorder · ikreymer · May 8, 2025 · Mar 14, 2025 · Mar 14, 2025 · May 7, 2025
diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py
@@ -705,6 +705,8 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
 
         unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
 
+        top_page_hosts = await self.page_ops.get_top_page_hosts(crawl_ids)
+
         await self.collections.find_one_and_update(
             {"_id": collection_id},
             {
@@ -715,6 +717,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
                     "totalSize": total_size,
                     "tags": sorted_tags,
                     "preloadResources": preload_resources,
+                    "topPageHosts": top_page_hosts,
                 }
             },
         )

diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py
@@ -32,7 +32,7 @@
     ) = PageOps = BackgroundJobOps = object
 
 
-CURR_DB_VERSION = "0043"
+CURR_DB_VERSION = "0044"
 
 
 # ============================================================================

diff --git a/backend/btrixcloud/migrations/migration_0044_coll_stats.py b/backend/btrixcloud/migrations/migration_0044_coll_stats.py
@@ -0,0 +1,44 @@
+"""
+Migration 0044 - Recalculate collection stats
+"""
+
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0044"
+
+
+# pylint: disable=duplicate-code
+class Migration(BaseMigration):
+    """Migration class."""
+
+    # pylint: disable=unused-argument
+    def __init__(self, mdb, **kwargs):
+        super().__init__(mdb, migration_version=MIGRATION_VERSION)
+
+        self.coll_ops = kwargs.get("coll_ops")
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Recalculate collection stats to get top host names
+        """
+        colls_mdb = self.mdb["collections"]
+
+        if self.coll_ops is None:
+            print(
+                "Unable to set collection stats, missing coll_ops",
+                flush=True,
+            )
+            return
+
+        async for coll in colls_mdb.find({}):
+            coll_id = coll["_id"]
+            try:
+                await self.coll_ops.update_collection_counts_and_tags(coll_id)
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Unable to update page stats for collection {coll_id}: {err}",
+                    flush=True,
+                )
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -1417,6 +1417,14 @@ class PreloadResource(BaseModel):
     crawlId: str
 
 
+# ============================================================================
+class HostCount(BaseModel):
+    """Host Count"""
+
+    host: str
+    count: int
+
+
 # ============================================================================
 class Collection(BaseMongoModel):
     """Org collection structure"""
@@ -1515,6 +1523,8 @@ class CollOut(BaseMongoModel):
     pagesQueryUrl: str = ""
     downloadUrl: Optional[str] = None
 
+    topPageHosts: List[HostCount] = []
+
 
 # ============================================================================
 class PublicCollOut(BaseMongoModel):
@@ -1550,6 +1560,8 @@ class PublicCollOut(BaseMongoModel):
 
     allowPublicDownload: bool = True
 
+    topPageHosts: List[HostCount] = []
+
 
 # ============================================================================
 class UpdateColl(BaseModel):

diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
@@ -923,6 +923,35 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int:
         res = await cursor.to_list(1)
         return res[0].get("urls") if res else 0
 
+    async def get_top_page_hosts(
+        self, crawl_ids: List[str]
+    ) -> List[dict[str, str | int]]:
+        """Get count of top page hosts across all archived items"""
+        cursor = self.pages.aggregate(
+            [
+                {"$match": {"crawl_id": {"$in": crawl_ids}}},
+                {
+                    "$addFields": {
+                        "host": {
+                            "$regexFind": {
+                                "input": "$url",
+                                "regex": "^https?://([^/]+)",
+                            }
+                        }
+                    }
+                },
+                {
+                    "$group": {
+                        "_id": {"$first": "$host.captures"},
+                        "count": {"$count": {}},
+                    }
+                },
+                {"$sort": {"count": -1}},
+            ]
+        )
+        res = await cursor.to_list(10)
+        return [{"host": x.get("_id"), "count": x.get("count")} for x in res]
+
     async def set_archived_item_page_counts(self, crawl_id: str):
         """Store archived item page and unique page counts in crawl document"""
         page_count = await self.pages.count_documents({"crawl_id": crawl_id})

diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py
@@ -94,6 +94,8 @@ def test_create_collection(
     assert data["defaultThumbnailName"] == default_thumbnail_name
     assert data["allowPublicDownload"]
 
+    assert data["topPageHosts"] == [{'count': 3, 'host': 'webrecorder.net'}]
+
 
 def test_create_public_collection(
     crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
@@ -223,6 +225,7 @@ def test_update_collection(
     assert data["dateEarliest"]
     assert data["dateLatest"]
     assert data["defaultThumbnailName"]
+    assert data["topPageHosts"]
 
 
 def test_rename_collection(
@@ -310,6 +313,7 @@ def test_add_remove_crawl_from_collection(
     assert data["tags"] == ["wr-test-2", "wr-test-1"]
     assert data["dateEarliest"]
     assert data["dateLatest"]
+    assert data["topPageHosts"] == [{'count': 7, 'host': 'webrecorder.net'}]
 
     # Verify it was added
     r = requests.get(
@@ -335,6 +339,7 @@ def test_add_remove_crawl_from_collection(
     assert data.get("tags", []) == []
     assert data.get("dateEarliest") is None
     assert data.get("dateLatest") is None
+    assert data["topPageHosts"] == []
 
     # Verify they were removed
     r = requests.get(
@@ -366,6 +371,7 @@ def test_add_remove_crawl_from_collection(
     assert data["tags"] == ["wr-test-2", "wr-test-1"]
     assert data["dateEarliest"]
     assert data["dateLatest"]
+    assert data["topPageHosts"]
 
 
 def test_get_collection(crawler_auth_headers, default_org_id):
@@ -1137,6 +1143,7 @@ def test_list_public_collections(
         assert collection["pageCount"] > 0
         assert collection["uniquePageCount"] > 0
         assert collection["totalSize"] > 0
+        assert collection["topPageHosts"]
 
     # Test non-existing slug - it should return a 404 but not reveal
     # whether or not an org exists with that slug

diff --git a/frontend/src/layouts/collections/metadataColumn.ts b/frontend/src/layouts/collections/metadataColumn.ts
@@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) {
         label: metadata.totalSize,
         render: (col) => `${localize.bytes(col.totalSize)}`,
       })}
+      ${metadataItem({
+        label: metadata.topPageHosts,
+        render: (col) =>
+          html` <table>
+            ${col.topPageHosts.map(
+              (x) => html`
+                <tr>
+                  <td>${x.host}</td>
+                  <td class="pl-4">${x.count}</td>
+                </tr>
+              `,
+            )}
+          </table>`,
+      })}
     </btrix-desc-list>
   `;
 }
diff --git a/frontend/src/strings/collections/metadata.ts b/frontend/src/strings/collections/metadata.ts
@@ -5,4 +5,5 @@ export const metadata = {
   uniquePageCount: msg("Unique Pages in Collection"),
   pageCount: msg("Total Pages Crawled"),
   totalSize: msg("Collection Size"),
+  topPageHosts: msg("Top Page Hostnames"),
 };
diff --git a/frontend/src/types/collection.ts b/frontend/src/types/collection.ts
@@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({
   crawlCount: z.number(),
   uniquePageCount: z.number(),
   pageCount: z.number(),
+  topPageHosts: z.array(
+    z.object({
+      host: z.string(),
+      count: z.number(),
+    }),
+  ),
   totalSize: z.number(),
   allowPublicDownload: z.boolean(),
   homeUrl: z.string().url().nullable(),