Skip to content

Commit 3cec487

Browse files
committed
compute top page origins for each collection:
- compute as part of existing collection stats compute - store top 10 results in collection - display in collection About sidebar
1 parent bcb7393 commit 3cec487

File tree

6 files changed

+58
-0
lines changed

6 files changed

+58
-0
lines changed

backend/btrixcloud/colls.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,8 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
705705

706706
unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
707707

708+
top_page_origins = await self.page_ops.get_top_page_origins(crawl_ids)
709+
708710
await self.collections.find_one_and_update(
709711
{"_id": collection_id},
710712
{
@@ -715,6 +717,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
715717
"totalSize": total_size,
716718
"tags": sorted_tags,
717719
"preloadResources": preload_resources,
720+
"topPageOrigins": top_page_origins,
718721
}
719722
},
720723
)

backend/btrixcloud/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,6 +1423,14 @@ class PreloadResource(BaseModel):
14231423
crawlId: str
14241424

14251425

1426+
# ============================================================================
1427+
class OriginCount(BaseModel):
1428+
"""Origin Count"""
1429+
1430+
origin: str
1431+
count: int
1432+
1433+
14261434
# ============================================================================
14271435
class Collection(BaseMongoModel):
14281436
"""Org collection structure"""
@@ -1521,6 +1529,8 @@ class CollOut(BaseMongoModel):
15211529
pagesQueryUrl: str = ""
15221530
downloadUrl: Optional[str] = None
15231531

1532+
topPageOrigins: List[OriginCount] = []
1533+
15241534

15251535
# ============================================================================
15261536
class PublicCollOut(BaseMongoModel):

backend/btrixcloud/pages.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,30 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int:
923923
res = await cursor.to_list(1)
924924
return res[0].get("urls") if res else 0
925925

926+
async def get_top_page_origins(
927+
self, crawl_ids: List[str]
928+
) -> List[dict[str, str | int]]:
929+
"""Get count of top page origins across all archived items"""
930+
cursor = self.pages.aggregate(
931+
[
932+
{"$match": {"crawl_id": {"$in": crawl_ids}}},
933+
{
934+
"$addFields": {
935+
"origin": {
936+
"$regexFind": {
937+
"input": "$url",
938+
"regex": "^https?://([^/])+",
939+
}
940+
}
941+
}
942+
},
943+
{"$group": {"_id": "$origin.match", "count": {"$count": {}}}},
944+
{"$sort": {"count": -1}},
945+
]
946+
)
947+
res = await cursor.to_list(10)
948+
return [{"origin": x.get("_id"), "count": x.get("count")} for x in res]
949+
926950
async def set_archived_item_page_counts(self, crawl_id: str):
927951
"""Store archived item page and unique page counts in crawl document"""
928952
page_count = await self.pages.count_documents({"crawl_id": crawl_id})

frontend/src/layouts/collections/metadataColumn.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) {
5656
label: metadata.totalSize,
5757
render: (col) => `${localize.bytes(col.totalSize)}`,
5858
})}
59+
${metadataItem({
60+
label: metadata.topPageDomains,
61+
render: (col) =>
62+
html` <table>
63+
${col.topPageOrigins.map(
64+
(x) => html`
65+
<tr>
66+
<td>${x.origin}</td>
67+
<td class="pl-4">${x.count}</td>
68+
</tr>
69+
`,
70+
)}
71+
</table>`,
72+
})}
5973
</btrix-desc-list>
6074
`;
6175
}

frontend/src/strings/collections/metadata.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ export const metadata = {
55
uniquePageCount: msg("Unique Pages in Collection"),
66
pageCount: msg("Total Pages Crawled"),
77
totalSize: msg("Collection Size"),
8+
topPageDomains: msg("Top Page Domains"),
89
};

frontend/src/types/collection.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({
4141
crawlCount: z.number(),
4242
uniquePageCount: z.number(),
4343
pageCount: z.number(),
44+
topPageOrigins: z.array(
45+
z.object({
46+
origin: z.string(),
47+
count: z.number(),
48+
}),
49+
),
4450
totalSize: z.number(),
4551
allowPublicDownload: z.boolean(),
4652
homeUrl: z.string().url().nullable(),

0 commit comments

Comments
 (0)