1
1
from asyncio import Protocol
2
2
from pathlib import Path
3
- from typing import Dict , Tuple
3
+ from typing import Tuple , Optional , TypedDict
4
4
5
5
from tarsier ._utils import load_js
6
6
from tarsier .adapter import AnyDriver , BrowserAdapter , adapter_factory
7
7
from tarsier .ocr import OCRService
8
8
from tarsier .text_format import format_text
9
9
10
- TagToXPath = Dict [int , str ]
10
+
11
+ class TagMetadata (TypedDict ):
12
+ tarsier_id : int
13
+ element_name : str
14
+ opening_tag_html : str
15
+ xpath : str
16
+ element_text : Optional [str ]
17
+ text_node_index : Optional [int ]
18
+ id_symbol : str
19
+ id_string : str
11
20
12
21
13
22
class ITarsier (Protocol ):
14
- async def page_to_image (self , driver : AnyDriver ) -> Tuple [bytes , Dict [ int , str ]]:
23
+ async def page_to_image (self , driver : AnyDriver ) -> Tuple [bytes , list [ TagMetadata ]]:
15
24
raise NotImplementedError ()
16
25
17
- async def page_to_text (self , driver : AnyDriver ) -> Tuple [str , Dict [int , str ]]:
26
+ async def page_to_text (self , driver : AnyDriver ) -> Tuple [str , list [TagMetadata ]]:
27
+ raise NotImplementedError ()
28
+
29
+ async def page_to_image_and_text (
30
+ self , driver : AnyDriver
31
+ ) -> Tuple [bytes , str , list [TagMetadata ]]:
18
32
raise NotImplementedError ()
19
33
20
34
@@ -31,10 +45,10 @@ async def page_to_image(
31
45
tag_text_elements : bool = False ,
32
46
tagless : bool = False ,
33
47
keep_tags_showing : bool = False ,
34
- ) -> Tuple [bytes , TagToXPath ]:
48
+ ) -> Tuple [bytes , list [ TagMetadata ] ]:
35
49
adapter = adapter_factory (driver )
36
50
tag_to_xpath = (
37
- await self ._tag_page (adapter , tag_text_elements ) if not tagless else {}
51
+ await self ._tag_page (adapter , tag_text_elements ) if not tagless else []
38
52
)
39
53
if tagless :
40
54
await self ._remove_tags (adapter )
@@ -44,15 +58,15 @@ async def page_to_image(
44
58
if not keep_tags_showing :
45
59
await self ._remove_tags (adapter )
46
60
47
- return screenshot , tag_to_xpath if not tagless else {}
61
+ return screenshot , tag_to_xpath if not tagless else []
48
62
49
63
async def page_to_text (
50
64
self ,
51
65
driver : AnyDriver ,
52
66
tag_text_elements : bool = False ,
53
67
tagless : bool = False ,
54
68
keep_tags_showing : bool = False ,
55
- ) -> Tuple [str , TagToXPath ]:
69
+ ) -> Tuple [str , list [ TagMetadata ] ]:
56
70
image , tag_to_xpath = await self .page_to_image (
57
71
driver , tag_text_elements , tagless , keep_tags_showing
58
72
)
@@ -65,7 +79,7 @@ async def page_to_image_and_text(
65
79
tag_text_elements : bool = False ,
66
80
tagless : bool = False ,
67
81
keep_tags_showing : bool = False ,
68
- ) -> Tuple [bytes , str , TagToXPath ]:
82
+ ) -> Tuple [bytes , str , list [ TagMetadata ] ]:
69
83
image , tag_to_xpath = await self .page_to_image (
70
84
driver , tag_text_elements , tagless , keep_tags_showing
71
85
)
@@ -90,13 +104,26 @@ def _run_ocr(self, image: bytes) -> str:
90
104
91
105
async def _tag_page (
92
106
self , adapter : BrowserAdapter , tag_text_elements : bool = False
93
- ) -> Dict [ int , str ]:
107
+ ) -> list [ TagMetadata ]:
94
108
await self ._load_tarsier_utils (adapter )
95
109
96
110
script = f"return window.tagifyWebpage({ str (tag_text_elements ).lower ()} );"
97
- tag_to_xpath = await adapter .run_js (script )
98
-
99
- return {int (key ): value for key , value in tag_to_xpath .items ()}
111
+ tag_to_meta = await adapter .run_js (script )
112
+
113
+ tag_metadata_list = [
114
+ TagMetadata (
115
+ tarsier_id = meta ["tarsierId" ],
116
+ element_name = meta ["elementName" ],
117
+ opening_tag_html = meta ["openingTagHTML" ],
118
+ xpath = meta ["xpath" ],
119
+ element_text = meta .get ("elementText" ),
120
+ text_node_index = meta .get ("textNodeIndex" ),
121
+ id_symbol = meta ["idSymbol" ],
122
+ id_string = meta ["idString" ],
123
+ )
124
+ for meta in tag_to_meta
125
+ ]
126
+ return tag_metadata_list
100
127
101
128
async def _remove_tags (self , adapter : BrowserAdapter ) -> None :
102
129
await self ._load_tarsier_utils (adapter )
0 commit comments