Skip to content

Commit 0a79261

Browse files
Breaking change: return metadata in page_to_text (#156)
* return metadata in page_to_text * fix more tests, fix mypy error * address comments * address comments, add test * Return full xpath (#157) * remove early exit condition and return full xpath * add test to make sure full xpath is returned * reformat * clean up nesting using ternary operator * clean up getTagSymbol conditionals
1 parent 608a6a3 commit 0a79261

File tree

12 files changed

+893
-428
lines changed

12 files changed

+893
-428
lines changed

poetry.lock

Lines changed: 120 additions & 75 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "tarsier"
3-
version = "0.7.2"
3+
version = "0.8.0"
44
description = "Vision utilities for web interaction agents"
55
authors = ["Rohan Pandey", "Adam Watkins", "Asim Shrestha"]
66
readme = "README.md"
@@ -19,14 +19,14 @@ azure-ai-vision-imageanalysis = "^1.0.0b2"
1919
[tool.poetry.group.dev.dependencies]
2020
mypy = "^1.10.0"
2121
ruff = ">=0.4.8,<0.7.0"
22-
bananalyzer = "^0.9.17"
22+
bananalyzer = "0.10.8"
2323

2424

2525
[tool.poetry.group.test.dependencies]
2626
pytest = ">=7.4.3,<9.0.0"
2727
pytest-cov = ">=4.1,<6.0"
2828
pytest-playwright = ">=0.4.4,<0.6.0"
29-
pytest-asyncio = ">=0.21.2,<0.24.0"
29+
pytest-asyncio = ">=0.24.0,<0.25.0"
3030
nest-asyncio = "^1.6.0"
3131
webdriver-manager = "^4.0.1"
3232
pytest-mock = "^3.14.0"

tarsier-snapshots/poetry.lock

Lines changed: 124 additions & 139 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tarsier-snapshots/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ readme = "README.md"
77

88
[tool.poetry.dependencies]
99
python = "^3.11"
10-
bananalyzer = "0.9.12"
10+
bananalyzer = "0.10.8"
1111
playwright = "^1.41.1"
1212
python-dotenv = "^1.0.1"
1313
tarsier = { path = "../", develop = true }
14-
numpy = "^1.26.3"
14+
numpy = "2.1.0"
1515
tiktoken = "^0.6.0"
1616

1717

tarsier/core.py

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,34 @@
11
from asyncio import Protocol
22
from pathlib import Path
3-
from typing import Dict, Tuple
3+
from typing import Tuple, Optional, TypedDict
44

55
from tarsier._utils import load_js
66
from tarsier.adapter import AnyDriver, BrowserAdapter, adapter_factory
77
from tarsier.ocr import OCRService
88
from tarsier.text_format import format_text
99

10-
TagToXPath = Dict[int, str]
10+
11+
class TagMetadata(TypedDict):
12+
tarsier_id: int
13+
element_name: str
14+
opening_tag_html: str
15+
xpath: str
16+
element_text: Optional[str]
17+
text_node_index: Optional[int]
18+
id_symbol: str
19+
id_string: str
1120

1221

1322
class ITarsier(Protocol):
14-
async def page_to_image(self, driver: AnyDriver) -> Tuple[bytes, Dict[int, str]]:
23+
async def page_to_image(self, driver: AnyDriver) -> Tuple[bytes, list[TagMetadata]]:
1524
raise NotImplementedError()
1625

17-
async def page_to_text(self, driver: AnyDriver) -> Tuple[str, Dict[int, str]]:
26+
async def page_to_text(self, driver: AnyDriver) -> Tuple[str, list[TagMetadata]]:
27+
raise NotImplementedError()
28+
29+
async def page_to_image_and_text(
30+
self, driver: AnyDriver
31+
) -> Tuple[bytes, str, list[TagMetadata]]:
1832
raise NotImplementedError()
1933

2034

@@ -31,10 +45,10 @@ async def page_to_image(
3145
tag_text_elements: bool = False,
3246
tagless: bool = False,
3347
keep_tags_showing: bool = False,
34-
) -> Tuple[bytes, TagToXPath]:
48+
) -> Tuple[bytes, list[TagMetadata]]:
3549
adapter = adapter_factory(driver)
3650
tag_to_xpath = (
37-
await self._tag_page(adapter, tag_text_elements) if not tagless else {}
51+
await self._tag_page(adapter, tag_text_elements) if not tagless else []
3852
)
3953
if tagless:
4054
await self._remove_tags(adapter)
@@ -44,15 +58,15 @@ async def page_to_image(
4458
if not keep_tags_showing:
4559
await self._remove_tags(adapter)
4660

47-
return screenshot, tag_to_xpath if not tagless else {}
61+
return screenshot, tag_to_xpath if not tagless else []
4862

4963
async def page_to_text(
5064
self,
5165
driver: AnyDriver,
5266
tag_text_elements: bool = False,
5367
tagless: bool = False,
5468
keep_tags_showing: bool = False,
55-
) -> Tuple[str, TagToXPath]:
69+
) -> Tuple[str, list[TagMetadata]]:
5670
image, tag_to_xpath = await self.page_to_image(
5771
driver, tag_text_elements, tagless, keep_tags_showing
5872
)
@@ -65,7 +79,7 @@ async def page_to_image_and_text(
6579
tag_text_elements: bool = False,
6680
tagless: bool = False,
6781
keep_tags_showing: bool = False,
68-
) -> Tuple[bytes, str, TagToXPath]:
82+
) -> Tuple[bytes, str, list[TagMetadata]]:
6983
image, tag_to_xpath = await self.page_to_image(
7084
driver, tag_text_elements, tagless, keep_tags_showing
7185
)
@@ -90,13 +104,26 @@ def _run_ocr(self, image: bytes) -> str:
90104

91105
async def _tag_page(
92106
self, adapter: BrowserAdapter, tag_text_elements: bool = False
93-
) -> Dict[int, str]:
107+
) -> list[TagMetadata]:
94108
await self._load_tarsier_utils(adapter)
95109

96110
script = f"return window.tagifyWebpage({str(tag_text_elements).lower()});"
97-
tag_to_xpath = await adapter.run_js(script)
98-
99-
return {int(key): value for key, value in tag_to_xpath.items()}
111+
tag_to_meta = await adapter.run_js(script)
112+
113+
tag_metadata_list = [
114+
TagMetadata(
115+
tarsier_id=meta["tarsierId"],
116+
element_name=meta["elementName"],
117+
opening_tag_html=meta["openingTagHTML"],
118+
xpath=meta["xpath"],
119+
element_text=meta.get("elementText"),
120+
text_node_index=meta.get("textNodeIndex"),
121+
id_symbol=meta["idSymbol"],
122+
id_string=meta["idString"],
123+
)
124+
for meta in tag_to_meta
125+
]
126+
return tag_metadata_list
100127

101128
async def _remove_tags(self, adapter: BrowserAdapter) -> None:
102129
await self._load_tarsier_utils(adapter)

0 commit comments

Comments
 (0)