Skip to content

Commit 608a6a3

Browse files
🏷️ Tag images using % symbol (#151)
1 parent d4d670b commit 608a6a3

File tree

8 files changed

+156
-5
lines changed

8 files changed

+156
-5
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "tarsier"
3-
version = "0.7.1"
3+
version = "0.7.2"
44
description = "Vision utilities for web interaction agents"
55
authors = ["Rohan Pandey", "Adam Watkins", "Asim Shrestha"]
66
readme = "README.md"

tarsier/tag_utils.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ const isTextNodeAValidWord = (child: ChildNode) => {
8585
return trimmedWord && (trimmedWord.match(/\w/) || trimmedWord.length > 3); // Regex matches any character, number, or _
8686
};
8787

88+
const isImageElement = (el: HTMLElement) => {
89+
return el.tagName.toLowerCase() === "img";
90+
};
91+
8892
const inputs = ["a", "button", "textarea", "select", "details", "label"];
8993
const isInteractable = (el: HTMLElement) => {
9094
// If it is a label but has an input child that it is a label for, say not interactable
@@ -233,6 +237,8 @@ function create_tagged_span(idNum: number, el: HTMLElement) {
233237
if (isTextInsertable(el)) idStr = `[#${idNum}]`;
234238
else if (el.tagName.toLowerCase() == "a") idStr = `[@${idNum}]`;
235239
else idStr = `[$${idNum}]`;
240+
} else if (isImageElement(el)) {
241+
idStr = `[%${idNum}]`;
236242
} else {
237243
idStr = `[${idNum}]`;
238244
}
@@ -343,11 +349,11 @@ function getElementsToTag(
343349
const elementsToTag: HTMLElement[] = [];
344350

345351
for (let el of allElements) {
346-
if (isTextLess(el) || !elIsVisible(el)) {
352+
if ((isTextLess(el) && !isImageElement(el)) || !elIsVisible(el)) {
347353
continue;
348354
}
349355

350-
if (isInteractable(el)) {
356+
if (isInteractable(el) || isImageElement(el)) {
351357
elementsToTag.push(el);
352358
} else if (tagLeafTexts) {
353359
// Append the parent tag as it may have multiple individual child nodes with text
@@ -480,6 +486,14 @@ function insertTags(
480486
absolutelyPositionTagIfMisaligned(idSpan, insertionElement);
481487
}
482488
idNum++;
489+
} else if (isImageElement(el)) {
490+
// Handle image elements
491+
const idSpan = create_tagged_span(idNum, el);
492+
if (el.parentElement) {
493+
el.parentElement.insertBefore(idSpan, el);
494+
absolutelyPositionTagIfMisaligned(idSpan, el);
495+
}
496+
idNum++;
483497
} else if (tagLeafTexts) {
484498
trimTextNodeStart(el);
485499
const validTextNodes = Array.from(el.childNodes).filter(
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<img id="small" src="https://placehold.co/60x60?text=+" alt="Small Image" />
9+
<img
10+
id="medium"
11+
src="https://placehold.co/250x250?text=+"
12+
alt="Medium Image"
13+
/>
14+
<img
15+
id="large"
16+
src="https://placehold.co/600x600?text=+"
17+
alt="Large Image"
18+
/>
19+
</body>
20+
</html>

tests/mock_html/hidden_image.html

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<title>Title</title>
6+
<style>
7+
.hidden {
8+
display: none;
9+
}
10+
.visible {
11+
margin-left: 40px;
12+
}
13+
</style>
14+
</head>
15+
<body>
16+
<img
17+
src="https://placehold.co/100x100?text=+"
18+
alt="Visible Image"
19+
class="visible"
20+
id="visible-image"
21+
/>
22+
<img
23+
src="https://placehold.co/100x100?text='"
24+
alt="Hidden Image"
25+
class="hidden"
26+
id="hidden-image"
27+
/>
28+
</body>
29+
</html>

tests/mock_html/image_and_text.html

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<div>
9+
<img
10+
src="https://placehold.co/200x200?text=`"
11+
alt="An image"
12+
style="float: left; margin-right: 10px"
13+
/>
14+
<p>Some text next to an image</p>
15+
</div>
16+
</body>
17+
</html>
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<button id="image-button">
9+
<img src="https://placehold.co/200x200?text=`" alt="Button Image" />
10+
</button>
11+
</body>
12+
</html>
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<a href="http://example.com" id="link1">
9+
<img
10+
src="https://placehold.co/100x100?text='"
11+
alt="Linked Image"
12+
id="image1"
13+
/>
14+
</a>
15+
</body>
16+
</html>

tests/test_elements.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@
7777
),
7878
(
7979
"image.html",
80-
{},
80+
{0: "//html/body/img"},
8181
["Hello World"],
82-
[],
82+
["[ % 0 ]"],
8383
),
8484
pytest.param(
8585
"japanese.html",
@@ -152,6 +152,49 @@
152152
["This is some text content inside the iframe"],
153153
["[ 0 ]"],
154154
),
155+
(
156+
"image_inside_button.html",
157+
{
158+
0: '//html/body/button[@id="image-button"]',
159+
},
160+
[],
161+
["[ $ 0 ]"],
162+
),
163+
(
164+
"image_and_text.html",
165+
{
166+
0: "//html/body/div/img",
167+
1: "//html/body/div/p",
168+
},
169+
["Some text next to an image"],
170+
["[ % 0 ]", "[ 1 ]"],
171+
),
172+
(
173+
"different_image_sizes.html",
174+
{
175+
0: '//html/body/img[1][@id="small"]',
176+
1: '//html/body/img[2][@id="medium"]',
177+
2: '//html/body/img[3][@id="large"]',
178+
},
179+
[],
180+
["[ % 0 ]", "[ % 1 ]", "[ % 2 ]"],
181+
),
182+
(
183+
"hidden_image.html",
184+
{
185+
0: '//html/body/img[1][@id="visible-image"]',
186+
},
187+
[],
188+
["[ % 0 ]"],
189+
),
190+
(
191+
"image_inside_link.html",
192+
{
193+
0: '//html/body/a[@id="link1"]',
194+
},
195+
[],
196+
["[ @ 0 ]"],
197+
),
155198
],
156199
)
157200
async def test_combined_elements_page(

0 commit comments

Comments
 (0)