Skip to content

Commit d4d670b

Browse files
♻️ Added new context manager function to reduce code duplication (#148)
* ♻️ Added new context manager function to reduce code duplication * address comments * fix naming issue
1 parent 4e94453 commit d4d670b

File tree

5 files changed

+103
-107
lines changed

5 files changed

+103
-107
lines changed

tests/conftest.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66
import pytest_asyncio
77
from dotenv import load_dotenv
8+
from contextlib import asynccontextmanager
89
from playwright.async_api import async_playwright
910
from playwright.sync_api import sync_playwright
1011
from selenium import webdriver
@@ -13,6 +14,8 @@
1314

1415
from tarsier import GoogleVisionOCRService, Tarsier, MicrosoftAzureOCRService
1516

17+
test_html_folder = "mock_html"
18+
1619
IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
1720

1821
if not IN_GITHUB_ACTIONS or os.getenv("NODE_ENV") == "development":
@@ -93,3 +96,26 @@ def tarsier(credentials):
9396
raise ValueError("Invalid OCR provider")
9497

9598
yield Tarsier(ocr_service)
99+
100+
101+
@asynccontextmanager
102+
async def page_context_manager(html_file: str):
103+
"""Context manager for opening and navigating to a file URL using Playwright."""
104+
async with async_playwright() as p:
105+
browser = await p.chromium.launch(headless=IN_GITHUB_ACTIONS)
106+
page = await browser.new_page()
107+
try:
108+
# Construct the absolute file path
109+
html_file_path = os.path.abspath(
110+
os.path.join(os.path.dirname(__file__), test_html_folder, html_file)
111+
)
112+
await page.goto(f"file://{html_file_path}")
113+
yield page
114+
finally:
115+
await page.close()
116+
await browser.close()
117+
118+
119+
@pytest.fixture
120+
def page_context():
121+
return page_context_manager

tests/test_artifact_removal.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
import os
2-
31
import pytest
4-
from playwright.async_api import async_playwright
52

63
example_data = [
74
{
@@ -34,21 +31,12 @@ def create_tarsier_functions(tarsier):
3431

3532
@pytest.mark.parametrize("data", example_data)
3633
@pytest.mark.asyncio
37-
async def test_artifact_removal(data, tarsier):
34+
async def test_artifact_removal(data, tarsier, page_context):
3835
file_name = data["file_name"]
3936
artifact_selectors = data["artifact_selectors"]
4037

41-
html_file_path = os.path.abspath(
42-
os.path.join(os.path.dirname(__file__), "mock_html", file_name)
43-
)
44-
45-
tarsier_functions = create_tarsier_functions(tarsier)
46-
47-
async with async_playwright() as p:
48-
browser = await p.chromium.launch(headless=True)
49-
page = await browser.new_page()
50-
51-
await page.goto(f"file://{html_file_path}")
38+
async with page_context(file_name) as page:
39+
tarsier_functions = create_tarsier_functions(tarsier)
5240

5341
for tarsier_func, cleanup_funcs in tarsier_functions:
5442
await tarsier_func(page)
@@ -72,5 +60,3 @@ async def test_artifact_removal(data, tarsier):
7260
f"Tarsier artifact '{selector}' still exists in file: {file_name} "
7361
f"after applying cleanup functions"
7462
)
75-
76-
await browser.close()

tests/test_elements.py

Lines changed: 64 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,19 @@
1212
"html_file, expected_tag_to_xpath, expected_page_text, expected_tag_string",
1313
[
1414
(
15-
"mock_html/text_only.html",
15+
"text_only.html",
1616
{0: "//html/body/h1"},
1717
["Hello, World!"],
1818
["[ 0 ]"],
1919
),
2020
(
21-
"mock_html/hyperlink_only.html",
21+
"hyperlink_only.html",
2222
{0: '//html/body/p/a[@id="link1"]'},
2323
["Example Link 1"],
2424
["[ @ 0 ]"],
2525
),
2626
(
27-
"mock_html/interactable_only.html",
27+
"interactable_only.html",
2828
{
2929
0: '//html/body/button[@id="button"]',
3030
1: '//html/body/input[@id="checkbox"]',
@@ -33,7 +33,7 @@
3333
["[ $ 0 ]", "[ $ 1 ]"],
3434
),
3535
(
36-
"mock_html/combination.html",
36+
"combination.html",
3737
{
3838
0: '//html/body/input[1][@id="text"]',
3939
1: '//html/body/input[2][@id="checkbox"]',
@@ -43,13 +43,13 @@
4343
["[ # 0 ]", "[ $ 1 ]", "[ 2 ]"],
4444
),
4545
(
46-
"mock_html/insertable_only.html",
46+
"insertable_only.html",
4747
{0: '//html/body/input[@id="text"]'},
4848
["Enter text here"],
4949
["[ # 0 ]"],
5050
),
5151
(
52-
"mock_html/br_elem.html",
52+
"br_elem.html",
5353
{
5454
0: "//html/body/div",
5555
1: "//html/body/div",
@@ -59,15 +59,15 @@
5959
["[ 0 ]", "[ 1 ]", "[ 2 ]"],
6060
),
6161
(
62-
"mock_html/display_contents.html",
62+
"display_contents.html",
6363
{
6464
0: "//html/body/div",
6565
},
6666
["Display contents"],
6767
["[ 0 ]"],
6868
),
6969
(
70-
"mock_html/icon_buttons.html",
70+
"icon_buttons.html",
7171
{
7272
0: "//html/body/button[1]",
7373
1: "//html/body/button[2]",
@@ -76,13 +76,13 @@
7676
["[ $ 0 ]", "[ $ 1 ]"],
7777
),
7878
(
79-
"mock_html/image.html",
79+
"image.html",
8080
{},
8181
["Hello World"],
8282
[],
8383
),
8484
pytest.param(
85-
"mock_html/japanese.html",
85+
"japanese.html",
8686
{
8787
0: '//html/body/p[@id="japanese"]',
8888
},
@@ -93,7 +93,7 @@
9393
),
9494
),
9595
pytest.param(
96-
"mock_html/russian.html",
96+
"russian.html",
9797
{
9898
0: '//html/body/p[@id="russian"]',
9999
},
@@ -104,7 +104,7 @@
104104
),
105105
),
106106
pytest.param(
107-
"mock_html/chinese.html",
107+
"chinese.html",
108108
{
109109
0: '//html/body/p[@id="chinese"]',
110110
},
@@ -115,7 +115,7 @@
115115
),
116116
),
117117
pytest.param(
118-
"mock_html/arabic.html",
118+
"arabic.html",
119119
{
120120
0: '//html/body/p[@id="arabic"]',
121121
},
@@ -126,7 +126,7 @@
126126
),
127127
),
128128
pytest.param(
129-
"mock_html/hindi.html",
129+
"hindi.html",
130130
{
131131
0: '//html/body/p[@id="hindi"]',
132132
},
@@ -137,15 +137,15 @@
137137
),
138138
),
139139
(
140-
"mock_html/dropdown.html",
140+
"dropdown.html",
141141
{
142142
0: "//html/body/label",
143143
},
144144
["Option 1"],
145145
["[ $ 0 ]"],
146146
),
147147
(
148-
"mock_html/iframe.html",
148+
"iframe.html",
149149
{
150150
0: "iframe[0]//html/body/p",
151151
},
@@ -156,74 +156,68 @@
156156
)
157157
async def test_combined_elements_page(
158158
tarsier,
159-
async_page,
159+
page_context,
160160
html_file,
161161
expected_tag_to_xpath,
162162
expected_page_text,
163163
expected_tag_string,
164164
):
165-
html_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), html_file))
166-
await async_page.goto(f"file://{html_file_path}")
167-
168-
page_text, tag_to_xpath = await tarsier.page_to_text(
169-
async_page, tag_text_elements=True
170-
)
165+
async with page_context(html_file) as page:
166+
page_text, tag_to_xpath = await tarsier.page_to_text(
167+
page, tag_text_elements=True
168+
)
171169

172-
assert tag_to_xpath == expected_tag_to_xpath, (
173-
f"tag_to_xpath does not match expected output for "
174-
f"{html_file}. Got: {tag_to_xpath}"
175-
)
170+
assert tag_to_xpath == expected_tag_to_xpath, (
171+
f"tag_to_xpath does not match expected output for "
172+
f"{html_file}. Got: {tag_to_xpath}"
173+
)
176174

177-
# TODO: revert to testing against entire string when colour tagging is merged
178-
for expected_text in expected_page_text:
179-
normalized_expected_text = "".join(
180-
expected_text.split()
181-
) # Remove whitespace from expected text
182-
page_text_combined = "".join(page_text).replace(" ", "")
175+
# TODO: revert to testing against entire string when colour tagging is merged
176+
for expected_text in expected_page_text:
177+
normalized_expected_text = "".join(
178+
expected_text.split()
179+
) # Remove whitespace from expected text
180+
page_text_combined = "".join(page_text).replace(" ", "")
183181

184-
assert all(char in page_text_combined for char in normalized_expected_text), (
185-
f"Expected text '{expected_text}' not found in page text for {html_file}. "
186-
f"Got: {page_text}"
187-
)
182+
assert all(
183+
char in page_text_combined for char in normalized_expected_text
184+
), (
185+
f"Expected text '{expected_text}' not found in page text for {html_file}. "
186+
f"Got: {page_text}"
187+
)
188188

189-
for expected_tag in expected_tag_string:
190-
assert expected_tag in page_text, (
191-
f"Expected tag '{expected_tag}' not found in page text for {html_file}. "
192-
f"Got: {page_text}"
193-
)
189+
for expected_tag in expected_tag_string:
190+
assert expected_tag in page_text, (
191+
f"Expected tag '{expected_tag}' not found in page text for {html_file}. "
192+
f"Got: {page_text}"
193+
)
194194

195195

196196
@pytest.mark.asyncio
197-
async def test_text_nodes_are_query_selectable(async_page):
198-
text_node_html_path = os.path.abspath(
199-
os.path.join(os.path.dirname(__file__), "mock_html/text_nodes.html")
200-
)
201-
await async_page.goto(f"file://{text_node_html_path}")
202-
tarsier = Tarsier(DummyOCRService())
203-
_, tag_to_xpath = await tarsier.page_to_text(async_page, tag_text_elements=True)
197+
async def test_text_nodes_are_query_selectable(page_context):
198+
async with page_context("text_nodes.html") as page:
199+
tarsier = Tarsier(DummyOCRService())
200+
_, tag_to_xpath = await tarsier.page_to_text(page, tag_text_elements=True)
204201

205-
# Query selector will specifically filter out TextNodes within XPath selectors
206-
# As a result, the tagged xpath for the text node should belong to the parent
207-
# https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/injected/xpathSelectorEngine.ts#L29-L30)
208-
assert len(tag_to_xpath) == 2
209-
assert await async_page.query_selector(tag_to_xpath[0])
210-
assert await async_page.query_selector(tag_to_xpath[1])
202+
# Query selector will specifically filter out TextNodes within XPath selectors
203+
# As a result, the tagged xpath for the text node should belong to the parent
204+
# https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/injected/xpathSelectorEngine.ts#L29-L30)
205+
assert len(tag_to_xpath) == 2
206+
assert await page.query_selector(tag_to_xpath[0])
207+
assert await page.query_selector(tag_to_xpath[1])
211208

212209

213210
@pytest.mark.asyncio
214-
async def test_dropdown_text_not_shown(tarsier, async_page):
215-
dropdown_html_path = os.path.abspath(
216-
os.path.join(os.path.dirname(__file__), "mock_html/dropdown.html")
217-
)
218-
await async_page.goto(f"file://{dropdown_html_path}")
219-
page_text, tag_to_xpath = await tarsier.page_to_text(
220-
async_page, tag_text_elements=True
221-
)
211+
async def test_dropdown_text_not_shown(tarsier, page_context):
212+
async with page_context("dropdown.html") as page:
213+
page_text, tag_to_xpath = await tarsier.page_to_text(
214+
page, tag_text_elements=True
215+
)
222216

223-
assert "[ $ 1 ]" not in page_text
224-
assert "[ $ 2 ]" not in page_text
225-
assert "[ $ 3 ]" not in page_text
226-
assert "[ $ 4 ]" not in page_text
227-
assert "Option 2" not in page_text
228-
assert "Option 3" not in page_text
229-
assert "Option 4" not in page_text
217+
assert "[ $ 1 ]" not in page_text
218+
assert "[ $ 2 ]" not in page_text
219+
assert "[ $ 3 ]" not in page_text
220+
assert "[ $ 4 ]" not in page_text
221+
assert "Option 2" not in page_text
222+
assert "Option 3" not in page_text
223+
assert "Option 4" not in page_text

tests/test_namespace.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import os
2-
31
import pytest
42
import pytest_asyncio
53
from playwright.async_api import Page
@@ -60,12 +58,8 @@ async def test_fix_namespaces(browser_adapter_with_js):
6058

6159

6260
@pytest.mark.asyncio
63-
async def test_xpath_namespace(tarsier, async_page):
64-
html_with_namespace_path = os.path.abspath(
65-
os.path.join(os.path.dirname(__file__), "mock_html/namespace.html")
66-
)
67-
await async_page.goto(f"file://{html_with_namespace_path}")
68-
69-
_, tag_to_xpath = await tarsier.page_to_text(async_page, tag_text_elements=True)
70-
assert len(tag_to_xpath) == 1, "The page contains only a single tag"
71-
assert tag_to_xpath[0] == '//html/body/*[name()="sc:visitoridentification"]/div'
61+
async def test_xpath_namespace(tarsier, page_context):
62+
async with page_context("namespace.html") as page:
63+
_, tag_to_xpath = await tarsier.page_to_text(page, tag_text_elements=True)
64+
assert len(tag_to_xpath) == 1, "The page contains only a single tag"
65+
assert tag_to_xpath[0] == '//html/body/*[name()="sc:visitoridentification"]/div'

tests/test_text_formatting.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import os
21
import pytest
32

43

@@ -13,14 +12,11 @@
1312
("xx_large.html", ["**XXLarge**"]),
1413
],
1514
)
16-
async def test_font_formatting(tarsier, async_page, html_file, expected_text_content):
17-
font_formatting_html_path = os.path.abspath(
18-
os.path.join(os.path.dirname(__file__), f"mock_html/{html_file}")
19-
)
20-
await async_page.goto(f"file://{font_formatting_html_path}")
21-
page_text, tag_to_xpath = await tarsier.page_to_text(
22-
async_page, tagless=True, tag_text_elements=True
23-
)
15+
async def test_font_formatting(tarsier, page_context, html_file, expected_text_content):
16+
async with page_context(html_file) as page:
17+
page_text, tag_to_xpath = await tarsier.page_to_text(
18+
page, tagless=True, tag_text_elements=True
19+
)
2420

2521
for expected_line in expected_text_content:
2622
assert (

0 commit comments

Comments
 (0)