♻️ Added new context manager function to reduce code duplication (#148)

seanmcguire12 · web-flow · commit d4d670b8cb05 · 2024-09-06T21:36:40.000-07:00
* ♻️ Added new context manager function to reduce code duplication

* address comments

* fix naming issue
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,6 +5,7 @@
 import pytest
 import pytest_asyncio
 from dotenv import load_dotenv
+from contextlib import asynccontextmanager
 from playwright.async_api import async_playwright
 from playwright.sync_api import sync_playwright
 from selenium import webdriver
@@ -13,6 +14,8 @@
 
 from tarsier import GoogleVisionOCRService, Tarsier, MicrosoftAzureOCRService
 
+test_html_folder = "mock_html"
+
 IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
 
 if not IN_GITHUB_ACTIONS or os.getenv("NODE_ENV") == "development":
@@ -93,3 +96,26 @@ def tarsier(credentials):
             raise ValueError("Invalid OCR provider")
 
     yield Tarsier(ocr_service)
+
+
+@asynccontextmanager
+async def page_context_manager(html_file: str):
+    """Context manager for opening and navigating to a file URL using Playwright."""
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=IN_GITHUB_ACTIONS)
+        page = await browser.new_page()
+        try:
+            # Construct the absolute file path
+            html_file_path = os.path.abspath(
+                os.path.join(os.path.dirname(__file__), test_html_folder, html_file)
+            )
+            await page.goto(f"file://{html_file_path}")
+            yield page
+        finally:
+            await page.close()
+            await browser.close()
+
+
+@pytest.fixture
+def page_context():
+    return page_context_manager
diff --git a/tests/test_artifact_removal.py b/tests/test_artifact_removal.py
@@ -1,7 +1,4 @@
-import os
-
 import pytest
-from playwright.async_api import async_playwright
 
 example_data = [
     {
@@ -34,21 +31,12 @@ def create_tarsier_functions(tarsier):
 
 @pytest.mark.parametrize("data", example_data)
 @pytest.mark.asyncio
-async def test_artifact_removal(data, tarsier):
+async def test_artifact_removal(data, tarsier, page_context):
     file_name = data["file_name"]
     artifact_selectors = data["artifact_selectors"]
 
-    html_file_path = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "mock_html", file_name)
-    )
-
-    tarsier_functions = create_tarsier_functions(tarsier)
-
-    async with async_playwright() as p:
-        browser = await p.chromium.launch(headless=True)
-        page = await browser.new_page()
-
-        await page.goto(f"file://{html_file_path}")
+    async with page_context(file_name) as page:
+        tarsier_functions = create_tarsier_functions(tarsier)
 
         for tarsier_func, cleanup_funcs in tarsier_functions:
             await tarsier_func(page)
@@ -72,5 +60,3 @@ async def test_artifact_removal(data, tarsier):
                     f"Tarsier artifact '{selector}' still exists in file: {file_name} "
                     f"after applying cleanup functions"
                 )
-
-        await browser.close()
diff --git a/tests/test_elements.py b/tests/test_elements.py
@@ -12,19 +12,19 @@
     "html_file, expected_tag_to_xpath, expected_page_text, expected_tag_string",
     [
         (
-            "mock_html/text_only.html",
+            "text_only.html",
             {0: "//html/body/h1"},
             ["Hello, World!"],
             ["[ 0 ]"],
         ),
         (
-            "mock_html/hyperlink_only.html",
+            "hyperlink_only.html",
             {0: '//html/body/p/a[@id="link1"]'},
             ["Example Link 1"],
             ["[ @ 0 ]"],
         ),
         (
-            "mock_html/interactable_only.html",
+            "interactable_only.html",
             {
                 0: '//html/body/button[@id="button"]',
                 1: '//html/body/input[@id="checkbox"]',
@@ -33,7 +33,7 @@
             ["[ $ 0 ]", "[ $ 1 ]"],
         ),
         (
-            "mock_html/combination.html",
+            "combination.html",
             {
                 0: '//html/body/input[1][@id="text"]',
                 1: '//html/body/input[2][@id="checkbox"]',
@@ -43,13 +43,13 @@
             ["[ # 0 ]", "[ $ 1 ]", "[ 2 ]"],
         ),
         (
-            "mock_html/insertable_only.html",
+            "insertable_only.html",
             {0: '//html/body/input[@id="text"]'},
             ["Enter text here"],
             ["[ # 0 ]"],
         ),
         (
-            "mock_html/br_elem.html",
+            "br_elem.html",
             {
                 0: "//html/body/div",
                 1: "//html/body/div",
@@ -59,15 +59,15 @@
             ["[ 0 ]", "[ 1 ]", "[ 2 ]"],
         ),
         (
-            "mock_html/display_contents.html",
+            "display_contents.html",
             {
                 0: "//html/body/div",
             },
             ["Display contents"],
             ["[ 0 ]"],
         ),
         (
-            "mock_html/icon_buttons.html",
+            "icon_buttons.html",
             {
                 0: "//html/body/button[1]",
                 1: "//html/body/button[2]",
@@ -76,13 +76,13 @@
             ["[ $ 0 ]", "[ $ 1 ]"],
         ),
         (
-            "mock_html/image.html",
+            "image.html",
             {},
             ["Hello World"],
             [],
         ),
         pytest.param(
-            "mock_html/japanese.html",
+            "japanese.html",
             {
                 0: '//html/body/p[@id="japanese"]',
             },
@@ -93,7 +93,7 @@
             ),
         ),
         pytest.param(
-            "mock_html/russian.html",
+            "russian.html",
             {
                 0: '//html/body/p[@id="russian"]',
             },
@@ -104,7 +104,7 @@
             ),
         ),
         pytest.param(
-            "mock_html/chinese.html",
+            "chinese.html",
             {
                 0: '//html/body/p[@id="chinese"]',
             },
@@ -115,7 +115,7 @@
             ),
         ),
         pytest.param(
-            "mock_html/arabic.html",
+            "arabic.html",
             {
                 0: '//html/body/p[@id="arabic"]',
             },
@@ -126,7 +126,7 @@
             ),
         ),
         pytest.param(
-            "mock_html/hindi.html",
+            "hindi.html",
             {
                 0: '//html/body/p[@id="hindi"]',
             },
@@ -137,15 +137,15 @@
             ),
         ),
         (
-            "mock_html/dropdown.html",
+            "dropdown.html",
             {
                 0: "//html/body/label",
             },
             ["Option 1"],
             ["[ $ 0 ]"],
         ),
         (
-            "mock_html/iframe.html",
+            "iframe.html",
             {
                 0: "iframe[0]//html/body/p",
             },
@@ -156,74 +156,68 @@
 )
 async def test_combined_elements_page(
     tarsier,
-    async_page,
+    page_context,
     html_file,
     expected_tag_to_xpath,
     expected_page_text,
     expected_tag_string,
 ):
-    html_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), html_file))
-    await async_page.goto(f"file://{html_file_path}")
-
-    page_text, tag_to_xpath = await tarsier.page_to_text(
-        async_page, tag_text_elements=True
-    )
+    async with page_context(html_file) as page:
+        page_text, tag_to_xpath = await tarsier.page_to_text(
+            page, tag_text_elements=True
+        )
 
-    assert tag_to_xpath == expected_tag_to_xpath, (
-        f"tag_to_xpath does not match expected output for "
-        f"{html_file}. Got: {tag_to_xpath}"
-    )
+        assert tag_to_xpath == expected_tag_to_xpath, (
+            f"tag_to_xpath does not match expected output for "
+            f"{html_file}. Got: {tag_to_xpath}"
+        )
 
-    # TODO: revert to testing against entire string when colour tagging is merged
-    for expected_text in expected_page_text:
-        normalized_expected_text = "".join(
-            expected_text.split()
-        )  # Remove whitespace from expected text
-        page_text_combined = "".join(page_text).replace(" ", "")
+        # TODO: revert to testing against entire string when colour tagging is merged
+        for expected_text in expected_page_text:
+            normalized_expected_text = "".join(
+                expected_text.split()
+            )  # Remove whitespace from expected text
+            page_text_combined = "".join(page_text).replace(" ", "")
 
-        assert all(char in page_text_combined for char in normalized_expected_text), (
-            f"Expected text '{expected_text}' not found in page text for {html_file}. "
-            f"Got: {page_text}"
-        )
+            assert all(
+                char in page_text_combined for char in normalized_expected_text
+            ), (
+                f"Expected text '{expected_text}' not found in page text for {html_file}. "
+                f"Got: {page_text}"
+            )
 
-    for expected_tag in expected_tag_string:
-        assert expected_tag in page_text, (
-            f"Expected tag '{expected_tag}' not found in page text for {html_file}. "
-            f"Got: {page_text}"
-        )
+        for expected_tag in expected_tag_string:
+            assert expected_tag in page_text, (
+                f"Expected tag '{expected_tag}' not found in page text for {html_file}. "
+                f"Got: {page_text}"
+            )
 
 
 @pytest.mark.asyncio
-async def test_text_nodes_are_query_selectable(async_page):
-    text_node_html_path = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "mock_html/text_nodes.html")
-    )
-    await async_page.goto(f"file://{text_node_html_path}")
-    tarsier = Tarsier(DummyOCRService())
-    _, tag_to_xpath = await tarsier.page_to_text(async_page, tag_text_elements=True)
+async def test_text_nodes_are_query_selectable(page_context):
+    async with page_context("text_nodes.html") as page:
+        tarsier = Tarsier(DummyOCRService())
+        _, tag_to_xpath = await tarsier.page_to_text(page, tag_text_elements=True)
 
-    # Query selector will specifically filter out TextNodes within XPath selectors
-    # As a result, the tagged xpath for the text node should belong to the parent
-    # https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/injected/xpathSelectorEngine.ts#L29-L30)
-    assert len(tag_to_xpath) == 2
-    assert await async_page.query_selector(tag_to_xpath[0])
-    assert await async_page.query_selector(tag_to_xpath[1])
+        # Query selector will specifically filter out TextNodes within XPath selectors
+        # As a result, the tagged xpath for the text node should belong to the parent
+        # https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/injected/xpathSelectorEngine.ts#L29-L30)
+        assert len(tag_to_xpath) == 2
+        assert await page.query_selector(tag_to_xpath[0])
+        assert await page.query_selector(tag_to_xpath[1])
 
 
 @pytest.mark.asyncio
-async def test_dropdown_text_not_shown(tarsier, async_page):
-    dropdown_html_path = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "mock_html/dropdown.html")
-    )
-    await async_page.goto(f"file://{dropdown_html_path}")
-    page_text, tag_to_xpath = await tarsier.page_to_text(
-        async_page, tag_text_elements=True
-    )
+async def test_dropdown_text_not_shown(tarsier, page_context):
+    async with page_context("dropdown.html") as page:
+        page_text, tag_to_xpath = await tarsier.page_to_text(
+            page, tag_text_elements=True
+        )
 
-    assert "[ $ 1 ]" not in page_text
-    assert "[ $ 2 ]" not in page_text
-    assert "[ $ 3 ]" not in page_text
-    assert "[ $ 4 ]" not in page_text
-    assert "Option 2" not in page_text
-    assert "Option 3" not in page_text
-    assert "Option 4" not in page_text
+        assert "[ $ 1 ]" not in page_text
+        assert "[ $ 2 ]" not in page_text
+        assert "[ $ 3 ]" not in page_text
+        assert "[ $ 4 ]" not in page_text
+        assert "Option 2" not in page_text
+        assert "Option 3" not in page_text
+        assert "Option 4" not in page_text
diff --git a/tests/test_namespace.py b/tests/test_namespace.py
@@ -1,5 +1,3 @@
-import os
-
 import pytest
 import pytest_asyncio
 from playwright.async_api import Page
@@ -60,12 +58,8 @@ async def test_fix_namespaces(browser_adapter_with_js):
 
 
 @pytest.mark.asyncio
-async def test_xpath_namespace(tarsier, async_page):
-    html_with_namespace_path = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "mock_html/namespace.html")
-    )
-    await async_page.goto(f"file://{html_with_namespace_path}")
-
-    _, tag_to_xpath = await tarsier.page_to_text(async_page, tag_text_elements=True)
-    assert len(tag_to_xpath) == 1, "The page contains only a single tag"
-    assert tag_to_xpath[0] == '//html/body/*[name()="sc:visitoridentification"]/div'
+async def test_xpath_namespace(tarsier, page_context):
+    async with page_context("namespace.html") as page:
+        _, tag_to_xpath = await tarsier.page_to_text(page, tag_text_elements=True)
+        assert len(tag_to_xpath) == 1, "The page contains only a single tag"
+        assert tag_to_xpath[0] == '//html/body/*[name()="sc:visitoridentification"]/div'
diff --git a/tests/test_text_formatting.py b/tests/test_text_formatting.py
@@ -1,4 +1,3 @@
-import os
 import pytest
 
 
@@ -13,14 +12,11 @@
         ("xx_large.html", ["**XXLarge**"]),
     ],
 )
-async def test_font_formatting(tarsier, async_page, html_file, expected_text_content):
-    font_formatting_html_path = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), f"mock_html/{html_file}")
-    )
-    await async_page.goto(f"file://{font_formatting_html_path}")
-    page_text, tag_to_xpath = await tarsier.page_to_text(
-        async_page, tagless=True, tag_text_elements=True
-    )
+async def test_font_formatting(tarsier, page_context, html_file, expected_text_content):
+    async with page_context(html_file) as page:
+        page_text, tag_to_xpath = await tarsier.page_to_text(
+            page, tagless=True, tag_text_elements=True
+        )
 
     for expected_line in expected_text_content:
         assert (