diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a3b1f7bfe0..cb9d1557320 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -583,6 +583,14 @@ if(EXECUTORCH_BUILD_PYBIND)
       torch
   )
 
+  if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+    if(CMAKE_TOOLCHAIN_IOS OR CMAKE_TOOLCHAIN_ANDROID OR APPLE)
+      list(APPEND _dep_libs extension_module_static)
+    else()
+      list(APPEND _dep_libs extension_module)
+    endif()
+  endif()
+
   if(EXECUTORCH_BUILD_TESTS)
     list(APPEND _dep_libs test_backend_compiler_lib)
   endif()
diff --git a/devtools/bundled_program/test/test_end2end.py b/devtools/bundled_program/test/test_end2end.py
index 7cee073be0e..3268a0df19a 100644
--- a/devtools/bundled_program/test/test_end2end.py
+++ b/devtools/bundled_program/test/test_end2end.py
@@ -5,21 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 # flake8: noqa: F401
-import functools
-import inspect
-import os
-import random
 import unittest
-from typing import Callable, Dict, Optional, Tuple, Type
-
-import executorch.exir as exir
-
-import executorch.exir.control_flow as control_flow
-
-# @manual=//executorch/extension/pytree:pybindings
-import executorch.extension.pytree as pytree
-
-import torch
 
 from executorch.devtools.bundled_program.core import BundledProgram
 from executorch.devtools.bundled_program.serialize import (
@@ -35,8 +21,6 @@
 try:
     from executorch.extension.pybindings.portable_lib import (
         _load_bundled_program_from_buffer,
-        _load_for_executorch_from_buffer,
-        _load_for_executorch_from_bundled_program,
     )
 
     kernel_mode = "lean"
@@ -47,8 +31,6 @@
 try:
     from executorch.extension.pybindings.aten_lib import (  # @manual=//executorch/extension/pybindings:aten_lib
         _load_bundled_program_from_buffer,
-        _load_for_executorch_from_buffer,
-        _load_for_executorch_from_bundled_program,
     )
 
     assert kernel_mode is None
@@ -75,19 +57,8 @@ def test_sample_model_e2e(self):
             bundled_program_buffer
         )
 
-        executorch_module = _load_for_executorch_from_bundled_program(
-            executorch_bundled_program
-        )
-
         for method_name in eager_model.method_names:
-            executorch_module.load_bundled_input(
-                executorch_bundled_program,
-                method_name,
-                0,
-            )
-            executorch_module.plan_execute(method_name)
-            executorch_module.verify_result_with_bundled_expected_output(
-                executorch_bundled_program,
+            executorch_bundled_program.verify_result_with_bundled_expected_output(
                 method_name,
                 0,
             )
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 3e449da5e14..f22688200da 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -53,3 +53,14 @@ def define_common_targets():
                 "//executorch/extension/module:module" + aten_suffix,
             ],
         )
+
+        runtime.cxx_library(
+            name = "module_wrapper" + aten_suffix,
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/module:bundled_module" + aten_suffix,
+            ],
+        )
diff --git a/extension/pybindings/README.md b/extension/pybindings/README.md
index 2cd680e7bb9..4a663a69b49 100644
--- a/extension/pybindings/README.md
+++ b/extension/pybindings/README.md
@@ -27,8 +27,6 @@ CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh
 - `_reset_profile_results()`: Reset profile results.
 ## Classes
 ### ExecuTorchModule
-- `load_bundled_input()`: Load bundled input.
-- `verify_result_with_bundled_expected_output(bundle: str, method_name: str, testset_idx: int, rtol: float = 1e-5, atol: float = 1e-8)`: Verify result with bundled expected output.
 - `plan_execute()`: Plan and execute.
 - `run_method()`: Run method.
 - `forward()`: Forward. This takes a pytree-flattend PyTorch-tensor-based input.
@@ -37,5 +35,6 @@ CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh
 - `__call__()`: Call method.
 ### BundledModule
 This class is currently empty and serves as a placeholder for future methods and attributes.
+- `verify_result_with_bundled_expected_output(method_name: str, testset_idx: int, rtol: float = 1e-5, atol: float = 1e-8)`: Verify result with bundled expected output.
 ## Note
 All functions and methods are guarded by a call guard that redirects `cout` and `cerr` to the Python environment.
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index db0871657f6..ae61a07fe99 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -23,6 +23,7 @@
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
+#include <executorch/extension/module/bundled_module.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/data_loader.h>
@@ -425,13 +426,54 @@ inline std::unique_ptr<Module> load_module_from_file(
       program_verification);
 }
 
+inline py::list get_outputs_as_py_list(
+    const std::vector<EValue>& outputs,
+    bool clone_outputs = true) {
+  const auto outputs_size = outputs.size();
+  py::list list(outputs_size);
+  for (size_t i = 0; i < outputs_size; ++i) {
+    auto& v = outputs[i];
+    if (Tag::None == v.tag) {
+      list[i] = py::none();
+    } else if (Tag::Int == v.tag) {
+      list[i] = py::cast(v.toInt());
+    } else if (Tag::Double == v.tag) {
+      list[i] = py::cast(v.toDouble());
+    } else if (Tag::Bool == v.tag) {
+      list[i] = py::cast(v.toBool());
+    } else if (Tag::String == v.tag) {
+      list[i] = py::cast(std::string(v.toString().data()));
+    } else if (Tag::Tensor == v.tag) {
+#ifdef USE_ATEN_LIB
+      // Clone so the outputs in python do not share a lifetime with the
+      // module object
+      if (clone_outputs) {
+        list[i] = py::cast(v.toTensor().clone());
+      } else {
+        list[i] = py::cast(v.toTensor());
+      }
+#else
+      if (clone_outputs) {
+        list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone());
+      } else {
+        list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()));
+      }
+#endif
+    } else {
+      ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
+    }
+  }
+  return list;
+}
+
 static constexpr size_t kDEFAULT_BUNDLED_INPUT_POOL_SIZE = 16 * 1024U;
 
-struct PyBundledModule final {
+struct PyBundledModule : public BundledModule {
   explicit PyBundledModule(
       const py::bytes& buffer,
       uint32_t bundled_input_pool_size)
-      : bundled_program_ptr_(buffer),
+      : BundledModule(buffer.cast<std::string_view>().data()),
+        bundled_program_ptr_(buffer),
         program_ptr_(static_cast<const void*>(
             bundled_program_flatbuffer::GetBundledProgram(
                 get_bundled_program_ptr())
@@ -460,6 +502,33 @@ struct PyBundledModule final {
     return program_len_;
   }
 
+  py::list verify_result_with_bundled_expected_output(
+      const std::string& method_name,
+      size_t testset_idx,
+      double rtol = 1e-5,
+      double atol = 1e-8) {
+    // Execute the method
+    auto result = BundledModule::execute(method_name, testset_idx);
+    if (!result.ok()) {
+      THROW_IF_ERROR(
+          result.error(),
+          "Method execution failed with status 0x%" PRIx32,
+          static_cast<uint32_t>(result.error()));
+    }
+
+    // Convert outputs to py::list
+    const auto& outputs = result.get();
+    py::list py_outputs = get_outputs_as_py_list(outputs);
+
+    Error status = BundledModule::verify_method_outputs(
+        method_name, testset_idx, rtol, atol);
+    THROW_IF_ERROR(
+        status,
+        "Result verification failed with status %" PRIu32,
+        static_cast<uint32_t>(status));
+    return py_outputs;
+  }
+
  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
@@ -816,43 +885,6 @@ struct PyModule final {
     }
   }
 
-  void load_bundled_input(
-      PyBundledModule& m,
-      const std::string method_name,
-      size_t testset_idx) {
-    const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input(
-        module_->get_method(method_name), bundled_program_ptr, testset_idx);
-    THROW_IF_ERROR(
-        status,
-        "load_bundled_input failed with status 0x%" PRIx32,
-        static_cast<uint32_t>(status));
-  }
-
-  py::list verify_result_with_bundled_expected_output(
-      PyBundledModule& m,
-      const std::string method_name,
-      size_t testset_idx,
-      double rtol = 1e-5,
-      double atol = 1e-8) {
-    const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    auto& method = module_->get_method(method_name);
-    Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input(
-        method, bundled_program_ptr, testset_idx);
-    THROW_IF_ERROR(
-        status,
-        "load_bundled_input failed with status 0x%" PRIx32,
-        static_cast<uint32_t>(status));
-    py::list outputs = plan_execute(method_name);
-    status = executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs(
-        method, bundled_program_ptr, testset_idx, rtol, atol);
-    THROW_IF_ERROR(
-        status,
-        "Result verification failed with status %" PRIu32,
-        static_cast<uint32_t>(status));
-    return outputs;
-  }
-
   py::list plan_execute(
       const std::string method_name,
       bool clone_outputs = true) {
@@ -875,46 +907,6 @@ struct PyModule final {
     return get_outputs_as_py_list(outputs, clone_outputs);
   }
 
-  py::list get_outputs_as_py_list(
-      const std::vector<EValue>& outputs,
-      bool clone_outputs = true) {
-    const auto outputs_size = outputs.size();
-    py::list list(outputs_size);
-    for (size_t i = 0; i < outputs_size; ++i) {
-      auto& v = outputs[i];
-      if (Tag::None == v.tag) {
-        list[i] = py::none();
-      } else if (Tag::Int == v.tag) {
-        list[i] = py::cast(v.toInt());
-      } else if (Tag::Double == v.tag) {
-        list[i] = py::cast(v.toDouble());
-      } else if (Tag::Bool == v.tag) {
-        list[i] = py::cast(v.toBool());
-      } else if (Tag::String == v.tag) {
-        list[i] = py::cast(std::string(v.toString().data()));
-      } else if (Tag::Tensor == v.tag) {
-#ifdef USE_ATEN_LIB
-        // Clone so the outputs in python do not share a lifetime with the
-        // module object
-        if (clone_outputs) {
-          list[i] = py::cast(v.toTensor().clone());
-        } else {
-          list[i] = py::cast(v.toTensor());
-        }
-#else
-        if (clone_outputs) {
-          list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone());
-        } else {
-          list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()));
-        }
-#endif
-      } else {
-        ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
-      }
-    }
-    return list;
-  }
-
   std::unique_ptr<PyMethodMeta> method_meta(const std::string method_name) {
     auto& method = module_->get_method(method_name);
     return std::make_unique<PyMethodMeta>(module_, method.method_meta());
@@ -1074,16 +1066,6 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
       call_guard);
 
   py::class_<PyModule>(m, "ExecuTorchModule")
-      .def("load_bundled_input", &PyModule::load_bundled_input, call_guard)
-      .def(
-          "verify_result_with_bundled_expected_output",
-          &PyModule::verify_result_with_bundled_expected_output,
-          py::arg("bundle"),
-          py::arg("method_name"),
-          py::arg("testset_idx"),
-          py::arg("rtol") = 1e-5,
-          py::arg("atol") = 1e-8,
-          call_guard)
       .def(
           "plan_execute",
           &PyModule::plan_execute,
@@ -1129,7 +1111,16 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
           py::arg("clone_outputs") = true,
           call_guard);
 
-  py::class_<PyBundledModule>(m, "BundledModule");
+  py::class_<PyBundledModule>(m, "BundledModule")
+      .def(
+          "verify_result_with_bundled_expected_output",
+          &PyBundledModule::verify_result_with_bundled_expected_output,
+          py::arg("method_name"),
+          py::arg("testset_idx"),
+          py::arg("rtol") = 1e-5,
+          py::arg("atol") = 1e-8,
+          call_guard);
+
   py::class_<PyTensorInfo>(m, "TensorInfo")
       .def("sizes", &PyTensorInfo::sizes, call_guard)
       .def("dtype", &PyTensorInfo::dtype, call_guard)
diff --git a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
index 1616304c3ea..5b3bf18a09e 100644
--- a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
+++ b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
@@ -16,6 +16,7 @@ PORTABLE_MODULE_DEPS = [
     "//executorch/extension/data_loader:buffer_data_loader",
     "//executorch/extension/data_loader:mmap_data_loader",
     "//executorch/extension/memory_allocator:malloc_memory_allocator",
+    "//executorch/extension/module:module_wrapper",
     "//executorch/runtime/executor/test:test_backend_compiler_lib",
     "//executorch/devtools/etdump:etdump_flatcc",
 ] + get_all_cpu_backend_targets()
@@ -28,6 +29,7 @@ ATEN_MODULE_DEPS = [
     "//executorch/extension/data_loader:buffer_data_loader",
     "//executorch/extension/data_loader:mmap_data_loader",
     "//executorch/extension/memory_allocator:malloc_memory_allocator",
+    "//executorch/extension/module:module_wrapper_aten",
     "//executorch/devtools/bundled_program:runtime_aten",
     "//executorch/runtime/executor/test:test_backend_compiler_lib_aten",
     "//executorch/devtools/etdump:etdump_flatcc",