Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9805c37
flwr basic example
adityapgupta Mar 9, 2025
abf9902
running flower with k8s
adityapgupta Mar 9, 2025
3702ca3
added flower
adityapgupta Mar 9, 2025
bc0b247
flower workload
adityapgupta Mar 10, 2025
615aaa1
added flower docker and k8s files
adityapgupta Mar 10, 2025
078388e
Merge remote-tracking branch 'upstream/main' into main
adityapgupta Mar 11, 2025
2d3b410
updated submodule
adityapgupta Mar 11, 2025
b91af01
testing edits
adityapgupta Mar 12, 2025
4f38345
Merge branch 'microsoft:main' into main
adityapgupta Apr 27, 2025
5ec9981
added llama client with working fault
adityapgupta Apr 27, 2025
c85fcaf
Merge https://github.com/xlab-uiuc/AIOpsLab into main
adityapgupta Apr 30, 2025
e27416b
added delay to supernode stop
adityapgupta May 1, 2025
bf71fcd
Adding LLaMa client
Armxyz1 May 1, 2025
50bbe00
Merge pull request #1 from adityapgupta/armaan
adityapgupta May 2, 2025
ee0beda
edited flower description
adityapgupta May 2, 2025
00bc2da
cleanup
adityapgupta May 8, 2025
2fbf911
added model misconfig fault
adityapgupta May 8, 2025
8300b2c
Merge https://github.com/xlab-uiuc/AIOpsLab into main
adityapgupta May 22, 2025
51da78a
fast forwarded aiopslab-applications
adityapgupta May 22, 2025
a6bd7c9
removed sleep calls
adityapgupta Jun 19, 2025
c08b7de
Merge https://github.com/microsoft/AIOpsLab into main
adityapgupta Jun 19, 2025
deb1478
prevent openebs and prometheus launch for docker problems
adityapgupta Jun 20, 2025
16d469e
updated requirements
adityapgupta Jun 20, 2025
bfc0b37
Merge https://github.com/microsoft/AIOpsLab into main
adityapgupta Aug 6, 2025
0dd1473
updating aiopslab-applications
adityapgupta Aug 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion aiopslab/generators/fault/inject_virtual.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from aiopslab.service.kubectl import KubeCtl
from aiopslab.service.helm import Helm
from aiopslab.service.dock import Docker
from aiopslab.generators.fault.base import FaultInjector
from aiopslab.service.apps.base import Application
from aiopslab.paths import TARGET_MICROSERVICES
Expand All @@ -18,6 +19,7 @@ def __init__(self, namespace: str):
super().__init__(namespace)
self.namespace = namespace
self.kubectl = KubeCtl()
self.docker = Docker()
self.mongo_service_pod_map = {
"url-shorten-mongodb": "url-shorten-service",
}
Expand Down Expand Up @@ -248,7 +250,35 @@ def recover_wrong_bin_usage(self, microservices: list[str]):
self.kubectl.exec_command(apply_command)

print(f"Recovered from wrong binary usage fault for service: {service}")


def inject_container_stop(self, microservices: list[str]):
"""Inject a fault to stop a container."""
for service in microservices:
self.docker.get_container(service).stop()
print(f"Stopped container {service}.")

print("Waiting for faults to propagate...")
time.sleep(15)
print("Faults propagated.")

def recover_container_stop(self, microservices: list[str]):
for service in microservices:
self.docker.get_container(service).start()
print(f"Started container {service}.")

def inject_model_misconfig(self, microservices: list[str]):
"""Inject a fault to misconfigure the model in the Flower application."""
for service in microservices:
command = f""" docker exec -it {service} sh -c "sed -i '24s/84/80/' /app/.flwr/apps/*/task.py" """
self.docker.exec_command(command)
print(f"Changed model configuration for service: {service}")

def recover_model_misconfig(self, microservices: list[str]):
for service in microservices:
command = f""" docker exec -it {service} sh -c "sed -i '24s/80/84/' /app/.flwr/apps/*/task.py" """
self.docker.exec_command(command)
print(f"Recovered model configuration for service: {service}")

############# HELPER FUNCTIONS ################
def _wait_for_pods_ready(self, microservices: list[str], timeout: int = 30):
for service in microservices:
Expand Down
55 changes: 34 additions & 21 deletions aiopslab/orchestrator/actions/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from datetime import datetime, timedelta
from aiopslab.utils.actions import action, read, write
from aiopslab.service.kubectl import KubeCtl
from aiopslab.service.dock import Docker
from aiopslab.service.shell import Shell

# from aiopslab.observer import initialize_pod_and_service_lists
Expand All @@ -22,7 +23,7 @@ class TaskActions:
@read
def get_logs(namespace: str, service: str) -> str:
"""
Collects relevant log data from a pod using Kubectl.
Collects relevant log data from a pod using Kubectl or from a container with Docker.

Args:
namespace (str): The namespace in which the service is running.
Expand All @@ -31,26 +32,35 @@ def get_logs(namespace: str, service: str) -> str:
Returns:
str | dict | list[dicts]: Log data as a structured object or a string.
"""
kubectl = KubeCtl()
try:
if namespace == "test-social-network":
user_service_pod = kubectl.get_pod_name(namespace, f"app={service}")
elif namespace == "test-hotel-reservation":
user_service_pod = kubectl.get_pod_name(
namespace, f"io.kompose.service={service}"
)
elif namespace == "astronomy-shop":
user_service_pod = kubectl.get_pod_name(
namespace, f"app.kubernetes.io/name={service}"
)
elif namespace == "default" and "wrk2-job" in service:
user_service_pod = kubectl.get_pod_name(namespace, f"job-name=wrk2-job")
else:
raise Exception
logs = kubectl.get_pod_logs(user_service_pod, namespace)
except Exception as e:
return "Error: Your service/namespace does not exist. Use kubectl to check."

if namespace == "docker":
docker = Docker()
try:
logs = docker.get_logs(service)
except Exception as e:
return "Error: Your service does not exist. Use docker to check."

else:
kubectl = KubeCtl()
try:
if namespace == "test-social-network":
user_service_pod = kubectl.get_pod_name(namespace, f"app={service}")
elif namespace == "test-hotel-reservation":
user_service_pod = kubectl.get_pod_name(
namespace, f"io.kompose.service={service}"
)
elif namespace == "astronomy-shop":
user_service_pod = kubectl.get_pod_name(
namespace, f"app.kubernetes.io/name={service}"
)
elif namespace == "default" and "wrk2-job" in service:
user_service_pod = kubectl.get_pod_name(namespace, f"job-name=wrk2-job")
else:
raise Exception
logs = kubectl.get_pod_logs(user_service_pod, namespace)
except Exception as e:
return "Error: Your service/namespace does not exist. Use kubectl to check."

print(logs)
logs = "\n".join(logs.split("\n"))

return logs
Expand All @@ -71,6 +81,9 @@ def exec_shell(command: str) -> str:
"""
if "kubectl edit" in command or "edit svc" in command:
return "Error: Cannot use `kubectl edit`. Use `kubectl patch` instead."

if "docker logs -f" in command:
return "Error: Cannot use `docker logs -f`. Use `docker logs` instead."

return Shell.exec(command)

Expand Down
44 changes: 24 additions & 20 deletions aiopslab/orchestrator/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,24 +45,26 @@ def init_problem(self, problem_id: str):
self.session = Session()
print(f"Session ID: {self.session.session_id}")
prob = self.probs.get_problem_instance(problem_id)
deployment = self.probs.get_problem_deployment(problem_id)
self.session.set_problem(prob, pid=problem_id)
self.session.set_agent(self.agent_name)

print("Setting up OpenEBS...")
if deployment != "docker":
print("Setting up OpenEBS...")

# Install OpenEBS
self.kubectl.exec_command(
"kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml"
)
self.kubectl.exec_command(
"kubectl patch storageclass openebs-hostpath -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'"
)
self.kubectl.wait_for_ready("openebs")
print("OpenEBS setup completed.")
# Install OpenEBS
self.kubectl.exec_command(
"kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml"
)
self.kubectl.exec_command(
"kubectl patch storageclass openebs-hostpath -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'"
)
self.kubectl.wait_for_ready("openebs")
print("OpenEBS setup completed.")

# Setup and deploy Prometheus
self.prometheus = Prometheus()
self.prometheus.deploy()
# Setup and deploy Prometheus
self.prometheus = Prometheus()
self.prometheus.deploy()

# deploy service
prob.app.delete()
Expand Down Expand Up @@ -200,13 +202,15 @@ async def start_problem(self, max_steps: int):
# But this will take more time.
# if not self.session.problem.sys_status_after_recovery():
self.session.problem.app.cleanup()
self.prometheus.teardown()
print("Uninstalling OpenEBS...")
self.kubectl.exec_command("kubectl delete sc openebs-hostpath openebs-device --ignore-not-found")
self.kubectl.exec_command(
"kubectl delete -f https://openebs.github.io/charts/openebs-operator.yaml"
)
self.kubectl.wait_for_namespace_deletion("openebs")

if self.session.problem.namespace != "docker":
self.prometheus.teardown()
print("Uninstalling OpenEBS...")
self.kubectl.exec_command("kubectl delete sc openebs-hostpath openebs-device --ignore-not-found")
self.kubectl.exec_command(
"kubectl delete -f https://openebs.github.io/charts/openebs-operator.yaml"
)
self.kubectl.wait_for_namespace_deletion("openebs")

self.execution_end_time = time.time()
total_execution_time = self.execution_end_time - self.execution_start_time
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from .model_misconfig import (
FlowerModelMisconfigDetection
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Model misconfiguration fault in the Flower application."""

import time
from typing import Any

from aiopslab.orchestrator.tasks import *
from aiopslab.service.dock import Docker
from aiopslab.service.apps.flower import Flower
from aiopslab.paths import TARGET_MICROSERVICES
from aiopslab.session import SessionItem
from aiopslab.generators.fault.inject_virtual import VirtualizationFaultInjector


class FlowerModelMisconfigBaseTask:
def __init__(self, faulty_service: str = "user-service"):
self.app = Flower()
self.docker = Docker()
self.namespace = self.app.namespace
self.faulty_service = faulty_service
self.train_dir = TARGET_MICROSERVICES / "flower"

def start_workload(self):
print("== Start Workload ==")
command = "flwr run train local-deployment"
self.docker.exec_command(command, cwd=self.train_dir)

path = "/app/.flwr/apps"
check = f""" docker exec -it {self.faulty_service} sh -c "test -d {path} && echo 'exists'" """

print("Waiting for workload to start...")
while True:
exists = self.docker.exec_command(check)
if exists.strip() == "exists":
break
time.sleep(1)
print("Workload started successfully.")

# Inject fault after workload starts, since the required files are created during the workload
print("Injecting fault...")
self.inject_fault(inject=True)

print("Waiting for faults to propagate...")
while True:
logs = self.docker.get_logs(self.faulty_service)
if "error" in logs.lower():
break
time.sleep(1)
print("Faults propagated.")

def inject_fault(self, inject: bool = False):
print("== Fault Injection ==")
if inject:
injector = VirtualizationFaultInjector(namespace=self.namespace)
injector._inject(
fault_type="model_misconfig",
microservices=[self.faulty_service],
)
print(f"Service: {self.faulty_service} | Namespace: {self.namespace}\n")
else:
print("Fault injection skipped.")

def recover_fault(self):
print("== Fault Recovery ==")
injector = VirtualizationFaultInjector(namespace=self.namespace)
injector._recover(
fault_type="model_misconfig",
microservices=[self.faulty_service],
)
print(f"Service: {self.faulty_service} | Namespace: {self.namespace}\n")


################## Detection Problem ##################
class FlowerModelMisconfigDetection(FlowerModelMisconfigBaseTask, DetectionTask):
def __init__(self, faulty_service: str = "clientapp-1"):
FlowerModelMisconfigBaseTask.__init__(self, faulty_service=faulty_service)
DetectionTask.__init__(self, self.app)

def eval(self, soln: Any, trace: list[SessionItem], duration: float):
print("== Evaluation ==")
expected_solution = "Yes"

if isinstance(soln, str):
if soln.strip().lower() == expected_solution.lower():
print(f"Correct detection: {soln}")
self.add_result("Detection Accuracy", "Correct")
else:
print(f"Incorrect detection: {soln}")
self.add_result("Detection Accuracy", "Incorrect")
else:
print("Invalid solution format")
self.add_result("Detection Accuracy", "Invalid Format")

return super().eval(soln, trace, duration)
6 changes: 6 additions & 0 deletions aiopslab/orchestrator/problems/flower_node_stop/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from .node_stop import (
FlowerNodeStopDetection
)
69 changes: 69 additions & 0 deletions aiopslab/orchestrator/problems/flower_node_stop/node_stop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Docker node stop fault problem in the Flower application."""

from typing import Any

from aiopslab.orchestrator.tasks import *
from aiopslab.service.dock import Docker
from aiopslab.service.apps.flower import Flower
from aiopslab.paths import TARGET_MICROSERVICES
from aiopslab.session import SessionItem
from aiopslab.generators.fault.inject_virtual import VirtualizationFaultInjector


class FlowerNodeStopBaseTask:
def __init__(self, faulty_service: str = "user-service"):
self.app = Flower()
self.docker = Docker()
self.namespace = self.app.namespace
self.faulty_service = faulty_service
self.train_dir = TARGET_MICROSERVICES / "flower"

def start_workload(self):
print("== Start Workload ==")
command = "flwr run train local-deployment"
self.docker.exec_command(command, cwd=self.train_dir)

def inject_fault(self):
print("== Fault Injection ==")
injector = VirtualizationFaultInjector(namespace=self.namespace)
injector._inject(
fault_type="container_stop",
microservices=[self.faulty_service],
)
print(f"Service: {self.faulty_service} | Namespace: {self.namespace}\n")

def recover_fault(self):
print("== Fault Recovery ==")
injector = VirtualizationFaultInjector(namespace=self.namespace)
injector._recover(
fault_type="container_stop",
microservices=[self.faulty_service],
)
print(f"Service: {self.faulty_service} | Namespace: {self.namespace}\n")


################## Detection Problem ##################
class FlowerNodeStopDetection(FlowerNodeStopBaseTask, DetectionTask):
def __init__(self, faulty_service: str = "supernode-1"):
FlowerNodeStopBaseTask.__init__(self, faulty_service=faulty_service)
DetectionTask.__init__(self, self.app)

def eval(self, soln: Any, trace: list[SessionItem], duration: float):
print("== Evaluation ==")
expected_solution = "Yes"

if isinstance(soln, str):
if soln.strip().lower() == expected_solution.lower():
print(f"Correct detection: {soln}")
self.add_result("Detection Accuracy", "Correct")
else:
print(f"Incorrect detection: {soln}")
self.add_result("Detection Accuracy", "Incorrect")
else:
print("Invalid solution format")
self.add_result("Detection Accuracy", "Invalid Format")

return super().eval(soln, trace, duration)
Loading