diff --git a/scripts/prod/restart_all_nodes_together.py b/scripts/prod/restart_all_nodes_together.py new file mode 100755 index 00000000000..01db58652cb --- /dev/null +++ b/scripts/prod/restart_all_nodes_together.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 + +import argparse +import json +import signal +import socket +import subprocess +import sys +from enum import Enum + +from time import sleep +import urllib.error +import urllib.request +from prometheus_client.parser import text_string_to_metric_families + +from update_config_and_restart_nodes_lib import ( + ApolloArgsParserBuilder, + Service, + get_context_list_from_args, + get_namespace_list_from_args, + get_pod_names, + print_colored, + print_error, + restart_node, + restart_all_nodes, + restart_pods, + update_config, +) + + +class RestartStrategy(Enum): + """Strategy for restarting nodes.""" + + All_At_Once = 1 + One_By_One = 2 + + +def restart_strategy_converter(strategy_name: str) -> RestartStrategy: + """Convert string to RestartStrategy enum with informative error message""" + RESTART_STRATEGY_PREFIX = f"{RestartStrategy.__name__}." + if strategy_name.startswith(RESTART_STRATEGY_PREFIX): + strategy_name = strategy_name[len(RESTART_STRATEGY_PREFIX) :] + + try: + return RestartStrategy[strategy_name] + except KeyError: + valid_strategies = ", ".join([strategy.name for strategy in RestartStrategy]) + raise argparse.ArgumentTypeError( + f"Invalid restart strategy '{strategy_name}'. Valid options are: {valid_strategies}" + ) + + +def get_free_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def get_metrics(port: int, pod: str) -> str: + for attempt in range(10): + try: + with urllib.request.urlopen(f"http://localhost:{port}/monitoring/metrics") as response: + if response.status == 200: + return response.read().decode("utf-8") + else: + print_colored( + f"Failed to get metrics from for pod {pod}, attempt {attempt + 1}: {response.status}" + ) + except urllib.error.URLError as e: + print_colored(f"Failed to get metrics from for pod {pod}, attempt {attempt + 1}: {e}") + print_error(f"Failed to get metrics from for pod {pod}, after {attempt + 1} attempts") + sys.exit(1) + + +def poll_until_height_revert( + local_port: int, pod: str, polling_interval_seconds: int, storage_required_height: int +): + """Poll metrics until the storage height marker reaches the required height.""" + while True: + metrics = get_metrics(local_port, pod) + if metrics is None: + print_error(f"Failed to get metrics from for pod {pod}") + sys.exit(1) + + metric_families = text_string_to_metric_families(metrics) + val = None + # TODO: change to the real metric (proposal accepted as prposer) when we have a sequencer + # node (and the metric exists). + METRIC_NAME = "mempool_pending_queue_size" + for metric_family in metric_families: + if metric_family.name == METRIC_NAME: + val = metric_family.samples[0].value + break + + if val is None: + print_colored( + f"Metric '{METRIC_NAME}' not found in pod {pod}. Assuming the node is not ready." + ) + else: + if val < storage_required_height: + print_colored( + f"Storage height marker ({val}) has not reached {storage_required_height} yet, continuing to wait." + ) + else: + print_colored( + f"Storage height marker ({val}) has reached {storage_required_height}. Safe to continue." + ) + break + + sleep(polling_interval_seconds) + + +def wait_for_node( + pod: str, metrics_port: int, polling_interval_seconds: int, storage_required_height: int +): + """Wait for the node to be restarted and propose successfully.""" + local_port = get_free_port() + # Start kubectl port forwarding to the node and keep it running in the background. + cmd = [ + "kubectl", + "port-forward", + f"pod/{pod}", + f"{local_port}:{metrics_port}", + ] + + pf_process = None + try: + pf_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + # Set up signal handler to ensure subprocess is terminated on interruption + def signal_handler(signum, frame): + if pf_process and pf_process.poll() is None: + print_colored(f"Terminating kubectl port-forward process (PID: {pf_process.pid})") + pf_process.terminate() + try: + pf_process.wait(timeout=5) + except subprocess.TimeoutExpired: + print_colored("Force killing kubectl port-forward process") + pf_process.kill() + pf_process.wait() + sys.exit(0) + + # Register signal handlers for graceful shutdown + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + poll_until_height_revert(local_port, pod, polling_interval_seconds, storage_required_height) + + finally: + # Ensure subprocess is always terminated + if pf_process and pf_process.poll() is None: + print_colored(f"Terminating kubectl port-forward process (PID: {pf_process.pid})") + pf_process.terminate() + try: + pf_process.wait(timeout=5) + except subprocess.TimeoutExpired: + print_colored("Force killing kubectl port-forward process") + pf_process.kill() + pf_process.wait() + + +def main(): + usage_example = """ +Examples: + # Restart all nodes to at the next block after current feeder block + %(prog)s --namespace-prefix apollo-sepolia-integration --num-nodes 3 --feeder_url feeder.integration-sepolia.starknet.io + %(prog)s -n apollo-sepolia-integration -m 3 -f feeder.integration-sepolia.starknet.io + + # Restart nodes with cluster prefix + %(prog)s -n apollo-sepolia-integration -m 3 -c my-cluster -f feeder.integration-sepolia.starknet.io + + # Update configuration without restarting nodes + %(prog)s -n apollo-sepolia-integration -m 3 -f feeder.integration-sepolia.starknet.io --no-restart + + # Restart nodes starting from specific node index + %(prog)s -n apollo-sepolia-integration -m 3 -s 5 -f feeder.integration-sepolia.starknet.io + + # Use different feeder URL + %(prog)s -n apollo-sepolia-integration -m 3 -f feeder.integration-sepolia.starknet.io + + # Use namespace list instead of prefix (restart specific namespaces) + %(prog)s --namespace-list apollo-sepolia-integration-0 apollo-sepolia-integration-2 -f feeder.integration-sepolia.starknet.io + %(prog)s -N apollo-sepolia-integration-0 apollo-sepolia-integration-2 -f feeder.integration-sepolia.starknet.io + + # Use cluster list for multiple clusters (only works with namespace-list, not namespace-prefix) + %(prog)s -N apollo-sepolia-integration-0 apollo-sepolia-integration-1 -C cluster1 cluster2 -f feeder.integration-sepolia.starknet.io + %(prog)s --namespace-list apollo-sepolia-integration-0 apollo-sepolia-integration-1 --cluster-list cluster1 cluster2 -f feeder.integration-sepolia.starknet.io + """ + + args_builder = ApolloArgsParserBuilder( + "Restart all nodes using the value from the feeder URL", usage_example + ) + + args_builder.add_argument( + "-f", + "--feeder_url", + required=True, + type=str, + help="The feeder URL to get the current block from", + ) + + args_builder.add_argument( + "-r", + "--restart-strategy", + type=restart_strategy_converter, + choices=list(RestartStrategy), + default=RestartStrategy.All_At_Once, + help="Strategy for restarting nodes (default: All_At_Once)", + ) + + # The port to connect to to get the metrics. + args_builder.add_argument( + "-p", + "--metrics-port", + type=int, + default=8082, + help="The port to connect to to get the metrics (default: 8082)", + ) + + args = args_builder.build() + + # Get current block number from feeder URL + try: + url = f"https://{args.feeder_url}/feeder_gateway/get_block" + with urllib.request.urlopen(url) as response: + if response.status != 200: + raise urllib.error.HTTPError( + url, response.status, "HTTP Error", response.headers, None + ) + data = json.loads(response.read().decode("utf-8")) + current_block_number = data["block_number"] + next_block_number = current_block_number + 1 + + print_colored(f"Current block number: {current_block_number}") + print_colored(f"Next block number: {next_block_number}") + + except urllib.error.URLError as e: + print_error(f"Failed to fetch block number from feeder URL: {e}") + sys.exit(1) + except KeyError as e: + print_error(f"Unexpected response format from feeder URL: {e}") + sys.exit(1) + except json.JSONDecodeError as e: + print_error(f"Failed to parse JSON response from feeder URL: {e}") + sys.exit(1) + + config_overrides = { + "consensus_manager_config.immediate_active_height": next_block_number, + "consensus_manager_config.cende_config.skip_write_height": next_block_number, + } + + namespace_list = get_namespace_list_from_args(args) + context_list = get_context_list_from_args(args) + # update_config( + # config_overrides, + # namespace_list, + # Service.Core, + # context_list, + # ) + + if args.no_restart: + print_colored("\nSkipping pod restart (--no-restart was specified)") + sys.exit(0) + + if args.restart_strategy == RestartStrategy.One_By_One: + for index, namespace in enumerate(namespace_list): + cluster = context_list[index] if context_list else None + try: + [pod] = get_pod_names(namespace, Service.Core, cluster) + except ValueError: + print_error(f"Expected 1 pod for namespace {namespace}, got: {pod}") + sys.exit(1) + # restart_pods(namespace, [pod], index, cluster) + wait_for_node(pod, args.metrics_port, 5, next_block_number) + elif args.restart_strategy == RestartStrategy.All_At_Once: + # restart_all_nodes( + # namespace_list, + # Service.Core, + # context_list, + # ) + pass + else: + print_error(f"Invalid restart strategy: {args.restart_strategy}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/set_node_revert_mode.py b/scripts/prod/set_node_revert_mode.py similarity index 100% rename from scripts/set_node_revert_mode.py rename to scripts/prod/set_node_revert_mode.py diff --git a/scripts/update_config_and_restart_nodes.py b/scripts/prod/update_config_and_restart_nodes.py similarity index 69% rename from scripts/update_config_and_restart_nodes.py rename to scripts/prod/update_config_and_restart_nodes.py index 01363d91188..6b07d612e61 100755 --- a/scripts/update_config_and_restart_nodes.py +++ b/scripts/prod/update_config_and_restart_nodes.py @@ -3,18 +3,21 @@ import argparse import json import sys +from typing import Any from update_config_and_restart_nodes_lib import ( ApolloArgsParserBuilder, Colors, Service, + get_context_list_from_args, + get_namespace_list_from_args, print_colored, print_error, update_config_and_restart_nodes, ) -def parse_config_overrides(config_overrides: list[str]) -> dict[str, any]: +def parse_config_overrides(config_overrides: list[str]) -> dict[str, Any]: """Parse config override strings in key=value format. Args: @@ -79,30 +82,36 @@ def service_type_converter(service_name: str) -> Service: def main(): usage_example = """ Examples: - # Update sequencer core configuration. - %(prog)s --namespace apollo-sepolia-integration --num-nodes 3 --cluster my-cluster --config-overrides consensus_manager_config.timeout=5000 --config-overrides validator_id=0x42 - %(prog)s -n apollo-sepolia-integration -N 3 --config-overrides consensus_manager_config.timeout=5000 --config-overrides validator_id=0x42 + # Basic usage with namespace prefix and node count + %(prog)s -n apollo-sepolia-integration -m 3 --config-overrides consensus_manager_config.timeout=5000 --config-overrides validator_id=0x42 - # Update gateway configuration - %(prog)s -n apollo-sepolia-integration -N 3 -j Gateway --config-overrides gateway_config.port=8080 + # Using namespace list mode (no num-nodes or start-index allowed) + %(prog)s -N apollo-sepolia-test-0 apollo-sepolia-test-1 apollo-sepolia-test-2 --config-overrides consensus_manager_config.timeout=5000 - # Update mempool configuration - %(prog)s -n apollo-sepolia-integration -N 3 -j Mempool --config-overrides mempool_config.max_size=1000 + # Using cluster prefix with namespace prefix + %(prog)s -n apollo-sepolia-integration -m 3 -c my-cluster --config-overrides validator_id=0x42 - # Update L1 provider configuration - %(prog)s -n apollo-sepolia-integration -N 3 -j L1 --config-overrides l1_config.endpoint=\"https://eth-mainnet.alchemyapi.io/v2/your-key\" + # Using cluster list with namespace list (must have same number of items) + %(prog)s -N apollo-sepolia-test-0 apollo-sepolia-test-2 -C cluster0 cluster2 --config-overrides validator_id=0x42 - # Update HTTP server configuration - %(prog)s -n apollo-sepolia-integration -N 3 -j HttpServer --config-overrides http_server_config.port=8081 + # Update different service types + %(prog)s -n apollo-sepolia-integration -m 3 -j Gateway --config-overrides gateway_config.port=8080 + %(prog)s -n apollo-sepolia-integration -m 3 -j Mempool --config-overrides mempool_config.max_size=1000 + %(prog)s -n apollo-sepolia-integration -m 3 -j L1 --config-overrides l1_config.endpoint=\"https://eth-mainnet.alchemyapi.io/v2/your-key\" + %(prog)s -n apollo-sepolia-integration -m 3 -j HttpServer --config-overrides http_server_config.port=8081 + %(prog)s -n apollo-sepolia-integration -m 3 -j SierraCompiler --config-overrides sierra_compiler_config.timeout=30000 + + # Update starting from specific node index + %(prog)s -n apollo-sepolia-integration -m 3 -s 5 --config-overrides validator_id=0x42 # Update without restart - %(prog)s -n apollo-sepolia-integration -N 3 --config-overrides validator_id=0x42 --no-restart + %(prog)s -n apollo-sepolia-integration -m 3 --config-overrides validator_id=0x42 --no-restart - # Update with explicit restart - %(prog)s -n apollo-sepolia-integration -N 3 --config-overrides validator_id=0x42 -r + # Update with explicit restart (default behavior) + %(prog)s -n apollo-sepolia-integration -m 3 --config-overrides validator_id=0x42 -r - # Update starting from specific node index - %(prog)s -n apollo-sepolia-integration -N 3 -i 5 --config-overrides validator_id=0x42 + # Complex example with multiple config overrides + %(prog)s -n apollo-sepolia-integration -m 3 -c my-cluster -j Core --config-overrides consensus_manager_config.timeout=5000 --config-overrides validator_id=0x42 --config-overrides components.gateway.url=\"localhost\" """ @@ -142,11 +151,9 @@ def main(): update_config_and_restart_nodes( config_overrides, - args.namespace, - args.num_nodes, - args.start_index, + get_namespace_list_from_args(args), args.service, - args.cluster, + get_context_list_from_args(args), not args.no_restart, ) diff --git a/scripts/update_config_and_restart_nodes_lib.py b/scripts/prod/update_config_and_restart_nodes_lib.py similarity index 61% rename from scripts/update_config_and_restart_nodes_lib.py rename to scripts/prod/update_config_and_restart_nodes_lib.py index 8bb05a0f1e7..8ce0484ec7e 100755 --- a/scripts/update_config_and_restart_nodes_lib.py +++ b/scripts/prod/update_config_and_restart_nodes_lib.py @@ -5,7 +5,7 @@ import subprocess import sys from enum import Enum -from typing import Optional +from typing import Any, Optional import tempfile import yaml @@ -47,24 +47,28 @@ def __init__(self, description: str, usage_example: str): epilog=usage_example, ) - # Add all required flags immediately on creation self._add_common_flags() def _add_common_flags(self): """Add all common flags.""" - self.parser.add_argument( + namespace_group = self.parser.add_mutually_exclusive_group(required=True) + namespace_group.add_argument( "-n", - "--namespace", - required=True, + "--namespace-prefix", help="The Kubernetes namespace prefix (e.g., apollo-sepolia-integration)", ) + namespace_group.add_argument( + "-N", + "--namespace-list", + nargs="+", + help="Space separated list of namespaces e.g., '--namespace-list apollo-sepolia-integration-0 apollo-sepolia-integration-2'", + ) self.parser.add_argument( - "-N", + "-m", "--num-nodes", - required=True, type=int, - help="The number of nodes to restart (required)", + help="The number of nodes to restart (required when specifying namespace-prefix)", ) self.parser.add_argument( @@ -75,8 +79,15 @@ def _add_common_flags(self): help="The starting index for node IDs (default: 0)", ) - self.parser.add_argument( - "-c", "--cluster", help="Optional cluster prefix for kubectl context" + cluster_group = self.parser.add_mutually_exclusive_group() + cluster_group.add_argument( + "-c", "--cluster-prefix", help="Optional cluster prefix for kubectl context" + ) + cluster_group.add_argument( + "-C", + "--cluster-list", + nargs="+", + help="Space separated list of cluster names for kubectl contexts", ) restart_group = self.parser.add_mutually_exclusive_group() @@ -136,13 +147,68 @@ def __init__(self, config_map_name: str, pod_name: str) -> None: def validate_arguments(args: argparse.Namespace) -> None: - if args.num_nodes <= 0: - print_error("Error: num-nodes must be a positive integer.") + if (args.namespace_list and args.cluster_prefix) or ( + args.namespace_prefix and args.cluster_list + ): + print_error("Error: Use either list mode or prefix mode. You cannot mix them.") sys.exit(1) - if args.start_index < 0: - print_error("Error: start-index must be a non-negative integer.") - sys.exit(1) + if args.namespace_list: + # List mode. + if args.start_index != 0: + print_error("Error: start-index cannot be set when namespace-list is specified.") + sys.exit(1) + if args.num_nodes: + print_error("Error: num-nodes cannot be set when namespace-list is specified.") + sys.exit(1) + if args.cluster_list: + if len(args.cluster_list) != len(args.namespace_list): + print_error( + "Error: cluster-list and namespace-list must have the same number of values." + ) + sys.exit(1) + else: + # Prefix mode. + if args.num_nodes is None: + print_error("Error: num-nodes is required when not in namespace-list mode.") + sys.exit(1) + + if args.num_nodes <= 0: + print_error("Error: num-nodes must be a positive integer.") + sys.exit(1) + + if args.start_index < 0: + print_error("Error: start-index must be a non-negative integer.") + sys.exit(1) + + +def get_namespace_list_from_args( + args: argparse.Namespace, +) -> list[str]: + """Get a list of namespaces based on the arguments""" + if args.namespace_list: + return args.namespace_list + + return [ + f"{args.namespace_prefix}-{i}" + for i in range(args.start_index, args.start_index + args.num_nodes) + ] + + +def get_context_list_from_args( + args: argparse.Namespace, +) -> list[str]: + """Get a list of contexts based on the arguments""" + if args.cluster_list: + return args.cluster_list + + if args.cluster_prefix is None: + return None + + return [ + f"{args.cluster_prefix}-{i}" + for i in range(args.start_index, args.start_index + args.num_nodes) + ] def run_kubectl_command(args: list, capture_output: bool = True) -> subprocess.CompletedProcess: @@ -156,19 +222,16 @@ def run_kubectl_command(args: list, capture_output: bool = True) -> subprocess.C sys.exit(1) -def get_namespace_args( - namespace: str, node_id: int, cluster_prefix: Optional[str] = None -) -> list[str]: - ret = ["-n", f"{namespace}-{node_id}"] - if cluster_prefix: - ret.extend(["--context", f"{cluster_prefix}-{node_id}"]) +def get_namespace_args(namespace: str, cluster: Optional[str] = None) -> list[str]: + ret = ["-n", f"{namespace}"] + if cluster: + ret.extend(["--context", f"{cluster}"]) return ret def get_configmap( namespace: str, - node_id: int, - cluster_prefix: Optional[str] = None, + cluster: Optional[str] = None, service: Service = Service.Core, ) -> str: """Get configmap YAML for a specific node""" @@ -179,7 +242,7 @@ def get_configmap( "-o", "yaml", ] - kubectl_args.extend(get_namespace_args(namespace, node_id, cluster_prefix)) + kubectl_args.extend(get_namespace_args(namespace, cluster)) result = run_kubectl_command(kubectl_args) return result.stdout @@ -251,8 +314,7 @@ def represent_literal_str(dumper, data): def update_config_values( config_content: str, - node_id: int, - config_overrides: dict[str, any] = None, + config_overrides: dict[str, Any] = None, ) -> str: """Update configuration values in the YAML content and return the updated YAML""" # Parse the configuration @@ -275,9 +337,9 @@ def normalize_config(config_content: str) -> str: return serialize_config_to_yaml(config, config_data) -def show_config_diff(old_content: str, new_content: str, node_id: int) -> None: +def show_config_diff(old_content: str, new_content: str, index: int) -> None: print_colored( - f"--------------------- Config changes to node no. {node_id}'s core service --------------------", + f"--------------------- Config changes {index} --------------------", Colors.YELLOW, ) @@ -287,8 +349,8 @@ def show_config_diff(old_content: str, new_content: str, node_id: int) -> None: diff = unified_diff( old_lines, new_lines, - fromfile=f"config{node_id}.yaml_old", - tofile=f"config{node_id}.yaml", + fromfile=f"config{index}.yaml_old", + tofile=f"config{index}.yaml", lineterm="", ) @@ -312,8 +374,8 @@ def ask_for_confirmation() -> bool: def apply_configmap( config_content: str, namespace: str, - node_id: int, - cluster_prefix: Optional[str] = None, + index: int, + cluster: Optional[str] = None, ) -> None: """Apply updated configmap""" with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: @@ -322,19 +384,17 @@ def apply_configmap( try: kubectl_args = ["apply", "-f", temp_file] - kubectl_args.extend(get_namespace_args(namespace, node_id, cluster_prefix)) + kubectl_args.extend(get_namespace_args(namespace, cluster)) run_kubectl_command(kubectl_args, capture_output=False) except Exception as e: - print_error(f"Failed applying config for node {node_id}: {e}") + print_error(f"Failed applying config for index {index}: {e}") sys.exit(1) -def restart_pod( - namespace: str, node_id: int, service: Service, cluster_prefix: Optional[str] = None -) -> None: - """Restart pod by deleting it""" +def get_pod_names(namespace: str, service: Service, cluster: Optional[str] = None) -> list[str]: + """Get the list of pods for a specific service""" # Get the list of pods (one string per line). kubectl_args = [ "get", @@ -347,6 +407,14 @@ def restart_pod( # Filter the list of pods to only include the ones that match the service and extract the pod name. pods = [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")] + return pods + + +def restart_pods( + namespace: str, pods: list[str], index: int, cluster: Optional[str] = None +) -> None: + """Restart pod by deleting it""" + if not pods: print_error(f"Could not find pods for service {service.pod_name}.") sys.exit(1) @@ -358,59 +426,77 @@ def restart_pod( "pod", pod, ] - kubectl_args.extend(get_namespace_args(namespace, node_id, cluster_prefix)) + kubectl_args.extend(get_namespace_args(namespace, cluster)) try: run_kubectl_command(kubectl_args, capture_output=False) - print_colored(f"Restarted {pod} for node {node_id}") + print_colored(f"Restarted {pod} for node {index}") except Exception as e: - print_error(f"Failed restarting {pod} for node {node_id}: {e}") + print_error(f"Failed restarting {pod} for node {index}: {e}") sys.exit(1) -def update_config_and_restart_nodes( - config_overrides: dict[str, any], - namespace: str, - num_nodes: int, - start_index: int, +def restart_node( + namespace: str, service: Service, index: int, cluster: Optional[str] = None +) -> None: + """Restart a single node by deleting its pod""" + pods = get_pod_names(namespace, service, cluster) + restart_pods(namespace, pods, index, cluster) + + +def restart_all_nodes( + namespace_list: list[str], + service: Service, + cluster_list: Optional[list[str]] = None, +) -> None: + """Restart nodes by deleting their pods""" + for index, namespace in enumerate(namespace_list): + cluster = cluster_list[index] if cluster_list else None + restart_node(namespace, service, index, cluster) + print_colored("\nAll pods have been successfully restarted!", Colors.GREEN) + + +def update_config( + config_overrides: dict[str, Any], + namespace_list: list[str], service: Service, - cluster_prefix: Optional[str] = None, + cluster_list: Optional[list[str]] = None, restart_nodes: bool = True, ) -> None: assert config_overrides is not None, "config_overrides must be provided" - assert namespace is not None, "namespace must be provided" - assert num_nodes > 0, "num_nodes must be a positive integer" - assert start_index >= 0, "start_index must be a non-negative integer" + assert namespace_list is not None and len(namespace_list) > 0, "namespaces must be provided" - if not cluster_prefix: + if not cluster_list: print_colored( - "CLUSTER_PREFIX not provided. Assuming all nodes are on the current cluster", + "cluster-prefix/cluster-list not provided. Assuming all nodes are on the current cluster", Colors.RED, ) + else: + assert len(cluster_list) == len( + namespace_list + ), f"cluster_list must have the same number of values as namespace_list. cluster_list: {cluster_list}, namespace_list: {namespace_list}" # Store original and updated configs for all nodes - configs = {} - - # Define the range of node IDs to process - node_ids = range(start_index, start_index + num_nodes) + configs = [] # Process each node's configuration - for node_id in node_ids: - print_colored(f"\nProcessing node {node_id}...") + for index, namespace in enumerate(namespace_list): + cluster = cluster_list[index] if cluster_list else None + print_colored( + f"\nProcessing node for namespace {namespace} (cluster: {cluster if cluster else 'current cluster'})..." + ) # Get current config and normalize it (e.g. " vs ') to ensure not showing bogus diffs. - original_config = normalize_config( - get_configmap(namespace, node_id, cluster_prefix, service) - ) + original_config = normalize_config(get_configmap(namespace, cluster, service)) # Update config - updated_config = update_config_values(original_config, node_id, config_overrides) + updated_config = update_config_values(original_config, config_overrides) # Store configs - configs[node_id] = {"original": original_config, "updated": updated_config} + configs.append({"original": original_config, "updated": updated_config}) # Show diff - show_config_diff(original_config, updated_config, node_id) + show_config_diff(original_config, updated_config, index) if not ask_for_confirmation(): print_error("Operation cancelled by user") @@ -418,15 +504,27 @@ def update_config_and_restart_nodes( # Apply all configurations print_colored("\nApplying configurations...") - for node_id in node_ids: - print(f"Applying config for node {node_id}...") - apply_configmap(configs[node_id]["updated"], namespace, node_id, cluster_prefix) + for index, config in enumerate(configs): + print(f"Applying config {index}...") + apply_configmap( + config["updated"], + namespace_list[index], + index, + cluster_list[index] if cluster_list else None, + ) + print("\nUpdate completed successfully!") + + +def update_config_and_restart_nodes( + config_overrides: dict[str, Any], + namespace_list: list[str], + service: Service, + cluster_list: Optional[list[str]] = None, + restart_nodes: bool = True, +) -> None: + update_config(config_overrides, namespace_list, service, cluster_list, restart_nodes) if restart_nodes: - for node_id in node_ids: - restart_pod(namespace, node_id, service, cluster_prefix) - print_colored("\nAll pods have been successfully restarted!", Colors.GREEN) + restart_all_nodes(namespace_list, service, cluster_list) else: print_colored("\nSkipping pod restart (--no-restart was specified)") - - print("\nOperation completed successfully!") diff --git a/scripts/restart_all_nodes_together.py b/scripts/restart_all_nodes_together.py deleted file mode 100755 index a6b327528ef..00000000000 --- a/scripts/restart_all_nodes_together.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 - -import json -import sys - -import urllib.error -import urllib.request -from update_config_and_restart_nodes_lib import ( - ApolloArgsParserBuilder, - Service, - print_colored, - print_error, - update_config_and_restart_nodes, -) - - -def main(): - usage_example = """ -Examples: - # Restart all nodes to at the next block after current feeder block - %(prog)s --namespace apollo-sepolia-integration --num-nodes 3 --feeder.integration-sepolia.starknet.io - %(prog)s -n apollo-sepolia-integration -N 3 -f feeder.integration-sepolia.starknet.io - - # Restart nodes with cluster prefix - %(prog)s -n apollo-sepolia-integration -N 3 -c my-cluster -f feeder.integration-sepolia.starknet.io - - # Update configuration without restarting nodes - %(prog)s -n apollo-sepolia-integration -N 3 -f feeder.integration-sepolia.starknet.io --no-restart - - # Restart nodes starting from specific node index - %(prog)s -n apollo-sepolia-integration -N 3 -s 5 -f feeder.integration-sepolia.starknet.io - - # Use different feeder URL - %(prog)s -n apollo-sepolia-integration -N 3 -f feeder.integration-sepolia.starknet.io - """ - - args_builder = ApolloArgsParserBuilder( - "Restart all nodes using the value from the feeder URL", usage_example - ) - - args_builder.add_argument( - "-f", - "--feeder_url", - required=True, - type=str, - help="The feeder URL to get the current block from", - ) - - args = args_builder.build() - - # Get current block number from feeder URL - try: - url = f"https://{args.feeder_url}/feeder_gateway/get_block" - with urllib.request.urlopen(url) as response: - if response.status != 200: - raise urllib.error.HTTPError( - url, response.status, "HTTP Error", response.headers, None - ) - data = json.loads(response.read().decode("utf-8")) - current_block_number = data["block_number"] - next_block_number = current_block_number + 1 - - print_colored(f"Current block number: {current_block_number}") - print_colored(f"Next block number: {next_block_number}") - - except urllib.error.URLError as e: - print_error(f"Failed to fetch block number from feeder URL: {e}") - sys.exit(1) - except KeyError as e: - print_error(f"Unexpected response format from feeder URL: {e}") - sys.exit(1) - except json.JSONDecodeError as e: - print_error(f"Failed to parse JSON response from feeder URL: {e}") - sys.exit(1) - - config_overrides = { - "consensus_manager_config.immediate_active_height": next_block_number, - "consensus_manager_config.cende_config.skip_write_height": next_block_number, - } - - update_config_and_restart_nodes( - config_overrides, - args.namespace, - args.num_nodes, - args.start_index, - Service.Core, - args.cluster, - not args.no_restart, - ) - - -if __name__ == "__main__": - main()