From a166e45d7b52266b3461392a0f5f013df0bdce57 Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Thu, 28 Aug 2025 04:24:45 +0300 Subject: [PATCH 01/11] test: fix returning logic in TestDownloadGTFSBundles --- internal/integration/gtfs_integration_test.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/integration/gtfs_integration_test.go b/internal/integration/gtfs_integration_test.go index ce2bb7d..4ffaf8a 100644 --- a/internal/integration/gtfs_integration_test.go +++ b/internal/integration/gtfs_integration_test.go @@ -32,17 +32,17 @@ func TestDownloadGTFSBundles(t *testing.T) { srv := server t.Run(fmt.Sprintf("ServerID_%d", srv.ID), func(t *testing.T) { t.Parallel() - staticBundle,err := gtfsService.DownloadGTFSBundle(ctx,srv.GtfsUrl, srv.ID,20) if err != nil { t.Errorf("failed to download GTFS bundle for server %d : %v", srv.ID, err) - err = gtfsService.StoreGTFSBundle(staticBundle,server.ID) - if err != nil { - t.Errorf("failed to store GTFS bundle for server %d : %v", srv.ID, err) - } - }else{ - t.Logf("GTFS bundle downloaded successfully for server %d", srv.ID) + return + } + err = gtfsService.StoreGTFSBundle(staticBundle,server.ID) + if err != nil { + t.Errorf("failed to store GTFS bundle for server %d : %v", srv.ID, err) + return } + t.Logf("GTFS bundle downloaded successfully for server %d", srv.ID) }) } } From 3b2918e047486e6de758d4f7c63e5e5a5012db5b Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Thu, 28 Aug 2025 04:34:35 +0300 Subject: [PATCH 02/11] security: restrict integration config file name to integration_config.json --- internal/integration/integration_test.go | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go index 45c2b8b..a9516be 100644 --- a/internal/integration/integration_test.go +++ b/internal/integration/integration_test.go @@ -12,18 +12,20 @@ import ( "flag" "fmt" "os" + "path/filepath" "testing" "watchdog.onebusaway.org/internal/models" ) + // integrationConfig stores the path to the integration configuration file, // passed using the -integration-config flag at test runtime. -var integrationConfig string +var integrationConfigPath string // init registers the -integration-config flag used to specify the path // to the integration test configuration file. func init() { - flag.StringVar(&integrationConfig, "integration-config", "", "Path to integration configuration file") + flag.StringVar(&integrationConfigPath, "integration-config", "", "Path to integration configuration file") } // integrationServers holds the list of OBA servers loaded from the config file. @@ -33,15 +35,24 @@ var integrationServers []models.ObaServer // TestMain handles setup before running integration tests. // It ensures the -integration-config flag is provided, // reads the config file, and unmarshals it into integrationServers. +// +// For security reasons, only files named `integration_config.json` are allowed to be loaded. +// Without this restriction, a user could supply any file path on the machine +// (e.g., /etc/passwd), and the application would attempt to read it. func TestMain(m *testing.M) { flag.Parse() - if integrationConfig == "" { + if integrationConfigPath == "" { fmt.Fprintln(os.Stderr, "Error: -integration-config flag is required for integration tests") os.Exit(1) } - data, err := os.ReadFile(integrationConfig) + if filepath.Base(integrationConfigPath) != "integration_config.json" { + fmt.Fprintln(os.Stderr,"invalid integration config file name: (only integration_config.json is allowed)", integrationConfigPath) + os.Exit(1) + } + + data, err := os.ReadFile(integrationConfigPath) if err != nil { fmt.Fprintf(os.Stderr, "Failed to read config file: %v\n", err) os.Exit(1) From a0b2b9a79dfe3fb0b1c5f549a2c660ec54441a29 Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Thu, 28 Aug 2025 04:36:05 +0300 Subject: [PATCH 03/11] refactor: rename integration_servers.json to integration_config.json --- .gitignore | 2 +- ...n_servers.json.template => integration_config.json.template} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename internal/integration/{integration_servers.json.template => integration_config.json.template} (100%) diff --git a/.gitignore b/.gitignore index 2ebde31..8664945 100644 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,4 @@ profile.cov config.json # Avoid Committing Integration testing configuration file -integration_servers.json \ No newline at end of file +integration_config.json \ No newline at end of file diff --git a/internal/integration/integration_servers.json.template b/internal/integration/integration_config.json.template similarity index 100% rename from internal/integration/integration_servers.json.template rename to internal/integration/integration_config.json.template From 50fa4d2c27927424c4a881dbeff601a9b1d8ac0b Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Thu, 28 Aug 2025 04:36:44 +0300 Subject: [PATCH 04/11] docs: update README with setup and usage instructions --- README.md | 231 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 120 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 90a3516..0d225a5 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,51 @@ -# watchdog +# Watchdog [![Coverage Status](https://coveralls.io/repos/github/OneBusAway/watchdog/badge.svg?branch=main)](https://coveralls.io/github/OneBusAway/watchdog?branch=main) -Golang-based Watchdog service for OBA REST API servers, providing deep observability by exposing a rich suite of Prometheus metrics. These metrics enable comprehensive monitoring of API uptime, GTFS Static and GTFS-RT data integrity, vehicle telemetry, agency and stop coverage, and overall operational health. +**Watchdog** is a Go-based service that monitors [OneBusAway (OBA)](https://onebusaway.org/) REST API servers. +It exposes a comprehensive set of **Prometheus metrics** for monitoring: -You can find documentation for the currently exposed metrics along with an interpretation guide [here](./docs/METRICS.md). +- GTFS Static and GTFS-RT data integrity +- Vehicle telemetry +- Agency and stop coverage +- Overall operational health + See the full list of metrics and interpretation guide [here](./docs/METRICS.md) -# Requirements +## Table of Contents -Go 1.23 or higher +- [Requirements](#requirements) +- [Setup](#setup) + - [Configuration](#configuration) + - [Example config.json](#example-configjson) + - [Ways to Provide the Config File](#ways-to-provide-the-config-file) + - [Local Configuration (recommended for development)](#1-local-configuration-recommended-for-development) + - [Remote Configuration (recommended for production)](#2-remote-configuration-recommended-for-production) + - [Application Options](#application-options) + - [Environment Variables](#environment-variables) +- [Running](#running) + - [Docker Compose (recommended)](#1-docker-compose-recommended) + - [Watchdog Only](#2-watchdog-only) + - [Local Config](#local-config) + - [Remote Config (with auth)](#remote-config-with-auth) + - [Docker (single container)](#3-docker-single-container) + - [Build image](#build-image) + - [Run with local config](#run-with-local-config) + - [Run with remote config](#run-with-remote-config) +- [Testing](#testing) + - [Unit Tests](#unit-tests) + - [Integration Tests](#integration-tests) -## Configuration +## Requirements -The watchdog service can be configured using either: +- **Go 1.23+** -- A **local JSON configuration file** (`--config-file`). -- A **remote JSON configuration URL** (`--config-url`). - - Using a remote configuration allows you to dynamically add, remove, or update server configurations at runtime. +## Setup -### JSON Configuration Format +### Configuration -The JSON configuration file should contain an array of `ObaServer` objects, each representing an OBA server to be monitored. Example: +Watchdog requires a configuration file (`config.json`) before running. Even placeholder data is necessary to start the service. + +#### Example `config.json` ```json [ @@ -35,179 +60,163 @@ The JSON configuration file should contain an array of `ObaServer` objects, each "gtfs_rt_api_key": "api-key-1", "gtfs_rt_api_value": "api-value-1", "agency_id": "agency-1" - }, - { - "name": "Test Server 2", - "id": 2, - "oba_base_url": "https://test2.example.com", - "oba_api_key": "test-key-2", - "gtfs_url": "https://gtfs2.example.com", - "trip_update_url": "https://trip2.example.com", - "vehicle_position_url": "https://vehicle2.example.com", - "gtfs_rt_api_key": "api-key-2", - "gtfs_rt_api_value": "api-value-2", - "agency_id": "agency-2" } ] ``` -### Local Configuration Setup +#### Ways to Provide the Config File -1. Either copy or rename `config.json.template` to `config.json` in the same folder. -2. Update `config.json` with your OBA server values. +#### 1. Local Configuration (recommended for development) -Note: The `config.json` file is ignored by Git to prevent accidental commits of sensitive configuration data. +- Copy or rename `config.json.template` → `config.json` +- Fill in your server values +- Run with: + +```bash +go run ./cmd/watchdog/ --config-file path/to/config.json +``` -### Remote Configuration Setup +Note: -1. Create a `config.json` file based on the `config.json.template` format. -2. Fill in your OBA server values in `config.json`. -3. Host it publicly and point the app to its URL. +- ⚠️The file **must** be named `config.json` +- `config.json` is Git-ignored (to protect secrets) -## Sentry Configuration +#### 2. Remote Configuration (recommended for production) -To enable Sentry error tracking, set the `SENTRY_DSN` environment variable with your Sentry DSN. +- Prepare `config.json` as above +- Host it publicly (or on a private server) +- Run with: -```sh -export SENTRY_DSN="your_sentry_dsn" +```bash +go run ./cmd/watchdog/ --config-url http://example.com/config.json ``` -# Running - -#### **Using a Local Configuration File** +If authentication is required, set: ```bash -go run ./cmd/watchdog/ --config-file ./config.json +export CONFIG_AUTH_USER="username" +export CONFIG_AUTH_PASS="password" ``` -## **Using a Remote Configuration URL with Authentication** +### Application Options -To load the configuration from a remote URL that requires basic authentication, follow these steps: +- **Fetch Interval** → default `30s` (`--fetch-interval `) +- **Environment** → `development` (default), `staging`, `production` (`--env `) +- **Port** → default `4000` (`--port `) -### 1. **Set the Required Environment Variables** +⚠️ If running with **Docker Compose**, Prometheus runs on `9090` and Grafana on `3000`. Don’t use those ports. -Before running the application, set the `CONFIG_AUTH_USER` and `CONFIG_AUTH_PASS` environment variables with the username and password for authentication. +### Environment Variables -#### On Linux/macOS: +- **Sentry DSN** ```bash -export CONFIG_AUTH_USER="your_username" -export CONFIG_AUTH_PASS="your_password" + export SENTRY_DSN="your_sentry_dsn" ``` -#### On Windows +- **Config Auth (for remote configs)** ```bash -set CONFIG_AUTH_USER=your_username -set CONFIG_AUTH_PASS=your_password + export CONFIG_AUTH_USER="username" + export CONFIG_AUTH_PASS="password" ``` -#### Run the Application with the Remote URL +## Running -Use the --config-url flag to specify the remote configuration URL. For example: +### 1. Docker Compose (recommended) + +Run Watchdog with **Prometheus + Grafana**: ```bash -go run ./cmd/watchdog/ \ - --config-url http://example.com/config.json +docker compose up --build ``` -## **Running with Docker** +Services: + +- Watchdog → `4000` +- Prometheus → `9090` +- Grafana → `3000` -You can also run the application using Docker. Here’s how: +Stop services: -### 1. **Build the Docker Image** +```bash +docker compose down +``` -First, build the Docker image for the application. Navigate to the root of the project directory and run: +Restart services: ```bash -docker build -t watchdog . +docker compose restart ``` -### 2. **Run the Docker Container** +Grafana auto-loads a Go runtime dashboard. Prometheus is pre-configured to scrape Watchdog. -#### 2.1 **Using Local Config** +### 2. Watchdog Only + +#### Local Config ```bash -docker run -d \ - --name watchdog \ - -e CONFIG_AUTH_USER=admin \ - -e CONFIG_AUTH_PASS=password \ - -v ./config.json:/app/config.json \ - -p 4000:4000 \ - watchdog \ - --config-file /app/config.json +go run ./cmd/watchdog/ --config-file path/to/config.json ``` -#### 2.2 **Using Remote Config** +#### Remote Config (with auth) ```bash -docker run -d \ - --name watchdog \ - -e CONFIG_AUTH_USER=admin \ - -e CONFIG_AUTH_PASS=password \ - -p 4000:4000 \ - watchdog \ +go run ./cmd/watchdog/ \ --config-url http://example.com/config.json ``` -## **Running with Docker Compose** - -You can run the Watchdog service along with Prometheus and Grafana for monitoring using Docker Compose. - -**Important:** - -> Make sure `config.json` exists before running any Docker commands. -> It must be a **file**, not a folder. +### 3. Docker (single container) -### **Start all services** +#### Build image ```bash -docker compose up -d +docker build -t watchdog . ``` -This will start: - -- **Watchdog** on port `4000` -- **Prometheus** on port `9090` -- **Grafana** on port `3000` - -### **Stop all services** +#### Run with local config ```bash -docker compose down +docker run -d \ + --name watchdog \ + -v ./config.json:/app/config.json \ + -p 4000:4000 \ + watchdog \ + --config-file /app/config.json ``` -### **Restart all services** +#### Run with remote config ```bash -docker compose restart +docker run -d \ + --name watchdog \ + -e CONFIG_AUTH_USER=admin \ + -e CONFIG_AUTH_PASS=password \ + -p 4000:4000 \ + watchdog \ + --config-url http://example.com/config.json ``` -Grafana will automatically load the pre-provisioned Go runtime dashboard, and Prometheus will be configured to scrape Watchdog metrics. +## Testing -# Testing - -### To run all unit test cases: +### Unit Tests ```bash go test ./... ``` -### To run integration testing - -Follow these steps: - -1. Open the file `integration_servers.json.template` inside the `internal/integration` package. -2. Rename it to `integration_servers.json`. -3. Fill in your OBA server configuration values. +### Integration Tests -Then run: +- Copy `integration_config.json.template` → `integration_config.json` +- Fill in OBA server values +- Run: ```bash -go test -tags=integration ./internal/integration -integration-config ./integration_servers.json +go test -tags=integration ./internal/integration \ + -integration-config path/to/integration_config.json ``` Note: -- The `integration_servers.json` file is ignored by Git to prevent accidental commits of sensitive data. -- You can point to any config file by passing its path using the -integration-config flag. +- ⚠️ the file **must** be named `integration_config.json` +- It’s Git-ignored for safety From 6778408f6cadb875635d87a9eff1e3600211568c Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Thu, 28 Aug 2025 08:25:05 +0300 Subject: [PATCH 05/11] docs: add contributing guidelines --- CONTRIBUTING.md | 43 +++++++++++++++++++++++++++++++++++++++++++ README.md | 5 +++++ 2 files changed, 48 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..78cc788 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,43 @@ +# Contributing to Watchdog + +Participating in this project means you agree to follow the [OBA Code of Conduct](https://github.com/OneBusAway/onebusaway/blob/master/CODE_OF_CONDUCT.md). + +## How to Contribute + +1. Fork the repository by clicking the fork button in the top-right corner. +2. Clone your fork locally and set the upstream to the official Watchdog repository. +3. Follow the setup instructions [here](./README.md#setup). +4. Keep changes scoped: + - Use a dedicated branch per bug fix or feature. + - This allows you to open and manage multiple pull requests in parallel. +5. Stay up to date: + - Sync your fork regularly with the upstream `main` branch to avoid conflicts. +6. **Open an issue first** for new features or significant changes so we can discuss design and direction. +7. Follow [code guidelines](#code-guidelines) and write conventional commit messages. +8. Submit a Pull Request + - Clearly describe the problem being solved. + - Reference related issues in your commits or PR description (e.g., `Fixes #42: Improve GTFS parsing`). + +## Code Guidelines + +- Run Watchdog using one of the methods described [here](./README.md#running) and ensure there are no build or runtime errors. + +- **Formatting**: Run the Go formatter before committing. + +```bash + go fmt ./... +``` + +- **Linting**: Run `go vet` to catch common mistakes. + +```bash + go vet ./... +``` + +- **Testing**: + - Write unit tests for new functionality. + - Ensure the full test suite passes with: + +```bash + go test ./... +``` diff --git a/README.md b/README.md index 0d225a5..17bc0d3 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ It exposes a comprehensive set of **Prometheus metrics** for monitoring: - [Testing](#testing) - [Unit Tests](#unit-tests) - [Integration Tests](#integration-tests) +- [Contributing](#contributing) ## Requirements @@ -220,3 +221,7 @@ Note: - ⚠️ the file **must** be named `integration_config.json` - It’s Git-ignored for safety + +## Contributing + +Refer to [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed contribution guidelines. From 5952848b47329c3e51c2b3c26b90055c8ba145ff Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Fri, 29 Aug 2025 06:39:34 +0300 Subject: [PATCH 06/11] docs: add project Code of Conduct file --- CODE_OF_CONDUCT.md | 3 +++ CONTRIBUTING.md | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..790761b --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +# Code of Conduct + +By participating in this project, you agree to comply with the [OBA Code of Conduct](https://github.com/OneBusAway/onebusaway/blob/master/CODE_OF_CONDUCT.md). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 78cc788..747371b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ # Contributing to Watchdog -Participating in this project means you agree to follow the [OBA Code of Conduct](https://github.com/OneBusAway/onebusaway/blob/master/CODE_OF_CONDUCT.md). +Please note that this project is released with a [Code of Conduct](./CODE_OF_CONDUCT.md); by participating, you agree to uphold its terms. ## How to Contribute From 6ac97bcdf78c817715ab4409cca85e6b28186f47 Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Fri, 29 Aug 2025 06:50:04 +0300 Subject: [PATCH 07/11] docs: remove Table of Contents from README --- README.md | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/README.md b/README.md index 17bc0d3..c389b19 100644 --- a/README.md +++ b/README.md @@ -11,31 +11,6 @@ It exposes a comprehensive set of **Prometheus metrics** for monitoring: - Overall operational health See the full list of metrics and interpretation guide [here](./docs/METRICS.md) -## Table of Contents - -- [Requirements](#requirements) -- [Setup](#setup) - - [Configuration](#configuration) - - [Example config.json](#example-configjson) - - [Ways to Provide the Config File](#ways-to-provide-the-config-file) - - [Local Configuration (recommended for development)](#1-local-configuration-recommended-for-development) - - [Remote Configuration (recommended for production)](#2-remote-configuration-recommended-for-production) - - [Application Options](#application-options) - - [Environment Variables](#environment-variables) -- [Running](#running) - - [Docker Compose (recommended)](#1-docker-compose-recommended) - - [Watchdog Only](#2-watchdog-only) - - [Local Config](#local-config) - - [Remote Config (with auth)](#remote-config-with-auth) - - [Docker (single container)](#3-docker-single-container) - - [Build image](#build-image) - - [Run with local config](#run-with-local-config) - - [Run with remote config](#run-with-remote-config) -- [Testing](#testing) - - [Unit Tests](#unit-tests) - - [Integration Tests](#integration-tests) -- [Contributing](#contributing) - ## Requirements - **Go 1.23+** From d78a65472cbd94c58bd0d53d7d39301f71dafb79 Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Fri, 29 Aug 2025 07:12:00 +0300 Subject: [PATCH 08/11] docs: add documentation for backoff helper functions --- internal/config/backoff_time_store.go | 37 +++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/internal/config/backoff_time_store.go b/internal/config/backoff_time_store.go index 0994aca..1ecef9c 100644 --- a/internal/config/backoff_time_store.go +++ b/internal/config/backoff_time_store.go @@ -10,28 +10,42 @@ import ( ) const ( - BASE_BACKOFF = 1 * time.Second - MAX_BACKOFF = 2 * time.Minute + // BASE_BACKOFF is the initial backoff delay before the first retry. + BASE_BACKOFF = 1 * time.Second + // MAX_BACKOFF is the upper limit for the backoff delay. + MAX_BACKOFF = 2 * time.Minute + // BACKOFF_FACTOR is the multiplier used to increase the backoff delay after each retry. BACKOFF_FACTOR = 2.0 - JITTER_FACTOR = 0.5 + // JITTER_FACTOR is the proportion of randomness applied to the backoff delay. + // It helps avoid synchronized retries across multiple clients. + JITTER_FACTOR = 0.5 ) +// backoffData holds the backoff delay and the timestamp of the next retry attempt +// for a given server. type backoffData struct { + // BackoffDelay is the current delay duration before retrying. BackoffDelay time.Duration - NextRetryAt time.Time + // NextRetryAt is the absolute timestamp when the next retry can be attempted. + NextRetryAt time.Time } +// BackoffStore manages backoff state for multiple servers. +// It is safe for concurrent use across goroutines. type BackoffStore struct { mu sync.RWMutex backoffs map[int]backoffData } +// NewBackoffStore creates and returns a new BackoffStore instance. func NewBackoffStore() *BackoffStore { return &BackoffStore{ backoffs: make(map[int]backoffData), } } +// NextRetryAt retrieves the next retry time for the given server ID. +// It returns the timestamp in UTC and a boolean indicating whether the server has an active backoff. func (s *BackoffStore) NextRetryAt(serverID int) (time.Time, bool) { s.mu.RLock() defer s.mu.RUnlock() @@ -41,6 +55,8 @@ func (s *BackoffStore) NextRetryAt(serverID int) (time.Time, bool) { return time.Time{}, false } +// UpdateBackoff updates the backoff delay and next retry time for the given server ID. +// If no backoff exists for the server, it initializes one with BASE_BACKOFF. func (s *BackoffStore) UpdateBackoff(serverID int) { s.mu.Lock() defer s.mu.Unlock() @@ -57,6 +73,7 @@ func (s *BackoffStore) UpdateBackoff(serverID int) { } } +// ResetBackoff removes any existing backoff data for the given server ID. func (s *BackoffStore) ResetBackoff(serverID int) { s.mu.Lock() defer s.mu.Unlock() @@ -64,6 +81,10 @@ func (s *BackoffStore) ResetBackoff(serverID int) { delete(s.backoffs, serverID) } +// DoWithBackoff executes an HTTP request with exponential backoff on failure. +// - If maxRetries is zero, it retries indefinitely. +// - If the context is canceled, it returns immediately. +// It applies jitter to avoid synchronized retries across clients. func DoWithBackoff(ctx context.Context, client *http.Client, req *http.Request, maxRetries int) (*http.Response, error) { backoffDelay := BASE_BACKOFF retries := 0 @@ -73,11 +94,13 @@ func DoWithBackoff(ctx context.Context, client *http.Client, req *http.Request, if err == nil { return resp, nil } - // if maxRetries set to zero it will retry forever + + // If maxRetries is greater than zero and reached, stop retrying. if maxRetries > 0 && retries >= maxRetries { return nil, fmt.Errorf("max retries exceeded: %w", err) } + // Wait for either the backoff delay or context cancellation. select { case <-ctx.Done(): return nil, ctx.Err() @@ -89,6 +112,8 @@ func DoWithBackoff(ctx context.Context, client *http.Client, req *http.Request, } } +// calculateNextRetryAt returns the next retry time by adding jitter to the given backoff duration. +// The result is capped at MAX_BACKOFF and returned as a UTC timestamp. func calculateNextRetryAt(backoff time.Duration) time.Time { // Adding jitter for backoff retries. Cryptographic randomness is not required. // #nosec G404 @@ -100,6 +125,8 @@ func calculateNextRetryAt(backoff time.Duration) time.Time { return time.Now().Add(backoff).UTC() } +// calculateNewBackoffDelay increases the given backoff delay by BACKOFF_FACTOR. +// The result is capped at MAX_BACKOFF. func calculateNewBackoffDelay(backoffDelay time.Duration) time.Duration { backoffDelay *= BACKOFF_FACTOR if backoffDelay >= MAX_BACKOFF { From 550c5e5e2f20e5143b0f8f6c9ceb238151fdee88 Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Fri, 29 Aug 2025 08:19:46 +0300 Subject: [PATCH 09/11] docs: add explanation for exponential backoff --- internal/app/metrics_collector.go | 26 ++++++++++++++++++++++++++ internal/config/config_loader.go | 6 +++++- internal/gtfs/gtfs_bundles.go | 9 ++++++--- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/internal/app/metrics_collector.go b/internal/app/metrics_collector.go index a78be1a..7bd995a 100644 --- a/internal/app/metrics_collector.go +++ b/internal/app/metrics_collector.go @@ -69,6 +69,28 @@ func (app *Application) StartMetricsCollection(ctx context.Context) { // but the process continues unless the GTFS-RT feed fails — in which case the function returns early, // as later checks depend on the real-time data. // +// Exponential Backoff: +// +// Unlike typical blocking backoff (e.g., retry loops with time.Sleep), this function uses a +// per-server backoff map (BackoffStore). Each server has a backoff delay and a calculated +// nextRetryAt timestamp. Before attempting a ping, the function checks whether the current time +// is still before nextRetryAt; if so, the entire metrics collection for that server is skipped. +// +// When a server fails, its backoff delay is increased exponentially and nextRetryAt is updated. +// When a server responds successfully, its backoff state is reset. +// +// Why this design? +// +// The StartMetricsCollection scheduler runs every `FetchInterval` (default: 30 seconds) for each server. +// If we used a standard blocking backoff inside CollectMetricsForServer, the backoff delay could exceed +// 30 seconds, causing overlapping goroutines (multiple retries running concurrently for the same +// server). By storing backoff state per server and checking it on each scheduled run, we ensure: +// - No overlapping goroutines per server. +// - Retry intervals still grow exponentially. +// - Retries align with the FetchInterval collection cycle, meaning the effective backoff wait time +// is always a multiple of FetchInterval. +// This keeps retries predictable, efficient, and non-blocking across all monitored servers. +// // Purpose: // - Centralizes all server-level metric gathering for reusability and testability. // - Ensures that all health and performance indicators are collected in one place. @@ -79,8 +101,10 @@ func (app *Application) StartMetricsCollection(ctx context.Context) { // - Sentry reports are tagged for fast debugging and correlation in distributed systems. // - Dependencies are injected (via app fields) to support testability and separation of concerns. func (app *Application) CollectMetricsForServer(server models.ObaServer) { + // Check if server has an active backoff period nextRetryAt, exists := app.ConfigService.BackoffStore.NextRetryAt(server.ID) if exists && time.Now().UTC().Before(nextRetryAt) { + // Still in backoff → skip metrics collection app.Logger.Info("Skipping metrics collection for server due to backoff", "server_id", server.ID, "next_retry_at", nextRetryAt) report.ReportErrorWithSentryOptions(fmt.Errorf("skipping metrics collection for server %s due to backoff", server.ObaBaseURL), report.SentryReportOptions{ Tags: map[string]string{ @@ -97,6 +121,7 @@ func (app *Application) CollectMetricsForServer(server models.ObaServer) { ok := app.MetricsService.ServerPing(server) if !ok { + // On ping failure → increase backoff for this server app.Logger.Error("Server ping failed", "server_id", server.ID, "server_name", server.Name) report.ReportErrorWithSentryOptions(fmt.Errorf("server ping failed for %s", server.ObaBaseURL), report.SentryReportOptions{ Tags: map[string]string{ @@ -113,6 +138,7 @@ func (app *Application) CollectMetricsForServer(server models.ObaServer) { return } + // On successful ping → reset backoff for this server app.Logger.Info("Server ping successful", "server_id", server.ID, "server_name", server.Name) app.ConfigService.BackoffStore.ResetBackoff(server.ID) diff --git a/internal/config/config_loader.go b/internal/config/config_loader.go index 960d23b..daa7566 100644 --- a/internal/config/config_loader.go +++ b/internal/config/config_loader.go @@ -103,12 +103,16 @@ func loadConfigFromFile(filePath string) ([]models.ObaServer, error) { return servers, nil } -// LoadConfigFromURL fetches a JSON configuration from a remote HTTP(S) endpoint, +// loadConfigFromURL fetches a JSON configuration from a remote HTTP(S) endpoint, // using the provided client and optional basic authentication. // // It validates the response status, reads the body, and unmarshals the configuration // into a slice of `models.ObaServer`. // +// Requests are executed with exponential backoff using DoWithBackoff. This ensures +// that transient network errors (e.g., timeouts, connection failures) are retried +// with increasing delays, up to `maxRetries` attempts. +// // Errors are logged and reported to Sentry for observability. func loadConfigFromURL(ctx context.Context, client *http.Client, url, authUser, authPass string, maxRetries int) ([]models.ObaServer, error) { req, err := http.NewRequest("GET", url, nil) diff --git a/internal/gtfs/gtfs_bundles.go b/internal/gtfs/gtfs_bundles.go index f485ddf..6ea1cd3 100644 --- a/internal/gtfs/gtfs_bundles.go +++ b/internal/gtfs/gtfs_bundles.go @@ -106,10 +106,12 @@ func refreshGTFSBundles(ctx context.Context, servers []models.ObaServer, logger } // downloadAndStoreGTFSBundle fetches a GTFS static bundle from the provided URL, -// parses it, and stores the resulting static data in the given StaticStore using the serverID as the key. +// parses it, and stores the resulting static data in the given StaticStore using +// the serverID as the key. Requests are executed with exponential backoff to handle +// transient network errors (e.g., timeouts, connection failures). // // It performs the following steps: -// 1. Makes an HTTP GET request to download the GTFS bundle. +// 1. Makes an HTTP GET request (with exponential backoff) to download the GTFS bundle. // 2. Reads and parses the response body as GTFS static data. // 3. Stores the parsed data in the StaticStore. // @@ -117,10 +119,11 @@ func refreshGTFSBundles(ctx context.Context, servers []models.ObaServer, logger // - url: The URL of the GTFS static bundle (usually a zip file). // - serverID: The identifier used to store and retrieve the static data from the store. // - staticStore: The in-memory store that holds GTFS static data indexed by server ID. +// - maxRetries: The maximum number of retry attempts allowed during exponential backoff +// before giving up on reaching the server // // Returns: // - gtfs static data -// - boolean severs as a sign if the request reached the server or not (server timeout or down) // - error: Describes what went wrong, or nil if the operation was successful. func downloadGTFSBundle(ctx context.Context, url string, serverID int, maxRetries int) (*remoteGtfs.Static, error) { From f36f367149137fa3a11333f08001bfe4203163ca Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Fri, 29 Aug 2025 09:08:13 +0300 Subject: [PATCH 10/11] docs(readme): update function documentation --- internal/app/routes.go | 34 ++++++++++---------- internal/config/config_loader.go | 28 ++++++++++++----- internal/gtfs/gtfs_bundles.go | 53 ++++++++++++++++++++++++-------- 3 files changed, 78 insertions(+), 37 deletions(-) diff --git a/internal/app/routes.go b/internal/app/routes.go index d17d81d..6c7a460 100644 --- a/internal/app/routes.go +++ b/internal/app/routes.go @@ -13,9 +13,11 @@ import ( // Routes sets up the HTTP routing configuration for the application and returns the final http.Handler. // -// This function initializes a new `httprouter.Router`, registers all application routes -// with their corresponding handler functions and HTTP methods, and wraps the entire router -// with Sentry middleware for centralized error tracking and performance monitoring. +// This function initializes a new `httprouter.Router`, registers all routes with their handlers, +// and wraps the router with Sentry and security middleware. +// +// Parameters: +// - ctx: Carries deadlines/cancellation and is passed to the cached Prometheus handler. // // Registered Routes: // - GET /v1/healthcheck: @@ -27,26 +29,22 @@ import ( // reduces collection overhead by caching exposition output for a configurable duration. // // Middleware: -// - `middleware.SentryMiddleware`: -// Wraps the router to automatically capture any panics, errors, or performance issues -// and report them to Sentry with contextual request data. +// - middleware.SentryMiddleware: +// Captures panics/errors and reports them to Sentry with request context. +// - middleware.SecurityHeaders: +// Adds HTTP security-related headers to every response. // // Purpose: -// - Centralize route registration for modularity and testability. -// - Establish a clear entry point for all incoming HTTP traffic. -// - Ensure observability via Prometheus and Sentry integrations. +// - Centralize route registration for modularity, testability, and a clear entry point +// for all incoming HTTP traffic. +// - Wire core middleware to ensure observability (Sentry, Prometheus) and baseline +// HTTP security headers. // - Improve performance and reduce Prometheus scrape overhead through cached metrics. +// - Support graceful cancellation and shutdown via ctx. // // Returns: // - An `http.Handler` instance that the server can use to handle incoming HTTP requests. -// -// Usage: -// -// Typically called during application startup and passed to `http.Server`: -// server := &http.Server{ -// Addr: ":4000", -// Handler: app.Routes(ctx), -// } + func (app *Application) Routes(ctx context.Context) http.Handler { // Initialize a new httprouter router instance. router := httprouter.New() @@ -58,7 +56,7 @@ func (app *Application) Routes(ctx context.Context) http.Handler { router.HandlerFunc(http.MethodGet, "/v1/healthcheck", app.healthcheckHandler) router.Handler(http.MethodGet, "/metrics", middleware.NewCachedPromHandler(ctx, prometheus.DefaultGatherer, 10*time.Second)) - // Wrap router with Sentry and securityHeaders middlewares + // Wrap router with Sentry and SecurityHeaders middlewares // Return wrapped httprouter instance. handler := middleware.SentryMiddleware(router) return middleware.SecurityHeaders(handler) diff --git a/internal/config/config_loader.go b/internal/config/config_loader.go index daa7566..f11db42 100644 --- a/internal/config/config_loader.go +++ b/internal/config/config_loader.go @@ -32,16 +32,30 @@ func ValidateConfigFlags(configFile, configURL *string) error { return nil } -// refreshConfig starts a background goroutine that periodically fetches configuration -// from a remote URL and updates the application's list of OBA servers. +// refreshConfig starts a background goroutine that periodically fetches +// configuration from a remote URL and updates the application's list of OBA servers. // -// It uses the provided HTTP client to make GET requests with optional basic auth, -// and on success, updates the application's configuration via `app.Config.UpdateConfig`. +// The fetch process is resilient: +// - It uses `loadConfigFromURL`, which applies exponential backoff retries +// (up to `maxRetries`) when transient network or parsing errors occur. +// - On success, the application's configuration is updated via `cfg.UpdateConfig`. +// - On failure, errors are logged and reported to Sentry, but the loop continues, +// ensuring that the service keeps running even under repeated failures. // -// Errors during fetch or parse are logged and reported to Sentry, but the loop continues, -// ensuring resiliency in the presence of transient issues. +// The function runs in a loop, sleeping for the specified `interval` between +// refresh attempts, and terminates gracefully when the context is canceled. // -// The routine stops gracefully when the context is canceled. +// Parameters: +// - ctx: Context for graceful cancellation of the refresh routine. +// - client: HTTP client used to fetch the remote config. +// - configURL: Remote URL to load configuration from. +// - configAuthUser: Optional username for basic authentication. +// - configAuthPass: Optional password for basic authentication. +// - cfg: Pointer to the application Config object to update. +// - logger: Logger for structured log output. +// - interval: Time duration between consecutive refresh attempts. +// - maxRetries: Maximum number of exponential backoff retries per fetch attempt. + func refreshConfig(ctx context.Context, client *http.Client, configURL, configAuthUser, configAuthPass string, cfg *Config, logger *slog.Logger, interval time.Duration, maxRetries int) { for { select { diff --git a/internal/gtfs/gtfs_bundles.go b/internal/gtfs/gtfs_bundles.go index 6ea1cd3..0ff43f3 100644 --- a/internal/gtfs/gtfs_bundles.go +++ b/internal/gtfs/gtfs_bundles.go @@ -20,21 +20,29 @@ import ( "watchdog.onebusaway.org/internal/utils" ) -// downloadGTFSBundles fetches and processes GTFS static bundles for a list of OBA servers. +// downloadGTFSBundles fetches and processes GTFS static bundles concurrently for a list of OBA servers. // -// For each server, it: -// 1. Downloads and parses the GTFS static bundle using the server's GTFS URL. -// 2. Stores the parsed data in the provided StaticStore. -// 3. Computes a geographic bounding box based on the stop locations in the static data. +// For each server, it starts a dedicated goroutine that: +// 1. Attempts to download and parse the GTFS static bundle from the server’s GTFS URL, +// using exponential backoff with retries (up to maxRetries). +// 2. Stores the parsed GTFS static data in the provided StaticStore, keyed by server ID. +// 3. Computes a geographic bounding box from the stop locations in the static data. // 4. Stores the bounding box in the provided BoundingBoxStore. // +// Concurrency: +// - A goroutine is launched for each server. +// - sync.WaitGroup is used to ensure all goroutines complete before the function returns. +// - Errors are handled per-server, reported via Sentry and logs, but do not stop processing other servers. +// // Parameters: +// - ctx: Context used to manage cancellation and timeouts across all goroutines. // - servers: A list of OBA servers, each containing a GTFS URL and unique ID. -// - logger: A structured logger used to record success/failure logs for each server. -// - boundingBoxStore: A store for bounding boxes, one per server. +// - logger: A structured logger for recording success/failure logs. +// - boundingBoxStore: A store for computed bounding boxes, one per server. // - staticStore: A store for parsed GTFS static data, keyed by server ID. +// - maxRetries: The maximum number of retries (with exponential backoff) when downloading a bundle. // -// This function does not return an error; failures are handled and reported per-server. +// This function does not return an error; failures are handled and reported individually per server. func downloadGTFSBundles(ctx context.Context, servers []models.ObaServer, logger *slog.Logger, boundingBoxStore *geo.BoundingBoxStore, staticStore *StaticStore, maxRetries int) { var wg sync.WaitGroup @@ -77,9 +85,10 @@ func downloadGTFSBundles(ctx context.Context, servers []models.ObaServer, logger // refreshGTFSBundles periodically refreshes GTFS static bundles for a list of OBA servers. // // It runs in a loop, triggered at the specified interval, and performs the following: -// 1. Logs the refresh operation. -// 2. Calls DownloadGTFSBundles to fetch, parse, and store updated GTFS data. -// 3. Updates geographic bounding boxes based on the downloaded data. +// 1. Logs the refresh operation. +// 2. Calls downloadGTFSBundles to fetch, parse, and store updated GTFS data for all servers. +// - Each server’s bundle download uses exponential backoff with retries, up to maxRetries attempts. +// 3. Updates geographic bounding boxes based on the downloaded data. // // The function listens for context cancellation (`ctx.Done()`) to gracefully stop the refresh routine. // @@ -88,8 +97,10 @@ func downloadGTFSBundles(ctx context.Context, servers []models.ObaServer, logger // - servers: List of OBA servers to fetch GTFS data from. // - logger: Logger for structured logging of refresh activity. // - interval: Time duration between each refresh cycle. -// - boundingBoxstore: Store to keep geographic bounding boxes per server. +// - boundingBoxStore: Store to keep geographic bounding boxes per server. // - staticStore: Store to keep parsed GTFS static data per server. +// - maxRetries: Maximum number of retries (with exponential backoff) for each server’s bundle download. + func refreshGTFSBundles(ctx context.Context, servers []models.ObaServer, logger *slog.Logger, interval time.Duration, boundingBoxstore *geo.BoundingBoxStore, staticStore *StaticStore, maxRetries int) { ticker := time.NewTicker(interval) defer ticker.Stop() @@ -188,6 +199,24 @@ func downloadGTFSBundle(ctx context.Context, url string, serverID int, maxRetrie } +// storeGTFSBundle stores a parsed GTFS static bundle in memory and computes its bounding box. +// +// The function performs the following: +// 1. Wraps the GTFS static bundle into a StaticData object, keeping only the relevant parts +// needed by the application to avoid storing the full bundle in memory. +// 2. Stores the StaticData in the StaticStore, keyed by serverID. +// 3. Computes the bounding box from the stops in the GTFS data. +// 4. Stores the bounding box in the BoundingBoxStore, also keyed by serverID. +// +// Parameters: +// - staticBundle: The parsed GTFS static bundle containing routes, stops, and other transit data. +// - serverID: The identifier used to store and retrieve data for a specific server. +// - staticStore: The in-memory store holding GTFS static data indexed by server ID. +// - boundingBoxStore: The in-memory store holding computed bounding boxes for GTFS data. +// +// Returns: +// - error: If computing the bounding box fails, an error is returned. Otherwise, nil. + func storeGTFSBundle(staticBundle *remoteGtfs.Static, serverID int, staticStore *StaticStore, boundingBoxStore *geo.BoundingBoxStore) error { // StaticData is a wrapper around the GTFS static bundle // that includes only the parts we use in the application. From 0f0c33fdd215acd00d7d58dc66f9e47eb185d4d9 Mon Sep 17 00:00:00 2001 From: Abo-Omar-74 Date: Sat, 30 Aug 2025 21:54:45 +0300 Subject: [PATCH 11/11] docs(readme): clarify localhost vs production endpoints with server IP/domain --- README.md | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c389b19..8842555 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Coverage Status](https://coveralls.io/repos/github/OneBusAway/watchdog/badge.svg?branch=main)](https://coveralls.io/github/OneBusAway/watchdog?branch=main) -**Watchdog** is a Go-based service that monitors [OneBusAway (OBA)](https://onebusaway.org/) REST API servers. +**Watchdog** is a Go-based service that monitors [OneBusAway (OBA)](https://onebusaway.org/) REST API servers. It exposes a comprehensive set of **Prometheus metrics** for monitoring: - GTFS Static and GTFS-RT data integrity @@ -99,6 +99,8 @@ export CONFIG_AUTH_PASS="password" ## Running +It may take a few minutes for Watchdog to start exposing data to Prometheus, since initial setup includes tasks such as downloading the GTFS bundle. + ### 1. Docker Compose (recommended) Run Watchdog with **Prometheus + Grafana**: @@ -127,6 +129,8 @@ docker compose restart Grafana auto-loads a Go runtime dashboard. Prometheus is pre-configured to scrape Watchdog. +See [Endpoints](#endpoints) to access metrics, health checks, Grafana, and Prometheus. + ### 2. Watchdog Only #### Local Config @@ -142,6 +146,8 @@ go run ./cmd/watchdog/ \ --config-url http://example.com/config.json ``` +See [Endpoints](#endpoints) to access metrics and health checks. + ### 3. Docker (single container) #### Build image @@ -173,6 +179,26 @@ docker run -d \ --config-url http://example.com/config.json ``` +See [Endpoints](#endpoints) to access metrics and health checks. + +## Endpoints + +During **development** (using `localhost`): + +- Watchdog Metrics: [http://localhost:4000/metrics](http://localhost:4000/metrics) +- Watchdog Health Check: [http://localhost:4000/v1/healthcheck](http://localhost:4000/v1/healthcheck) +- Grafana: [http://localhost:3000/login](http://localhost:3000/login) → default user/pass: `admin` / `admin` +- Prometheus Targets: [http://localhost:9090/targets](http://localhost:9090/targets) +- Prometheus Query: [http://localhost:9090/query](http://localhost:9090/query) + +During **production** (replace ``): + +- Watchdog Metrics: `http://:4000/metrics` +- Watchdog Health Check: `http://:4000/v1/healthcheck` +- Grafana: `http://:3000/login` +- Prometheus Targets: `http://:9090/targets` +- Prometheus Query: `http://:9090/query` + ## Testing ### Unit Tests