Skip to content

Uptime scoring for nodes #1381

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions grid-proxy/internal/explorer/converters.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func nodeFromDBNode(info db.Node) types.Node {
NumGPU: info.NumGPU,
ExtraFee: info.ExtraFee,
Healthy: info.Healthy,
UptimeScore: info.UptimeScore,
Dmi: types.Dmi{
Processor: info.Processor,
Memory: info.Memory,
Expand Down Expand Up @@ -169,6 +170,7 @@ func nodeWithNestedCapacityFromDBNode(info db.Node) types.NodeWithNestedCapacity
NumGPU: info.NumGPU,
ExtraFee: info.ExtraFee,
Healthy: info.Healthy,
UptimeScore: info.UptimeScore,
Dmi: types.Dmi{
Processor: info.Processor,
Memory: info.Memory,
Expand Down
57 changes: 56 additions & 1 deletion grid-proxy/internal/explorer/db/indexer_calls.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,15 @@ func (p *PostgresDatabase) UpsertNodesGPU(ctx context.Context, gpus []types.Node
}

func (p *PostgresDatabase) UpsertNodeHealth(ctx context.Context, healthReports []types.HealthReport) error {
// Calculate uptime scores for each health report
for i := range healthReports {
uptimeScore := p.calculateUptimeScore(ctx, healthReports[i])
healthReports[i].UptimeScore = uptimeScore
}

conflictClause := clause.OnConflict{
Columns: []clause.Column{{Name: "node_twin_id"}},
DoUpdates: clause.AssignmentColumns([]string{"healthy", "updated_at"}),
DoUpdates: clause.AssignmentColumns([]string{"healthy", "uptime_score", "updated_at"}),
}
return p.gormDB.WithContext(ctx).Table("health_report").Clauses(conflictClause).Create(&healthReports).Error
}
Expand Down Expand Up @@ -111,3 +117,52 @@ func (p *PostgresDatabase) UpsertNodeLocation(ctx context.Context, locations []t
}
return p.gormDB.WithContext(ctx).Table("node_location").Clauses(conflictClause).Create(&locations).Error
}

func (p *PostgresDatabase) calculateUptimeScore(ctx context.Context, healthReport types.HealthReport) float64 {
const thirtyDaysInSeconds = 30 * 24 * 60
const intervalsInThirtyDays = 30 * 24 * 60 / 5 // 30 days in minutes, divided by 5 minutes intervals

now := healthReport.UpdatedAt
thirtyDaysAgo := now - thirtyDaysInSeconds

newValue := 0.0
if healthReport.Healthy {
newValue = 1.0
}
var previousReport types.HealthReport
previousTotal := 0.0
err := p.gormDB.WithContext(ctx).Table("health_report").
Where("node_twin_id = ?", healthReport.NodeTwinId).
Last(&previousReport).Error

// If no previous report exists, we assume the previous total is 0
if err != nil {
return newValue / intervalsInThirtyDays
}
previousTotal = previousReport.UptimeScore * intervalsInThirtyDays

// Get old value from 30 days ago (±1 minute) from history
oldValue := 0.0
var oldReport types.HealthReport
startTime := thirtyDaysAgo - 60
endTime := thirtyDaysAgo + 60

err = p.gormDB.WithContext(ctx).Table("health_report").
Where("node_twin_id = ? AND updated_at BETWEEN ? AND ?", healthReport.NodeTwinId, startTime, endTime).
Order("updated_at DESC").
First(&oldReport).Error

if err == nil {
if oldReport.Healthy {
oldValue = 1.0
}
}

totalHealthyIntervals := previousTotal + newValue - oldValue

if totalHealthyIntervals < 0 {
totalHealthyIntervals = 0
}

return totalHealthyIntervals / intervalsInThirtyDays
}
1 change: 1 addition & 0 deletions grid-proxy/internal/explorer/db/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ func (d *PostgresDatabase) nodeTableQuery(ctx context.Context, filter types.Node
"resources_cache.node_gpu_count AS num_gpu",
"resources_cache.gpus",
"health_report.healthy",
"health_report.uptime_score",
"node_ipv6.has_ipv6",
"node_features.features as features",
"resources_cache.bios",
Expand Down
1 change: 1 addition & 0 deletions grid-proxy/internal/explorer/db/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ type Node struct {
ExtraFee uint64
NodeContractsCount uint64 `gorm:"node_contracts_count"`
Healthy bool
UptimeScore float64 `gorm:"uptime_score"`
Bios types.BIOS `gorm:"type:jsonb;serializer:json"`
Baseboard types.Baseboard `gorm:"type:jsonb;serializer:json"`
Memory []types.Memory `gorm:"type:jsonb;serializer:json"`
Expand Down
7 changes: 4 additions & 3 deletions grid-proxy/pkg/types/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ func (NodeGPU) TableName() string {
// HealthReport holds the state of node healthiness
// used as gorm model
type HealthReport struct {
NodeTwinId uint32 `gorm:"unique;not null"`
Healthy bool
UpdatedAt int64
NodeTwinId uint32 `gorm:"unique;not null"`
Healthy bool
UptimeScore float64 `gorm:"default:0"` // Uptime score calculated using 30-day sliding window
UpdatedAt int64
}

func (HealthReport) TableName() string {
Expand Down
2 changes: 2 additions & 0 deletions grid-proxy/pkg/types/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ type Node struct {
NumGPU int `json:"num_gpu" sort:"num_gpu"`
ExtraFee uint64 `json:"extraFee" sort:"extra_fee"`
Healthy bool `json:"healthy"`
UptimeScore float64 `json:"uptime_score"`
Dmi Dmi `json:"dmi"`
Speed Speed `json:"speed"`
CpuBenchmark CpuBenchmark `json:"cpu_benchmark"`
Expand Down Expand Up @@ -94,6 +95,7 @@ type NodeWithNestedCapacity struct {
NumGPU int `json:"num_gpu"`
ExtraFee uint64 `json:"extraFee"`
Healthy bool `json:"healthy"`
UptimeScore float64 `json:"uptime_score"`
Dmi Dmi `json:"dmi"`
Speed Speed `json:"speed"`
CpuBenchmark CpuBenchmark `json:"cpu_benchmark"`
Expand Down
8 changes: 5 additions & 3 deletions grid-proxy/tests/queries/mock_client/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type DBData struct {
GPUs map[uint32][]types.NodeGPU
Regions map[string]string
Locations map[string]Location
HealthReports map[uint32]bool
HealthReports map[uint32]types.HealthReport
NodeIpv6 map[uint32]bool
NodeFeatures map[uint32][]string
DMIs map[uint32]types.Dmi
Expand Down Expand Up @@ -514,6 +514,7 @@ func loadHealthReports(db *sql.DB, data *DBData) error {
SELECT
COALESCE(node_twin_id, 0),
COALESCE(healthy, false),
COALESCE(uptime_score, 0.0),
COALESCE(updated_at, 0)
FROM
health_report;`)
Expand All @@ -525,11 +526,12 @@ func loadHealthReports(db *sql.DB, data *DBData) error {
if err := rows.Scan(
&health.NodeTwinId,
&health.Healthy,
&health.UptimeScore,
&health.UpdatedAt,
); err != nil {
return err
}
data.HealthReports[health.NodeTwinId] = health.Healthy
data.HealthReports[health.NodeTwinId] = health
}

return nil
Expand Down Expand Up @@ -796,7 +798,7 @@ func Load(db *sql.DB, gormDB *gorm.DB) (DBData, error) {
FarmHasRentedNode: make(map[uint64]map[uint64]bool),
Regions: make(map[string]string),
Locations: make(map[string]Location),
HealthReports: make(map[uint32]bool),
HealthReports: make(map[uint32]types.HealthReport),
DMIs: make(map[uint32]types.Dmi),
Speeds: make(map[uint32]types.Speed),
CpuBenchmarks: make(map[uint32]types.CpuBenchmark),
Expand Down
24 changes: 13 additions & 11 deletions grid-proxy/tests/queries/mock_client/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,12 @@ func (g *GridProxyMockClient) Nodes(ctx context.Context, filter types.NodeFilter
State: node.Power.State,
Target: node.Power.Target,
},
NumGPU: numGPU,
GPUs: getGpus(g.data, uint32(node.TwinID)),
ExtraFee: node.ExtraFee,
Healthy: g.data.HealthReports[uint32(node.TwinID)],
Dmi: g.data.DMIs[uint32(node.TwinID)],
NumGPU: numGPU,
GPUs: getGpus(g.data, uint32(node.TwinID)),
ExtraFee: node.ExtraFee,
Healthy: g.data.HealthReports[uint32(node.TwinID)].Healthy,
UptimeScore: g.data.HealthReports[uint32(node.TwinID)].UptimeScore,
Dmi: g.data.DMIs[uint32(node.TwinID)],
Speed: types.Speed{
Upload: g.data.Speeds[uint32(node.TwinID)].Upload,
Download: g.data.Speeds[uint32(node.TwinID)].Download,
Expand Down Expand Up @@ -289,11 +290,12 @@ func (g *GridProxyMockClient) Node(ctx context.Context, nodeID uint32) (res type
State: node.Power.State,
Target: node.Power.Target,
},
NumGPU: numGPU,
GPUs: getGpus(g.data, uint32(node.TwinID)),
ExtraFee: node.ExtraFee,
Healthy: g.data.HealthReports[uint32(node.TwinID)],
Dmi: g.data.DMIs[uint32(node.TwinID)],
NumGPU: numGPU,
GPUs: getGpus(g.data, uint32(node.TwinID)),
ExtraFee: node.ExtraFee,
Healthy: g.data.HealthReports[uint32(node.TwinID)].Healthy,
UptimeScore: g.data.HealthReports[uint32(node.TwinID)].UptimeScore,
Dmi: g.data.DMIs[uint32(node.TwinID)],
Speed: types.Speed{
Upload: g.data.Speeds[uint32(node.TwinID)].Upload,
Download: g.data.Speeds[uint32(node.TwinID)].Download,
Expand Down Expand Up @@ -354,7 +356,7 @@ func (n *Node) satisfies(f types.NodeFilter, data *DBData) bool {
return false
}

if f.Healthy != nil && *f.Healthy != data.HealthReports[uint32(n.TwinID)] {
if f.Healthy != nil && *f.Healthy != data.HealthReports[uint32(n.TwinID)].Healthy {
return false
}

Expand Down
7 changes: 4 additions & 3 deletions grid-proxy/tools/db/crafter/generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -901,9 +901,10 @@ func (c *Crafter) GenerateHealthReports() error {
}

healthReport := types.HealthReport{
NodeTwinId: uint32(nodeTwinsStart + i),
Healthy: health,
UpdatedAt: time.Now().Unix(),
NodeTwinId: uint32(nodeTwinsStart + i),
Healthy: health,
UptimeScore: rand.Float64(), // Random uptime score between 0.0 and 1.0
UpdatedAt: time.Now().Unix(),
}
healthReports = append(healthReports, healthReport)
}
Expand Down
3 changes: 3 additions & 0 deletions grid-proxy/tools/db/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1024,6 +1024,9 @@ CREATE TABLE IF NOT EXISTS public.health_report (
updated_at bigint
);

ALTER TABLE public.health_report
ADD COLUMN IF NOT EXISTS uptime_score double precision DEFAULT 0;

ALTER TABLE public.health_report
OWNER TO postgres;

Expand Down
Loading