Skip to content

Commit 1d8f2ec

Browse files
extend env relays with zos-config (#21)
* refactor network health check - require at least one instance of each service to be alive * adds set relays urls from zos config Signed-off-by: Ashraf Fouda <ashraf.m.fouda@gmail.com> * modify GetRelaysUrls to not propagate error --------- Signed-off-by: Ashraf Fouda <ashraf.m.fouda@gmail.com> Co-authored-by: Ashraf Fouda <ashraf.m.fouda@gmail.com>
1 parent 2688840 commit 1d8f2ec

File tree

4 files changed

+92
-39
lines changed

4 files changed

+92
-39
lines changed

pkg/environment/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ type Config struct {
3131
RolloutUpgrade struct {
3232
TestFarms []uint32 `json:"test_farms"`
3333
} `json:"rollout_upgrade"`
34+
RelaysURLs []string `json:"relays_urls"`
3435
}
3536

3637
// Merge, updates current config with cfg merging and override config

pkg/environment/environment.go

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"sync"
88

99
"github.com/pkg/errors"
10+
"github.com/rs/zerolog/log"
1011
substrate "github.com/threefoldtech/tfchain/clients/tfchain-client-go"
1112
"github.com/threefoldtech/zosbase/pkg"
1213

@@ -47,7 +48,7 @@ type Environment struct {
4748
// IMPORTANT NOTICE:
4849
// SINCE RELAYS FOR A NODE IS STORED ON THE CHAIN IN A LIMITED SPACE
4950
// PLEASE MAKE SURE THAT ANY ENV HAS NO MORE THAN FOUR RELAYS CONFIGURED
50-
RelayURL []string
51+
relaysURLs []string
5152
ActivationURL []string
5253
GraphQL []string
5354
KycURL string
@@ -112,7 +113,7 @@ var (
112113
"wss://tfchain.dev.grid.tf/",
113114
"wss://tfchain.02.dev.grid.tf",
114115
},
115-
RelayURL: []string{
116+
relaysURLs: []string{
116117
"wss://relay.dev.grid.tf",
117118
"wss://relay.02.dev.grid.tf",
118119
},
@@ -136,7 +137,7 @@ var (
136137
"wss://tfchain.test.grid.tf/",
137138
"wss://tfchain.02.test.grid.tf",
138139
},
139-
RelayURL: []string{
140+
relaysURLs: []string{
140141
"wss://relay.test.grid.tf",
141142
"wss://relay.02.test.grid.tf",
142143
},
@@ -160,7 +161,7 @@ var (
160161
"wss://tfchain.qa.grid.tf/",
161162
"wss://tfchain.02.qa.grid.tf/",
162163
},
163-
RelayURL: []string{
164+
relaysURLs: []string{
164165
"wss://relay.qa.grid.tf",
165166
"wss://relay.02.qa.grid.tf",
166167
},
@@ -187,9 +188,9 @@ var (
187188
"wss://03.tfchain.grid.tf/",
188189
"wss://04.tfchain.grid.tf/",
189190
},
190-
RelayURL: []string{
191+
relaysURLs: []string{
191192
"wss://relay.grid.tf",
192-
"wss://relay.02.grid.tf",
193+
// "wss://relay.02.grid.tf",
193194
},
194195
ActivationURL: []string{
195196
"https://activation.grid.tf/activation/activate",
@@ -224,13 +225,22 @@ func Get() (Environment, error) {
224225
if err != nil {
225226
return Environment{}, err
226227
}
227-
if params.IsV4() {
228-
env.FlistURL = "redis://v4.hub.grid.tf:9940"
229-
}
230228

231229
return env, nil
232230
}
233231

232+
func GetRelaysURLs() []string {
233+
config, err := GetConfig()
234+
if err == nil && len(config.RelaysURLs) > 0 {
235+
log.Debug().Msg("using relays urls from zos-config")
236+
return config.RelaysURLs
237+
}
238+
239+
log.Debug().Msg("using relays urls from environment")
240+
env := MustGet()
241+
return env.relaysURLs
242+
}
243+
234244
// GetSubstrate gets a client to subsrate blockchain
235245
func GetSubstrate() (substrate.Manager, error) {
236246
env, err := Get()
@@ -281,7 +291,7 @@ func getEnvironmentFromParams(params kernel.Params) (Environment, error) {
281291

282292
if relay, ok := params.Get("relay"); ok {
283293
if len(relay) > 0 {
284-
env.RelayURL = relay
294+
env.relaysURLs = relay
285295
}
286296
}
287297

@@ -368,5 +378,10 @@ func getEnvironmentFromParams(params kernel.Params) (Environment, error) {
368378
env.BinRepo = e
369379
}
370380

381+
// if the node running v4 chage flisturl to use v4.hub.grid.tf
382+
if params.IsV4() {
383+
env.FlistURL = "redis://v4.hub.grid.tf:9940"
384+
}
385+
371386
return env, nil
372387
}

pkg/perf/healthcheck/healthcheck.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func (h *healthcheckTask) Run(ctx context.Context) (interface{}, error) {
8686
}
8787

8888
if len(errors) != 0 {
89-
return fmt.Errorf("failed health check")
89+
return fmt.Errorf("failed health check %s", errorsToStrings(errors))
9090
}
9191

9292
return nil

pkg/perf/healthcheck/network.go

Lines changed: 65 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"net"
77
"net/url"
8+
"strings"
89
"sync"
910
"time"
1011

@@ -13,64 +14,100 @@ import (
1314
"github.com/threefoldtech/zosbase/pkg/environment"
1415
)
1516

16-
const defaultRequestTimeout = 5 * time.Second
17+
const defaultRequestTimeout = 10 * time.Second
1718

19+
// function: at least one instance of each service should be reachable
20+
// returns errors as a report for perf healthcheck
21+
// a side effect: set/delete the not-reachable flag
1822
func networkCheck(ctx context.Context) []error {
19-
env := environment.MustGet()
20-
servicesUrl := []string{env.FlistURL}
21-
22-
servicesUrl = append(append(servicesUrl, env.SubstrateURL...), env.RelayURL...)
23-
servicesUrl = append(append(servicesUrl, env.ActivationURL...), env.GraphQL...)
23+
var (
24+
wg sync.WaitGroup
25+
errMu sync.Mutex
26+
errors []error
27+
)
2428

25-
var errors []error
29+
env := environment.MustGet()
30+
services := map[string][]string{
31+
"substrate": env.SubstrateURL,
32+
"activation": env.ActivationURL,
33+
"relay": environment.GetRelaysURLs(),
34+
"graphql": env.GraphQL,
35+
"hub": {env.FlistURL},
36+
"kyc": {env.KycURL},
37+
}
2638

27-
var wg sync.WaitGroup
28-
var mut sync.Mutex
29-
for _, serviceUrl := range servicesUrl {
39+
for service, instances := range services {
3040
wg.Add(1)
31-
go func(serviceUrl string) {
41+
go func(service string, instances []string) {
3242
defer wg.Done()
3343

34-
err := checkService(ctx, serviceUrl)
35-
if err != nil {
36-
mut.Lock()
37-
defer mut.Unlock()
38-
44+
if err := verifyAtLeastOneIsReachable(ctx, service, instances); err != nil {
45+
errMu.Lock()
3946
errors = append(errors, err)
47+
errMu.Unlock()
4048
}
41-
}(serviceUrl)
49+
50+
}(service, instances)
4251
}
52+
4353
wg.Wait()
4454

4555
if len(errors) == 0 {
56+
log.Debug().Msg("all network checks passed")
4657
if err := app.DeleteFlag(app.NotReachable); err != nil {
47-
log.Error().Err(err).Msg("failed to delete readonly flag")
58+
log.Error().Err(err).Msg("failed to delete not-reachable flag")
59+
}
60+
} else {
61+
log.Warn().Int("failed_checks", len(errors)).Msg("some network checks failed")
62+
if err := app.SetFlag(app.NotReachable); err != nil {
63+
log.Error().Err(err).Msg("failed to set not-reachable flag")
4864
}
4965
}
5066

5167
return errors
5268
}
5369

70+
func verifyAtLeastOneIsReachable(ctx context.Context, service string, instances []string) error {
71+
if len(instances) == 0 {
72+
return fmt.Errorf("no instances provided for service %s", service)
73+
}
74+
75+
var unreachableErrors []string
76+
for _, instance := range instances {
77+
if err := checkService(ctx, instance); err == nil {
78+
return nil
79+
} else {
80+
unreachableErrors = append(unreachableErrors, err.Error())
81+
}
82+
}
83+
84+
return fmt.Errorf("all %s instances are unreachable: %s", service, strings.Join(unreachableErrors, "; "))
85+
}
86+
5487
func checkService(ctx context.Context, serviceUrl string) error {
55-
ctx, cancel := context.WithTimeout(ctx, defaultRequestTimeout)
88+
timeoutCtx, cancel := context.WithTimeout(ctx, defaultRequestTimeout)
5689
defer cancel()
5790

58-
address := parseUrl(serviceUrl)
59-
err := isReachable(ctx, address)
91+
address, err := parseUrl(serviceUrl)
6092
if err != nil {
61-
if err := app.SetFlag(app.NotReachable); err != nil {
62-
log.Error().Err(err).Msg("failed to set not reachable flag")
63-
}
93+
return fmt.Errorf("invalid URL %s: %w", serviceUrl, err)
94+
}
95+
96+
if err := isReachable(timeoutCtx, address); err != nil {
6497
return fmt.Errorf("%s is not reachable: %w", serviceUrl, err)
6598
}
6699

67100
return nil
68101
}
69102

70-
func parseUrl(serviceUrl string) string {
103+
func parseUrl(serviceUrl string) (string, error) {
71104
u, err := url.Parse(serviceUrl)
72105
if err != nil {
73-
return ""
106+
return "", fmt.Errorf("failed to parse URL: %w", err)
107+
}
108+
109+
if u.Host == "" {
110+
return "", fmt.Errorf("missing hostname in URL")
74111
}
75112

76113
port := ":80"
@@ -82,11 +119,11 @@ func parseUrl(serviceUrl string) string {
82119
u.Host += port
83120
}
84121

85-
return u.Host
122+
return u.Host, nil
86123
}
87124

88125
func isReachable(ctx context.Context, address string) error {
89-
d := net.Dialer{Timeout: defaultRequestTimeout}
126+
var d net.Dialer
90127
conn, err := d.DialContext(ctx, "tcp", address)
91128
if err != nil {
92129
return fmt.Errorf("failed to connect: %w", err)

0 commit comments

Comments
 (0)