Skip to content

Commit

Permalink
services/horizon: Add horizon health check endpoint (#3435)
Browse files Browse the repository at this point in the history
This commit adds a health check endpoint which can be used to check if horizon is operational. Fully operation is defined as being able to submit transactions to stellar core and being able to access the Horizon DB. On success the health check responds with a 200 http status code. On failure the health check responds with a 503.
  • Loading branch information
tamirms authored Mar 4, 2021
1 parent 7731f31 commit df2d6e4
Show file tree
Hide file tree
Showing 8 changed files with 336 additions and 1 deletion.
2 changes: 2 additions & 0 deletions services/horizon/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ file. This project adheres to [Semantic Versioning](http://semver.org/).

## Unreleased

* Add an endpoint which determines if Horizon is healthy enough to receive traffic ([3435](https://github.com/stellar/go/pull/3435)).

## v2.0.0

### Before you upgrade
Expand Down
9 changes: 9 additions & 0 deletions services/horizon/internal/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,15 @@ func (a *App) init() error {
CoreGetter: a,
HorizonVersion: a.horizonVersion,
FriendbotURL: a.config.FriendbotURL,
HealthCheck: healthCheck{
session: a.historyQ.Session,
ctx: a.ctx,
core: &stellarcore.Client{
HTTP: &http.Client{Timeout: infoRequestTimeout},
URL: a.config.StellarCoreURL,
},
cache: newHealthCache(healthCacheTTL),
},
}

var err error
Expand Down
96 changes: 96 additions & 0 deletions services/horizon/internal/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package horizon

import (
"context"
"encoding/json"
"net/http"
"sync"
"time"

"github.com/stellar/go/protocols/stellarcore"
"github.com/stellar/go/support/clock"
"github.com/stellar/go/support/db"
"github.com/stellar/go/support/log"
)

const (
dbPingTimeout = 5 * time.Second
infoRequestTimeout = 5 * time.Second
healthCacheTTL = 500 * time.Millisecond
)

var healthLogger = log.WithField("service", "healthCheck")

type stellarCoreClient interface {
Info(ctx context.Context) (*stellarcore.InfoResponse, error)
}

type healthCache struct {
response healthResponse
lastUpdate time.Time
ttl time.Duration
clock clock.Clock
lock sync.Mutex
}

func (h *healthCache) get(runCheck func() healthResponse) healthResponse {
h.lock.Lock()
defer h.lock.Unlock()

if h.clock.Now().Sub(h.lastUpdate) > h.ttl {
h.response = runCheck()
h.lastUpdate = h.clock.Now()
}

return h.response
}

func newHealthCache(ttl time.Duration) *healthCache {
return &healthCache{ttl: ttl}
}

type healthCheck struct {
session db.SessionInterface
ctx context.Context
core stellarCoreClient
cache *healthCache
}

type healthResponse struct {
DatabaseConnected bool `json:"database_connected"`
CoreUp bool `json:"core_up"`
CoreSynced bool `json:"core_synced"`
}

func (h healthCheck) runCheck() healthResponse {
response := healthResponse{
DatabaseConnected: true,
CoreUp: true,
CoreSynced: true,
}
if err := h.session.Ping(dbPingTimeout); err != nil {
healthLogger.Warnf("could not ping db: %s", err)
response.DatabaseConnected = false
}
if resp, err := h.core.Info(h.ctx); err != nil {
healthLogger.Warnf("request to stellar core failed: %s", err)
response.CoreUp = false
response.CoreSynced = false
} else {
response.CoreSynced = resp.IsSynced()
}

return response
}

func (h healthCheck) ServeHTTP(w http.ResponseWriter, r *http.Request) {
response := h.cache.get(h.runCheck)

if !response.DatabaseConnected || !response.CoreSynced || !response.CoreUp {
w.WriteHeader(http.StatusServiceUnavailable)
}

if err := json.NewEncoder(w).Encode(response); err != nil {
healthLogger.Warnf("could not write response: %s", err)
}
}
212 changes: 212 additions & 0 deletions services/horizon/internal/health_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
package horizon

import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"sync"
"testing"
"time"

"github.com/stellar/go/protocols/stellarcore"
"github.com/stellar/go/support/clock"
"github.com/stellar/go/support/clock/clocktest"
"github.com/stellar/go/support/db"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
)

var _ stellarCoreClient = (*mockStellarCore)(nil)

type mockStellarCore struct {
mock.Mock
}

func (m *mockStellarCore) Info(ctx context.Context) (*stellarcore.InfoResponse, error) {
args := m.Called(ctx)
return args.Get(0).(*stellarcore.InfoResponse), args.Error(1)
}

func TestHealthCheck(t *testing.T) {
synced := &stellarcore.InfoResponse{}
synced.Info.State = "Synced!"
notSynced := &stellarcore.InfoResponse{}
notSynced.Info.State = "Catching up"

for _, tc := range []struct {
name string
pingErr error
coreErr error
coreResponse *stellarcore.InfoResponse
expectedStatus int
expectedResponse healthResponse
}{
{
"healthy",
nil,
nil,
synced,
http.StatusOK,
healthResponse{
DatabaseConnected: true,
CoreUp: true,
CoreSynced: true,
},
},
{
"db down",
fmt.Errorf("database is down"),
nil,
synced,
http.StatusServiceUnavailable,
healthResponse{
DatabaseConnected: false,
CoreUp: true,
CoreSynced: true,
},
},
{
"stellar core not synced",
nil,
nil,
notSynced,
http.StatusServiceUnavailable,
healthResponse{
DatabaseConnected: true,
CoreUp: true,
CoreSynced: false,
},
},
{
"stellar core down",
nil,
fmt.Errorf("stellar core is down"),
nil,
http.StatusServiceUnavailable,
healthResponse{
DatabaseConnected: true,
CoreUp: false,
CoreSynced: false,
},
},
{
"stellar core and db down",
fmt.Errorf("database is down"),
fmt.Errorf("stellar core is down"),
nil,
http.StatusServiceUnavailable,
healthResponse{
DatabaseConnected: false,
CoreUp: false,
CoreSynced: false,
},
},
{
"stellar core not synced and db down",
fmt.Errorf("database is down"),
nil,
notSynced,
http.StatusServiceUnavailable,
healthResponse{
DatabaseConnected: false,
CoreUp: true,
CoreSynced: false,
},
},
} {
t.Run(tc.name, func(t *testing.T) {
session := &db.MockSession{}
session.On("Ping", dbPingTimeout).Return(tc.pingErr).Once()
ctx := context.Background()
core := &mockStellarCore{}
core.On("Info", ctx).Return(tc.coreResponse, tc.coreErr).Once()

h := healthCheck{
session: session,
ctx: ctx,
core: core,
cache: newHealthCache(healthCacheTTL),
}

w := httptest.NewRecorder()
h.ServeHTTP(w, nil)
assert.Equal(t, tc.expectedStatus, w.Code)

var response healthResponse
err := json.Unmarshal(w.Body.Bytes(), &response)
assert.NoError(t, err)
assert.Equal(t, tc.expectedResponse, response)

session.AssertExpectations(t)
core.AssertExpectations(t)
})
}
}

func TestHealthCheckCache(t *testing.T) {
cachedResponse := healthResponse{
DatabaseConnected: false,
CoreUp: true,
CoreSynced: false,
}
h := healthCheck{
session: nil,
ctx: context.Background(),
core: nil,
cache: &healthCache{
response: cachedResponse,
lastUpdate: time.Unix(0, 0),
ttl: 5 * time.Second,
lock: sync.Mutex{},
},
}

for _, timestamp := range []time.Time{time.Unix(1, 0), time.Unix(4, 0)} {
h.cache.clock = clock.Clock{
Source: clocktest.FixedSource(timestamp),
}
w := httptest.NewRecorder()
h.ServeHTTP(w, nil)
assert.Equal(t, http.StatusServiceUnavailable, w.Code)

var response healthResponse
err := json.Unmarshal(w.Body.Bytes(), &response)
assert.NoError(t, err)
assert.Equal(t, cachedResponse, response)
assert.Equal(t, cachedResponse, h.cache.response)
assert.True(t, h.cache.lastUpdate.Equal(time.Unix(0, 0)))
}

session := &db.MockSession{}
session.On("Ping", dbPingTimeout).Return(nil).Once()
core := &mockStellarCore{}
core.On("Info", h.ctx).Return(&stellarcore.InfoResponse{}, fmt.Errorf("core err")).Once()
h.session = session
h.core = core
updatedResponse := healthResponse{
DatabaseConnected: true,
CoreUp: false,
CoreSynced: false,
}
for _, timestamp := range []time.Time{time.Unix(6, 0), time.Unix(7, 0)} {
h.cache.clock = clock.Clock{
Source: clocktest.FixedSource(timestamp),
}
w := httptest.NewRecorder()
h.ServeHTTP(w, nil)
assert.Equal(t, http.StatusServiceUnavailable, w.Code)

var response healthResponse
err := json.Unmarshal(w.Body.Bytes(), &response)
assert.NoError(t, err)
assert.Equal(t, updatedResponse, response)
assert.Equal(t, updatedResponse, h.cache.response)
assert.True(t, h.cache.lastUpdate.Equal(time.Unix(6, 0)))
}

session.AssertExpectations(t)
core.AssertExpectations(t)
}
4 changes: 3 additions & 1 deletion services/horizon/internal/httpx/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type RouterConfig struct {
CoreGetter actions.CoreSettingsGetter
HorizonVersion string
FriendbotURL *url.URL
HealthCheck http.Handler
}

type Router struct {
Expand Down Expand Up @@ -104,6 +105,8 @@ func (r *Router) addRoutes(config *RouterConfig, rateLimiter *throttled.HTTPRate
HorizonSession: config.DBSession,
}

r.Method(http.MethodGet, "/health", config.HealthCheck)

r.Method(http.MethodGet, "/", ObjectActionHandler{Action: actions.GetRootHandler{
LedgerState: ledgerState,
CoreSettingsGetter: config.CoreGetter,
Expand All @@ -118,7 +121,6 @@ func (r *Router) addRoutes(config *RouterConfig, rateLimiter *throttled.HTTPRate
}

historyMiddleware := NewHistoryMiddleware(ledgerState, int32(config.StaleThreshold), config.DBSession)

// State endpoints behind stateMiddleware
r.Group(func(r chi.Router) {
r.Use(stateMiddleware.Wrap)
Expand Down
1 change: 1 addition & 0 deletions support/db/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ type SessionInterface interface {
Exec(query squirrel.Sqlizer) (sql.Result, error)
ExecRaw(query string, args ...interface{}) (sql.Result, error)
NoRows(err error) bool
Ping(timeout time.Duration) error
}

// Table helps to build sql queries against a given table. It logically
Expand Down
5 changes: 5 additions & 0 deletions support/db/mock_session.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package db

import (
"database/sql"
"time"

"github.com/Masterminds/squirrel"
sq "github.com/Masterminds/squirrel"
Expand Down Expand Up @@ -87,3 +88,7 @@ func (m *MockSession) NoRows(err error) bool {
args := m.Called(err)
return args.Get(0).(bool)
}

func (m *MockSession) Ping(timeout time.Duration) error {
return m.Called(timeout).Error(0)
}
8 changes: 8 additions & 0 deletions support/db/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,14 @@ func (s *Session) Rollback() error {
return err
}

// Ping verifies a connection to the database is still alive,
// establishing a connection if necessary.
func (s *Session) Ping(timeout time.Duration) error {
ctx, cancel := context.WithTimeout(s.Ctx, timeout)
defer cancel()
return s.DB.PingContext(ctx)
}

// Select runs `query`, setting the results found on `dest`.
func (s *Session) Select(dest interface{}, query sq.Sqlizer) error {
sql, args, err := s.build(query)
Expand Down

0 comments on commit df2d6e4

Please sign in to comment.