diff --git a/healthcheck/healthcheck.go b/healthcheck/healthcheck.go index 1d6d6b280..295f2afe5 100644 --- a/healthcheck/healthcheck.go +++ b/healthcheck/healthcheck.go @@ -12,6 +12,7 @@ import ( "github.com/containous/traefik/log" "github.com/containous/traefik/safe" + "github.com/go-kit/kit/metrics" "github.com/vulcand/oxy/roundrobin" ) @@ -19,9 +20,9 @@ var singleton *HealthCheck var once sync.Once // GetHealthCheck returns the health check which is guaranteed to be a singleton. -func GetHealthCheck() *HealthCheck { +func GetHealthCheck(metrics metricsRegistry) *HealthCheck { once.Do(func() { - singleton = newHealthCheck() + singleton = newHealthCheck(metrics) }) return singleton } @@ -50,6 +51,7 @@ type BackendHealthCheck struct { //HealthCheck struct type HealthCheck struct { Backends map[string]*BackendHealthCheck + metrics metricsRegistry cancel context.CancelFunc } @@ -60,12 +62,19 @@ type LoadBalancer interface { Servers() []*url.URL } -func newHealthCheck() *HealthCheck { +func newHealthCheck(metrics metricsRegistry) *HealthCheck { return &HealthCheck{ Backends: make(map[string]*BackendHealthCheck), + metrics: metrics, } } +// metricsRegistry is a local interface in the healthcheck package, exposing only the required metrics +// necessary for the healthcheck package. This makes it easier for the tests. +type metricsRegistry interface { + BackendServerUpGauge() metrics.Gauge +} + // NewBackendHealthCheck Instantiate a new BackendHealthCheck func NewBackendHealthCheck(options Options, backendName string) *BackendHealthCheck { return &BackendHealthCheck{ @@ -113,22 +122,30 @@ func (hc *HealthCheck) checkBackend(backend *BackendHealthCheck) { enabledURLs := backend.LB.Servers() var newDisabledURLs []*url.URL for _, url := range backend.disabledURLs { + serverUpMetricValue := float64(0) if err := checkHealth(url, backend); err == nil { log.Warnf("Health check up: Returning to server list. Backend: %q URL: %q", backend.name, url.String()) backend.LB.UpsertServer(url, roundrobin.Weight(1)) + serverUpMetricValue = 1 } else { log.Warnf("Health check still failing. Backend: %q URL: %q Reason: %s", backend.name, url.String(), err) newDisabledURLs = append(newDisabledURLs, url) } + labelValues := []string{"backend", backend.name, "url", url.String()} + hc.metrics.BackendServerUpGauge().With(labelValues...).Set(serverUpMetricValue) } backend.disabledURLs = newDisabledURLs for _, url := range enabledURLs { + serverUpMetricValue := float64(1) if err := checkHealth(url, backend); err != nil { log.Warnf("Health check failed: Remove from server list. Backend: %q URL: %q Reason: %s", backend.name, url.String(), err) backend.LB.RemoveServer(url) backend.disabledURLs = append(backend.disabledURLs, url) + serverUpMetricValue = 0 } + labelValues := []string{"backend", backend.name, "url", url.String()} + hc.metrics.BackendServerUpGauge().With(labelValues...).Set(serverUpMetricValue) } } diff --git a/healthcheck/healthcheck_test.go b/healthcheck/healthcheck_test.go index 81089bb13..5a8481db4 100644 --- a/healthcheck/healthcheck_test.go +++ b/healthcheck/healthcheck_test.go @@ -27,6 +27,7 @@ func TestSetBackendsConfiguration(t *testing.T) { healthSequence []bool wantNumRemovedServers int wantNumUpsertedServers int + wantGaugeValue float64 }{ { desc: "healthy server staying healthy", @@ -34,6 +35,7 @@ func TestSetBackendsConfiguration(t *testing.T) { healthSequence: []bool{true}, wantNumRemovedServers: 0, wantNumUpsertedServers: 0, + wantGaugeValue: 1, }, { desc: "healthy server becoming sick", @@ -41,6 +43,7 @@ func TestSetBackendsConfiguration(t *testing.T) { healthSequence: []bool{false}, wantNumRemovedServers: 1, wantNumUpsertedServers: 0, + wantGaugeValue: 0, }, { desc: "sick server becoming healthy", @@ -48,6 +51,7 @@ func TestSetBackendsConfiguration(t *testing.T) { healthSequence: []bool{true}, wantNumRemovedServers: 0, wantNumUpsertedServers: 1, + wantGaugeValue: 1, }, { desc: "sick server staying sick", @@ -55,6 +59,7 @@ func TestSetBackendsConfiguration(t *testing.T) { healthSequence: []bool{false}, wantNumRemovedServers: 0, wantNumUpsertedServers: 0, + wantGaugeValue: 0, }, { desc: "healthy server toggling to sick and back to healthy", @@ -62,6 +67,7 @@ func TestSetBackendsConfiguration(t *testing.T) { healthSequence: []bool{false, true}, wantNumRemovedServers: 1, wantNumUpsertedServers: 1, + wantGaugeValue: 1, }, } @@ -89,8 +95,10 @@ func TestSetBackendsConfiguration(t *testing.T) { backend.disabledURLs = append(backend.disabledURLs, serverURL) } + collectingMetrics := testhelpers.NewCollectingHealthCheckMetrics() check := HealthCheck{ Backends: make(map[string]*BackendHealthCheck), + metrics: collectingMetrics, } wg := sync.WaitGroup{} wg.Add(1) @@ -118,6 +126,10 @@ func TestSetBackendsConfiguration(t *testing.T) { if lb.numUpsertedServers != test.wantNumUpsertedServers { t.Errorf("got %d upserted servers, wanted %d", lb.numUpsertedServers, test.wantNumUpsertedServers) } + + if collectingMetrics.Gauge.GaugeValue != test.wantGaugeValue { + t.Errorf("got %v ServerUp Gauge, want %v", collectingMetrics.Gauge.GaugeValue, test.wantGaugeValue) + } }) } } diff --git a/metrics/datadog.go b/metrics/datadog.go index 9a48f6036..ae099b14b 100644 --- a/metrics/datadog.go +++ b/metrics/datadog.go @@ -31,10 +31,10 @@ func RegisterDatadog(config *types.Datadog) Registry { } registry := &standardRegistry{ - enabled: true, - reqsCounter: datadogClient.NewCounter(ddMetricsReqsName, 1.0), - reqDurationHistogram: datadogClient.NewHistogram(ddMetricsLatencyName, 1.0), - retriesCounter: datadogClient.NewCounter(ddRetriesTotalName, 1.0), + enabled: true, + backendReqsCounter: datadogClient.NewCounter(ddMetricsReqsName, 1.0), + backendReqDurationHistogram: datadogClient.NewHistogram(ddMetricsLatencyName, 1.0), + backendRetriesCounter: datadogClient.NewCounter(ddRetriesTotalName, 1.0), } return registry diff --git a/metrics/datadog_test.go b/metrics/datadog_test.go index 1ec6c0f07..8559f6b3b 100644 --- a/metrics/datadog_test.go +++ b/metrics/datadog_test.go @@ -31,10 +31,10 @@ func TestDatadog(t *testing.T) { } udp.ShouldReceiveAll(t, expected, func() { - datadogRegistry.ReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet).Add(1) - datadogRegistry.ReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusNotFound), "method", http.MethodGet).Add(1) - datadogRegistry.ReqDurationHistogram().With("service", "test", "code", strconv.Itoa(http.StatusOK)).Observe(10000) - datadogRegistry.RetriesCounter().With("service", "test").Add(1) - datadogRegistry.RetriesCounter().With("service", "test").Add(1) + datadogRegistry.BackendReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet).Add(1) + datadogRegistry.BackendReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusNotFound), "method", http.MethodGet).Add(1) + datadogRegistry.BackendReqDurationHistogram().With("service", "test", "code", strconv.Itoa(http.StatusOK)).Observe(10000) + datadogRegistry.BackendRetriesCounter().With("service", "test").Add(1) + datadogRegistry.BackendRetriesCounter().With("service", "test").Add(1) }) } diff --git a/metrics/influxdb.go b/metrics/influxdb.go index 4ab029a6d..5c573fc4d 100644 --- a/metrics/influxdb.go +++ b/metrics/influxdb.go @@ -37,10 +37,10 @@ func RegisterInfluxDB(config *types.InfluxDB) Registry { } return &standardRegistry{ - enabled: true, - reqsCounter: influxDBClient.NewCounter(influxDBMetricsReqsName), - reqDurationHistogram: influxDBClient.NewHistogram(influxDBMetricsLatencyName), - retriesCounter: influxDBClient.NewCounter(influxDBRetriesTotalName), + enabled: true, + backendReqsCounter: influxDBClient.NewCounter(influxDBMetricsReqsName), + backendReqDurationHistogram: influxDBClient.NewHistogram(influxDBMetricsLatencyName), + backendRetriesCounter: influxDBClient.NewCounter(influxDBRetriesTotalName), } } diff --git a/metrics/influxdb_test.go b/metrics/influxdb_test.go index a8b52ad30..0e15f5254 100644 --- a/metrics/influxdb_test.go +++ b/metrics/influxdb_test.go @@ -31,11 +31,11 @@ func TestInfluxDB(t *testing.T) { } msg := udp.ReceiveString(t, func() { - influxDBRegistry.ReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet).Add(1) - influxDBRegistry.ReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusNotFound), "method", http.MethodGet).Add(1) - influxDBRegistry.RetriesCounter().With("service", "test").Add(1) - influxDBRegistry.RetriesCounter().With("service", "test").Add(1) - influxDBRegistry.ReqDurationHistogram().With("service", "test", "code", strconv.Itoa(http.StatusOK)).Observe(10000) + influxDBRegistry.BackendReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet).Add(1) + influxDBRegistry.BackendReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusNotFound), "method", http.MethodGet).Add(1) + influxDBRegistry.BackendRetriesCounter().With("service", "test").Add(1) + influxDBRegistry.BackendRetriesCounter().With("service", "test").Add(1) + influxDBRegistry.BackendReqDurationHistogram().With("service", "test", "code", strconv.Itoa(http.StatusOK)).Observe(10000) }) assertMessage(t, msg, expected) diff --git a/metrics/metrics.go b/metrics/metrics.go index a486de838..62079d16a 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -9,71 +9,168 @@ import ( type Registry interface { // IsEnabled shows whether metrics instrumentation is enabled. IsEnabled() bool - ReqsCounter() metrics.Counter - ReqDurationHistogram() metrics.Histogram - RetriesCounter() metrics.Counter + // server metrics + ConfigReloadsCounter() metrics.Counter + ConfigReloadsFailureCounter() metrics.Counter + LastConfigReloadSuccessGauge() metrics.Gauge + LastConfigReloadFailureGauge() metrics.Gauge + + // entry point metrics + EntrypointReqsCounter() metrics.Counter + EntrypointReqDurationHistogram() metrics.Histogram + EntrypointOpenConnsGauge() metrics.Gauge + + // backend metrics + BackendReqsCounter() metrics.Counter + BackendReqDurationHistogram() metrics.Histogram + BackendOpenConnsGauge() metrics.Gauge + BackendRetriesCounter() metrics.Counter + BackendServerUpGauge() metrics.Gauge } -// NewMultiRegistry creates a new standardRegistry that wraps multiple Registries. +// NewVoidRegistry is a noop implementation of metrics.Registry. +// It is used to avoid nil checking in components that do metric collections. +func NewVoidRegistry() Registry { + return NewMultiRegistry([]Registry{}) +} + +// NewMultiRegistry is an implementation of metrics.Registry that wraps multiple registries. +// It handles the case when a registry hasn't registered some metric and returns nil. +// This allows for feature imparity between the different metric implementations. func NewMultiRegistry(registries []Registry) Registry { - reqsCounters := []metrics.Counter{} - reqDurationHistograms := []metrics.Histogram{} - retriesCounters := []metrics.Counter{} + configReloadsCounter := []metrics.Counter{} + configReloadsFailureCounter := []metrics.Counter{} + lastConfigReloadSuccessGauge := []metrics.Gauge{} + lastConfigReloadFailureGauge := []metrics.Gauge{} + entrypointReqsCounter := []metrics.Counter{} + entrypointReqDurationHistogram := []metrics.Histogram{} + entrypointOpenConnsGauge := []metrics.Gauge{} + backendReqsCounter := []metrics.Counter{} + backendReqDurationHistogram := []metrics.Histogram{} + backendOpenConnsGauge := []metrics.Gauge{} + backendRetriesCounter := []metrics.Counter{} + backendServerUpGauge := []metrics.Gauge{} for _, r := range registries { - reqsCounters = append(reqsCounters, r.ReqsCounter()) - reqDurationHistograms = append(reqDurationHistograms, r.ReqDurationHistogram()) - retriesCounters = append(retriesCounters, r.RetriesCounter()) + if r.ConfigReloadsCounter() != nil { + configReloadsCounter = append(configReloadsCounter, r.ConfigReloadsCounter()) + } + if r.ConfigReloadsFailureCounter() != nil { + configReloadsFailureCounter = append(configReloadsFailureCounter, r.ConfigReloadsFailureCounter()) + } + if r.LastConfigReloadSuccessGauge() != nil { + lastConfigReloadSuccessGauge = append(lastConfigReloadSuccessGauge, r.LastConfigReloadSuccessGauge()) + } + if r.LastConfigReloadFailureGauge() != nil { + lastConfigReloadFailureGauge = append(lastConfigReloadFailureGauge, r.LastConfigReloadFailureGauge()) + } + if r.EntrypointReqsCounter() != nil { + entrypointReqsCounter = append(entrypointReqsCounter, r.EntrypointReqsCounter()) + } + if r.EntrypointReqDurationHistogram() != nil { + entrypointReqDurationHistogram = append(entrypointReqDurationHistogram, r.EntrypointReqDurationHistogram()) + } + if r.EntrypointOpenConnsGauge() != nil { + entrypointOpenConnsGauge = append(entrypointOpenConnsGauge, r.EntrypointOpenConnsGauge()) + } + if r.BackendReqsCounter() != nil { + backendReqsCounter = append(backendReqsCounter, r.BackendReqsCounter()) + } + if r.BackendReqDurationHistogram() != nil { + backendReqDurationHistogram = append(backendReqDurationHistogram, r.BackendReqDurationHistogram()) + } + if r.BackendOpenConnsGauge() != nil { + backendOpenConnsGauge = append(backendOpenConnsGauge, r.BackendOpenConnsGauge()) + } + if r.BackendRetriesCounter() != nil { + backendRetriesCounter = append(backendRetriesCounter, r.BackendRetriesCounter()) + } + if r.BackendServerUpGauge() != nil { + backendServerUpGauge = append(backendServerUpGauge, r.BackendServerUpGauge()) + } } return &standardRegistry{ - enabled: true, - reqsCounter: multi.NewCounter(reqsCounters...), - reqDurationHistogram: multi.NewHistogram(reqDurationHistograms...), - retriesCounter: multi.NewCounter(retriesCounters...), + enabled: len(registries) > 0, + configReloadsCounter: multi.NewCounter(configReloadsCounter...), + configReloadsFailureCounter: multi.NewCounter(configReloadsFailureCounter...), + lastConfigReloadSuccessGauge: multi.NewGauge(lastConfigReloadSuccessGauge...), + lastConfigReloadFailureGauge: multi.NewGauge(lastConfigReloadFailureGauge...), + entrypointReqsCounter: multi.NewCounter(entrypointReqsCounter...), + entrypointReqDurationHistogram: multi.NewHistogram(entrypointReqDurationHistogram...), + entrypointOpenConnsGauge: multi.NewGauge(entrypointOpenConnsGauge...), + backendReqsCounter: multi.NewCounter(backendReqsCounter...), + backendReqDurationHistogram: multi.NewHistogram(backendReqDurationHistogram...), + backendOpenConnsGauge: multi.NewGauge(backendOpenConnsGauge...), + backendRetriesCounter: multi.NewCounter(backendRetriesCounter...), + backendServerUpGauge: multi.NewGauge(backendServerUpGauge...), } } type standardRegistry struct { - enabled bool - reqsCounter metrics.Counter - reqDurationHistogram metrics.Histogram - retriesCounter metrics.Counter + enabled bool + configReloadsCounter metrics.Counter + configReloadsFailureCounter metrics.Counter + lastConfigReloadSuccessGauge metrics.Gauge + lastConfigReloadFailureGauge metrics.Gauge + entrypointReqsCounter metrics.Counter + entrypointReqDurationHistogram metrics.Histogram + entrypointOpenConnsGauge metrics.Gauge + backendReqsCounter metrics.Counter + backendReqDurationHistogram metrics.Histogram + backendOpenConnsGauge metrics.Gauge + backendRetriesCounter metrics.Counter + backendServerUpGauge metrics.Gauge } func (r *standardRegistry) IsEnabled() bool { return r.enabled } -func (r *standardRegistry) ReqsCounter() metrics.Counter { - return r.reqsCounter +func (r *standardRegistry) ConfigReloadsCounter() metrics.Counter { + return r.configReloadsCounter } -func (r *standardRegistry) ReqDurationHistogram() metrics.Histogram { - return r.reqDurationHistogram +func (r *standardRegistry) ConfigReloadsFailureCounter() metrics.Counter { + return r.configReloadsFailureCounter } -func (r *standardRegistry) RetriesCounter() metrics.Counter { - return r.retriesCounter +func (r *standardRegistry) LastConfigReloadSuccessGauge() metrics.Gauge { + return r.lastConfigReloadSuccessGauge } -// NewVoidRegistry is a noop implementation of metrics.Registry. -// It is used to avoid nil checking in components that do metric collections. -func NewVoidRegistry() Registry { - return &standardRegistry{ - enabled: false, - reqsCounter: &voidCounter{}, - reqDurationHistogram: &voidHistogram{}, - retriesCounter: &voidCounter{}, - } +func (r *standardRegistry) LastConfigReloadFailureGauge() metrics.Gauge { + return r.lastConfigReloadFailureGauge } -type voidCounter struct{} +func (r *standardRegistry) EntrypointReqsCounter() metrics.Counter { + return r.entrypointReqsCounter +} -func (v *voidCounter) With(labelValues ...string) metrics.Counter { return v } -func (v *voidCounter) Add(delta float64) {} +func (r *standardRegistry) EntrypointReqDurationHistogram() metrics.Histogram { + return r.entrypointReqDurationHistogram +} -type voidHistogram struct{} +func (r *standardRegistry) EntrypointOpenConnsGauge() metrics.Gauge { + return r.entrypointOpenConnsGauge +} -func (h *voidHistogram) With(labelValues ...string) metrics.Histogram { return h } -func (h *voidHistogram) Observe(value float64) {} +func (r *standardRegistry) BackendReqsCounter() metrics.Counter { + return r.backendReqsCounter +} + +func (r *standardRegistry) BackendReqDurationHistogram() metrics.Histogram { + return r.backendReqDurationHistogram +} + +func (r *standardRegistry) BackendOpenConnsGauge() metrics.Gauge { + return r.backendOpenConnsGauge +} + +func (r *standardRegistry) BackendRetriesCounter() metrics.Counter { + return r.backendRetriesCounter +} + +func (r *standardRegistry) BackendServerUpGauge() metrics.Gauge { + return r.backendServerUpGauge +} diff --git a/metrics/metrics_test.go b/metrics/metrics_test.go index b851d04b3..1e1b159f0 100644 --- a/metrics/metrics_test.go +++ b/metrics/metrics_test.go @@ -7,29 +7,18 @@ import ( "github.com/stretchr/testify/assert" ) -func TestNewVoidRegistry(t *testing.T) { - registry := NewVoidRegistry() - - if registry.IsEnabled() { - t.Errorf("VoidRegistry should not return true for IsEnabled()") - } - registry.ReqsCounter().With("some", "value").Add(1) - registry.ReqDurationHistogram().With("some", "value").Observe(1) - registry.RetriesCounter().With("some", "value").Add(1) -} - func TestNewMultiRegistry(t *testing.T) { registries := []Registry{newCollectingRetryMetrics(), newCollectingRetryMetrics()} registry := NewMultiRegistry(registries) - registry.ReqsCounter().With("key", "requests").Add(1) - registry.ReqDurationHistogram().With("key", "durations").Observe(2) - registry.RetriesCounter().With("key", "retries").Add(3) + registry.BackendReqsCounter().With("key", "requests").Add(1) + registry.BackendReqDurationHistogram().With("key", "durations").Observe(2) + registry.BackendRetriesCounter().With("key", "retries").Add(3) for _, collectingRegistry := range registries { - cReqsCounter := collectingRegistry.ReqsCounter().(*counterMock) - cReqDurationHistogram := collectingRegistry.ReqDurationHistogram().(*histogramMock) - cRetriesCounter := collectingRegistry.RetriesCounter().(*counterMock) + cReqsCounter := collectingRegistry.BackendReqsCounter().(*counterMock) + cReqDurationHistogram := collectingRegistry.BackendReqDurationHistogram().(*histogramMock) + cRetriesCounter := collectingRegistry.BackendRetriesCounter().(*counterMock) wantCounterValue := float64(1) if cReqsCounter.counterValue != wantCounterValue { @@ -52,9 +41,9 @@ func TestNewMultiRegistry(t *testing.T) { func newCollectingRetryMetrics() Registry { return &standardRegistry{ - reqsCounter: &counterMock{}, - reqDurationHistogram: &histogramMock{}, - retriesCounter: &counterMock{}, + backendReqsCounter: &counterMock{}, + backendReqDurationHistogram: &histogramMock{}, + backendRetriesCounter: &counterMock{}, } } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index bdf64d4d7..75b9c9468 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -2,10 +2,14 @@ package metrics import ( "net/http" + "sort" + "strings" + "sync" "github.com/containous/mux" + "github.com/containous/traefik/safe" "github.com/containous/traefik/types" - "github.com/go-kit/kit/metrics/prometheus" + "github.com/go-kit/kit/metrics" stdprometheus "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -13,15 +17,50 @@ import ( const ( metricNamePrefix = "traefik_" - reqsTotalName = metricNamePrefix + "requests_total" - reqDurationName = metricNamePrefix + "request_duration_seconds" - retriesTotalName = metricNamePrefix + "backend_retries_total" + // server meta information + configReloadsTotalName = metricNamePrefix + "config_reloads_total" + configReloadsFailuresTotalName = metricNamePrefix + "config_reloads_failure_total" + configLastReloadSuccessName = metricNamePrefix + "config_last_reload_success" + configLastReloadFailureName = metricNamePrefix + "config_last_reload_failure" + + // entrypoint + entrypointReqsTotalName = metricNamePrefix + "entrypoint_requests_total" + entrypointReqDurationName = metricNamePrefix + "entrypoint_request_duration_seconds" + entrypointOpenConnsName = metricNamePrefix + "entrypoint_open_connections" + + // backend level + backendReqsTotalName = metricNamePrefix + "backend_requests_total" + backendReqDurationName = metricNamePrefix + "backend_request_duration_seconds" + backendOpenConnsName = metricNamePrefix + "backend_open_connections" + backendRetriesTotalName = metricNamePrefix + "backend_retries_total" + backendServerUpName = metricNamePrefix + "backend_server_up" ) -// PrometheusHandler expose Prometheus routes +const ( + // generationAgeForever indicates that a metric never gets outdated. + generationAgeForever = 0 + // generationAgeDefault is the default age of three generations. + generationAgeDefault = 3 +) + +// promState holds all metric state internally and acts as the only Collector we register for Prometheus. +// +// This enables control to remove metrics that belong to outdated configuration. +// As an example why this is required, consider Traefik learns about a new service. +// It populates the 'traefik_server_backend_up' metric for it with a value of 1 (alive). +// When the backend is undeployed now the metric is still there in the client library +// and will be until Traefik would be restarted. +// +// To solve this problem promState keeps track of configuration generations. +// Every time a new configuration is loaded, the generation is increased by one. +// Metrics that "belong" to a dynamic configuration part of Traefik (e.g. backend, entrypoint) +// are removed, given they were tracked more than 3 generations ago. +var promState = newPrometheusState() + +// PrometheusHandler exposes Prometheus routes. type PrometheusHandler struct{} -// AddRoutes add Prometheus routes on a router +// AddRoutes adds Prometheus routes on a router. func (h PrometheusHandler) AddRoutes(router *mux.Router) { router.Methods(http.MethodGet).Path("/metrics").Handler(promhttp.Handler()) } @@ -34,24 +73,332 @@ func RegisterPrometheus(config *types.Prometheus) Registry { buckets = config.Buckets } - reqCounter := prometheus.NewCounterFrom(stdprometheus.CounterOpts{ - Name: reqsTotalName, - Help: "How many HTTP requests processed, partitioned by status code and method.", - }, []string{"service", "code", "method"}) - reqDurationHistogram := prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ - Name: reqDurationName, - Help: "How long it took to process the request.", + safe.Go(func() { + promState.ListenValueUpdates() + }) + + configReloads := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{ + Name: configReloadsTotalName, + Help: "Config reloads", + }, []string{}) + configReloadsFailures := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{ + Name: configReloadsFailuresTotalName, + Help: "Config failure reloads", + }, []string{}) + lastConfigReloadSuccess := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{ + Name: configLastReloadSuccessName, + Help: "Last config reload success", + }, []string{}) + lastConfigReloadFailure := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{ + Name: configLastReloadFailureName, + Help: "Last config reload failure", + }, []string{}) + + entrypointReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{ + Name: entrypointReqsTotalName, + Help: "How many HTTP requests processed on an entrypoint, partitioned by status code, protocol, and method.", + }, []string{"code", "method", "protocol", "entrypoint"}) + entrypointReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{ + Name: entrypointReqDurationName, + Help: "How long it took to process the request on an entrypoint, partitioned by status code, protocol, and method.", Buckets: buckets, - }, []string{"service", "code"}) - retryCounter := prometheus.NewCounterFrom(stdprometheus.CounterOpts{ - Name: retriesTotalName, - Help: "How many request retries happened in total.", - }, []string{"service"}) + }, []string{"code", "method", "protocol", "entrypoint"}) + entrypointOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{ + Name: entrypointOpenConnsName, + Help: "How many open connections exist on an entrypoint, partitioned by method and protocol.", + }, []string{"method", "protocol", "entrypoint"}) + + backendReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{ + Name: backendReqsTotalName, + Help: "How many HTTP requests processed on a backend, partitioned by status code, protocol, and method.", + }, []string{"code", "method", "protocol", "backend"}) + backendReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{ + Name: backendReqDurationName, + Help: "How long it took to process the request on a backend, partitioned by status code, protocol, and method.", + Buckets: buckets, + }, []string{"code", "method", "protocol", "backend"}) + backendOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{ + Name: backendOpenConnsName, + Help: "How many open connections exist on a backend, partitioned by method and protocol.", + }, []string{"method", "protocol", "backend"}) + backendRetries := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{ + Name: backendRetriesTotalName, + Help: "How many request retries happened on a backend.", + }, []string{"backend"}) + backendServerUp := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{ + Name: backendServerUpName, + Help: "Backend server is up, described by gauge value of 0 or 1.", + }, []string{"backend", "url"}) + + promState.describers = []func(chan<- *stdprometheus.Desc){ + configReloads.cv.Describe, + configReloadsFailures.cv.Describe, + lastConfigReloadSuccess.gv.Describe, + lastConfigReloadFailure.gv.Describe, + entrypointReqs.cv.Describe, + entrypointReqDurations.hv.Describe, + entrypointOpenConns.gv.Describe, + backendReqs.cv.Describe, + backendReqDurations.hv.Describe, + backendOpenConns.gv.Describe, + backendRetries.cv.Describe, + backendServerUp.gv.Describe, + } + stdprometheus.MustRegister(promState) return &standardRegistry{ - enabled: true, - reqsCounter: reqCounter, - reqDurationHistogram: reqDurationHistogram, - retriesCounter: retryCounter, + enabled: true, + configReloadsCounter: configReloads, + configReloadsFailureCounter: configReloadsFailures, + lastConfigReloadSuccessGauge: lastConfigReloadSuccess, + lastConfigReloadFailureGauge: lastConfigReloadFailure, + entrypointReqsCounter: entrypointReqs, + entrypointReqDurationHistogram: entrypointReqDurations, + entrypointOpenConnsGauge: entrypointOpenConns, + backendReqsCounter: backendReqs, + backendReqDurationHistogram: backendReqDurations, + backendOpenConnsGauge: backendOpenConns, + backendRetriesCounter: backendRetries, + backendServerUpGauge: backendServerUp, } } + +// OnConfigurationUpdate increases the current generation of the prometheus state. +func OnConfigurationUpdate() { + promState.IncGeneration() +} + +func newPrometheusState() *prometheusState { + collectors := make(chan *collector) + state := make(map[string]*collector) + + return &prometheusState{ + collectors: collectors, + state: state, + } +} + +type prometheusState struct { + currentGeneration int + collectors chan *collector + describers []func(ch chan<- *stdprometheus.Desc) + + mtx sync.Mutex + state map[string]*collector +} + +func (ps *prometheusState) IncGeneration() { + ps.mtx.Lock() + defer ps.mtx.Unlock() + ps.currentGeneration++ +} + +func (ps *prometheusState) ListenValueUpdates() { + for collector := range ps.collectors { + ps.mtx.Lock() + collector.lastTrackedGeneration = ps.currentGeneration + ps.state[collector.id] = collector + ps.mtx.Unlock() + } +} + +// Describe implements prometheus.Collector and simply calls +// the registered describer functions. +func (ps *prometheusState) Describe(ch chan<- *stdprometheus.Desc) { + for _, desc := range ps.describers { + desc(ch) + } +} + +// Collect implements prometheus.Collector. It calls the Collect +// method of all metrics it received on the collectors channel. +// It's also responsible to remove metrics that were tracked +// at least three generations ago. Those metrics are cleaned up +// after the Collect of them were called. +func (ps *prometheusState) Collect(ch chan<- stdprometheus.Metric) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + outdatedKeys := []string{} + for key, cs := range ps.state { + cs.collector.Collect(ch) + + if cs.maxAge == generationAgeForever { + continue + } + if ps.currentGeneration-cs.lastTrackedGeneration >= cs.maxAge { + outdatedKeys = append(outdatedKeys, key) + } + } + + for _, key := range outdatedKeys { + delete(ps.state, key) + } +} + +func newCollector(metricName string, lnvs labelNamesValues, c stdprometheus.Collector) *collector { + maxAge := generationAgeDefault + + // metrics without labels should never become outdated + if len(lnvs) == 0 { + maxAge = generationAgeForever + } + + return &collector{ + id: buildMetricID(metricName, lnvs), + maxAge: maxAge, + collector: c, + } +} + +// collector wraps a Collector object from the Prometheus client library. +// It adds information on how many generations this metric should be present +// in the /metrics output, relatived to the time it was last tracked. +type collector struct { + id string + collector stdprometheus.Collector + lastTrackedGeneration int + maxAge int +} + +func buildMetricID(metricName string, lnvs labelNamesValues) string { + newLnvs := append([]string{}, lnvs...) + sort.Strings(newLnvs) + return metricName + ":" + strings.Join(newLnvs, "|") +} + +func newCounterFrom(collectors chan<- *collector, opts stdprometheus.CounterOpts, labelNames []string) *counter { + cv := stdprometheus.NewCounterVec(opts, labelNames) + c := &counter{ + name: opts.Name, + cv: cv, + collectors: collectors, + } + if len(labelNames) == 0 { + c.Add(0) + } + return c +} + +type counter struct { + name string + cv *stdprometheus.CounterVec + labelNamesValues labelNamesValues + collectors chan<- *collector +} + +func (c *counter) With(labelValues ...string) metrics.Counter { + return &counter{ + name: c.name, + cv: c.cv, + labelNamesValues: c.labelNamesValues.With(labelValues...), + collectors: c.collectors, + } +} + +func (c *counter) Add(delta float64) { + collector := c.cv.With(c.labelNamesValues.ToLabels()) + collector.Add(delta) + c.collectors <- newCollector(c.name, c.labelNamesValues, collector) +} + +func (c *counter) Describe(ch chan<- *stdprometheus.Desc) { + c.cv.Describe(ch) +} + +func newGaugeFrom(collectors chan<- *collector, opts stdprometheus.GaugeOpts, labelNames []string) *gauge { + gv := stdprometheus.NewGaugeVec(opts, labelNames) + g := &gauge{ + name: opts.Name, + gv: gv, + collectors: collectors, + } + if len(labelNames) == 0 { + g.Set(0) + } + return g +} + +type gauge struct { + name string + gv *stdprometheus.GaugeVec + labelNamesValues labelNamesValues + collectors chan<- *collector +} + +func (g *gauge) With(labelValues ...string) metrics.Gauge { + return &gauge{ + name: g.name, + gv: g.gv, + labelNamesValues: g.labelNamesValues.With(labelValues...), + collectors: g.collectors, + } +} + +func (g *gauge) Set(value float64) { + collector := g.gv.With(g.labelNamesValues.ToLabels()) + collector.Set(value) + g.collectors <- newCollector(g.name, g.labelNamesValues, collector) +} + +func (g *gauge) Describe(ch chan<- *stdprometheus.Desc) { + g.gv.Describe(ch) +} + +func newHistogramFrom(collectors chan<- *collector, opts stdprometheus.HistogramOpts, labelNames []string) *histogram { + hv := stdprometheus.NewHistogramVec(opts, labelNames) + return &histogram{ + name: opts.Name, + hv: hv, + collectors: collectors, + } +} + +type histogram struct { + name string + hv *stdprometheus.HistogramVec + labelNamesValues labelNamesValues + collectors chan<- *collector +} + +func (h *histogram) With(labelValues ...string) metrics.Histogram { + return &histogram{ + name: h.name, + hv: h.hv, + labelNamesValues: h.labelNamesValues.With(labelValues...), + collectors: h.collectors, + } +} + +func (h *histogram) Observe(value float64) { + collector := h.hv.With(h.labelNamesValues.ToLabels()) + collector.Observe(value) + h.collectors <- newCollector(h.name, h.labelNamesValues, collector) +} + +func (h *histogram) Describe(ch chan<- *stdprometheus.Desc) { + h.hv.Describe(ch) +} + +// labelNamesValues is a type alias that provides validation on its With method. +// Metrics may include it as a member to help them satisfy With semantics and +// save some code duplication. +type labelNamesValues []string + +// With validates the input, and returns a new aggregate labelNamesValues. +func (lvs labelNamesValues) With(labelValues ...string) labelNamesValues { + if len(labelValues)%2 != 0 { + labelValues = append(labelValues, "unknown") + } + return append(lvs, labelValues...) +} + +// ToLabels is a convenience method to convert a labelNamesValues +// to the native prometheus.Labels. +func (lvs labelNamesValues) ToLabels() stdprometheus.Labels { + labels := stdprometheus.Labels{} + for i := 0; i < len(lvs); i += 2 { + labels[lvs[i]] = lvs[i+1] + } + return labels +} diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index 79e4b4d1f..9c698e045 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -1,9 +1,11 @@ package metrics import ( + "fmt" "net/http" "strconv" "testing" + "time" "github.com/containous/traefik/types" "github.com/prometheus/client_golang/prometheus" @@ -12,20 +14,54 @@ import ( func TestPrometheus(t *testing.T) { prometheusRegistry := RegisterPrometheus(&types.Prometheus{}) + defer prometheus.Unregister(promState) if !prometheusRegistry.IsEnabled() { t.Errorf("PrometheusRegistry should return true for IsEnabled()") } - prometheusRegistry.ReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet).Add(1) - prometheusRegistry.ReqsCounter().With("service", "test", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet).Add(1) - prometheusRegistry.ReqDurationHistogram().With("service", "test", "code", strconv.Itoa(http.StatusOK)).Observe(10000) - prometheusRegistry.ReqDurationHistogram().With("service", "test", "code", strconv.Itoa(http.StatusOK)).Observe(10000) - prometheusRegistry.RetriesCounter().With("service", "test").Add(1) - metricsFamilies, err := prometheus.DefaultGatherer.Gather() - if err != nil { - t.Fatalf("could not gather metrics families: %s", err) - } + prometheusRegistry.ConfigReloadsCounter().Add(1) + prometheusRegistry.ConfigReloadsFailureCounter().Add(1) + prometheusRegistry.LastConfigReloadSuccessGauge().Set(float64(time.Now().Unix())) + prometheusRegistry.LastConfigReloadFailureGauge().Set(float64(time.Now().Unix())) + + prometheusRegistry. + EntrypointReqsCounter(). + With("code", strconv.Itoa(http.StatusOK), "method", http.MethodGet, "protocol", "http", "entrypoint", "http"). + Add(1) + prometheusRegistry. + EntrypointReqDurationHistogram(). + With("code", strconv.Itoa(http.StatusOK), "method", http.MethodGet, "protocol", "http", "entrypoint", "http"). + Observe(1) + prometheusRegistry. + EntrypointOpenConnsGauge(). + With("method", http.MethodGet, "protocol", "http", "entrypoint", "http"). + Set(1) + + prometheusRegistry. + BackendReqsCounter(). + With("backend", "backend1", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet, "protocol", "http"). + Add(1) + prometheusRegistry. + BackendReqDurationHistogram(). + With("backend", "backend1", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet, "protocol", "http"). + Observe(10000) + prometheusRegistry. + BackendOpenConnsGauge(). + With("backend", "backend1", "method", http.MethodGet, "protocol", "http"). + Set(1) + prometheusRegistry. + BackendRetriesCounter(). + With("backend", "backend1"). + Add(1) + prometheusRegistry. + BackendServerUpGauge(). + With("backend", "backend1", "url", "http://127.0.0.10:80"). + Set(1) + + delayForTrackingCompletion() + + metricsFamilies := mustScrape() tests := []struct { name string @@ -33,46 +69,93 @@ func TestPrometheus(t *testing.T) { assert func(*dto.MetricFamily) }{ { - name: reqsTotalName, - labels: map[string]string{ - "code": "200", - "method": http.MethodGet, - "service": "test", - }, - assert: func(family *dto.MetricFamily) { - cv := family.Metric[0].Counter.GetValue() - expectedCv := float64(2) - if cv != expectedCv { - t.Errorf("gathered metrics do not contain correct value for total requests, got %f expected %f", cv, expectedCv) - } - }, + name: configReloadsTotalName, + assert: buildCounterAssert(t, configReloadsTotalName, 1), }, { - name: reqDurationName, - labels: map[string]string{ - "service": "test", - "code": "200", - }, - assert: func(family *dto.MetricFamily) { - sc := family.Metric[0].Histogram.GetSampleCount() - expectedSc := uint64(2) - if sc != expectedSc { - t.Errorf("gathered metrics do not contain correct sample count for request duration, got %d expected %d", sc, expectedSc) - } - }, + name: configReloadsFailuresTotalName, + assert: buildCounterAssert(t, configReloadsFailuresTotalName, 1), }, { - name: retriesTotalName, + name: configLastReloadSuccessName, + assert: buildTimestampAssert(t, configLastReloadSuccessName), + }, + { + name: configLastReloadFailureName, + assert: buildTimestampAssert(t, configLastReloadFailureName), + }, + { + name: entrypointReqsTotalName, labels: map[string]string{ - "service": "test", + "code": "200", + "method": http.MethodGet, + "protocol": "http", + "entrypoint": "http", }, - assert: func(family *dto.MetricFamily) { - cv := family.Metric[0].Counter.GetValue() - expectedCv := float64(1) - if cv != expectedCv { - t.Errorf("gathered metrics do not contain correct value for total retries, got %f expected %f", cv, expectedCv) - } + assert: buildCounterAssert(t, entrypointReqsTotalName, 1), + }, + { + name: entrypointReqDurationName, + labels: map[string]string{ + "code": "200", + "method": http.MethodGet, + "protocol": "http", + "entrypoint": "http", }, + assert: buildHistogramAssert(t, entrypointReqDurationName, 1), + }, + { + name: entrypointOpenConnsName, + labels: map[string]string{ + "method": http.MethodGet, + "protocol": "http", + "entrypoint": "http", + }, + assert: buildGaugeAssert(t, entrypointOpenConnsName, 1), + }, + { + name: backendReqsTotalName, + labels: map[string]string{ + "code": "200", + "method": http.MethodGet, + "protocol": "http", + "backend": "backend1", + }, + assert: buildCounterAssert(t, backendReqsTotalName, 1), + }, + { + name: backendReqDurationName, + labels: map[string]string{ + "code": "200", + "method": http.MethodGet, + "protocol": "http", + "backend": "backend1", + }, + assert: buildHistogramAssert(t, backendReqDurationName, 1), + }, + { + name: backendOpenConnsName, + labels: map[string]string{ + "method": http.MethodGet, + "protocol": "http", + "backend": "backend1", + }, + assert: buildGaugeAssert(t, backendOpenConnsName, 1), + }, + { + name: backendRetriesTotalName, + labels: map[string]string{ + "backend": "backend1", + }, + assert: buildGreaterThanCounterAssert(t, backendRetriesTotalName, 1), + }, + { + name: backendServerUpName, + labels: map[string]string{ + "backend": "backend1", + "url": "http://127.0.0.10:80", + }, + assert: buildGaugeAssert(t, backendServerUpName, 1), }, } @@ -94,6 +177,90 @@ func TestPrometheus(t *testing.T) { } } +func TestPrometheusGenerationLogicForMetricWithLabel(t *testing.T) { + prometheusRegistry := RegisterPrometheus(&types.Prometheus{}) + defer prometheus.Unregister(promState) + + // Metrics with labels belonging to a specific configuration in Traefik + // should be removed when the generationMaxAge is exceeded. As example + // we use the traefik_backend_requests_total metric. + prometheusRegistry. + BackendReqsCounter(). + With("backend", "backend1", "code", strconv.Itoa(http.StatusOK), "method", http.MethodGet, "protocol", "http"). + Add(1) + + delayForTrackingCompletion() + + assertMetricExists(t, backendReqsTotalName, mustScrape()) + + // Increase the config generation one more than the max age of a metric. + for i := 0; i < generationAgeDefault+1; i++ { + OnConfigurationUpdate() + } + + // On the next scrape the metric still exists and will be removed + // after the scrape completed. + assertMetricExists(t, backendReqsTotalName, mustScrape()) + + // Now the metric should be absent. + assertMetricAbsent(t, backendReqsTotalName, mustScrape()) +} + +func TestPrometheusGenerationLogicForMetricWithoutLabel(t *testing.T) { + prometheusRegistry := RegisterPrometheus(&types.Prometheus{}) + defer prometheus.Unregister(promState) + + // Metrics without labels like traefik_config_reloads_total should live forever + // and never get removed. + prometheusRegistry.ConfigReloadsCounter().Add(1) + + delayForTrackingCompletion() + + assertMetricExists(t, configReloadsTotalName, mustScrape()) + + // Increase the config generation one more than the max age of a metric. + for i := 0; i < generationAgeDefault+100; i++ { + OnConfigurationUpdate() + } + + // Scrape two times in order to verify, that it is not removed after the + // first scrape completed. + assertMetricExists(t, configReloadsTotalName, mustScrape()) + assertMetricExists(t, configReloadsTotalName, mustScrape()) +} + +// Tracking and gathering the metrics happens concurrently. +// In practice this is no problem, because in case a tracked metric would miss +// the current scrape, it would just be there in the next one. +// That we can test reliably the tracking of all metrics here, we sleep +// for a short amount of time, to make sure the metric will be present +// in the next scrape. +func delayForTrackingCompletion() { + time.Sleep(250 * time.Millisecond) +} + +func mustScrape() []*dto.MetricFamily { + families, err := prometheus.DefaultGatherer.Gather() + if err != nil { + panic(fmt.Sprintf("could not gather metrics families: %s", err)) + } + return families +} + +func assertMetricExists(t *testing.T, name string, families []*dto.MetricFamily) { + t.Helper() + if findMetricFamily(name, families) == nil { + t.Errorf("gathered metrics do not contain %q", name) + } +} + +func assertMetricAbsent(t *testing.T, name string, families []*dto.MetricFamily) { + t.Helper() + if findMetricFamily(name, families) != nil { + t.Errorf("gathered metrics contain %q, but should not", name) + } +} + func findMetricFamily(name string, families []*dto.MetricFamily) *dto.MetricFamily { for _, family := range families { if family.GetName() == name { @@ -102,3 +269,43 @@ func findMetricFamily(name string, families []*dto.MetricFamily) *dto.MetricFami } return nil } + +func buildCounterAssert(t *testing.T, metricName string, expectedValue int) func(family *dto.MetricFamily) { + return func(family *dto.MetricFamily) { + if cv := int(family.Metric[0].Counter.GetValue()); cv != expectedValue { + t.Errorf("metric %s has value %d, want %d", metricName, cv, expectedValue) + } + } +} + +func buildGreaterThanCounterAssert(t *testing.T, metricName string, expectedMinValue int) func(family *dto.MetricFamily) { + return func(family *dto.MetricFamily) { + if cv := int(family.Metric[0].Counter.GetValue()); cv < expectedMinValue { + t.Errorf("metric %s has value %d, want at least %d", metricName, cv, expectedMinValue) + } + } +} + +func buildHistogramAssert(t *testing.T, metricName string, expectedSampleCount int) func(family *dto.MetricFamily) { + return func(family *dto.MetricFamily) { + if sc := int(family.Metric[0].Histogram.GetSampleCount()); sc != expectedSampleCount { + t.Errorf("metric %s has sample count value %d, want %d", metricName, sc, expectedSampleCount) + } + } +} + +func buildGaugeAssert(t *testing.T, metricName string, expectedValue int) func(family *dto.MetricFamily) { + return func(family *dto.MetricFamily) { + if gv := int(family.Metric[0].Gauge.GetValue()); gv != expectedValue { + t.Errorf("metric %s has value %d, want %d", metricName, gv, expectedValue) + } + } +} + +func buildTimestampAssert(t *testing.T, metricName string) func(family *dto.MetricFamily) { + return func(family *dto.MetricFamily) { + if ts := time.Unix(int64(family.Metric[0].Gauge.GetValue()), 0); time.Since(ts) > time.Minute { + t.Errorf("metric %s has wrong timestamp %v", metricName, ts) + } + } +} diff --git a/metrics/statsd.go b/metrics/statsd.go index 9dd58e6f6..977a35daf 100644 --- a/metrics/statsd.go +++ b/metrics/statsd.go @@ -30,10 +30,10 @@ func RegisterStatsd(config *types.Statsd) Registry { } return &standardRegistry{ - enabled: true, - reqsCounter: statsdClient.NewCounter(statsdMetricsReqsName, 1.0), - reqDurationHistogram: statsdClient.NewTiming(statsdMetricsLatencyName, 1.0), - retriesCounter: statsdClient.NewCounter(statsdRetriesTotalName, 1.0), + enabled: true, + backendReqsCounter: statsdClient.NewCounter(statsdMetricsReqsName, 1.0), + backendReqDurationHistogram: statsdClient.NewTiming(statsdMetricsLatencyName, 1.0), + backendRetriesCounter: statsdClient.NewCounter(statsdRetriesTotalName, 1.0), } } diff --git a/metrics/statsd_test.go b/metrics/statsd_test.go index 8f23a4432..3587dbc56 100644 --- a/metrics/statsd_test.go +++ b/metrics/statsd_test.go @@ -29,10 +29,10 @@ func TestStatsD(t *testing.T) { } udp.ShouldReceiveAll(t, expected, func() { - statsdRegistry.ReqsCounter().With("service", "test", "code", string(http.StatusOK), "method", http.MethodGet).Add(1) - statsdRegistry.ReqsCounter().With("service", "test", "code", string(http.StatusNotFound), "method", http.MethodGet).Add(1) - statsdRegistry.RetriesCounter().With("service", "test").Add(1) - statsdRegistry.RetriesCounter().With("service", "test").Add(1) - statsdRegistry.ReqDurationHistogram().With("service", "test", "code", string(http.StatusOK)).Observe(10000) + statsdRegistry.BackendReqsCounter().With("service", "test", "code", string(http.StatusOK), "method", http.MethodGet).Add(1) + statsdRegistry.BackendReqsCounter().With("service", "test", "code", string(http.StatusNotFound), "method", http.MethodGet).Add(1) + statsdRegistry.BackendRetriesCounter().With("service", "test").Add(1) + statsdRegistry.BackendRetriesCounter().With("service", "test").Add(1) + statsdRegistry.BackendReqDurationHistogram().With("service", "test", "code", string(http.StatusOK)).Observe(10000) }) } diff --git a/middlewares/metrics.go b/middlewares/metrics.go index 084c576b4..b5a362299 100644 --- a/middlewares/metrics.go +++ b/middlewares/metrics.go @@ -3,51 +3,100 @@ package middlewares import ( "net/http" "strconv" + "strings" + "sync/atomic" "time" "unicode/utf8" "github.com/containous/traefik/log" "github.com/containous/traefik/metrics" gokitmetrics "github.com/go-kit/kit/metrics" + "github.com/urfave/negroni" ) -// MetricsWrapper is a Negroni compatible Handler which relies on a -// given Metrics implementation to expose and monitor Traefik Metrics. -type MetricsWrapper struct { - registry metrics.Registry - serviceName string -} +const ( + protoHTTP = "http" + protoSSE = "sse" + protoWebsocket = "websocket" +) -// NewMetricsWrapper return a MetricsWrapper struct with -// a given Metrics implementation -func NewMetricsWrapper(registry metrics.Registry, service string) *MetricsWrapper { - var metricsWrapper = MetricsWrapper{ - registry: registry, - serviceName: service, +// NewEntryPointMetricsMiddleware creates a new metrics middleware for an Entrypoint. +func NewEntryPointMetricsMiddleware(registry metrics.Registry, entryPointName string) negroni.Handler { + return &metricsMiddleware{ + reqsCounter: registry.EntrypointReqsCounter(), + reqDurationHistogram: registry.EntrypointReqDurationHistogram(), + openConnsGauge: registry.EntrypointOpenConnsGauge(), + baseLabels: []string{"entrypoint", entryPointName}, } - - return &metricsWrapper } -func (m *MetricsWrapper) ServeHTTP(rw http.ResponseWriter, r *http.Request, next http.HandlerFunc) { +// NewBackendMetricsMiddleware creates a new metrics middleware for a Backend. +func NewBackendMetricsMiddleware(registry metrics.Registry, backendName string) negroni.Handler { + return &metricsMiddleware{ + reqsCounter: registry.BackendReqsCounter(), + reqDurationHistogram: registry.BackendReqDurationHistogram(), + openConnsGauge: registry.BackendOpenConnsGauge(), + baseLabels: []string{"backend", backendName}, + } +} + +type metricsMiddleware struct { + reqsCounter gokitmetrics.Counter + reqDurationHistogram gokitmetrics.Histogram + openConnsGauge gokitmetrics.Gauge + baseLabels []string + openConns int64 +} + +func (m *metricsMiddleware) ServeHTTP(rw http.ResponseWriter, r *http.Request, next http.HandlerFunc) { + labels := []string{"method", getMethod(r), "protocol", getRequestProtocol(r)} + labels = append(labels, m.baseLabels...) + + openConns := atomic.AddInt64(&m.openConns, 1) + m.openConnsGauge.With(labels...).Set(float64(openConns)) + defer func(labelValues []string) { + openConns := atomic.AddInt64(&m.openConns, -1) + m.openConnsGauge.With(labelValues...).Set(float64(openConns)) + }(labels) + start := time.Now() - prw := &responseRecorder{rw, http.StatusOK} - next(prw, r) + recorder := &responseRecorder{rw, http.StatusOK} + next(recorder, r) - reqLabels := []string{"service", m.serviceName, "code", strconv.Itoa(prw.statusCode), "method", getMethod(r)} - m.registry.ReqsCounter().With(reqLabels...).Add(1) - - reqDurationLabels := []string{"service", m.serviceName, "code", strconv.Itoa(prw.statusCode)} - m.registry.ReqDurationHistogram().With(reqDurationLabels...).Observe(time.Since(start).Seconds()) + labels = append(labels, "code", strconv.Itoa(recorder.statusCode)) + m.reqsCounter.With(labels...).Add(1) + m.reqDurationHistogram.With(labels...).Observe(float64(time.Since(start).Seconds())) } -type retryMetrics interface { - RetriesCounter() gokitmetrics.Counter +func getRequestProtocol(req *http.Request) string { + switch { + case isWebsocketRequest(req): + return protoWebsocket + case isSSERequest(req): + return protoSSE + default: + return protoHTTP + } } -// NewMetricsRetryListener instantiates a MetricsRetryListener with the given retryMetrics. -func NewMetricsRetryListener(retryMetrics retryMetrics, backendName string) RetryListener { - return &MetricsRetryListener{retryMetrics: retryMetrics, backendName: backendName} +// isWebsocketRequest determines if the specified HTTP request is a websocket handshake request. +func isWebsocketRequest(req *http.Request) bool { + return containsHeader(req, "Connection", "upgrade") && containsHeader(req, "Upgrade", "websocket") +} + +// isSSERequest determines if the specified HTTP request is a request for an event subscription. +func isSSERequest(req *http.Request) bool { + return containsHeader(req, "Accept", "text/event-stream") +} + +func containsHeader(req *http.Request, name, value string) bool { + items := strings.Split(req.Header.Get(name), ",") + for _, item := range items { + if value == strings.ToLower(strings.TrimSpace(item)) { + return true + } + } + return false } func getMethod(r *http.Request) string { @@ -58,6 +107,15 @@ func getMethod(r *http.Request) string { return r.Method } +type retryMetrics interface { + BackendRetriesCounter() gokitmetrics.Counter +} + +// NewMetricsRetryListener instantiates a MetricsRetryListener with the given retryMetrics. +func NewMetricsRetryListener(retryMetrics retryMetrics, backendName string) RetryListener { + return &MetricsRetryListener{retryMetrics: retryMetrics, backendName: backendName} +} + // MetricsRetryListener is an implementation of the RetryListener interface to // record RequestMetrics about retry attempts. type MetricsRetryListener struct { @@ -67,5 +125,5 @@ type MetricsRetryListener struct { // Retried tracks the retry in the RequestMetrics implementation. func (m *MetricsRetryListener) Retried(req *http.Request, attempt int) { - m.retryMetrics.RetriesCounter().With("backend", m.backendName).Add(1) + m.retryMetrics.BackendRetriesCounter().With("backend", m.backendName).Add(1) } diff --git a/middlewares/metrics_test.go b/middlewares/metrics_test.go index 21a3e255b..ae0b1b7aa 100644 --- a/middlewares/metrics_test.go +++ b/middlewares/metrics_test.go @@ -6,6 +6,7 @@ import ( "reflect" "testing" + "github.com/containous/traefik/testhelpers" "github.com/go-kit/kit/metrics" ) @@ -17,39 +18,25 @@ func TestMetricsRetryListener(t *testing.T) { retryListener.Retried(req, 2) wantCounterValue := float64(2) - if retryMetrics.retryCounter.counterValue != wantCounterValue { - t.Errorf("got counter value of %d, want %d", retryMetrics.retryCounter.counterValue, wantCounterValue) + if retryMetrics.retriesCounter.CounterValue != wantCounterValue { + t.Errorf("got counter value of %d, want %d", retryMetrics.retriesCounter.CounterValue, wantCounterValue) } wantLabelValues := []string{"backend", "backendName"} - if !reflect.DeepEqual(retryMetrics.retryCounter.lastLabelValues, wantLabelValues) { - t.Errorf("wrong label values %v used, want %v", retryMetrics.retryCounter.lastLabelValues, wantLabelValues) + if !reflect.DeepEqual(retryMetrics.retriesCounter.LastLabelValues, wantLabelValues) { + t.Errorf("wrong label values %v used, want %v", retryMetrics.retriesCounter.LastLabelValues, wantLabelValues) } } // collectingRetryMetrics is an implementation of the retryMetrics interface that can be used inside tests to collect the times Add() was called. type collectingRetryMetrics struct { - retryCounter *collectingCounter + retriesCounter *testhelpers.CollectingCounter } -func newCollectingRetryMetrics() collectingRetryMetrics { - return collectingRetryMetrics{retryCounter: &collectingCounter{}} +func newCollectingRetryMetrics() *collectingRetryMetrics { + return &collectingRetryMetrics{retriesCounter: &testhelpers.CollectingCounter{}} } -func (metrics collectingRetryMetrics) RetriesCounter() metrics.Counter { - return metrics.retryCounter -} - -type collectingCounter struct { - counterValue float64 - lastLabelValues []string -} - -func (c *collectingCounter) With(labelValues ...string) metrics.Counter { - c.lastLabelValues = labelValues - return c -} - -func (c *collectingCounter) Add(delta float64) { - c.counterValue += delta +func (metrics *collectingRetryMetrics) BackendRetriesCounter() metrics.Counter { + return metrics.retriesCounter } diff --git a/server/server.go b/server/server.go index 37b8f8f42..b0be1d462 100644 --- a/server/server.go +++ b/server/server.go @@ -122,10 +122,7 @@ func NewServer(globalConfiguration configuration.GlobalConfiguration) *Server { server.tracingMiddleware.Setup() } - server.metricsRegistry = metrics.NewVoidRegistry() - if globalConfiguration.Metrics != nil { - server.registerMetricClients(globalConfiguration.Metrics) - } + server.metricsRegistry = registerMetricClients(globalConfiguration.Metrics) if globalConfiguration.Cluster != nil { // leadership creation if cluster mode @@ -304,7 +301,7 @@ func (s *Server) setupServerEntryPoint(newServerEntryPointName string, newServer serverMiddlewares = append(serverMiddlewares, s.accessLoggerMiddleware) } if s.metricsRegistry.IsEnabled() { - serverMiddlewares = append(serverMiddlewares, middlewares.NewMetricsWrapper(s.metricsRegistry, newServerEntryPointName)) + serverMiddlewares = append(serverMiddlewares, middlewares.NewEntryPointMetricsMiddleware(s.metricsRegistry, newServerEntryPointName)) } if s.globalConfiguration.API != nil { if s.globalConfiguration.API.Stats == nil { @@ -449,8 +446,10 @@ func (s *Server) loadConfiguration(configMsg types.ConfigMessage) { } newConfigurations[configMsg.ProviderName] = configMsg.Configuration + s.metricsRegistry.ConfigReloadsCounter().Add(1) newServerEntryPoints, err := s.loadConfig(newConfigurations, s.globalConfiguration) if err == nil { + s.metricsRegistry.LastConfigReloadSuccessGauge().Set(float64(time.Now().Unix())) for newServerEntryPointName, newServerEntryPoint := range newServerEntryPoints { s.serverEntryPoints[newServerEntryPointName].httpRouter.UpdateHandler(newServerEntryPoint.httpRouter.GetHandler()) if s.globalConfiguration.EntryPoints[newServerEntryPointName].TLS == nil { @@ -465,6 +464,8 @@ func (s *Server) loadConfiguration(configMsg types.ConfigMessage) { s.currentConfigurations.Set(newConfigurations) s.postLoadConfiguration() } else { + s.metricsRegistry.ConfigReloadsFailureCounter().Add(1) + s.metricsRegistry.LastConfigReloadFailureGauge().Set(float64(time.Now().Unix())) log.Error("Error loading new configuration, aborted ", err) } } @@ -501,6 +502,8 @@ func (s *serverEntryPoint) getCertificate(clientHello *tls.ClientHelloInfo) (*tl } func (s *Server) postLoadConfiguration() { + metrics.OnConfigurationUpdate() + if s.globalConfiguration.ACME == nil { return } @@ -1070,7 +1073,7 @@ func (s *Server) loadConfig(configurations types.Configurations, globalConfigura rebalancer, _ = roundrobin.NewRebalancer(rr, roundrobin.RebalancerStickySession(sticky)) } lb = rebalancer - if err := configureLBServers(rebalancer, config, frontend); err != nil { + if err := s.configureLBServers(rebalancer, config, frontend); err != nil { log.Errorf("Skipping frontend %s...", frontendName) continue frontend } @@ -1092,7 +1095,7 @@ func (s *Server) loadConfig(configurations types.Configurations, globalConfigura } } lb = rr - if err := configureLBServers(rr, config, frontend); err != nil { + if err := s.configureLBServers(rr, config, frontend); err != nil { log.Errorf("Skipping frontend %s...", frontendName) continue frontend } @@ -1154,7 +1157,7 @@ func (s *Server) loadConfig(configurations types.Configurations, globalConfigura } if s.metricsRegistry.IsEnabled() { - n.Use(middlewares.NewMetricsWrapper(s.metricsRegistry, frontend.Backend)) + n.Use(middlewares.NewBackendMetricsMiddleware(s.metricsRegistry, frontend.Backend)) } ipWhitelistMiddleware, err := configureIPWhitelistMiddleware(frontend.WhitelistSourceRange) @@ -1233,7 +1236,7 @@ func (s *Server) loadConfig(configurations types.Configurations, globalConfigura } } } - healthcheck.GetHealthCheck().SetBackendsConfiguration(s.routinesPool.Ctx(), backendsHealthCheck) + healthcheck.GetHealthCheck(s.metricsRegistry).SetBackendsConfiguration(s.routinesPool.Ctx(), backendsHealthCheck) // Get new certificates list sorted per entrypoints // Update certificates entryPointsCertificates, err := s.loadHTTPSConfiguration(configurations, globalConfiguration.DefaultEntryPoints) @@ -1249,18 +1252,19 @@ func (s *Server) loadConfig(configurations types.Configurations, globalConfigura return serverEntryPoints, err } -func configureLBServers(lb healthcheck.LoadBalancer, config *types.Configuration, frontend *types.Frontend) error { - for serverName, server := range config.Backends[frontend.Backend].Servers { - u, err := url.Parse(server.URL) +func (s *Server) configureLBServers(lb healthcheck.LoadBalancer, config *types.Configuration, frontend *types.Frontend) error { + for name, srv := range config.Backends[frontend.Backend].Servers { + u, err := url.Parse(srv.URL) if err != nil { - log.Errorf("Error parsing server URL %s: %v", server.URL, err) + log.Errorf("Error parsing server URL %s: %v", srv.URL, err) return err } - log.Debugf("Creating server %s at %s with weight %d", serverName, u, server.Weight) - if err := lb.UpsertServer(u, roundrobin.Weight(server.Weight)); err != nil { - log.Errorf("Error adding server %s to load balancer: %v", server.URL, err) + log.Debugf("Creating server %s at %s with weight %d", name, u, srv.Weight) + if err := lb.UpsertServer(u, roundrobin.Weight(srv.Weight)); err != nil { + log.Errorf("Error adding server %s to load balancer: %v", srv.URL, err) return err } + s.metricsRegistry.BackendServerUpGauge().With("backend", frontend.Backend, "url", srv.URL).Set(1) } return nil } @@ -1477,9 +1481,12 @@ func configureBackends(backends map[string]*types.Backend) { } } -func (s *Server) registerMetricClients(metricsConfig *types.Metrics) { - var registries []metrics.Registry +func registerMetricClients(metricsConfig *types.Metrics) metrics.Registry { + if metricsConfig == nil { + return metrics.NewVoidRegistry() + } + registries := []metrics.Registry{} if metricsConfig.Prometheus != nil { registries = append(registries, metrics.RegisterPrometheus(metricsConfig.Prometheus)) log.Debug("Configured Prometheus metrics") @@ -1497,9 +1504,7 @@ func (s *Server) registerMetricClients(metricsConfig *types.Metrics) { log.Debugf("Configured InfluxDB metrics pushing to %s once every %s", metricsConfig.InfluxDB.Address, metricsConfig.InfluxDB.PushInterval) } - if len(registries) > 0 { - s.metricsRegistry = metrics.NewMultiRegistry(registries) - } + return metrics.NewMultiRegistry(registries) } func stopMetricsClients() { diff --git a/server/server_test.go b/server/server_test.go index 6e96af6fa..02f19fe23 100644 --- a/server/server_test.go +++ b/server/server_test.go @@ -484,7 +484,7 @@ func TestServerLoadConfigHealthCheckOptions(t *testing.T) { if healthCheck != nil { wantNumHealthCheckBackends = 1 } - gotNumHealthCheckBackends := len(healthcheck.GetHealthCheck().Backends) + gotNumHealthCheckBackends := len(healthcheck.GetHealthCheck(testhelpers.NewCollectingHealthCheckMetrics()).Backends) if gotNumHealthCheckBackends != wantNumHealthCheckBackends { t.Errorf("got %d health check backends, want %d", gotNumHealthCheckBackends, wantNumHealthCheckBackends) } diff --git a/testhelpers/metrics.go b/testhelpers/metrics.go new file mode 100644 index 000000000..e80fb75f3 --- /dev/null +++ b/testhelpers/metrics.go @@ -0,0 +1,52 @@ +package testhelpers + +import "github.com/go-kit/kit/metrics" + +// CollectingCounter is a metrics.Counter implementation that enables access to the CounterValue and LastLabelValues. +type CollectingCounter struct { + CounterValue float64 + LastLabelValues []string +} + +// With is there to satisfy the metrics.Counter interface. +func (c *CollectingCounter) With(labelValues ...string) metrics.Counter { + c.LastLabelValues = labelValues + return c +} + +// Add is there to satisfy the metrics.Counter interface. +func (c *CollectingCounter) Add(delta float64) { + c.CounterValue += delta +} + +// CollectingGauge is a metrics.Gauge implementation that enables access to the GaugeValue and LastLabelValues. +type CollectingGauge struct { + GaugeValue float64 + LastLabelValues []string +} + +// With is there to satisfy the metrics.Gauge interface. +func (g *CollectingGauge) With(labelValues ...string) metrics.Gauge { + g.LastLabelValues = labelValues + return g +} + +// Set is there to satisfy the metrics.Gauge interface. +func (g *CollectingGauge) Set(delta float64) { + g.GaugeValue = delta +} + +// CollectingHealthCheckMetrics can be used for testing the Metrics instrumentation of the HealthCheck package. +type CollectingHealthCheckMetrics struct { + Gauge *CollectingGauge +} + +// NewCollectingHealthCheckMetrics creates a new CollectingHealthCheckMetrics instance. +func NewCollectingHealthCheckMetrics() *CollectingHealthCheckMetrics { + return &CollectingHealthCheckMetrics{&CollectingGauge{}} +} + +// BackendServerUpGauge is there to satisfy the healthcheck.metricsRegistry interface. +func (m *CollectingHealthCheckMetrics) BackendServerUpGauge() metrics.Gauge { + return m.Gauge +}