diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 3e50461c2..8c8fddbb6 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -11,7 +11,10 @@ This doc contains all the metrics that Dragonfly components currently support. N - dragonfly_supernode_http_response_size_bytes{code, handler, method} - http response size in bytes - dragonfly_supernode_peers{peer} - dragonfly peers, the label peer consists of the hostname and ip address of one peer. - dragonfly_supernode_tasks{cdnstatus} - dragonfly tasks -- dragonfly_supernode_dfgettasks{callsystem} - dragonfly dfget tasks +- dragonfly_supernode_tasks_registered_total{} - total times of registering new tasks. counter type. +- dragonfly_supernode_dfgettasks{callsystem, status} - dragonfly dfget tasks +- dragonfly_supernode_dfgettasks_registered_total{callsystem} - total times of registering new dfgettasks. counter type. +- dragonfly_supernode_dfgettasks_failed_total{callsystem} - total times of failed dfgettasks. counter type. - dragonfly_supernode_schedule_duration_milliseconds{peer} - duration for task scheduling in milliseconds - dragonfly_supernode_trigger_cdn_total{} - total times of triggering cdn. - dragonfly_supernode_trigger_cdn_failed_total{} - total failed times of triggering cdn. diff --git a/supernode/daemon/mgr/dfgettask/manager.go b/supernode/daemon/mgr/dfgettask/manager.go index 1173d98d8..4807264ff 100644 --- a/supernode/daemon/mgr/dfgettask/manager.go +++ b/supernode/daemon/mgr/dfgettask/manager.go @@ -36,26 +36,36 @@ import ( var _ mgr.DfgetTaskMgr = &Manager{} type metrics struct { - dfgetTasks *prometheus.GaugeVec + dfgetTasks *prometheus.GaugeVec + dfgetTasksRegisterCount *prometheus.CounterVec + dfgetTasksFailCount *prometheus.CounterVec } func newMetrics(register prometheus.Registerer) *metrics { return &metrics{ dfgetTasks: metricsutils.NewGauge(config.SubsystemSupernode, "dfgettasks", - "The number of dfget tasks", []string{"callsystem"}, register), + "Current status of dfgettasks", []string{"callsystem", "status"}, register), + + dfgetTasksRegisterCount: metricsutils.NewCounter(config.SubsystemSupernode, "dfgettasks_registered_total", + "Total times of registering dfgettasks", []string{"callsystem"}, register), + + dfgetTasksFailCount: metricsutils.NewCounter(config.SubsystemSupernode, "dfgettasks_failed_total", + "Total failure times of dfgettasks", []string{"callsystem"}, register), } } // Manager is an implementation of the interface of DfgetTaskMgr. type Manager struct { + cfg *config.Config dfgetTaskStore *dutil.Store ptoc *syncmap.SyncMap metrics *metrics } // NewManager returns a new Manager. -func NewManager(register prometheus.Registerer) (*Manager, error) { +func NewManager(cfg *config.Config, register prometheus.Registerer) (*Manager, error) { return &Manager{ + cfg: cfg, dfgetTaskStore: dutil.NewStore(), ptoc: syncmap.NewSyncMap(), metrics: newMetrics(register), @@ -89,7 +99,12 @@ func (dtm *Manager) Add(ctx context.Context, dfgetTask *types.DfGetTask) error { dtm.ptoc.Add(generatePeerKey(dfgetTask.PeerID, dfgetTask.TaskID), dfgetTask.CID) dtm.dfgetTaskStore.Put(key, dfgetTask) - dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem).Inc() + + // If dfget task is created by supernode cdn, don't update metrics. + if !dtm.cfg.IsSuperPID(dfgetTask.PeerID) || !dtm.cfg.IsSuperCID(dfgetTask.CID) { + dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem, dfgetTask.Status).Inc() + dtm.metrics.dfgetTasksRegisterCount.WithLabelValues(dfgetTask.CallSystem).Inc() + } return nil } @@ -121,7 +136,7 @@ func (dtm *Manager) Delete(ctx context.Context, clientID, taskID string) error { return err } dtm.ptoc.Delete(generatePeerKey(dfgetTask.PeerID, dfgetTask.TaskID)) - dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem).Dec() + dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem, dfgetTask.Status).Dec() return dtm.dfgetTaskStore.Delete(key) } @@ -133,9 +148,16 @@ func (dtm *Manager) UpdateStatus(ctx context.Context, clientID, taskID, status s } if dfgetTask.Status != types.DfGetTaskStatusSUCCESS { + dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem, dfgetTask.Status).Dec() + dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem, status).Inc() dfgetTask.Status = status } + // Add the total failed count. + if dfgetTask.Status == types.DfGetTaskStatusFAILED { + dtm.metrics.dfgetTasksFailCount.WithLabelValues(dfgetTask.CallSystem).Inc() + } + return nil } diff --git a/supernode/daemon/mgr/dfgettask/manager_test.go b/supernode/daemon/mgr/dfgettask/manager_test.go index 15fc822c4..e21ea0a36 100644 --- a/supernode/daemon/mgr/dfgettask/manager_test.go +++ b/supernode/daemon/mgr/dfgettask/manager_test.go @@ -22,6 +22,7 @@ import ( "github.com/dragonflyoss/Dragonfly/apis/types" "github.com/dragonflyoss/Dragonfly/pkg/errortypes" + "github.com/dragonflyoss/Dragonfly/supernode/config" "github.com/go-check/check" "github.com/prometheus/client_golang/prometheus" @@ -37,11 +38,18 @@ func init() { } type DfgetTaskMgrTestSuite struct { + cfg *config.Config +} + +func (s *DfgetTaskMgrTestSuite) SetUpSuite(c *check.C) { + s.cfg = config.NewConfig() + s.cfg.SetCIDPrefix("127.0.0.1") } func (s *DfgetTaskMgrTestSuite) TestDfgetTaskAdd(c *check.C) { - manager, _ := NewManager(prometheus.NewRegistry()) + manager, _ := NewManager(s.cfg, prometheus.NewRegistry()) dfgetTasks := manager.metrics.dfgetTasks + dfgetTasksRegisterCount := manager.metrics.dfgetTasksRegisterCount var testCases = []struct { dfgetTask *types.DfGetTask @@ -96,7 +104,11 @@ func (s *DfgetTaskMgrTestSuite) TestDfgetTaskAdd(c *check.C) { c.Check(err, check.IsNil) c.Assert(1, check.Equals, int(prom_testutil.ToFloat64( - dfgetTasks.WithLabelValues(tc.dfgetTask.CallSystem)))) + dfgetTasks.WithLabelValues(tc.dfgetTask.CallSystem, tc.dfgetTask.Status)))) + + c.Assert(1, check.Equals, + int(prom_testutil.ToFloat64( + dfgetTasksRegisterCount.WithLabelValues(tc.dfgetTask.CallSystem)))) dt, err := manager.Get(context.Background(), tc.dfgetTask.CID, tc.dfgetTask.TaskID) c.Check(err, check.IsNil) c.Check(dt, check.DeepEquals, tc.Expect) @@ -104,7 +116,9 @@ func (s *DfgetTaskMgrTestSuite) TestDfgetTaskAdd(c *check.C) { } func (s *DfgetTaskMgrTestSuite) TestDfgetTaskUpdate(c *check.C) { - manager, _ := NewManager(prometheus.NewRegistry()) + manager, _ := NewManager(s.cfg, prometheus.NewRegistry()) + dfgetTasksFailCount := manager.metrics.dfgetTasksFailCount + var testCases = []struct { dfgetTask *types.DfGetTask taskStatus string @@ -163,13 +177,19 @@ func (s *DfgetTaskMgrTestSuite) TestDfgetTaskUpdate(c *check.C) { err = manager.UpdateStatus(context.Background(), tc.dfgetTask.CID, tc.dfgetTask.TaskID, tc.taskStatus) c.Check(err, check.IsNil) + if tc.taskStatus == types.DfGetTaskStatusFAILED { + c.Assert(1, check.Equals, + int(prom_testutil.ToFloat64( + dfgetTasksFailCount.WithLabelValues(tc.dfgetTask.CallSystem)))) + } + dt, err := manager.Get(context.Background(), tc.dfgetTask.CID, tc.dfgetTask.TaskID) c.Check(dt, check.DeepEquals, tc.Expect) } } func (s *DfgetTaskMgrTestSuite) TestDfgetTaskDelete(c *check.C) { - manager, _ := NewManager(prometheus.NewRegistry()) + manager, _ := NewManager(s.cfg, prometheus.NewRegistry()) dfgetTasks := manager.metrics.dfgetTasks var testCases = []struct { @@ -207,7 +227,7 @@ func (s *DfgetTaskMgrTestSuite) TestDfgetTaskDelete(c *check.C) { c.Check(err, check.IsNil) c.Assert(0, check.Equals, int(prom_testutil.ToFloat64( - dfgetTasks.WithLabelValues(tc.dfgetTask.CallSystem)))) + dfgetTasks.WithLabelValues(tc.dfgetTask.CallSystem, tc.dfgetTask.Status)))) _, err = manager.Get(context.Background(), tc.dfgetTask.CID, tc.dfgetTask.TaskID) c.Check(errortypes.IsDataNotFound(err), check.Equals, true) diff --git a/supernode/daemon/mgr/peer/manager.go b/supernode/daemon/mgr/peer/manager.go index 143587850..6dffac9bf 100644 --- a/supernode/daemon/mgr/peer/manager.go +++ b/supernode/daemon/mgr/peer/manager.go @@ -44,7 +44,7 @@ type metrics struct { func newMetrics(register prometheus.Registerer) *metrics { return &metrics{ peers: metricsutils.NewGauge(config.SubsystemSupernode, "peers", - "The number of supernode peers", []string{"peer"}, register), + "Current status of peers", []string{"peer"}, register), } } @@ -83,7 +83,7 @@ func (pm *Manager) Register(ctx context.Context, peerCreateRequest *types.PeerCr Created: strfmt.DateTime(time.Now()), } pm.peerStore.Put(id, peerInfo) - pm.metrics.peers.WithLabelValues(GeneratePeerName(peerInfo)).Inc() + pm.metrics.peers.WithLabelValues(peerInfo.IP.String()).Inc() return &types.PeerCreateResponse{ ID: id, @@ -98,7 +98,7 @@ func (pm *Manager) DeRegister(ctx context.Context, peerID string) error { } pm.peerStore.Delete(peerID) - pm.metrics.peers.WithLabelValues(GeneratePeerName(peerInfo)).Dec() + pm.metrics.peers.WithLabelValues(peerInfo.IP.String()).Dec() return nil } @@ -200,8 +200,3 @@ func getLessFunc(listResult []interface{}, desc bool) (less func(i, j int) bool) func generatePeerID(peerInfo *types.PeerCreateRequest) string { return fmt.Sprintf("%s-%s-%d", peerInfo.HostName.String(), peerInfo.IP.String(), time.Now().UnixNano()) } - -// GeneratePeerName extracts the hostname and ip from peerInfo. -func GeneratePeerName(info *types.PeerInfo) string { - return info.HostName.String() + "-" + info.IP.String() -} diff --git a/supernode/daemon/mgr/peer/manager_test.go b/supernode/daemon/mgr/peer/manager_test.go index 6af1f2457..db053e15d 100644 --- a/supernode/daemon/mgr/peer/manager_test.go +++ b/supernode/daemon/mgr/peer/manager_test.go @@ -56,8 +56,7 @@ func (s *PeerMgrTestSuite) TestPeerMgr(c *check.C) { c.Check(err, check.IsNil) c.Assert(1, check.Equals, - int(prom_testutil.ToFloat64(peers.WithLabelValues("foo-192.168.10.11")))) - + int(prom_testutil.ToFloat64(peers.WithLabelValues("192.168.10.11")))) // get id := resp.ID info, err := manager.Get(context.Background(), id) @@ -82,7 +81,7 @@ func (s *PeerMgrTestSuite) TestPeerMgr(c *check.C) { c.Check(err, check.IsNil) c.Assert(0, check.Equals, - int(prom_testutil.ToFloat64(peers.WithLabelValues("foo-192.168.10.11")))) + int(prom_testutil.ToFloat64(peers.WithLabelValues("192.168.10.11")))) // get info, err = manager.Get(context.Background(), id) diff --git a/supernode/daemon/mgr/task/manager.go b/supernode/daemon/mgr/task/manager.go index b6d313893..e55dc44be 100644 --- a/supernode/daemon/mgr/task/manager.go +++ b/supernode/daemon/mgr/task/manager.go @@ -45,6 +45,7 @@ var _ mgr.TaskMgr = &Manager{} type metrics struct { tasks *prometheus.GaugeVec + tasksRegisterCount *prometheus.CounterVec triggerCdnCount *prometheus.CounterVec triggerCdnFailCount *prometheus.CounterVec scheduleDurationMilliSeconds *prometheus.HistogramVec @@ -53,13 +54,16 @@ type metrics struct { func newMetrics(register prometheus.Registerer) *metrics { return &metrics{ tasks: metricsutils.NewGauge(config.SubsystemSupernode, "tasks", - "The status of Supernode tasks", []string{"cdnstatus"}, register), + "Current status of Supernode tasks", []string{"cdnstatus"}, register), + + tasksRegisterCount: metricsutils.NewCounter(config.SubsystemSupernode, "tasks_registered_total", + "Total times of registering tasks", []string{}, register), triggerCdnCount: metricsutils.NewCounter(config.SubsystemSupernode, "trigger_cdn_total", - "The number of triggering cdn", []string{}, register), + "Total times of triggering cdn", []string{}, register), triggerCdnFailCount: metricsutils.NewCounter(config.SubsystemSupernode, "trigger_cdn_failed_total", - "The number of triggering cdn failure", []string{}, register), + "Total failure times of triggering cdn", []string{}, register), scheduleDurationMilliSeconds: metricsutils.NewHistogram(config.SubsystemSupernode, "schedule_duration_milliseconds", "Duration for task scheduling in milliseconds", []string{"peer"}, @@ -87,7 +91,8 @@ type Manager struct { // NewManager returns a new Manager Object. func NewManager(cfg *config.Config, peerMgr mgr.PeerMgr, dfgetTaskMgr mgr.DfgetTaskMgr, - progressMgr mgr.ProgressMgr, cdnMgr mgr.CDNMgr, schedulerMgr mgr.SchedulerMgr, originClient httpclient.OriginHTTPClient, register prometheus.Registerer) (*Manager, error) { + progressMgr mgr.ProgressMgr, cdnMgr mgr.CDNMgr, schedulerMgr mgr.SchedulerMgr, + originClient httpclient.OriginHTTPClient, register prometheus.Registerer) (*Manager, error) { return &Manager{ cfg: cfg, taskStore: dutil.NewStore(), @@ -118,6 +123,7 @@ func (tm *Manager) Register(ctx context.Context, req *types.TaskCreateRequest) ( logrus.Infof("failed to add or update task with req %+v: %v", req, err) return nil, err } + tm.metrics.tasksRegisterCount.WithLabelValues().Inc() logrus.Debugf("success to get task info: %+v", task) // TODO: defer rollback the task update diff --git a/supernode/daemon/mgr/task/manager_test.go b/supernode/daemon/mgr/task/manager_test.go index eb0dcdccd..59c2b604d 100644 --- a/supernode/daemon/mgr/task/manager_test.go +++ b/supernode/daemon/mgr/task/manager_test.go @@ -30,6 +30,7 @@ import ( "github.com/go-check/check" "github.com/golang/mock/gomock" "github.com/prometheus/client_golang/prometheus" + prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" ) func Test(t *testing.T) { @@ -76,6 +77,7 @@ func (s *TaskMgrTestSuite) TearDownSuite(c *check.C) { } func (s *TaskMgrTestSuite) TestCheckTaskStatus(c *check.C) { + tasksRegisterCount := s.taskManager.metrics.tasksRegisterCount s.taskManager.taskStore = dutil.NewStore() req := &types.TaskCreateRequest{ CID: "cid", @@ -87,6 +89,8 @@ func (s *TaskMgrTestSuite) TestCheckTaskStatus(c *check.C) { } resp, err := s.taskManager.Register(context.Background(), req) c.Check(err, check.IsNil) + c.Assert(1, check.Equals, + int(prom_testutil.ToFloat64(tasksRegisterCount.WithLabelValues()))) isSuccess, err := s.taskManager.CheckTaskStatus(context.Background(), resp.ID) c.Check(err, check.IsNil) diff --git a/supernode/daemon/mgr/task/manager_util.go b/supernode/daemon/mgr/task/manager_util.go index c7e7fdc1f..4e1fe414b 100644 --- a/supernode/daemon/mgr/task/manager_util.go +++ b/supernode/daemon/mgr/task/manager_util.go @@ -30,7 +30,6 @@ import ( "github.com/dragonflyoss/Dragonfly/pkg/timeutils" "github.com/dragonflyoss/Dragonfly/supernode/config" "github.com/dragonflyoss/Dragonfly/supernode/daemon/mgr" - "github.com/dragonflyoss/Dragonfly/supernode/daemon/mgr/peer" "github.com/dragonflyoss/Dragonfly/supernode/util" "github.com/pkg/errors" @@ -332,9 +331,7 @@ func (tm *Manager) parseAvailablePeers(ctx context.Context, clientID string, tas } // Get peerName to represent peer in metrics. - p, _ := tm.peerMgr.Get(context.Background(), dfgetTask.PeerID) - peerName := peer.GeneratePeerName(p) - + peer, _ := tm.peerMgr.Get(context.Background(), dfgetTask.PeerID) // get scheduler pieceResult logrus.Debugf("start scheduler for taskID: %s clientID: %s", task.ID, clientID) startTime := time.Now() @@ -342,7 +339,7 @@ func (tm *Manager) parseAvailablePeers(ctx context.Context, clientID string, tas if err != nil { return false, nil, err } - tm.metrics.scheduleDurationMilliSeconds.WithLabelValues(peerName).Observe(timeutils.SinceInMilliseconds(startTime)) + tm.metrics.scheduleDurationMilliSeconds.WithLabelValues(peer.IP.String()).Observe(timeutils.SinceInMilliseconds(startTime)) logrus.Debugf("get scheduler result length(%d) with taskID(%s) and clientID(%s)", len(pieceResult), task.ID, clientID) var pieceInfos []*types.PieceInfo diff --git a/supernode/server/metrics.go b/supernode/server/metrics.go index 2bb4e6ff2..20aa7eb8d 100644 --- a/supernode/server/metrics.go +++ b/supernode/server/metrics.go @@ -26,7 +26,7 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" ) -// metrics defines three prometheus metrics for monitoring http handler status +// metrics defines some prometheus metrics for monitoring supernode type metrics struct { requestCounter *prometheus.CounterVec requestDuration *prometheus.HistogramVec diff --git a/supernode/server/server.go b/supernode/server/server.go index c068aceb9..486b71457 100644 --- a/supernode/server/server.go +++ b/supernode/server/server.go @@ -68,7 +68,7 @@ func New(cfg *config.Config, register prometheus.Registerer) (*Server, error) { return nil, err } - dfgetTaskMgr, err := dfgettask.NewManager(register) + dfgetTaskMgr, err := dfgettask.NewManager(cfg, register) if err != nil { return nil, err } @@ -88,7 +88,8 @@ func New(cfg *config.Config, register prometheus.Registerer) (*Server, error) { return nil, err } - taskMgr, err := task.NewManager(cfg, peerMgr, dfgetTaskMgr, progressMgr, cdnMgr, schedulerMgr, originClient, register) + taskMgr, err := task.NewManager(cfg, peerMgr, dfgetTaskMgr, progressMgr, cdnMgr, + schedulerMgr, originClient, register) if err != nil { return nil, err }