From 15b40a286628e7db589a58df106c5c7dd86ea27b Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Sun, 3 Feb 2019 17:10:47 -0800 Subject: [PATCH] Drop precomputed ages. Show stats on front page. --- zfs_replication_exporter.go | 127 ++++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 29 deletions(-) diff --git a/zfs_replication_exporter.go b/zfs_replication_exporter.go index 05fed21..bdd4ecf 100644 --- a/zfs_replication_exporter.go +++ b/zfs_replication_exporter.go @@ -5,6 +5,7 @@ import ( "bytes" "flag" "fmt" + "html/template" "io/ioutil" "net/http" _ "net/http/pprof" @@ -12,6 +13,7 @@ import ( "path/filepath" "regexp" "strings" + "sync" "time" "github.com/golang/glog" @@ -38,12 +40,6 @@ var ( Name: "ssh_fetch_duration_seconds", Help: "Time to fetch and parse snapshot age over SSH", }) - snapshotAgesMetric = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "zfs_snapshot_age_seconds", - Help: "Duration in seconds for most recent snapshot for `filesystem`", - }, - []string{"host", "filesystem"}, - ) snapshotTimestampMetric = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "zfs_snapshot_timestamp_seconds", Help: "Most recent snapshot timestamp for `filesystem` UNIX epoch seconds", @@ -60,7 +56,6 @@ var ( func init() { prometheus.MustRegister(fetchRequestDurationMetric) - prometheus.MustRegister(snapshotAgesMetric) prometheus.MustRegister(snapshotTimestampMetric) prometheus.MustRegister(snapshotCountsMetrics) } @@ -99,7 +94,19 @@ func newPublicKey() ([]ssh.AuthMethod, error) { return signers, nil } -func updateMetrics(host string, c *ssh.Client) error { +type filesystemName string +type filesystemStat struct { + // Timestamp of most recent snapshot + Timestamp time.Time + // Counts is the number of snapshots + Counts int + // FreenasCounts is the number of snapshots matching freenas' auto-snapshot + // names. + FreenasCounts int +} +type snapshotStats map[filesystemName]*filesystemStat + +func fetchSnapshotStats(host string, c *ssh.Client) (snapshotStats, error) { now := time.Now() defer func() { delta := time.Since(now) @@ -110,19 +117,17 @@ func updateMetrics(host string, c *ssh.Client) error { // represented by a Session. s, err := c.NewSession() if err != nil { - return fmt.Errorf("[%s] error creating new session: %v", host, err) + return nil, fmt.Errorf("[%s] error creating new session: %v", host, err) } defer s.Close() glog.V(2).Infof("[%s] Running %q", host, snapshotListCmd) b, err := s.CombinedOutput(snapshotListCmd) if err != nil { - return fmt.Errorf("[%s] error running %q: %v", host, snapshotListCmd, err) + return nil, fmt.Errorf("[%s] error running %q: %v", host, snapshotListCmd, err) } scanner := bufio.NewScanner(bytes.NewReader(b)) - snapshotAges := make(map[string]time.Time) - snapshotCountsByFilesystem := make(map[string]int) - freenasSnapshotCountsByFilesystem := make(map[string]int) + stats := snapshotStats(make(map[filesystemName]*filesystemStat)) for scanner.Scan() { l := scanner.Text() m := snapshotPattern.FindStringSubmatch(l) @@ -131,35 +136,89 @@ func updateMetrics(host string, c *ssh.Client) error { if idx := strings.Index(l, "@"); idx != -1 { filesystem = l[:idx] } - snapshotCountsByFilesystem[filesystem]++ + name := filesystemName(filesystem) + if _, ok := stats[name]; !ok { + stats[name] = &filesystemStat{} + } + stats[name].Counts++ if len(m) == 2 { t, err := time.Parse(snapshotFormat, m[1]) if err != nil { - glog.Errorf("[%s] Malformed time in snapshot %q: %v", host, m[2], err) + glog.Errorf("[%s] Malformed time in snapshot %q: %v", host, m[1], err) continue } - freenasSnapshotCountsByFilesystem[filesystem]++ - snapshotTime := snapshotAges[filesystem] + glog.V(3).Infof("filesystem: %s timestamp %v", l, t) + stats[name].FreenasCounts++ + snapshotTime := stats[name].Timestamp + glog.V(3).Infof("snapshotTime.Before(t) = %v snapshotTime: %v t: %v", snapshotTime.Before(t), snapshotTime, t) if snapshotTime.Before(t) { - snapshotAges[filesystem] = t + stats[name].Timestamp = t } + } else { + glog.V(3).Infof("[%s] Skipping snapshot with non-conforming timestamp %q", host, l) } } if err := scanner.Err(); err != nil { - return fmt.Errorf("[%s] failed to scan response: %v", host, err) + return nil, fmt.Errorf("[%s] failed to scan response: %v", host, err) } + return stats, nil +} - for filesystem, c := range snapshotCountsByFilesystem { - snapshotCountsMetrics.WithLabelValues(host, filesystem, "all").Set(float64(c)) +func updateMetrics(host string, stats snapshotStats) { + for filesystem, stat := range stats { + snapshotCountsMetrics.WithLabelValues(host, string(filesystem), "all").Set(float64(stat.Counts)) + snapshotCountsMetrics.WithLabelValues(host, string(filesystem), "freenas").Set(float64(stat.FreenasCounts)) + snapshotTimestampMetric.WithLabelValues(host, string(filesystem)).Set(float64(stat.Timestamp.Unix())) } - for filesystem, c := range freenasSnapshotCountsByFilesystem { - snapshotCountsMetrics.WithLabelValues(host, filesystem, "freenas").Set(float64(c)) +} + +type hostsSnapshotStats struct { + sync.Mutex + host2Stats map[string]snapshotStats +} + +var indexTmpl = template.Must(template.New("index").Parse(` + + + + + + + ZFS replication exporter + + +
+

Debugging info for ZFS replication exporter

+{{range $host, $snapStats := .}} +

{{$host}}

+ + + + + + + + {{range $name, $fsStat := .}} + + + + + + + {{end}} +
FilesystemCountsFreeNAS SnapshotsMost Recent
{{$name}}{{$fsStat.Counts}}{{$fsStat.FreenasCounts}}{{if $fsStat.FreenasCounts }}{{$fsStat.Timestamp}}{{end}}
+{{end}} +
+ + +`)) + +func (hss *hostsSnapshotStats) ServeHTTP(w http.ResponseWriter, r *http.Request) { + hss.Lock() + defer hss.Unlock() + if err := indexTmpl.Execute(w, hss.host2Stats); err != nil { + glog.Errorf("Failed to render index: %v", err) } - for filesystem, snapshotTime := range snapshotAges { - snapshotAgesMetric.WithLabelValues(host, filesystem).Set(now.Sub(snapshotTime).Seconds()) - snapshotTimestampMetric.WithLabelValues(host, filesystem).Set(float64(snapshotTime.Unix())) - } - return nil } func main() { @@ -171,6 +230,10 @@ func main() { glog.Exitf("Error fetching public keys: %v", err) } + hss := &hostsSnapshotStats{ + host2Stats: make(map[string]snapshotStats), + } + for _, userHost := range strings.Split(*hosts, ",") { u := os.Getenv("USER") h := userHost @@ -197,11 +260,16 @@ func main() { } } if c != nil { - if err := updateMetrics(host, c); err != nil { + stats, err := fetchSnapshotStats(host, c) + if err != nil { glog.Errorf("Failed to update metrics: %v", err) c.Close() c = nil } + hss.Lock() + hss.host2Stats[host] = stats + hss.Unlock() + updateMetrics(host, stats) } time.Sleep(*refreshInterval) } @@ -209,6 +277,7 @@ func main() { } // Expose the registered metrics via HTTP. + http.Handle("/", hss) http.Handle("/metrics", promhttp.Handler()) glog.Exitf("Failed to ListenAndServe: %v", http.ListenAndServe(*addr, nil)) }