320 lines
9.4 KiB
Go
320 lines
9.4 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"flag"
|
|
"fmt"
|
|
"html/template"
|
|
"io/ioutil"
|
|
"net/http"
|
|
_ "net/http/pprof"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/golang/glog"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
|
|
"golang.org/x/crypto/ssh"
|
|
)
|
|
|
|
var (
|
|
addr = flag.String("addr", "localhost:9999", "HTTP listen address for prometheus /metrics")
|
|
hosts = flag.String("hosts", os.Getenv("USER")+"@localhost:22", "user@host:port log into and scrape")
|
|
refreshInterval = flag.Duration("refresh", 5*time.Minute, "refresh interval time")
|
|
)
|
|
|
|
const snapshotListCmd = "/sbin/zfs list -t snapshot -H -o name -s name"
|
|
|
|
var (
|
|
// FreeNAS example:
|
|
// @auto-20171001.1400-2w
|
|
freenasSnapshotPattern = regexp.MustCompile(`^[^@]+@auto-(\d{8}\.\d{4})-\d+[mwy]$`)
|
|
// Linux examples:
|
|
// @zfs-auto-snap_frequent-2023-01-15-18h00U
|
|
// @zfs-auto-snap_frequent-2023-01-15-18h15U
|
|
// @zfs-auto-snap_frequent-2023-01-15-18h30U
|
|
// @zfs-auto-snap_frequent-2023-01-15-18h45U
|
|
// @zfs-auto-snap_daily-2023-01-14-08h00U
|
|
// @zfs-auto-snap_hourly-2023-01-14-19h00U
|
|
// @zfs-auto-snap_weekly-2022-12-19-08h00U
|
|
// @zfs-auto-snap_monthly-2022-12-01-08h00U
|
|
linuxSnapshotPattern = regexp.MustCompile(`^[^@]+@zfs-auto-snap_(?:frequent|daily|hourly|weekly|monthly)-(\d{4}-\d{2}-\d{2}-\d{2}h\d{2}U)$`)
|
|
freenasSnapshotFormat = "20060102.1504"
|
|
linuxSnapshotFormat = "2006-01-02-15h04U"
|
|
|
|
fetchRequestDurationMetric = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "ssh_fetch_duration_seconds",
|
|
Help: "Time to fetch and parse snapshot age over SSH",
|
|
})
|
|
snapshotTimestampMetric = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "zfs_snapshot_timestamp_seconds",
|
|
Help: "Most recent snapshot timestamp for `filesystem` UNIX epoch seconds",
|
|
},
|
|
[]string{"host", "filesystem"},
|
|
)
|
|
snapshotCountsMetrics = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "zfs_snapshot_count",
|
|
Help: "Count of snapshots per-filesystem",
|
|
},
|
|
[]string{"host", "filesystem", "type"},
|
|
)
|
|
)
|
|
|
|
func init() {
|
|
prometheus.MustRegister(fetchRequestDurationMetric)
|
|
prometheus.MustRegister(snapshotTimestampMetric)
|
|
prometheus.MustRegister(snapshotCountsMetrics)
|
|
}
|
|
|
|
func newPublicKey() ([]ssh.AuthMethod, error) {
|
|
var signers []ssh.AuthMethod
|
|
|
|
for _, path := range []string{
|
|
filepath.Join(os.Getenv("HOME"), ".ssh", "id_dsa"),
|
|
filepath.Join(os.Getenv("HOME"), ".ssh", "id_rsa"),
|
|
} {
|
|
|
|
// A public key may be used to authenticate against the remote
|
|
// server by using an unencrypted PEM-encoded private key file.
|
|
//
|
|
// If you have an encrypted private key, the crypto/x509 package
|
|
// can be used to decrypt it.
|
|
key, err := ioutil.ReadFile(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
return nil, fmt.Errorf("unable to read private key %q: %v", path, err)
|
|
}
|
|
|
|
// Create the Signer for this private key.
|
|
signer, err := ssh.ParsePrivateKey(key)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to parse private key %q: %v", path, err)
|
|
}
|
|
signers = append(signers, ssh.PublicKeys(signer))
|
|
}
|
|
if len(signers) == 0 {
|
|
return nil, fmt.Errorf("no public keys configured")
|
|
}
|
|
return signers, nil
|
|
}
|
|
|
|
type filesystemName string
|
|
type filesystemStat struct {
|
|
// Timestamp of most recent snapshot
|
|
Timestamp time.Time
|
|
// Counts is the number of snapshots
|
|
Counts int
|
|
// FreenasCounts is the number of snapshots matching freenas' auto-snapshot
|
|
// names.
|
|
FreenasCounts int
|
|
// LinuxCounts is the number of snapshots matching linux' auto-snapshot
|
|
// names.
|
|
LinuxCounts int
|
|
}
|
|
type snapshotStats map[filesystemName]*filesystemStat
|
|
|
|
func fetchSnapshotStats(host string, c *ssh.Client) (snapshotStats, error) {
|
|
now := time.Now()
|
|
defer func() {
|
|
delta := time.Since(now)
|
|
fetchRequestDurationMetric.Set(delta.Seconds())
|
|
glog.V(2).Infof("[%s] Update took %s", host, delta)
|
|
}()
|
|
// Each ClientConn can support multiple interactive sessions,
|
|
// represented by a Session.
|
|
s, err := c.NewSession()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("[%s] error creating new session: %v", host, err)
|
|
}
|
|
defer s.Close()
|
|
|
|
glog.V(2).Infof("[%s] Running %q", host, snapshotListCmd)
|
|
b, err := s.CombinedOutput(snapshotListCmd)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("[%s] error running %q: %v", host, snapshotListCmd, err)
|
|
}
|
|
scanner := bufio.NewScanner(bytes.NewReader(b))
|
|
stats := snapshotStats(make(map[filesystemName]*filesystemStat))
|
|
for scanner.Scan() {
|
|
l := scanner.Text()
|
|
filesystem := l
|
|
if idx := strings.Index(l, "@"); idx != -1 {
|
|
filesystem = l[:idx]
|
|
}
|
|
name := filesystemName(filesystem)
|
|
if _, ok := stats[name]; !ok {
|
|
stats[name] = &filesystemStat{}
|
|
}
|
|
stats[name].Counts++
|
|
|
|
var foundSnapshot bool
|
|
if m := freenasSnapshotPattern.FindStringSubmatch(l); len(m) == 2 {
|
|
foundSnapshot = true
|
|
t, err := time.Parse(freenasSnapshotFormat, m[1])
|
|
if err != nil {
|
|
glog.Errorf("[%s] Malformed time in freenas snapshot %q: %v", host, m[1], err)
|
|
continue
|
|
}
|
|
glog.V(3).Infof("filesystem: %s timestamp %v", l, t)
|
|
stats[name].FreenasCounts++
|
|
snapshotTime := stats[name].Timestamp
|
|
glog.V(3).Infof("snapshotTime.Before(t) = %v snapshotTime: %v t: %v", snapshotTime.Before(t), snapshotTime, t)
|
|
if snapshotTime.Before(t) {
|
|
stats[name].Timestamp = t
|
|
}
|
|
}
|
|
if m := linuxSnapshotPattern.FindStringSubmatch(l); len(m) == 2 {
|
|
foundSnapshot = true
|
|
t, err := time.Parse(linuxSnapshotFormat, m[1])
|
|
if err != nil {
|
|
glog.Errorf("[%s] Malformed time in linux snapshot %q: %v", host, m[1], err)
|
|
continue
|
|
}
|
|
glog.V(3).Infof("filesystem: %s timestamp %v", l, t)
|
|
stats[name].LinuxCounts++
|
|
snapshotTime := stats[name].Timestamp
|
|
glog.V(3).Infof("snapshotTime.Before(t) = %v snapshotTime: %v t: %v", snapshotTime.Before(t), snapshotTime, t)
|
|
if snapshotTime.Before(t) {
|
|
stats[name].Timestamp = t
|
|
}
|
|
}
|
|
if !foundSnapshot {
|
|
glog.V(3).Infof("[%s] Skipping snapshot with non-conforming timestamp %q", host, l)
|
|
}
|
|
}
|
|
if err := scanner.Err(); err != nil {
|
|
return nil, fmt.Errorf("[%s] failed to scan response: %v", host, err)
|
|
}
|
|
return stats, nil
|
|
}
|
|
|
|
func updateMetrics(host string, stats snapshotStats) {
|
|
for filesystem, stat := range stats {
|
|
snapshotCountsMetrics.WithLabelValues(host, string(filesystem), "all").Set(float64(stat.Counts))
|
|
snapshotCountsMetrics.WithLabelValues(host, string(filesystem), "freenas").Set(float64(stat.FreenasCounts))
|
|
snapshotCountsMetrics.WithLabelValues(host, string(filesystem), "linux").Set(float64(stat.LinuxCounts))
|
|
snapshotTimestampMetric.WithLabelValues(host, string(filesystem)).Set(float64(stat.Timestamp.Unix()))
|
|
}
|
|
}
|
|
|
|
type hostsSnapshotStats struct {
|
|
sync.Mutex
|
|
host2Stats map[string]snapshotStats
|
|
}
|
|
|
|
var indexTmpl = template.Must(template.New("index").Parse(`<!doctype html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
|
|
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
|
|
<title>ZFS replication exporter</title>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<h1>Debugging info for ZFS replication exporter</h1>
|
|
<p>Metrics at <a href="/metrics">/metrics</a>
|
|
{{with .}}
|
|
{{range $host, $snapStats := .}}
|
|
<h2>{{$host}}</h2>
|
|
<table class="table">
|
|
<tr>
|
|
<th>Filesystem</th>
|
|
<th>Counts</th>
|
|
<th>FreeNAS Snapshots</th>
|
|
<th>Linux Snapshots</th>
|
|
<th>Most Recent</th>
|
|
</tr>
|
|
{{range $name, $fsStat := .}}
|
|
<tr>
|
|
<td>{{$name}}</td>
|
|
<td>{{$fsStat.Counts}}</td>
|
|
<td>{{$fsStat.FreenasCounts}}</td>
|
|
<td>{{$fsStat.LinuxCounts}}</td>
|
|
<td>{{if or $fsStat.LinuxCounts $fsStat.FreenasCounts }}{{$fsStat.Timestamp}}{{end}}</td>
|
|
</tr>
|
|
{{end}}
|
|
</table>
|
|
{{end}}
|
|
{{else}}
|
|
<h2>Waiting for first fetch of metrics</h2>
|
|
<p>Please refresh later.</p>
|
|
{{end}}
|
|
</div>
|
|
</body>
|
|
</html>
|
|
`))
|
|
|
|
func (hss *hostsSnapshotStats) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|
hss.Lock()
|
|
defer hss.Unlock()
|
|
if err := indexTmpl.Execute(w, hss.host2Stats); err != nil {
|
|
glog.Errorf("Failed to render index: %v", err)
|
|
}
|
|
}
|
|
|
|
func main() {
|
|
flag.Parse()
|
|
defer glog.Flush()
|
|
|
|
ams, err := newPublicKey()
|
|
if err != nil {
|
|
glog.Exitf("Error fetching public keys: %v", err)
|
|
}
|
|
|
|
hss := &hostsSnapshotStats{
|
|
host2Stats: make(map[string]snapshotStats),
|
|
}
|
|
|
|
for _, userHost := range strings.Split(*hosts, ",") {
|
|
u := os.Getenv("USER")
|
|
h := userHost
|
|
if idx := strings.Index(userHost, "@"); idx != -1 {
|
|
u = userHost[:idx]
|
|
h = userHost[idx+1:]
|
|
}
|
|
go func(user, host string) {
|
|
config := &ssh.ClientConfig{
|
|
User: user,
|
|
Auth: ams,
|
|
// TODO(wathiede); use FixedHostKey?
|
|
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
|
|
Timeout: 5 * time.Second,
|
|
}
|
|
for {
|
|
glog.Infof("Dialing %s@%s", user, host)
|
|
c, err := ssh.Dial("tcp", host, config)
|
|
if err != nil {
|
|
glog.Errorf("Error dialing %q: %v", host, err)
|
|
} else {
|
|
stats, err := fetchSnapshotStats(host, c)
|
|
if err != nil {
|
|
glog.Errorf("Failed to update metrics: %v", err)
|
|
} else {
|
|
hss.Lock()
|
|
hss.host2Stats[host] = stats
|
|
hss.Unlock()
|
|
updateMetrics(host, stats)
|
|
}
|
|
c.Close()
|
|
}
|
|
time.Sleep(*refreshInterval)
|
|
}
|
|
}(u, h)
|
|
}
|
|
|
|
// Expose the registered metrics via HTTP.
|
|
http.Handle("/", hss)
|
|
http.Handle("/metrics", promhttp.Handler())
|
|
glog.Exitf("Failed to ListenAndServe: %v", http.ListenAndServe(*addr, nil))
|
|
}
|