From 5171c33d3cae4925b092ef854b1061b97bea336d Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Tue, 10 Oct 2017 10:08:00 -0700 Subject: [PATCH] Set timeout and re-dial on any failures. --- zfs_replication_exporter.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/zfs_replication_exporter.go b/zfs_replication_exporter.go index 7ec835a..3558961 100644 --- a/zfs_replication_exporter.go +++ b/zfs_replication_exporter.go @@ -151,16 +151,23 @@ func main() { Auth: ams, // TODO(wathiede); use FixedHostKey? HostKeyCallback: ssh.InsecureIgnoreHostKey(), - } - c, err := ssh.Dial("tcp", *host, config) - if err != nil { - glog.Exitf("Error dialing %q: %v", *host, err) + Timeout: 5 * time.Second, } go func() { for { + var c *ssh.Client + if c == nil { + var err error + c, err = ssh.Dial("tcp", *host, config) + if err != nil { + glog.Errorf("Error dialing %q: %v", *host, err) + } + } if err := updateMetrics(c); err != nil { glog.Errorf("Failed to update metrics: %v", err) + c.Close() + c = nil } time.Sleep(*refreshInterval) }