diff --git a/cmd/mailhash/mailhash.go b/cmd/mailhash/mailhash.go index a600f65..c53cf46 100644 --- a/cmd/mailhash/mailhash.go +++ b/cmd/mailhash/mailhash.go @@ -7,7 +7,6 @@ import ( "os" "path/filepath" "strings" - "xinu.tv/email" "github.com/golang/glog" @@ -47,14 +46,22 @@ func (m *Messages) hashMail(path string, info os.FileInfo, err error) error { return nil } + base := filepath.Base(path) + if _, ok := m.Skip[base]; ok { + glog.Infoln("Skipping", path) + return nil + } + r, err := os.Open(path) if err != nil { glog.Fatal(err) } + defer r.Close() + h, err := email.Hash(r, headers) if err != nil { glog.Errorf("%s not an mail file", path) - glog.Info("%q", err.Error()) + glog.Infof("%q", err.Error()) return nil } md := email.NewInfo(path) @@ -130,6 +137,7 @@ func (m Messages) Reconcile(maildir string) error { if err != nil { glog.Fatal(err) } + defer r.Close() h, err := email.Hash(r, headers) if err != nil { diff --git a/cmd/md2pq/md2pq.go b/cmd/md2pq/md2pq.go new file mode 100644 index 0000000..1035114 --- /dev/null +++ b/cmd/md2pq/md2pq.go @@ -0,0 +1,148 @@ +// md2pg imports a Maildir format file into a PostgreSQL DB +package main + +import ( + "bytes" + "database/sql" + "flag" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "time" + + "github.com/golang/glog" + "github.com/lib/pq" + + "xinu.tv/email" + "xinu.tv/set" +) + +var ( + maildir = flag.String("maildir", "", "Maildir root") + skipFiles = flag.String("skip", "maildirfolder,log,msgid.cache,razor-agent.log", + "comma separated files to skip") + + // Hashed over fields from each message. + headers = []string{"to", "from", "cc", "date", "subject"} +) + +var CRCR = []byte("\n\n") + +func Load(db *sql.DB, root string, skip *set.StringSet) error { + dup := set.NewStrings() + start := time.Now() + cnt := 0 + dupCnt := 0 + defer func() { + glog.Infof("%d messages processed in %s", cnt, time.Since(start)) + glog.Infof("%d dups found", dupCnt) + }() + + txn, err := db.Begin() + if err != nil { + return err + } + + stmt, err := txn.Prepare(pq.CopyIn("original", "hash", "header_size", + "total_size", "blob")) + if err != nil { + if err := txn.Rollback(); err != nil { + glog.Errorln("txn.Prepare error rolling back", err) + } + return err + } + + b := new(bytes.Buffer) + err = filepath.Walk(root, + func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + if info.IsDir() { + return nil + } + base := filepath.Base(path) + if skip.Contains(base) { + return nil + } + + b.Reset() + r, err := os.Open(path) + if err != nil { + return err + } + defer r.Close() + + n, err := io.Copy(b, r) + if err != nil { + return err + } + + blob := b.Bytes() + hdr_size := bytes.Index(blob, CRCR) + h, err := email.Hash(bytes.NewReader(blob), headers) + if err != nil { + glog.Errorf("%s not an mail file", path) + glog.Infof("%q", err.Error()) + return nil + } + + chksum := fmt.Sprintf("%x", h.Sum(nil)) + if dup.Contains(chksum) { + glog.Warningln("Dup email found", chksum, path, len(blob)) + dupCnt++ + return nil + } + dup.Add(chksum) + + glog.Infoln(chksum, hdr_size, n) + if _, err := stmt.Exec(chksum, hdr_size, n, blob); err != nil { + return err + } + + cnt++ + return nil + }) + + if _, err := stmt.Exec(); err != nil { + if err := txn.Rollback(); err != nil { + glog.Errorln("stmt.Exec error rolling back", err) + } + return err + } + + if err := stmt.Close(); err != nil { + if err := txn.Rollback(); err != nil { + glog.Errorln("stmt.Close error rolling back", err) + } + return err + } + + return txn.Commit() +} + +func main() { + defer glog.Flush() + flag.Parse() + + // TODO(wathiede): make a set of flags. + db, err := sql.Open("postgres", "user=gomail dbname=gomail sslmode=disable") + if err != nil { + glog.Fatal(err) + } + + if *maildir == "" { + fmt.Println("Must specify Maildir with -maildir") + os.Exit(1) + } + + skip := set.NewStrings(strings.Split(*skipFiles, ",")...) + glog.Infoln("Skip files", skip) + + if err := Load(db, *maildir, skip); err != nil { + glog.Fatal(err) + } +} diff --git a/pq/init.sql b/pq/init.sql new file mode 100644 index 0000000..f1cd3db --- /dev/null +++ b/pq/init.sql @@ -0,0 +1,7 @@ +DROP TABLE original; +CREATE TABLE original ( + hash char(40) PRIMARY KEY, + header_size integer, + total_size integer, + blob bytea +)