149 lines
2.9 KiB
Go
149 lines
2.9 KiB
Go
// md2pg imports a Maildir format file into a PostgreSQL DB
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"database/sql"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/golang/glog"
|
|
"github.com/lib/pq"
|
|
|
|
"xinu.tv/email"
|
|
"xinu.tv/set"
|
|
)
|
|
|
|
var (
|
|
maildir = flag.String("maildir", "", "Maildir root")
|
|
skipFiles = flag.String("skip", "maildirfolder,log,msgid.cache,razor-agent.log",
|
|
"comma separated files to skip")
|
|
|
|
// Hashed over fields from each message.
|
|
headers = []string{"to", "from", "cc", "date", "subject"}
|
|
)
|
|
|
|
var CRCR = []byte("\n\n")
|
|
|
|
func Load(db *sql.DB, root string, skip *set.StringSet) error {
|
|
dup := set.NewStrings()
|
|
start := time.Now()
|
|
cnt := 0
|
|
dupCnt := 0
|
|
defer func() {
|
|
glog.Infof("%d messages processed in %s", cnt, time.Since(start))
|
|
glog.Infof("%d dups found", dupCnt)
|
|
}()
|
|
|
|
txn, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
stmt, err := txn.Prepare(pq.CopyIn("original", "hash", "header_size",
|
|
"total_size", "blob"))
|
|
if err != nil {
|
|
if err := txn.Rollback(); err != nil {
|
|
glog.Errorln("txn.Prepare error rolling back", err)
|
|
}
|
|
return err
|
|
}
|
|
|
|
b := new(bytes.Buffer)
|
|
err = filepath.Walk(root,
|
|
func(path string, info os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if info.IsDir() {
|
|
return nil
|
|
}
|
|
base := filepath.Base(path)
|
|
if skip.Contains(base) {
|
|
return nil
|
|
}
|
|
|
|
b.Reset()
|
|
r, err := os.Open(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer r.Close()
|
|
|
|
n, err := io.Copy(b, r)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
blob := b.Bytes()
|
|
hdr_size := bytes.Index(blob, CRCR)
|
|
h, err := email.Hash(bytes.NewReader(blob), headers)
|
|
if err != nil {
|
|
glog.Errorf("%s not an mail file", path)
|
|
glog.Infof("%q", err.Error())
|
|
return nil
|
|
}
|
|
|
|
chksum := fmt.Sprintf("%x", h.Sum(nil))
|
|
if dup.Contains(chksum) {
|
|
glog.Warningln("Dup email found", chksum, path, len(blob))
|
|
dupCnt++
|
|
return nil
|
|
}
|
|
dup.Add(chksum)
|
|
|
|
glog.Infoln(chksum, hdr_size, n)
|
|
if _, err := stmt.Exec(chksum, hdr_size, n, blob); err != nil {
|
|
return err
|
|
}
|
|
|
|
cnt++
|
|
return nil
|
|
})
|
|
|
|
if _, err := stmt.Exec(); err != nil {
|
|
if err := txn.Rollback(); err != nil {
|
|
glog.Errorln("stmt.Exec error rolling back", err)
|
|
}
|
|
return err
|
|
}
|
|
|
|
if err := stmt.Close(); err != nil {
|
|
if err := txn.Rollback(); err != nil {
|
|
glog.Errorln("stmt.Close error rolling back", err)
|
|
}
|
|
return err
|
|
}
|
|
|
|
return txn.Commit()
|
|
}
|
|
|
|
func main() {
|
|
defer glog.Flush()
|
|
flag.Parse()
|
|
|
|
// TODO(wathiede): make a set of flags.
|
|
db, err := sql.Open("postgres", "user=gomail dbname=gomail sslmode=disable")
|
|
if err != nil {
|
|
glog.Fatal(err)
|
|
}
|
|
|
|
if *maildir == "" {
|
|
fmt.Println("Must specify Maildir with -maildir")
|
|
os.Exit(1)
|
|
}
|
|
|
|
skip := set.NewStrings(strings.Split(*skipFiles, ",")...)
|
|
glog.Infoln("Skip files", skip)
|
|
|
|
if err := Load(db, *maildir, skip); err != nil {
|
|
glog.Fatal(err)
|
|
}
|
|
}
|