diff --git a/cmd/headers/headers.go b/cmd/headers/headers.go index d2af4aa..1be1baf 100644 --- a/cmd/headers/headers.go +++ b/cmd/headers/headers.go @@ -20,10 +20,19 @@ func insertHeaders(c *db.Conn, hash string, r io.Reader) error { if err != nil { return err } - glog.Infoln("Insert", hash) + glog.Infoln("Insert headers", hash) return c.InsertHeaders(hash, msg.Header) } +func insertSearchHeaders(c *db.Conn, hash string, r io.Reader) error { + msg, err := mail.ReadMessage(r) + if err != nil { + return err + } + glog.Infoln("Insert search headers", hash) + return c.InsertSearchHeaders(hash, msg.Header) +} + func insertAllHeaders(c *db.Conn, all bool) error { oCh := make(chan db.Original) errc := make(chan error) @@ -39,6 +48,31 @@ func insertAllHeaders(c *db.Conn, all bool) error { case o := <-oCh: r := bytes.NewReader(o.Blob) if err := insertHeaders(c, o.Hash, r); err != nil { + glog.Errorln(o, "insertHeaders", err) + return err + } + case err := <-errc: + return err + } + } +} + +func insertAllSearchHeaders(c *db.Conn, all bool) error { + oCh := make(chan db.Original) + errc := make(chan error) + donec := make(chan struct{}) + defer close(donec) + if all { + go c.Originals(oCh, errc, donec) + } else { + go c.OriginalsNotInTable("search_header", oCh, errc, donec) + } + for { + select { + case o := <-oCh: + r := bytes.NewReader(o.Blob) + if err := insertSearchHeaders(c, o.Hash, r); err != nil { + glog.Errorln(o.Hash, "insertSearchHeaders", err) return err } case err := <-errc: @@ -60,4 +94,8 @@ func main() { glog.Fatal(err) } + if err := insertAllSearchHeaders(c, *all); err != nil { + glog.Fatal(err) + } + } diff --git a/db/util.go b/db/util.go index a0bc2e6..465e3a6 100644 --- a/db/util.go +++ b/db/util.go @@ -5,6 +5,7 @@ import ( "flag" "fmt" "net/mail" + "time" "unicode/utf8" "github.com/golang/glog" @@ -140,7 +141,7 @@ VALUES for k, vs := range hdrs { for _, v := range vs { if !utf8.ValidString(v) { - glog.Infof("%s: value for %q invalid UTF-8 %q", hash, k, v) + glog.Warningf("%s: value for %q invalid UTF-8 %q", hash, k, v) continue } if _, err := stmt.Exec(hash, k, v); err != nil { @@ -150,3 +151,75 @@ VALUES } return nil } + +func (c *Conn) InsertSearchHeaders(hash string, hdrs mail.Header) (err error) { + var tx *sql.Tx + tx, err = c.Begin() + if err != nil { + return err + } + defer func() { + if err != nil { + if err := tx.Rollback(); err != nil { + glog.Error(err) + } + return + } + err = tx.Commit() + }() + + stmt, err := tx.Prepare(` +INSERT INTO + search_header (hash, name, value) +VALUES + ($1, $2, $3) +`) + if err != nil { + return err + } + + // Save the following headers to the database untouched. + otherHdrs := []string{"Subject", "Message-Id", "In-Reply-To", "References"} + for _, k := range otherHdrs { + for _, v := range hdrs[k] { + if !utf8.ValidString(v) { + glog.Warningf("%s: value for %q invalid UTF-8 %q", hash, k, v) + continue + } + if _, err := stmt.Exec(hash, k, v); err != nil { + return err + } + } + } + + // Parse the following headers as addresses, and save each email address + // found to the database independently. + addrHdrs := []string{"To", "From", "Cc"} + for _, k := range addrHdrs { + for _, value := range hdrs[k] { + addrs, err := mail.ParseAddressList(value) + if err != nil { + glog.Warningf("%s: error parsing address list for %q: %v", hash, k, err) + continue + } + + for _, v := range addrs { + if _, err := stmt.Exec(hash, k, v.Address); err != nil { + return err + } + } + } + } + + // Normalize date field. + t, err := hdrs.Date() + if err != nil { + glog.Warningf("%s: failed to parse date header: %v", hash, err) + return nil + } + v := t.UTC().Format(time.RFC3339Nano) + if _, err := stmt.Exec(hash, "Date", v); err != nil { + return err + } + return nil +} diff --git a/hash.go b/hash.go index bc3e2d8..46927a1 100644 --- a/hash.go +++ b/hash.go @@ -43,7 +43,7 @@ func (h Hasher) HashReader(r io.Reader) (hash.Hash, error) { var std = Hasher([]string{"to", "from", "cc", "date", "subject", "message-id"}) -// Hash will parse r as an email, and return the hash as a hexidecimal string +// Hash will parse r as an email, and return the hash as a hexadecimal string // using a default set of headers. func HashReader(r io.Reader) (string, error) { h, err := std.HashReader(r) diff --git a/pg/init-search-header.sql b/pg/init-search-header.sql new file mode 100644 index 0000000..706f622 --- /dev/null +++ b/pg/init-search-header.sql @@ -0,0 +1,7 @@ +DROP TABLE search_header; + +CREATE TABLE search_header ( + hash CHAR(40) REFERENCES original (hash), + name TEXT, + value TEXT +);