From 12199604c1acc5b17052f483074699e48f960219 Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Fri, 28 Mar 2014 21:46:34 -0700 Subject: [PATCH] Add helper to print message with hash as a header. Broke out email.go's Hash() in to multiple functions to make this easier, and update tools that used it. --- cmd/addhashheader/addhashheader.go | 36 ++++++++++ cmd/addhashheader/testdata/msg.1 | 109 +++++++++++++++++++++++++++++ cmd/mailhash/mailhash.go | 9 +-- cmd/md2pq/md2pq.go | 6 +- hash.go | 64 +++++++++++------ 5 files changed, 193 insertions(+), 31 deletions(-) create mode 100644 cmd/addhashheader/addhashheader.go create mode 100644 cmd/addhashheader/testdata/msg.1 diff --git a/cmd/addhashheader/addhashheader.go b/cmd/addhashheader/addhashheader.go new file mode 100644 index 0000000..feb20b6 --- /dev/null +++ b/cmd/addhashheader/addhashheader.go @@ -0,0 +1,36 @@ +package main + +import ( + "flag" + "fmt" + "io" + "net/mail" + "os" + + "github.com/golang/glog" + + "xinu.tv/email" +) + +func main() { + flag.Parse() + defer glog.Flush() + + msg, err := mail.ReadMessage(os.Stdin) + if err != nil { + glog.Fatal(err) + } + + h, err := email.HashMessage(msg) + if err != nil { + glog.Fatal(err) + } + for k, vs := range msg.Header { + for _, v := range vs { + fmt.Printf("%s: %s\n", k, v) + } + } + fmt.Println("X-Xinu-Hash:", h) + fmt.Println() + io.Copy(os.Stdout, msg.Body) +} diff --git a/cmd/addhashheader/testdata/msg.1 b/cmd/addhashheader/testdata/msg.1 new file mode 100644 index 0000000..9ebb022 --- /dev/null +++ b/cmd/addhashheader/testdata/msg.1 @@ -0,0 +1,109 @@ +Subject: [go-nuts] Re: Runtime code generation for scientific computing? +From: Raul Mera +To: golang-nuts@googlegroups.com +Date: Fri, 28 Mar 2014 16:14:50 -0700 (PDT) + + + +On Friday, March 28, 2014 10:31:54 PM UTC+1, egon wrote: +> +> I've been thinking how to properly approach scientific computing with Go. +> +> It seems that there are many cases which can be improved at the level of +> compiler i.e. multi-dimensional arrays, matrices, sets. There are also +> SIMD, GPGPUs, OpenCL, OpenMP, FPUs etc. basically hardware features and +> libraries that can improve the speed of calculations. So the question is +> how to properly target all of that? +> +> +Adding support for each of those special cases would make the compiler +> slower, more complex and make it harder to maintain. So adding all of that +> wouldn't probably be a good idea. +> +> +Please let's not mix hardware facilities or especial libraries, which are +indeed special cases, with multi-dimensional slices (we already have +multidimensional arrays) which are a natural way to represent matrices, +which, in turn, are not quite an "special case" but pretty much the main +thing you use for any code that deals with mathematics in some way (which +is a lot of code, not only scientific). Multidimensional slices are not +linked to a particular library, hardware or even algorithm. The proposal +currently being discussed in another thread here is a natural extension of +what exists, and is careful to keep complexity to a minimun. + +I very much agree with you about GPUGPU, OpenCL, etc. Those things should +not be in the language. Also, noone in the scientific community has asked +for sets, as far as I know (sorry if I am wrong here), and 2-dimensional +slices are the same as matrices for our purposes. This means that while in +your mail it appears that a long list of language additions is needed/asked +for in order for Go to succeed in scientific programming, the community +is actually asking for only one thing, + +Also, saying "multi-dimensional slices, matrices" gives a misleading +impression of several issues, when both things are actually the same for +our purposes. Since I have not seen anyone from the numerical community +asking for sets, the whole list you present as language changes wanted by +the scientific computing people is reduced to one item. + + + +> So what would be the alternative? One approach that would be viable is +> runtime code generation, it's complex in it's internals, but idea is +> simple. Take a string and convert it to a Go function at runtime. So +> convert a string to an AST, run tons of optimizations and convert it to +> specialized byte-code and finally run it. +> +> When we are talking about scientific computing, this means we should do +> aggressive inlining, optimizations that may take a lot of time... e.g. if +> an optimization strategy takes 4min and it improves the computation by 20% +> then with calculations running longer than 20min it's worthwhile to run +> that optimization... but it may not be a good idea for general programming. +> +> I do not like that separation between "scientific" and "general" +computing. We both want our code clean, and have to deal with large +programs. Still, I agree that if you want to use special hardware and the +like, or want the even the *last bit* of performance, no matter what, it is +reasonable that you pay the price. I suspect the current approach of +delegating those things to C is good enough, but I do not have a strong +opinion (well, my strong opinion, like yours, is "leave them out of the +language"). + + + +> So, maybe, the better approach for scientific computation is to provide +> the packages for DSLs, AST conversions, transformations, optimizations, +> targets and nice runtime code generation support. I know the details will +> get complicated, but maybe it's a better than adding every single thing to +> the compiler. With Go compiler eventually being written in Go, it would +> mean that we can use the same code in runtime code generation. +> +> What are your thoughts on this? +> +> +I will not speak for them, but I don't see that the numerical/scientific Go +comunity (mostly, the gonum people) has proposed or even thinks that we +should add every single thing to the compiler, but are actually against +that. That part sounds bit like a straw man to me. We want a natural way +to represent a very common data structure. We do not want fotran-like +matrix multiplication in the language, and we most certainly have not asked +for language support for the other things you mention. For several of them, +we can probably just work them out at library level, for some, like the GPU +thing, we probably will always need to delegate to C (or, actually, CUDA or +similar). + +I am not particularly against your proposal, but I don't like that it is +presented as an alternative to what the scientific/numerical Go community +are asking for. + + + ++ egon +> +> (PS. I'm probably not the best person for implementing it, but maybe +> someone else gets inspired and runs with the idea :) ) +> + +-- +You received this message because you are subscribed to the Google Groups "golang-nuts" group. +To unsubscribe from this group and stop receiving emails from it, send an email to golang-nuts+unsubscribe@googlegroups.com. +For more options, visit https://groups.google.com/d/optout. diff --git a/cmd/mailhash/mailhash.go b/cmd/mailhash/mailhash.go index c53cf46..1800791 100644 --- a/cmd/mailhash/mailhash.go +++ b/cmd/mailhash/mailhash.go @@ -34,8 +34,6 @@ type Messages struct { Statuses []Status } -var headers = []string{"to", "from", "cc", "date", "subject"} - func (m *Messages) hashMail(path string, info os.FileInfo, err error) error { glog.Infoln("Processing", path) if err != nil { @@ -58,7 +56,7 @@ func (m *Messages) hashMail(path string, info os.FileInfo, err error) error { } defer r.Close() - h, err := email.Hash(r, headers) + h, err := email.HashReader(r) if err != nil { glog.Errorf("%s not an mail file", path) glog.Infof("%q", err.Error()) @@ -67,7 +65,7 @@ func (m *Messages) hashMail(path string, info os.FileInfo, err error) error { md := email.NewInfo(path) m.Statuses = append(m.Statuses, Status{ Path: path, - Hash: fmt.Sprintf("%x", h.Sum(nil)), + Hash: h, Read: md.Seen, }) return nil @@ -139,14 +137,13 @@ func (m Messages) Reconcile(maildir string) error { } defer r.Close() - h, err := email.Hash(r, headers) + chksum, err := email.HashReader(r) if err != nil { glog.Errorf("%s not an mail file", path) glog.Infof("%q", err.Error()) return nil } - chksum := fmt.Sprintf("%x", h.Sum(nil)) md := email.NewInfo(path) s, ok := hashMap[chksum] if !ok { diff --git a/cmd/md2pq/md2pq.go b/cmd/md2pq/md2pq.go index 3396966..6883dbc 100644 --- a/cmd/md2pq/md2pq.go +++ b/cmd/md2pq/md2pq.go @@ -29,9 +29,6 @@ var ( skipFiles = flag.String("skip", "maildirfolder,log,msgid.cache,razor-agent.log", "comma separated files to skip") - // Hashed over fields from each message. - headers = []string{"to", "from", "cc", "date", "subject", "message-id"} - total = expvar.NewInt("bytes-parsed") cnt = expvar.NewInt("messages-parsed") dupCnt = expvar.NewInt("duplicates-found") @@ -101,14 +98,13 @@ func Load(db *sql.DB, uid int, root string, skip *set.StringSet) error { } hdr_size := bytes.Index(b, CRCR) - h, err := email.Hash(bytes.NewReader(b), headers) + chksum, err := email.HashReader(bytes.NewReader(b)) if err != nil { glog.Errorf("%s not an mail file", path) glog.Infof("%q", err.Error()) return nil } - chksum := fmt.Sprintf("%x", h.Sum(nil)) if _, err := hstmt.Exec(chksum, path); err != nil { return err } diff --git a/hash.go b/hash.go index b91ed46..bc3e2d8 100644 --- a/hash.go +++ b/hash.go @@ -2,37 +2,61 @@ package email import ( "crypto/sha1" + "fmt" "hash" "io" "net/mail" - "os" "sort" "github.com/golang/glog" ) -// Hash will parse r as an email, and return a hash.Hash that has been applied -// to the values of the specified headers. -func Hash(r io.Reader, headers []string) (hash.Hash, error) { - // Add deterministic behavior regardless of the order the users specified. - sort.Strings(headers) - var name string - if f, ok := r.(*os.File); ok { - name = f.Name() - } +// Hasher is a list of headers that should be considered when hashing an email +// message. +type Hasher []string - h := sha1.New() +// Hash will parse r as an email, and return a hash.Hash that has been applied +// to the values of the headers in h. +func (h Hasher) HashMessage(msg *mail.Message) (hash.Hash, error) { + // Add deterministic behavior regardless of the order the users specified. + if !sort.IsSorted(sort.StringSlice(h)) { + sort.Strings(h) + } + hsh := sha1.New() + for _, header := range h { + v := msg.Header.Get(header) + if v == "" { + glog.V(2).Infoln("Empty", header, "header") + } + io.WriteString(hsh, v) + } + return hsh, nil +} + +func (h Hasher) HashReader(r io.Reader) (hash.Hash, error) { msg, err := mail.ReadMessage(r) if err != nil { return nil, err } - - for _, header := range headers { - v := msg.Header.Get(header) - if v == "" { - glog.V(2).Infoln(name, "Empty", header, "header") - } - io.WriteString(h, v) - } - return h, nil + return h.HashMessage(msg) +} + +var std = Hasher([]string{"to", "from", "cc", "date", "subject", "message-id"}) + +// Hash will parse r as an email, and return the hash as a hexidecimal string +// using a default set of headers. +func HashReader(r io.Reader) (string, error) { + h, err := std.HashReader(r) + if err != nil { + return "", err + } + return fmt.Sprintf("%x", h.Sum(nil)), nil +} + +func HashMessage(msg *mail.Message) (string, error) { + h, err := std.HashMessage(msg) + if err != nil { + return "", err + } + return fmt.Sprintf("%x", h.Sum(nil)), nil }