diff --git a/cmd/ldbstats/ldbstats.go b/cmd/ldbstats/ldbstats.go new file mode 100644 index 0000000..572c156 --- /dev/null +++ b/cmd/ldbstats/ldbstats.go @@ -0,0 +1,144 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "net/mail" + "sort" + "strings" + "time" + + "github.com/golang/glog" + "github.com/syndtr/goleveldb/leveldb" + "github.com/syndtr/goleveldb/leveldb/util" +) + +var levelDBPath = flag.String("db", "", "level DB path for storing email") + +type sortedIntMap struct { + m map[string]int + s []string +} + +func (sm *sortedIntMap) Len() int { + return len(sm.m) +} + +func (sm *sortedIntMap) Less(i, j int) bool { + return sm.m[sm.s[i]] > sm.m[sm.s[j]] +} + +func (sm *sortedIntMap) Swap(i, j int) { + sm.s[i], sm.s[j] = sm.s[j], sm.s[i] +} + +func sortIntMap(m map[string]int) []string { + sm := new(sortedIntMap) + sm.m = m + sm.s = make([]string, len(m)) + i := 0 + for key, _ := range m { + sm.s[i] = key + i++ + } + sort.Sort(sm) + return sm.s +} + +type sortedStringSliceMap struct { + m map[string][]string + s []string +} + +func (sm *sortedStringSliceMap) Len() int { + return len(sm.m) +} + +func (sm *sortedStringSliceMap) Less(i, j int) bool { + return len(sm.m[sm.s[i]]) > len(sm.m[sm.s[j]]) +} + +func (sm *sortedStringSliceMap) Swap(i, j int) { + sm.s[i], sm.s[j] = sm.s[j], sm.s[i] +} + +func sortStringSliceMap(m map[string][]string) []string { + sm := new(sortedStringSliceMap) + sm.m = m + sm.s = make([]string, len(m)) + i := 0 + for key, _ := range m { + sm.s[i] = key + i++ + } + sort.Sort(sm) + return sm.s +} + +func count(ss []string) string { + c := map[string]int{} + for _, s := range ss { + c[s]++ + } + var out []string + for _, s := range sortIntMap(c) { + out = append(out, fmt.Sprintf("(%d) %s", c[s], s)) + } + return strings.Join(out, " ") +} + +func stats(db *leveldb.DB) error { + start := time.Now() + it := db.NewIterator(&util.Range{}, nil) + defer it.Release() + cnt := 0 + allAddrs := map[string][]string{} + for it.Next() { + cnt++ + if cnt%1000 == 0 { + //TODO XXX TEST + //break + glog.Infof("Processed %d messages in %v", cnt, time.Since(start)) + } + m, err := mail.ReadMessage(bytes.NewReader(it.Value())) + if err != nil { + return err + } + for _, h := range []string{"To", "Cc", "From"} { + addrs, err := m.Header.AddressList(h) + if err != nil { + continue + } + for _, a := range addrs { + k := strings.ToLower(a.Address) + allAddrs[k] = append(allAddrs[k], a.Name) + } + } + } + fmt.Printf("Processed %d messages in %v\n", cnt, time.Since(start)) + fmt.Printf("Found %d addresses\n", len(allAddrs)) + + for _, addr := range sortStringSliceMap(allAddrs) { + fmt.Printf(" %s: %s\n", addr, count(allAddrs[addr])) + } + + return it.Error() +} + +func main() { + flag.Parse() + defer glog.Flush() + db, err := leveldb.OpenFile(*levelDBPath, nil) + if err != nil { + glog.Exitf("Error opening leveldb: %v", err) + } + defer func() { + if err := db.Close(); err != nil { + glog.Errorf("Error closing %q: %v", *levelDBPath, err) + } + }() + if err := stats(db); err != nil { + glog.Exitf("Failed to compute stats: %v", err) + } +} diff --git a/cmd/md2leveldb/md2leveldb.go b/cmd/md2leveldb/md2leveldb.go new file mode 100644 index 0000000..c928687 --- /dev/null +++ b/cmd/md2leveldb/md2leveldb.go @@ -0,0 +1,121 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "xinu.tv/email" + + "github.com/golang/glog" + "github.com/syndtr/goleveldb/leveldb" +) + +var ( + levelDBPath = flag.String("db", "", "level DB path for storing email") + maildirPath = flag.String("maildir", filepath.Join(os.Getenv("HOME"), "Maildir"), "Maildir path") + skipFiles = flag.String("skip", ".notmuch,maildirfolder,log,msgid.cache,razor-agent.log", + "comma separated files to skip") +) + +const maxMessageSize int64 = 50 << 20 + +func load(dbPath, mdPath string) error { + db, err := leveldb.OpenFile(*levelDBPath, nil) + if err != nil { + return fmt.Errorf("error opening leveldb: %v", err) + } + defer func() { + if err := db.Close(); err != nil { + glog.Errorf("Error closing %q: %v", *levelDBPath, err) + } + }() + + skipPats := strings.Split(*skipFiles, ",") + glog.Infoln("Skip files", skipPats) + imported, count := 0, 0 + if err := filepath.Walk(mdPath, func(path string, info os.FileInfo, err error) error { + base := filepath.Base(path) + if info.IsDir() { + glog.Infoln(path) + for _, pat := range skipPats { + ok, err := filepath.Match(pat, base) + if err != nil { + return err + } + if ok { + return filepath.SkipDir + + } + } + } + + if !info.Mode().IsRegular() { + return nil + } + for _, pat := range skipPats { + ok, err := filepath.Match(pat, base) + if err != nil { + return err + } + if ok { + return nil + + } + } + + r, err := os.Open(path) + if err != nil { + return fmt.Errorf("%q open: %v", path, err) + } + b, err := ioutil.ReadAll(r) + if err != nil { + return fmt.Errorf("%q read: %v", path, err) + } + + chksum, err := email.HashReader(bytes.NewReader(b)) + if err != nil { + return fmt.Errorf("%q checksum: %v", path, err) + } + + count++ + if count%1000 == 0 { + glog.Infof("Processed %d files, %d imported", count, imported) + } + key := []byte(chksum) + ok, err := db.Has(key, nil) + if err != nil { + return fmt.Errorf("%q error Has(%q): %v", path, key, err) + } + if ok { + return nil + } + //glog.Infof("Processing %v %q", chksum, path) + imported++ + if err := db.Put(key, b, nil); err != nil { + return fmt.Errorf("%q put: %v", path, err) + } + return nil + }); err != nil { + return fmt.Errorf("walk error: %v", err) + } + return nil +} + +func main() { + flag.Parse() + defer glog.Flush() + if *levelDBPath == "" { + glog.Exitf("-db required") + } + if *maildirPath == "" { + glog.Exitf("-maildir required") + } + if err := load(*levelDBPath, *maildirPath); err != nil { + glog.Exitf("Failed to load: %v", err) + } +}