LevelDB test programs.

This commit is contained in:
Bill Thiede 2016-12-21 21:05:10 -08:00
parent f2ef02a033
commit a4f102ad99
2 changed files with 265 additions and 0 deletions

144
cmd/ldbstats/ldbstats.go Normal file
View File

@ -0,0 +1,144 @@
package main
import (
"bytes"
"flag"
"fmt"
"net/mail"
"sort"
"strings"
"time"
"github.com/golang/glog"
"github.com/syndtr/goleveldb/leveldb"
"github.com/syndtr/goleveldb/leveldb/util"
)
var levelDBPath = flag.String("db", "", "level DB path for storing email")
type sortedIntMap struct {
m map[string]int
s []string
}
func (sm *sortedIntMap) Len() int {
return len(sm.m)
}
func (sm *sortedIntMap) Less(i, j int) bool {
return sm.m[sm.s[i]] > sm.m[sm.s[j]]
}
func (sm *sortedIntMap) Swap(i, j int) {
sm.s[i], sm.s[j] = sm.s[j], sm.s[i]
}
func sortIntMap(m map[string]int) []string {
sm := new(sortedIntMap)
sm.m = m
sm.s = make([]string, len(m))
i := 0
for key, _ := range m {
sm.s[i] = key
i++
}
sort.Sort(sm)
return sm.s
}
type sortedStringSliceMap struct {
m map[string][]string
s []string
}
func (sm *sortedStringSliceMap) Len() int {
return len(sm.m)
}
func (sm *sortedStringSliceMap) Less(i, j int) bool {
return len(sm.m[sm.s[i]]) > len(sm.m[sm.s[j]])
}
func (sm *sortedStringSliceMap) Swap(i, j int) {
sm.s[i], sm.s[j] = sm.s[j], sm.s[i]
}
func sortStringSliceMap(m map[string][]string) []string {
sm := new(sortedStringSliceMap)
sm.m = m
sm.s = make([]string, len(m))
i := 0
for key, _ := range m {
sm.s[i] = key
i++
}
sort.Sort(sm)
return sm.s
}
func count(ss []string) string {
c := map[string]int{}
for _, s := range ss {
c[s]++
}
var out []string
for _, s := range sortIntMap(c) {
out = append(out, fmt.Sprintf("(%d) %s", c[s], s))
}
return strings.Join(out, " ")
}
func stats(db *leveldb.DB) error {
start := time.Now()
it := db.NewIterator(&util.Range{}, nil)
defer it.Release()
cnt := 0
allAddrs := map[string][]string{}
for it.Next() {
cnt++
if cnt%1000 == 0 {
//TODO XXX TEST
//break
glog.Infof("Processed %d messages in %v", cnt, time.Since(start))
}
m, err := mail.ReadMessage(bytes.NewReader(it.Value()))
if err != nil {
return err
}
for _, h := range []string{"To", "Cc", "From"} {
addrs, err := m.Header.AddressList(h)
if err != nil {
continue
}
for _, a := range addrs {
k := strings.ToLower(a.Address)
allAddrs[k] = append(allAddrs[k], a.Name)
}
}
}
fmt.Printf("Processed %d messages in %v\n", cnt, time.Since(start))
fmt.Printf("Found %d addresses\n", len(allAddrs))
for _, addr := range sortStringSliceMap(allAddrs) {
fmt.Printf(" %s: %s\n", addr, count(allAddrs[addr]))
}
return it.Error()
}
func main() {
flag.Parse()
defer glog.Flush()
db, err := leveldb.OpenFile(*levelDBPath, nil)
if err != nil {
glog.Exitf("Error opening leveldb: %v", err)
}
defer func() {
if err := db.Close(); err != nil {
glog.Errorf("Error closing %q: %v", *levelDBPath, err)
}
}()
if err := stats(db); err != nil {
glog.Exitf("Failed to compute stats: %v", err)
}
}

View File

@ -0,0 +1,121 @@
package main
import (
"bytes"
"flag"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"xinu.tv/email"
"github.com/golang/glog"
"github.com/syndtr/goleveldb/leveldb"
)
var (
levelDBPath = flag.String("db", "", "level DB path for storing email")
maildirPath = flag.String("maildir", filepath.Join(os.Getenv("HOME"), "Maildir"), "Maildir path")
skipFiles = flag.String("skip", ".notmuch,maildirfolder,log,msgid.cache,razor-agent.log",
"comma separated files to skip")
)
const maxMessageSize int64 = 50 << 20
func load(dbPath, mdPath string) error {
db, err := leveldb.OpenFile(*levelDBPath, nil)
if err != nil {
return fmt.Errorf("error opening leveldb: %v", err)
}
defer func() {
if err := db.Close(); err != nil {
glog.Errorf("Error closing %q: %v", *levelDBPath, err)
}
}()
skipPats := strings.Split(*skipFiles, ",")
glog.Infoln("Skip files", skipPats)
imported, count := 0, 0
if err := filepath.Walk(mdPath, func(path string, info os.FileInfo, err error) error {
base := filepath.Base(path)
if info.IsDir() {
glog.Infoln(path)
for _, pat := range skipPats {
ok, err := filepath.Match(pat, base)
if err != nil {
return err
}
if ok {
return filepath.SkipDir
}
}
}
if !info.Mode().IsRegular() {
return nil
}
for _, pat := range skipPats {
ok, err := filepath.Match(pat, base)
if err != nil {
return err
}
if ok {
return nil
}
}
r, err := os.Open(path)
if err != nil {
return fmt.Errorf("%q open: %v", path, err)
}
b, err := ioutil.ReadAll(r)
if err != nil {
return fmt.Errorf("%q read: %v", path, err)
}
chksum, err := email.HashReader(bytes.NewReader(b))
if err != nil {
return fmt.Errorf("%q checksum: %v", path, err)
}
count++
if count%1000 == 0 {
glog.Infof("Processed %d files, %d imported", count, imported)
}
key := []byte(chksum)
ok, err := db.Has(key, nil)
if err != nil {
return fmt.Errorf("%q error Has(%q): %v", path, key, err)
}
if ok {
return nil
}
//glog.Infof("Processing %v %q", chksum, path)
imported++
if err := db.Put(key, b, nil); err != nil {
return fmt.Errorf("%q put: %v", path, err)
}
return nil
}); err != nil {
return fmt.Errorf("walk error: %v", err)
}
return nil
}
func main() {
flag.Parse()
defer glog.Flush()
if *levelDBPath == "" {
glog.Exitf("-db required")
}
if *maildirPath == "" {
glog.Exitf("-maildir required")
}
if err := load(*levelDBPath, *maildirPath); err != nil {
glog.Exitf("Failed to load: %v", err)
}
}