email/dedup/dedup.go
2013-08-13 20:54:10 -07:00

157 lines
2.7 KiB
Go

package main
import (
"crypto/sha1"
"flag"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
)
var totalFiles = 0
var filePat = flag.String("filepat", "*", "Glob for list of files to dedup")
var debug = debugT(false)
type debugT bool
func (d debugT) Print(args ...interface{}) {
if d {
log.Print(args...)
}
}
func (d debugT) Printf(format string, args ...interface{}) {
if d {
log.Printf(format, args...)
}
}
type Checksum struct {
Checksum, Filename string
}
func main() {
flag.Parse()
files, err := filepath.Glob(os.ExpandEnv(*filePat))
if err != nil {
log.Fatalf("Failed to glob %q: %s", *filePat, err)
}
totalFiles = len(files)
debug.Printf("Found %d files", totalFiles)
numWorkers := 10
fns := make(chan string, numWorkers)
sums := make(chan Checksum, numWorkers)
done := make(chan bool)
for i:=0; i< numWorkers; i++ {
go checksumer(fns, sums, done)
}
go computeDups(sums, done)
//go printer(sums, done)
for _, fn := range files {
fns <- fn
}
close(fns)
for i:=0; i< numWorkers; i++ {
<-done
}
close(sums)
<-done
}
func removeDups(dups map[string][]string) {
log.Print("Removing dups")
for _, fns := range dups {
if len(fns) > 1 {
log.Print("Skipping: ", fns[0])
log.Print("Deleting")
for _, fn := range fns[1:] {
log.Print("\t", fn)
err := os.Remove(fn)
if err != nil {
log.Fatalf("Failed to remove file: %s", err)
}
}
}
}
}
func computeDups(sums <-chan Checksum, done chan bool) {
dups := map[string][]string{}
count := 0
for sum := range sums {
dups[sum.Checksum] = append(dups[sum.Checksum], sum.Filename)
if count > 0 && (count % 1000) == 0 {
log.Printf("Processed %d/%d files", count, totalFiles)
}
count++
}
count = 0
for _, fns := range dups {
if len(fns) > 1 {
count++
}
}
if count == 0 {
log.Print("No dups found")
} else {
log.Printf("Found %d files with dups, delete them? [y/n]", count)
var answer string
fmt.Scan(&answer)
if strings.HasPrefix(strings.ToLower(answer), "y") {
removeDups(dups)
}
}
done <- true
}
func printer(sums <-chan Checksum, done chan bool) {
for sum := range sums {
fmt.Println(sum.Checksum, sum.Filename)
}
done <- true
}
func checksumer(fns <-chan string, sums chan<- Checksum, done chan bool) {
for fn := range fns {
f, err := os.Open(fn)
if err != nil {
log.Print("Error opening file: ", err)
continue
}
fi, err := f.Stat()
if err != nil {
log.Print("Error stating file: ", err)
} else {
debug.Printf("Mode %s for %q", fi.Mode(), fn)
if (fi.Mode() & os.ModeType) != 0 {
debug.Print("Skipping non-file ", fn)
}
}
h := sha1.New()
io.Copy(h, f)
f.Close()
sums <- Checksum{
Checksum: fmt.Sprintf("%x", h.Sum(nil)),
Filename: fn,
}
}
done <- true
}