diff --git a/Cargo.lock b/Cargo.lock index 4774abe..ea49c6f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,6 +29,12 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "block-buffer" version = "0.10.3" @@ -38,6 +44,12 @@ dependencies = [ "generic-array", ] +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + [[package]] name = "cfg-if" version = "1.0.0" @@ -54,6 +66,43 @@ dependencies = [ "encoding_rs", ] +[[package]] +name = "clap" +version = "4.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d7ae14b20b94cb02149ed21a86c423859cbe18dc7ed69845cace50e52b40a5" +dependencies = [ + "bitflags", + "clap_derive", + "clap_lex", + "is-terminal", + "once_cell", + "strsim", + "termcolor", +] + +[[package]] +name = "clap_derive" +version = "4.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44bec8e5c9d09e439c4335b1af0abaab56dcf3b94999a936e1bb47b9134288f0" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09" +dependencies = [ + "os_str_bytes", +] + [[package]] name = "cpufeatures" version = "0.2.5" @@ -143,6 +192,7 @@ name = "email" version = "0.1.0" dependencies = [ "anyhow", + "clap", "mailparse", "memmap", "rayon", @@ -161,6 +211,27 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "generic-array" version = "0.14.6" @@ -171,6 +242,12 @@ dependencies = [ "version_check", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "hermit-abi" version = "0.2.6" @@ -180,12 +257,46 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + +[[package]] +name = "io-lifetimes" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "is-terminal" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857" +dependencies = [ + "hermit-abi 0.3.1", + "io-lifetimes", + "rustix", + "windows-sys", +] + [[package]] name = "libc" version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + [[package]] name = "mailparse" version = "0.14.0" @@ -228,10 +339,46 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" dependencies = [ - "hermit-abi", + "hermit-abi 0.2.6", "libc", ] +[[package]] +name = "once_cell" +version = "1.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" + +[[package]] +name = "os_str_bytes" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro2" version = "1.0.51" @@ -295,6 +442,20 @@ version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +[[package]] +name = "rustix" +version = "0.36.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "same-file" version = "1.0.6" @@ -321,6 +482,12 @@ dependencies = [ "digest", ] +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "syn" version = "1.0.108" @@ -332,6 +499,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + [[package]] name = "thiserror" version = "1.0.38" @@ -411,3 +587,69 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" diff --git a/Cargo.toml b/Cargo.toml index 3798d74..3d9e62f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" [dependencies] anyhow = "1.0.69" +clap = { version = "4.1.8", features = ["derive"] } mailparse = "0.14.0" memmap = "0.7.0" rayon = "1.6.1" diff --git a/src/bin/fingerprint.rs b/src/bin/fingerprint.rs new file mode 100644 index 0000000..9102a55 --- /dev/null +++ b/src/bin/fingerprint.rs @@ -0,0 +1,136 @@ +use std::{ + collections::HashMap, + fs::File, + path::Path, + time::{SystemTime, UNIX_EPOCH}, +}; + +use clap::Parser; +use email::{fingerprint, should_skip, EmailError}; +use mailparse::{dateparse, parse_mail, MailHeaderMap}; +use memmap::MmapOptions; +use rayon::{iter::ParallelBridge, prelude::ParallelIterator}; +use walkdir::WalkDir; + +/// Simple program to greet a person +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Number years to search through + #[arg(short, long, default_value_t = 1)] + years: usize, + + /// Enable verbose logging + #[arg(short, long, default_value_t = false)] + verbose: bool, + + /// Number of example filenames to print + #[arg(short, long, default_value_t = 1)] + examples: usize, + + /// Show top N fingerprints + #[arg(short, long, default_value_t = 10)] + top_n: usize, + + /// List of input directories to recursively search + input_dir: String, +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + let n = 1; + // Just check messages from the last N years. + let max_age = 60 * 60 * 24 * 365 * n; + let start = std::time::Instant::now(); + let unix_secs = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("couldn't get unix time"); + let youngest = unix_secs.as_secs() as i64 - max_age; + let dir = args.input_dir.clone(); + let map = WalkDir::new(&dir) + .into_iter() + .par_bridge() + .filter_map(|entry| entry.ok()) + .filter_map(|entry| { + if entry.file_type().is_dir() { + if args.verbose { + println!( + "{} ", + entry + .path() + .strip_prefix(&dir) + .expect("failed to strip dir") + .display() + ); + } + return None; + } + let arg = entry.path().display().to_string(); + if should_skip(&arg) { + return None; + } + match parse(&arg, youngest) { + Ok(Some(h)) => Some((h, arg)), + // Skip old emails + Ok(None) => return None, + Err(e) => { + eprintln!("{}: failed {}", arg, e); + None + } + } + }) + .fold( + || HashMap::new(), + |mut m, (h, arg)| { + m.entry(h).or_insert(Vec::new()).push(arg); + m + }, + ) + // Merge maps created by parallel iteration. + .reduce( + || HashMap::new(), + |mut acc, m| { + for (k, v) in m { + acc.entry(k).or_insert(Vec::new()).extend(v); + } + acc + }, + ); + + let mut res: Vec<_> = map + .into_iter() + .map(|(hash, paths)| { + ( + paths.len(), + hash, + paths.into_iter().take(args.examples).collect::>(), + ) + }) + .collect(); + res.sort(); + res.reverse(); + for (cnt, hash, ex) in res.into_iter().take(args.top_n) { + println!("{cnt} {}\n{hash}\n", ex.join("\n")); + } + println!("Runtime: {:.2}s", start.elapsed().as_secs_f32()); + Ok(()) +} + +// If the date in the email is before youngest Ok(None) will be returned. +fn parse>(path: P, youngest: i64) -> Result, EmailError> { + let file = File::open(&path)?; + let mmap = unsafe { MmapOptions::new().map(&file)? }; + let m = parse_mail(&mmap)?; + let d = dateparse( + m.headers + .get_first_value("Date") + .unwrap_or("".to_string()) + .as_str(), + ) + .unwrap_or(0); + if d < youngest { + return Ok(None); + } + //println!("{}: {:#?}", path.as_ref().display(), m.ctype); + Ok(Some(fingerprint(&m).join("\n"))) +} diff --git a/src/bin/summarize.rs b/src/bin/summarize.rs new file mode 100644 index 0000000..fc3e3ca --- /dev/null +++ b/src/bin/summarize.rs @@ -0,0 +1,25 @@ +use std::fs::File; + +use clap::Parser; +use email::fingerprint; +use mailparse::parse_mail; +use memmap::MmapOptions; + +/// Use library to summarize information about given mail files +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// List of files to summarize + paths: Vec, +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + for path in args.paths { + let file = File::open(&path)?; + let mmap = unsafe { MmapOptions::new().map(&file)? }; + let m = parse_mail(&mmap)?; + println!("{path}\n{}", fingerprint(&m).join("\n")); + } + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index 7a67274..96eea49 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ use std::{fs::File, path::Path}; -use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError}; +use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError, ParsedMail}; use memmap::MmapOptions; use sha1::{Digest, Sha1}; use thiserror::Error; @@ -44,6 +44,38 @@ pub fn hash_headers(hdrs: &[MailHeader]) -> String { format!("{:x}", hasher.finalize()) } +// Returns true if the last part of path is known to be a non-mail file commonly found under +// Maildir/ +const SKIP_FILES: &[&str] = &[ + "docdata.glass", + "flintlock", + "iamglass", + ".mbsyncstate", + "position.glass", + "postlist.glass", + "termlist.glass", + ".uidvalidity", +]; +pub fn should_skip(path: &str) -> bool { + let filename: &str = if let Some(idx) = path.rfind("/") { + &path[idx + 1..] + } else { + path + }; + SKIP_FILES.contains(&filename) +} + +pub fn fingerprint(pm: &ParsedMail<'_>) -> Vec { + fingerprint_rec(pm, 0) +} +fn fingerprint_rec(pm: &ParsedMail<'_>, depth: usize) -> Vec { + let mut v = vec![format!("{}{}", " ".repeat(depth * 2), pm.ctype.mimetype)]; + for c in &pm.subparts { + v.extend(fingerprint_rec(&c, depth + 1)); + } + v +} + #[cfg(test)] mod tests { use mailparse::{parse_headers, MailHeader, MailHeaderMap};