Add helpers for parsing emails and understanding what are common structures.

This commit is contained in:
Bill Thiede 2023-03-05 17:27:31 -08:00
parent 093cc0cb56
commit 5a04c7cc28
5 changed files with 438 additions and 2 deletions

244
Cargo.lock generated
View File

@ -29,6 +29,12 @@ version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]] [[package]]
name = "block-buffer" name = "block-buffer"
version = "0.10.3" version = "0.10.3"
@ -38,6 +44,12 @@ dependencies = [
"generic-array", "generic-array",
] ]
[[package]]
name = "cc"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -54,6 +66,43 @@ dependencies = [
"encoding_rs", "encoding_rs",
] ]
[[package]]
name = "clap"
version = "4.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d7ae14b20b94cb02149ed21a86c423859cbe18dc7ed69845cace50e52b40a5"
dependencies = [
"bitflags",
"clap_derive",
"clap_lex",
"is-terminal",
"once_cell",
"strsim",
"termcolor",
]
[[package]]
name = "clap_derive"
version = "4.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44bec8e5c9d09e439c4335b1af0abaab56dcf3b94999a936e1bb47b9134288f0"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09"
dependencies = [
"os_str_bytes",
]
[[package]] [[package]]
name = "cpufeatures" name = "cpufeatures"
version = "0.2.5" version = "0.2.5"
@ -143,6 +192,7 @@ name = "email"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"clap",
"mailparse", "mailparse",
"memmap", "memmap",
"rayon", "rayon",
@ -161,6 +211,27 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "errno"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
dependencies = [
"errno-dragonfly",
"libc",
"winapi",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
]
[[package]] [[package]]
name = "generic-array" name = "generic-array"
version = "0.14.6" version = "0.14.6"
@ -171,6 +242,12 @@ dependencies = [
"version_check", "version_check",
] ]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]] [[package]]
name = "hermit-abi" name = "hermit-abi"
version = "0.2.6" version = "0.2.6"
@ -180,12 +257,46 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "io-lifetimes"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "is-terminal"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
dependencies = [
"hermit-abi 0.3.1",
"io-lifetimes",
"rustix",
"windows-sys",
]
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.139" version = "0.2.139"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
[[package]] [[package]]
name = "mailparse" name = "mailparse"
version = "0.14.0" version = "0.14.0"
@ -228,10 +339,46 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
dependencies = [ dependencies = [
"hermit-abi", "hermit-abi 0.2.6",
"libc", "libc",
] ]
[[package]]
name = "once_cell"
version = "1.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
[[package]]
name = "os_str_bytes"
version = "6.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2",
"quote",
"version_check",
]
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.51" version = "1.0.51"
@ -295,6 +442,20 @@ version = "0.6.28"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
[[package]]
name = "rustix"
version = "0.36.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc"
dependencies = [
"bitflags",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]] [[package]]
name = "same-file" name = "same-file"
version = "1.0.6" version = "1.0.6"
@ -321,6 +482,12 @@ dependencies = [
"digest", "digest",
] ]
[[package]]
name = "strsim"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.108" version = "1.0.108"
@ -332,6 +499,15 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "termcolor"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
dependencies = [
"winapi-util",
]
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.38" version = "1.0.38"
@ -411,3 +587,69 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
[[package]]
name = "windows_i686_gnu"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
[[package]]
name = "windows_i686_msvc"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"

View File

@ -8,6 +8,7 @@ edition = "2018"
[dependencies] [dependencies]
anyhow = "1.0.69" anyhow = "1.0.69"
clap = { version = "4.1.8", features = ["derive"] }
mailparse = "0.14.0" mailparse = "0.14.0"
memmap = "0.7.0" memmap = "0.7.0"
rayon = "1.6.1" rayon = "1.6.1"

136
src/bin/fingerprint.rs Normal file
View File

@ -0,0 +1,136 @@
use std::{
collections::HashMap,
fs::File,
path::Path,
time::{SystemTime, UNIX_EPOCH},
};
use clap::Parser;
use email::{fingerprint, should_skip, EmailError};
use mailparse::{dateparse, parse_mail, MailHeaderMap};
use memmap::MmapOptions;
use rayon::{iter::ParallelBridge, prelude::ParallelIterator};
use walkdir::WalkDir;
/// Simple program to greet a person
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// Number years to search through
#[arg(short, long, default_value_t = 1)]
years: usize,
/// Enable verbose logging
#[arg(short, long, default_value_t = false)]
verbose: bool,
/// Number of example filenames to print
#[arg(short, long, default_value_t = 1)]
examples: usize,
/// Show top N fingerprints
#[arg(short, long, default_value_t = 10)]
top_n: usize,
/// List of input directories to recursively search
input_dir: String,
}
fn main() -> anyhow::Result<()> {
let args = Args::parse();
let n = 1;
// Just check messages from the last N years.
let max_age = 60 * 60 * 24 * 365 * n;
let start = std::time::Instant::now();
let unix_secs = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("couldn't get unix time");
let youngest = unix_secs.as_secs() as i64 - max_age;
let dir = args.input_dir.clone();
let map = WalkDir::new(&dir)
.into_iter()
.par_bridge()
.filter_map(|entry| entry.ok())
.filter_map(|entry| {
if entry.file_type().is_dir() {
if args.verbose {
println!(
"{} ",
entry
.path()
.strip_prefix(&dir)
.expect("failed to strip dir")
.display()
);
}
return None;
}
let arg = entry.path().display().to_string();
if should_skip(&arg) {
return None;
}
match parse(&arg, youngest) {
Ok(Some(h)) => Some((h, arg)),
// Skip old emails
Ok(None) => return None,
Err(e) => {
eprintln!("{}: failed {}", arg, e);
None
}
}
})
.fold(
|| HashMap::new(),
|mut m, (h, arg)| {
m.entry(h).or_insert(Vec::new()).push(arg);
m
},
)
// Merge maps created by parallel iteration.
.reduce(
|| HashMap::new(),
|mut acc, m| {
for (k, v) in m {
acc.entry(k).or_insert(Vec::new()).extend(v);
}
acc
},
);
let mut res: Vec<_> = map
.into_iter()
.map(|(hash, paths)| {
(
paths.len(),
hash,
paths.into_iter().take(args.examples).collect::<Vec<_>>(),
)
})
.collect();
res.sort();
res.reverse();
for (cnt, hash, ex) in res.into_iter().take(args.top_n) {
println!("{cnt} {}\n{hash}\n", ex.join("\n"));
}
println!("Runtime: {:.2}s", start.elapsed().as_secs_f32());
Ok(())
}
// If the date in the email is before youngest Ok(None) will be returned.
fn parse<P: AsRef<Path>>(path: P, youngest: i64) -> Result<Option<String>, EmailError> {
let file = File::open(&path)?;
let mmap = unsafe { MmapOptions::new().map(&file)? };
let m = parse_mail(&mmap)?;
let d = dateparse(
m.headers
.get_first_value("Date")
.unwrap_or("".to_string())
.as_str(),
)
.unwrap_or(0);
if d < youngest {
return Ok(None);
}
//println!("{}: {:#?}", path.as_ref().display(), m.ctype);
Ok(Some(fingerprint(&m).join("\n")))
}

25
src/bin/summarize.rs Normal file
View File

@ -0,0 +1,25 @@
use std::fs::File;
use clap::Parser;
use email::fingerprint;
use mailparse::parse_mail;
use memmap::MmapOptions;
/// Use library to summarize information about given mail files
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// List of files to summarize
paths: Vec<String>,
}
fn main() -> anyhow::Result<()> {
let args = Args::parse();
for path in args.paths {
let file = File::open(&path)?;
let mmap = unsafe { MmapOptions::new().map(&file)? };
let m = parse_mail(&mmap)?;
println!("{path}\n{}", fingerprint(&m).join("\n"));
}
Ok(())
}

View File

@ -1,6 +1,6 @@
use std::{fs::File, path::Path}; use std::{fs::File, path::Path};
use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError}; use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError, ParsedMail};
use memmap::MmapOptions; use memmap::MmapOptions;
use sha1::{Digest, Sha1}; use sha1::{Digest, Sha1};
use thiserror::Error; use thiserror::Error;
@ -44,6 +44,38 @@ pub fn hash_headers(hdrs: &[MailHeader]) -> String {
format!("{:x}", hasher.finalize()) format!("{:x}", hasher.finalize())
} }
// Returns true if the last part of path is known to be a non-mail file commonly found under
// Maildir/
const SKIP_FILES: &[&str] = &[
"docdata.glass",
"flintlock",
"iamglass",
".mbsyncstate",
"position.glass",
"postlist.glass",
"termlist.glass",
".uidvalidity",
];
pub fn should_skip(path: &str) -> bool {
let filename: &str = if let Some(idx) = path.rfind("/") {
&path[idx + 1..]
} else {
path
};
SKIP_FILES.contains(&filename)
}
pub fn fingerprint(pm: &ParsedMail<'_>) -> Vec<String> {
fingerprint_rec(pm, 0)
}
fn fingerprint_rec(pm: &ParsedMail<'_>, depth: usize) -> Vec<String> {
let mut v = vec![format!("{}{}", " ".repeat(depth * 2), pm.ctype.mimetype)];
for c in &pm.subparts {
v.extend(fingerprint_rec(&c, depth + 1));
}
v
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use mailparse::{parse_headers, MailHeader, MailHeaderMap}; use mailparse::{parse_headers, MailHeader, MailHeaderMap};