From 04585e8d24310521e547a3d53d447645fcd90e4c Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Sat, 23 Nov 2019 08:30:07 -0800 Subject: [PATCH] Add is_multidisc, move big tests to separate module. Started work on a method movie() that groups multipart movies together. --- src/lib.rs | 146 +++++++++++++++----------- src/main.rs | 89 ++++++++++++++-- src/movielibrary_test.rs | 45 ++++++++ testdata/simple/metadata.compact.json | 26 +++++ 4 files changed, 232 insertions(+), 74 deletions(-) create mode 100644 src/movielibrary_test.rs create mode 100644 testdata/simple/metadata.compact.json diff --git a/src/lib.rs b/src/lib.rs index 29e734c..641de47 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::collections::HashSet; use std::env; use std::ffi::OsStr; use std::fmt; @@ -21,6 +22,7 @@ use log::error; use log::info; use rayon::iter::ParallelBridge; use rayon::prelude::ParallelIterator; +use regex::Regex; use serde::de; use serde::de::Deserializer; use serde::Deserialize; @@ -59,7 +61,23 @@ where T::from_str(&s).map_err(de::Error::custom) } -#[derive(Clone, Deserialize, Debug, Serialize)] +pub fn is_multidisc(names: &Vec) -> bool { + // TODO(wathiede): smarter version that helps with: + // The Hudsucker Proxy: + // 1920x1080 4.78Gi 1h 50m 45s 54151c3b9a2a4773958f848efecefc3b.mkv + // 720x416 736.51Mi 50m 40s The Hudsucker Proxy CD1.avi + // 720x416 736.49Mi 1h 3s The Hudsucker Proxy CD2.avi + lazy_static! { + static ref DIGIT: Regex = Regex::new("[0-9]").unwrap(); + } + let mut set = HashSet::new(); + for name in names { + set.insert(DIGIT.replace_all(&name, "#").to_string()); + } + set.len() == 1 +} + +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] struct Format { #[serde(deserialize_with = "from_str")] bit_rate: usize, @@ -71,7 +89,7 @@ struct Format { size: usize, } -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] pub struct Tags(HashMap); impl Tags { @@ -83,7 +101,7 @@ impl Tags { } } -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] #[serde(tag = "codec_type")] enum Stream { #[serde(rename = "video")] @@ -122,13 +140,13 @@ enum Stream { Data {}, } -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] struct Metadata { format: Format, streams: Vec, } -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] pub struct VideoFormat { short_name: String, long_name: String, @@ -154,7 +172,7 @@ impl Default for VideoFormat { } } -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] pub struct AudioFormat { short_name: String, long_name: String, @@ -166,7 +184,7 @@ pub struct AudioFormat { language: Option, } -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] pub struct SubtitleFormat { short_name: String, long_name: String, @@ -176,7 +194,7 @@ pub struct SubtitleFormat { language: Option, } -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)] pub struct CompactMetadata { pub bit_rate: usize, pub duration: f32, @@ -222,12 +240,13 @@ impl Default for CompactMetadata { } } -#[derive(Deserialize, Debug, Serialize)] +#[derive(Deserialize, Debug, PartialEq, Serialize)] pub struct MetadataFile { #[serde(flatten)] metadata: HashMap, } +#[derive(PartialEq, Debug)] pub struct MovieLibrary { root: String, } @@ -263,6 +282,34 @@ lazy_static! { static ref MOVIE_EXTS: Vec<&'static str> = vec!["avi", "m4v", "mkv", "mov", "mp4"]; } +#[derive(Debug, PartialEq)] +struct Movie { + files: Vec<(String, CompactMetadata)>, +} + +#[derive(Debug, PartialEq)] +pub struct Movies { + movies: Vec, +} + +fn movies_from_paths_compact_metadata(p_cmd: HashMap) -> Movies { + // file path + let files_to_movies: HashMap> = HashMap::new(); + // TODO(wathiede): + // - walk over every item, use something based on is_multidisc to pack multifile movies + // together. + // - then walk over `files` and create a Movie for each + // - then store those Movie structs in Movies + + let movies = p_cmd + .into_iter() + .map(|(p, cmd)| Movie { + files: vec![(p, cmd)], + }) + .collect(); + Movies { movies } +} + impl MovieLibrary { pub fn new>(root: S) -> MovieLibrary { MovieLibrary { root: root.into() } @@ -455,11 +502,18 @@ impl MovieLibrary { }) } - pub fn videos( - &self, - _include_stale: bool, - ) -> Result<(HashMap), Error> { - // TODO(wathiede): implement include_stale. + pub fn movies(&self) -> Result { + let path = Path::new(&self.root).join(COMPACT_METADATA_FILENAME); + // Open the file in read-only mode with buffer. + let f = File::open(&path).context(format!("open {}", path.display()))?; + let r = BufReader::new(f); + + let p_cmd: HashMap = serde_json::from_reader(r) + .context(format!("serde_json::from_reader {}", path.display()))?; + Ok(movies_from_paths_compact_metadata(p_cmd)) + } + + pub fn videos(&self) -> Result<(HashMap), Error> { let path = Path::new(&self.root).join(COMPACT_METADATA_FILENAME); // Open the file in read-only mode with buffer. let f = File::open(&path).context(format!("open {}", path.display()))?; @@ -470,15 +524,13 @@ impl MovieLibrary { } } +#[cfg(test)] +mod movielibrary_test; + #[cfg(test)] mod tests { use super::*; - #[allow(dead_code)] - fn testdata_dir() -> String { - format!("{}/testdata", env::var("CARGO_MANIFEST_DIR").unwrap()) - } - #[test] fn largest_dimension() { let md = CompactMetadata { @@ -521,50 +573,18 @@ mod tests { assert_eq!(md.largest_dimension(), Some(Resolution(640, 480))); } - /* #[test] - fn test_movies() { - let lib = MovieLibrary::new(format!("{}/Movies", testdata_dir())); - let movies = lib.movies(true).expect("failed to get movies"); - let mut got = movies.keys().collect::>(); - got.sort(); - let want = [ - "Aladdin (1992)/Aladdin.1992.720p.BRrip.x264.GAZ.YIFY.mp4", - "Aladdin (2019)/4fe12adfdf4b4e9daa4f1366452d3431.mkv", - "Higher Learning/Higher Learning CD1.avi", - "Higher Learning/Higher Learning CD2.avi", - "J0hn W1ck (2014)/J0hn W1ck (2014) m720p x264 aac.m4v", - "J0hn W1ck (2014)/J0hn W1ck (2014) m720p x264 aac.sample.m4v", - "Stale Sample (2019)/Stale Sample (2019) m720p x264 aac.sample.m4v", - "The Hudsucker Proxy (1994)/54151c3b9a2a4773958f848efecefc3b.mkv", - "The Hudsucker Proxy (1994)/The Hudsucker Proxy CD1.avi", - "The Hudsucker Proxy (1994)/The Hudsucker Proxy CD2.avi", - ]; - - assert_eq!(got, want); + fn test_multidisc() { + // Empty set is not a multidisc set. + assert!(!is_multidisc(&vec![])); + assert!(is_multidisc(&vec![ + "Unbearable.Lightness.Of.Being Part 1.avi".to_string(), + "Unbearable.Lightness.Of.Being Part 2.avi".to_string(), + "Unbearable.Lightness.Of.Being Part 3.avi".to_string(), + ])); + assert!(!is_multidisc(&vec![ + "Scent Of A Woman 1992 DvDrip[Eng]-greenbud1969.avi".to_string(), + "Scent.Of.A.Woman.1992.1080p.BluRay.x264.AC3.mp4".to_string(), + ])); } - */ - - /* - #[test] - fn test_filter_stale() { - let lib = MovieLibrary::new(format!("{}/Movies", testdata_dir())); - let movies = lib.movies(false).expect("failed to get movies"); - let mut got = movies.keys().collect::>(); - got.sort(); - let want = [ - "Aladdin (1992)/Aladdin.1992.720p.BRrip.x264.GAZ.YIFY.mp4", - "Aladdin (2019)/4fe12adfdf4b4e9daa4f1366452d3431.mkv", - "Higher Learning/Higher Learning CD1.avi", - "Higher Learning/Higher Learning CD2.avi", - "J0hn W1ck (2014)/J0hn W1ck (2014) m720p x264 aac.m4v", - "J0hn W1ck (2014)/J0hn W1ck (2014) m720p x264 aac.sample.m4v", - "The Hudsucker Proxy (1994)/54151c3b9a2a4773958f848efecefc3b.mkv", - "The Hudsucker Proxy (1994)/The Hudsucker Proxy CD1.avi", - "The Hudsucker Proxy (1994)/The Hudsucker Proxy CD2.avi", - ]; - - assert_eq!(got, want); - } - */ } diff --git a/src/main.rs b/src/main.rs index 0244567..cd4a54a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,11 +8,13 @@ use std::time::Duration; use human_format::Formatter; use human_format::Scales; use humantime; +use lazy_static::lazy_static; use log::info; use regex::Regex; use structopt::StructOpt; use tabwriter::TabWriter; +use superdeduper::is_multidisc; use superdeduper::CompactMetadata; use superdeduper::MovieLibrary; @@ -22,14 +24,80 @@ const TO_BE_REMOVED_DIR: &str = "/storage/media/to-be-deleted/"; fn clean_path_parent>(path: P) -> PathBuf { let path = path.as_ref(); let path = path.parent().unwrap(); - let mut path = path.to_str().unwrap(); + let path = path.to_str().unwrap(); + /* + // Strip year if path.ends_with(')') { path = &path[..path.len() - 7]; } + */ PathBuf::from(path) } -fn print_dupes(videos: HashMap) {} +lazy_static! { + static ref CLEAN_TITLE_CHARS: Regex = Regex::new("[^ 0-9[:alpha:]]").unwrap(); +} + +fn normalize(path: &str) -> String { + CLEAN_TITLE_CHARS + .replace_all(&path, "") + .to_ascii_lowercase() +} + +lazy_static! { + static ref YEAR_SUFFIX: Regex = Regex::new(r" \d{4}$").unwrap(); +} +fn print_dupes(videos: HashMap) { + let mut video_groups: HashMap> = HashMap::new(); + for (name, md) in videos.into_iter() { + let clean_name = normalize(clean_path_parent(&name).to_str().unwrap()); + let paths = video_groups.entry(clean_name).or_insert(Vec::new()); + paths.push((name.to_string(), md)); + } + + let mut names = video_groups.keys().collect::>(); + names.sort(); + + for name in &names { + if YEAR_SUFFIX.is_match(&name) { + let yearless = &name[..&name.len() - 5]; + info!("is '{}' in map", yearless); + if let Some(yearless_vg) = video_groups.get(yearless) { + println!("Possible dupe between movie with year and no year:"); + println!(" {:?}", video_groups.get(name.as_str()).unwrap()); + println!(" {:?}", yearless_vg); + } + } + } + let mut fmtr = Formatter::new(); + fmtr.with_separator(""); + fmtr.with_scales(Scales::Binary()); + for name in names { + let possible_dupes = &video_groups[name]; + if possible_dupes.len() < 2 { + continue; + } + let paths: Vec = possible_dupes + .iter() + .map(|(name, _)| name.to_string()) + .collect(); + if is_multidisc(&paths) { + continue; + } + let mut file: Vec<_> = video_groups[name].iter().collect(); + file.sort_by(|(n1, _), (n2, _)| n1.partial_cmp(n2).unwrap()); + println!("{}:", name); + for (p, md) in file { + println!( + " {:>9} {:>9} {} {}", + md.largest_dimension().unwrap(), + fmtr.format(md.size as f64), + humantime::Duration::from(Duration::from_secs(md.duration as u64)), + &p[p.rfind("/").unwrap() + 1..] + ); + } + } +} fn print_all(videos: HashMap) { let mut names = videos.keys().collect::>(); @@ -40,7 +108,6 @@ fn print_all(videos: HashMap) { fmtr.with_scales(Scales::Binary()); let mut tw = TabWriter::new(vec![]); for name in names { - let clean_name = clean_path_parent(&name); let md = &videos[name]; write!( &mut tw, @@ -57,7 +124,7 @@ fn print_all(videos: HashMap) { println!("{}", String::from_utf8(tw.into_inner().unwrap()).unwrap()); } -fn print_video_groups(video_groups: &HashMap>) { +fn print_video_groups(video_groups: &HashMap>) { let mut names = video_groups.keys().collect::>(); names.sort(); @@ -71,7 +138,7 @@ fn print_video_groups(video_groups: &HashMap = video_groups[name].iter().collect(); file.sort_by(|(n1, _), (n2, _)| n1.partial_cmp(n2).unwrap()); - println!("{}:", name.display()); + println!("{}:", name); for (p, md) in file { println!( " {:>9} {:>9} {} {}", @@ -166,18 +233,18 @@ fn main() -> Result<(), Box> { match app.cmd { Command::Samples => { let lib = MovieLibrary::new(MOVIE_DIR); - let videos = lib.videos(false)?; + let videos = lib.videos()?; let samples_re = Regex::new(r"(?i).*sample.*").unwrap(); print_videos(&videos, Some(&samples_re)); } Command::Groups => { let lib = MovieLibrary::new(MOVIE_DIR); - let videos = lib.videos(false)?; + let videos = lib.videos()?; - let mut video_groups: HashMap> = HashMap::new(); + let mut video_groups: HashMap> = HashMap::new(); for (name, md) in videos.into_iter() { - let clean_name = clean_path_parent(&name); + let clean_name = normalize(clean_path_parent(&name).to_str().unwrap()); let paths = video_groups.entry(clean_name).or_insert(Vec::new()); paths.push((name.to_string(), md)); } @@ -190,13 +257,13 @@ fn main() -> Result<(), Box> { } Command::PrintDupes => { let lib = MovieLibrary::new(MOVIE_DIR); - let videos = lib.videos(false)?; + let videos = lib.videos()?; print_dupes(videos); } Command::PrintAll => { let lib = MovieLibrary::new(MOVIE_DIR); - let videos = lib.videos(false)?; + let videos = lib.videos()?; print_all(videos); } diff --git a/src/movielibrary_test.rs b/src/movielibrary_test.rs new file mode 100644 index 0000000..9bb80cb --- /dev/null +++ b/src/movielibrary_test.rs @@ -0,0 +1,45 @@ +use super::*; + +#[allow(dead_code)] +fn testdata_dir() -> PathBuf { + format!("{}/testdata", env::var("CARGO_MANIFEST_DIR").unwrap()).into() +} + +#[test] +fn test_library() { + let ml = MovieLibrary::new(testdata_dir().join("simple").to_str().unwrap()); + assert_eq!( + ml.movies().expect("failed to build movies"), + Movies { + movies: vec![Movie { + files: vec![( + "One Movie With Year (2018)/abcdef123456789.mkv".to_string(), + CompactMetadata { + bit_rate: 100000, + duration: 3600.0, + filename: "./One Movie With Year (2018)/abcdef123456789.mkv".to_string(), + format_name: "mkv".to_string(), + size: 2000000, + video: vec![VideoFormat { + short_name: "mpeg4".to_string(), + long_name: "MPEG-4 part 2".to_string(), + height: 362, + width: 660, + title: None, + language: None, + }], + audio: vec![AudioFormat { + short_name: "mp3".to_string(), + long_name: "MP3 (MPEG audio layer 3)".to_string(), + channels: 2, + channel_layout: "stereo".to_string(), + title: None, + language: None, + }], + subtitle: Vec::new(), + } + )] + }] + } + ); +} diff --git a/testdata/simple/metadata.compact.json b/testdata/simple/metadata.compact.json new file mode 100644 index 0000000..2757c99 --- /dev/null +++ b/testdata/simple/metadata.compact.json @@ -0,0 +1,26 @@ +{ + "One Movie With Year (2018)/abcdef123456789.mkv": { + "bit_rate": 100000, + "duration": 3600.0, + "filename": "./One Movie With Year (2018)/abcdef123456789.mkv", + "format_name": "mkv", + "size": 2000000, + "video": [ + { + "short_name": "mpeg4", + "long_name": "MPEG-4 part 2", + "height": 362, + "width": 660 + } + ], + "audio": [ + { + "short_name": "mp3", + "long_name": "MP3 (MPEG audio layer 3)", + "channels": 2, + "channel_layout": "stereo" + } + ], + "subtitle": [] + } +}