superdeduper/src/lib.rs
2019-12-14 16:08:38 -08:00

747 lines
23 KiB
Rust

use std::cmp::Ordering;
use std::collections::HashMap;
use std::env;
use std::ffi::OsStr;
use std::fmt;
use std::fmt::Display;
use std::fmt::Formatter;
use std::fs::File;
use std::io::BufReader;
use std::io::BufWriter;
use std::path::Path;
use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use failure::bail;
use failure::Error;
use failure::ResultExt;
use glob::glob;
use lazy_static::lazy_static;
use log::error;
use log::info;
use rayon::iter::ParallelBridge;
use rayon::prelude::ParallelIterator;
use regex::Regex;
use serde::de;
use serde::de::Deserializer;
use serde::Deserialize;
use serde::Serialize;
use serde_json::Value;
const FULL_METADATA_FILENAME: &str = "metadata.json";
const COMPACT_METADATA_FILENAME: &str = "metadata.compact.json";
pub fn clean_path_parent<P: AsRef<Path>>(path: P) -> PathBuf {
let path = path.as_ref();
let path = path.parent().unwrap();
let path = path.to_str().unwrap();
PathBuf::from(path)
}
#[derive(Clone, Deserialize, Debug, Eq, PartialEq, Serialize)]
pub struct Resolution(usize, usize);
impl From<(usize, usize)> for Resolution {
fn from(res: (usize, usize)) -> Self {
Resolution(res.0, res.1)
}
}
impl Display for Resolution {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
let v = format!("{}x{}", self.0, self.1);
f.pad(&v)
}
}
impl Ord for Resolution {
fn cmp(&self, other: &Resolution) -> Ordering {
let pixels = self.0.checked_mul(self.1).unwrap_or(usize::max_value());
let other_pixels = other.0.checked_mul(other.1).unwrap_or(usize::max_value());
pixels.cmp(&other_pixels)
}
}
impl PartialOrd for Resolution {
fn partial_cmp(&self, other: &Resolution) -> Option<Ordering> {
Some(self.cmp(other))
}
}
fn option_from_str<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
where
T: FromStr,
T::Err: Display,
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
T::from_str(&s).map(Some).map_err(de::Error::custom)
}
fn from_str<'de, T, D>(deserializer: D) -> Result<T, D::Error>
where
T: FromStr,
T::Err: Display,
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
T::from_str(&s).map_err(de::Error::custom)
}
fn collapse_multidisc(names: &Vec<String>) -> HashMap<String, Vec<String>> {
lazy_static! {
static ref DIGIT: Regex = Regex::new("[0-9]").unwrap();
}
let mut set = HashMap::new();
for name in names {
let clean = DIGIT.replace_all(&name, "#").to_string();
set.entry(clean)
.or_insert(Vec::new())
.push(name.to_string());
}
set
}
pub fn is_multidisc(names: &Vec<String>) -> bool {
let set = collapse_multidisc(names);
set.len() == 1
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
struct Format {
#[serde(deserialize_with = "from_str")]
bit_rate: usize,
#[serde(deserialize_with = "from_str")]
duration: f32,
filename: String,
format_name: String,
#[serde(deserialize_with = "from_str")]
size: usize,
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
pub struct Tags(HashMap<String, String>);
impl Tags {
fn title(&self) -> Option<String> {
self.0.get("title").map(|s| s.to_string())
}
fn language(&self) -> Option<String> {
self.0.get("language").map(|s| s.to_string())
}
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
#[serde(tag = "codec_type")]
enum Stream {
#[serde(rename = "video")]
Video {
#[serde(default, deserialize_with = "option_from_str")]
#[serde(skip_serializing_if = "Option::is_none")]
bit_rate: Option<usize>,
codec_name: String,
codec_long_name: String,
coded_height: usize,
coded_width: usize,
display_aspect_ratio: Option<String>,
#[serde(default, deserialize_with = "from_str")]
duration: f32,
height: usize,
width: usize,
tags: Option<Tags>,
},
#[serde(rename = "audio")]
Audio {
codec_name: String,
codec_long_name: String,
channels: usize,
channel_layout: Option<String>,
tags: Option<Tags>,
},
#[serde(rename = "subtitle")]
Subtitle {
codec_name: String,
codec_long_name: String,
tags: Option<Tags>,
},
#[serde(rename = "attachment")]
Attachment {},
#[serde(rename = "data")]
Data {},
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
struct Metadata {
format: Format,
streams: Vec<Stream>,
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
pub struct VideoFormat {
short_name: String,
long_name: String,
height: usize,
width: usize,
#[serde(skip_serializing_if = "Option::is_none")]
title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
language: Option<String>,
}
#[cfg(test)]
impl Default for VideoFormat {
fn default() -> Self {
VideoFormat {
short_name: "UNNAMED_SHORT".to_string(),
long_name: "UNNAMED_LONG".to_string(),
height: 0,
width: 0,
title: None,
language: None,
}
}
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
pub struct AudioFormat {
short_name: String,
long_name: String,
channels: usize,
#[serde(skip_serializing_if = "Option::is_none")]
channel_layout: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
language: Option<String>,
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
pub struct SubtitleFormat {
short_name: String,
long_name: String,
#[serde(skip_serializing_if = "Option::is_none")]
title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
language: Option<String>,
}
#[derive(Clone, Deserialize, Debug, PartialEq, Serialize)]
pub struct CompactMetadata {
pub bit_rate: usize,
pub duration: f32,
filename: String,
format_name: String,
pub size: usize,
video: Vec<VideoFormat>,
audio: Vec<AudioFormat>,
subtitle: Vec<SubtitleFormat>,
}
impl CompactMetadata {
pub fn largest_dimension(&self) -> Option<Resolution> {
if self.video.is_empty() {
return None;
}
Some(self.video.iter().fold(
Resolution(0, 0),
|acc, VideoFormat { width, height, .. }| {
if acc.0 * acc.1 < width * height {
Resolution(*width, *height)
} else {
acc
}
},
))
}
}
#[cfg(test)]
impl Default for CompactMetadata {
fn default() -> Self {
CompactMetadata {
bit_rate: 0,
duration: 0.,
filename: "UNSET".to_string(),
format_name: "UNKNOWN".to_string(),
size: 0,
video: Vec::new(),
audio: Vec::new(),
subtitle: Vec::new(),
}
}
}
#[derive(Deserialize, Debug, PartialEq, Serialize)]
pub struct MetadataFile {
#[serde(flatten)]
metadata: HashMap<String, Metadata>,
}
#[derive(PartialEq, Debug)]
pub struct MovieLibrary {
root: String,
}
fn json_metadata_for_path<P: AsRef<Path> + AsRef<OsStr>>(path: P) -> Result<String, Error> {
let mut cmd = Command::new("ffprobe");
cmd.args(&[
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
"-show_error",
"-show_streams",
"-i",
])
.arg(Path::new("./").join(path));
info!(target: "json", "cmd {:?}", cmd);
let output = cmd.output()?;
if output.status.success() {
return Ok(String::from_utf8(output.stdout)?);
}
bail!(
"{:?} exit status {}:\nSTDOUT: {}\nSTDERR: {}",
cmd,
output.status,
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
)
}
lazy_static! {
static ref MOVIE_EXTS: Vec<&'static str> = vec!["avi", "m4v", "mkv", "mov", "mp4"];
}
#[derive(Default, Debug, PartialEq)]
pub struct Movie {
pub files: Vec<(String, CompactMetadata)>,
}
impl Movie {
fn min_bit_rate(&self) -> Option<usize> {
if self.files.is_empty() {
None
} else {
Some(self.files.iter().fold(usize::max_value(), |acc, (_, cmd)| {
std::cmp::min(acc, cmd.bit_rate)
}))
}
}
fn min_resolution(&self) -> Option<Resolution> {
if self.files.is_empty() {
None
} else {
Some(self.files.iter().fold(
Resolution(usize::max_value(), usize::max_value()),
|acc, (_, cmd)| std::cmp::min(acc, cmd.largest_dimension().unwrap()),
))
}
}
}
impl Display for Movie {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
let p = &self.files.first().unwrap().0;
write!(f, "{}", &p[..p.find("/").unwrap()])?;
for (path, cmd) in &self.files {
write!(
f,
" {} {}",
&path[path.rfind("/").unwrap()..],
cmd.largest_dimension().unwrap()
)?;
}
Ok(())
}
}
#[derive(Debug, PartialEq)]
pub struct Movies {
movies: Vec<Movie>,
}
impl Movies {
/// Find all movies with multiple copies. The returned vec contains a tuple of (Movie to keep,
/// One or more Movies to remove). The highest bit rate movie is kept.
/// Movies with differing years are considered distinct movies.
/// If there is a yearless movie and one or more movies with a year exist, then the yearless
/// movie will be removed
pub fn duplicate_candidates(&self) -> Vec<(&Movie, Vec<&Movie>)> {
let date_re = Regex::new(r"\(\d{4}\)$").unwrap();
let mut movie_counter = HashMap::new();
let mut movies_without_date_counter = HashMap::new();
for m in &self.movies {
let (path, _cmd) = m.files.first().unwrap();
let parent = clean_path_parent(path)
.to_string_lossy()
.to_ascii_lowercase()
.to_string();
if date_re.is_match(&parent) {
movie_counter.entry(parent).or_insert(Vec::new()).push(m);
} else {
movies_without_date_counter
.entry(parent)
.or_insert(Vec::new())
.push(m);
}
}
let mut dupes: Vec<(&Movie, Vec<&Movie>)> = Vec::new();
for (parent, movies) in movie_counter.iter() {
let dateless_parent = parent[..parent.len() - 7].to_string();
if let Some(movie) = movies_without_date_counter.remove(&dateless_parent) {
let tuple = (movies[0], movie);
dupes.push(tuple);
}
}
for (_parent, mut movies) in movie_counter.into_iter() {
if movies.len() > 1 {
// Sort, lowest bit_rate movie first
movies.sort_by(|a, b| a.min_bit_rate().cmp(&b.min_bit_rate()));
// Flip order, we care about the largest.
movies.reverse();
// Take the largest image, return the rest for removal.
let tuple = (movies.remove(0), movies);
dupes.push(tuple);
}
}
// Sort to make this function deterministic.
dupes.sort_by(|(a_keep, _), (b_keep, _)| {
a_keep
.files
.first()
.unwrap()
.0
.partial_cmp(&b_keep.files.first().unwrap().0)
.unwrap()
});
dupes
}
}
fn movies_from_paths_compact_metadata(mut p_cmd: HashMap<String, CompactMetadata>) -> Movies {
let multidisc = collapse_multidisc(&p_cmd.keys().map(|s| s.to_string()).collect());
let movies = multidisc
.into_iter()
.map(|(_hash, names)| {
let mut files: Vec<(String, CompactMetadata)> = names
.iter()
.map(|name| (name.to_string(), p_cmd.remove(name).unwrap()))
.collect();
files.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
Movie { files }
})
.collect();
let mut m = Movies { movies };
m.movies.sort_by(|a, b| {
a.files
.first()
.unwrap()
.0
.partial_cmp(&b.files.first().unwrap().0)
.unwrap()
});
m
}
impl MovieLibrary {
pub fn new<S: Into<String>>(root: S) -> MovieLibrary {
MovieLibrary { root: root.into() }
}
pub fn compact_metadata(&self) -> Result<(), Error> {
let path = Path::new(&self.root).join(FULL_METADATA_FILENAME);
// Open the file in read-only mode with buffer.
let f = File::open(&path).context(format!("open {}", path.display()))?;
let r = BufReader::new(f);
let mdf: MetadataFile = serde_json::from_reader(r)
.context(format!("serde_json::from_reader {}", path.display()))?;
info!("Read metadata, {} videos found", mdf.metadata.len());
let metadata: HashMap<String, CompactMetadata> = mdf
.metadata
.into_iter()
.map(|(path, Metadata { format, streams })| (path, Metadata { format, streams }))
.map(|(path, md)| {
let video = md
.streams
.iter()
.filter_map(|s| {
if let Stream::Video {
codec_name,
codec_long_name,
height,
width,
tags,
..
} = s
{
Some(VideoFormat {
short_name: codec_name.to_string(),
long_name: codec_long_name.to_string(),
height: *height,
width: *width,
title: tags.as_ref().and_then(|t| t.title()),
language: tags.as_ref().and_then(|t| t.language()),
})
} else {
None
}
})
.collect();
let audio = md
.streams
.iter()
.filter_map(|s| {
if let Stream::Audio {
codec_name,
codec_long_name,
channels,
channel_layout,
tags,
..
} = s
{
Some(AudioFormat {
short_name: codec_name.to_string(),
long_name: codec_long_name.to_string(),
channels: *channels,
channel_layout: channel_layout.clone(),
title: tags.as_ref().and_then(|t| t.title()),
language: tags.as_ref().and_then(|t| t.language()),
})
} else {
None
}
})
.collect();
let subtitle = md
.streams
.iter()
.filter_map(|s| {
if let Stream::Subtitle {
codec_name,
codec_long_name,
tags,
..
} = s
{
Some(SubtitleFormat {
short_name: codec_name.to_string(),
long_name: codec_long_name.to_string(),
title: tags.as_ref().and_then(|t| t.title()),
language: tags.as_ref().and_then(|t| t.language()),
})
} else {
None
}
})
.collect();
(
path,
CompactMetadata {
bit_rate: md.format.bit_rate,
duration: md.format.duration,
filename: md.format.filename,
format_name: md.format.format_name,
size: md.format.size,
video,
audio,
subtitle,
},
)
})
.collect();
let f = File::create(Path::new(&self.root).join(COMPACT_METADATA_FILENAME))?;
let f = BufWriter::new(f);
Ok(serde_json::ser::to_writer_pretty(f, &metadata)?)
}
pub fn update_metadata(&self) -> Result<Vec<String>, Error> {
let path = Path::new(&self.root).join(FULL_METADATA_FILENAME);
let mut old_metadata: HashMap<String, Value> = match File::open(&path) {
Ok(f) => {
let r = BufReader::new(f);
serde_json::from_reader(r)?
}
Err(e) => {
error!("Failed to open {}: {}", path.display(), e);
HashMap::new()
}
};
info!("Read metadata, {} videos found", old_metadata.len());
// Filter out stale metadata (where the file no longer exists).
let old_metadata: HashMap<String, Value> = self
.iter_video_files()
.filter(|r| r.is_ok())
.filter_map(|r| {
let path = r
.as_ref()
.unwrap()
.strip_prefix(&self.root)
.unwrap()
.to_str()
.unwrap()
.to_owned();
match old_metadata.remove(&path) {
Some(v) => Some((path, v)),
None => None,
}
})
.collect();
info!(
"After removing stale metadata, {} videos found",
old_metadata.len()
);
let mut metadata: HashMap<_, _> = self
.iter_video_files()
.filter(|r| r.is_ok())
.filter(|r| {
let path = r
.as_ref()
.unwrap()
.strip_prefix(&self.root)
.unwrap()
.to_str()
.unwrap()
.to_owned();
!old_metadata.contains_key(&path)
})
.par_bridge()
.filter_map(move |path| {
env::set_current_dir(&self.root).unwrap();
let path: PathBuf = path.unwrap().into();
let path = path.strip_prefix(&self.root).unwrap();
match json_metadata_for_path(&path) {
Ok(json) => {
info!("{}", path.display());
Some((path.to_string_lossy().into_owned(), json))
}
Err(e) => {
error!("{}", e);
None
}
}
})
.map(|(path, json)| (path, serde_json::from_str::<Value>(&json).unwrap()))
.collect();
let new_videos = metadata.keys().cloned().collect();
info!("Adding {} new videos", metadata.len());
metadata.extend(old_metadata);
let f = File::create(Path::new(&self.root).join(FULL_METADATA_FILENAME))?;
let f = BufWriter::new(f);
serde_json::ser::to_writer_pretty(f, &metadata)?;
Ok(new_videos)
}
fn iter_video_files(&self) -> impl Send + Iterator<Item = Result<PathBuf, glob::GlobError>> {
glob(&format!("{}/*/*", self.root)).unwrap().filter(|path| {
let path = path.as_ref().unwrap();
match path.extension() {
Some(ext) => {
let ext: &str = &ext.to_str().unwrap().to_lowercase();
if !MOVIE_EXTS.contains(&ext) {
return false;
}
}
None => return false,
}
return true;
})
}
pub fn movies(&self) -> Result<Movies, Error> {
let path = Path::new(&self.root).join(COMPACT_METADATA_FILENAME);
let f = File::open(&path).context(format!("open {}", path.display()))?;
let r = BufReader::new(f);
let p_cmd: HashMap<String, CompactMetadata> = serde_json::from_reader(r)
.context(format!("serde_json::from_reader {}", path.display()))?;
Ok(movies_from_paths_compact_metadata(p_cmd))
}
pub fn videos(&self) -> Result<(HashMap<String, CompactMetadata>), Error> {
let path = Path::new(&self.root).join(COMPACT_METADATA_FILENAME);
let f = File::open(&path).context(format!("open {}", path.display()))?;
let r = BufReader::new(f);
Ok(serde_json::from_reader(r)
.context(format!("serde_json::from_reader {}", path.display()))?)
}
}
#[cfg(test)]
mod movielibrary_test;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn largest_dimension() {
let md = CompactMetadata {
..Default::default()
};
assert_eq!(md.largest_dimension(), None);
let md = CompactMetadata {
video: vec![
VideoFormat {
height: 3,
width: 4,
..Default::default()
},
VideoFormat {
width: 640,
height: 480,
..Default::default()
},
],
..Default::default()
};
assert_eq!(md.largest_dimension(), Some(Resolution(640, 480)));
let md = CompactMetadata {
video: vec![
VideoFormat {
width: 640,
height: 480,
..Default::default()
},
VideoFormat {
height: 3,
width: 4,
..Default::default()
},
],
..Default::default()
};
assert_eq!(md.largest_dimension(), Some(Resolution(640, 480)));
}
#[test]
fn test_multidisc() {
// Empty set is not a multidisc set.
assert!(!is_multidisc(&vec![]));
assert!(is_multidisc(&vec![
"Unbearable.Lightness.Of.Being Part 1.avi".to_string(),
"Unbearable.Lightness.Of.Being Part 2.avi".to_string(),
"Unbearable.Lightness.Of.Being Part 3.avi".to_string(),
]));
assert!(!is_multidisc(&vec![
"Scent Of A Woman 1992 DvDrip[Eng]-greenbud1969.avi".to_string(),
"Scent.Of.A.Woman.1992.1080p.BluRay.x264.AC3.mp4".to_string(),
]));
}
}