From 8c47f017583264e4ab8964cdd3c8be3bd70fa132 Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Sat, 20 Jan 2024 08:14:10 -0800 Subject: [PATCH] Improve server side html sanitization. --- server/Cargo.toml | 5 +- server/src/bin/cleanhtml.rs | 16 +++ server/src/graphql.rs | 22 +--- server/src/lib.rs | 212 ++++++++++++++++++++++++++++++++++++ 4 files changed, 234 insertions(+), 21 deletions(-) create mode 100644 server/src/bin/cleanhtml.rs diff --git a/server/Cargo.toml b/server/Cargo.toml index 016ef3c..da88b79 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -2,7 +2,7 @@ name = "server" version = "0.1.0" edition = "2021" -default-bin = "server" +default-run = "server" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -24,6 +24,9 @@ memmap = "0.7.0" mailparse = "0.14.0" ammonia = "3.3.0" lol_html = "1.2.0" +css-inline = "0.13.0" +anyhow = "1.0.79" +maplit = "1.0.2" [dependencies.rocket_contrib] version = "0.4.11" diff --git a/server/src/bin/cleanhtml.rs b/server/src/bin/cleanhtml.rs new file mode 100644 index 0000000..6cbc89a --- /dev/null +++ b/server/src/bin/cleanhtml.rs @@ -0,0 +1,16 @@ +use std::fs; + +use server::sanitize_html; + +fn main() -> anyhow::Result<()> { + let mut args = std::env::args().skip(1); + let src = args.next().expect("source not specified"); + let dst = args.next().expect("destination not specified"); + println!("Sanitizing {src} into {dst}"); + let bytes = fs::read(src)?; + let html = String::from_utf8_lossy(&bytes); + let html = sanitize_html(&html)?; + fs::write(dst, html)?; + + Ok(()) +} diff --git a/server/src/graphql.rs b/server/src/graphql.rs index 8a33a72..55a4e14 100644 --- a/server/src/graphql.rs +++ b/server/src/graphql.rs @@ -11,12 +11,13 @@ use async_graphql::{ SimpleObject, Union, }; use log::{error, info, warn}; -use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings}; use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail}; use memmap::MmapOptions; use notmuch::Notmuch; use rocket::time::Instant; +use crate::sanitize_html; + pub struct QueryRoot; /// # Number of seconds since the Epoch @@ -190,25 +191,6 @@ struct Tag { bg_color: String, unread: usize, } -fn sanitize_html(html: &str) -> Result { - let element_content_handlers = vec![ - // Open links in new tab - element!("a[href]", |el| { - el.set_attribute("target", "_blank").unwrap(); - - Ok(()) - }), - ]; - - Ok(rewrite_str( - // TODO(wathiede): replace ammonia with more lol-html rules. - &ammonia::clean(&html), - RewriteStrSettings { - element_content_handlers, - ..RewriteStrSettings::default() - }, - )?) -} #[Object] impl QueryRoot { diff --git a/server/src/lib.rs b/server/src/lib.rs index c2c4236..9ec4903 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -1,3 +1,215 @@ pub mod error; pub mod graphql; pub mod nm; + +use css_inline::{CSSInliner, InlineError, InlineOptions}; +use log::error; +use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings}; +use maplit::{hashmap, hashset}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum SanitizeError { + #[error("lol-html rewrite error")] + RewritingError(#[from] RewritingError), + #[error("css inline error")] + InlineError(#[from] InlineError), +} + +pub fn sanitize_html(html: &str) -> Result { + let element_content_handlers = vec![ + // Open links in new tab + element!("a[href]", |el| { + el.set_attribute("target", "_blank").unwrap(); + + Ok(()) + }), + // Only secure image URLs + element!("img[src]", |el| { + let src = el + .get_attribute("src") + .expect("src was required") + .replace("http:", "https:"); + + el.set_attribute("src", &src)?; + + Ok(()) + }), + ]; + + let inline_opts = InlineOptions { + inline_style_tags: true, + keep_style_tags: false, + keep_link_tags: false, + base_url: None, + load_remote_stylesheets: false, + extra_css: None, + preallocate_node_capacity: 32, + ..InlineOptions::default() + }; + + let inlined_html = match CSSInliner::new(inline_opts).inline(&html) { + Ok(inlined_html) => inlined_html, + Err(err) => { + error!("failed to inline CSS: {err}"); + html.to_string() + } + }; + // Default's don't allow style, but we want to preserve that. + let attributes = + hashset!["align", "bgcolor", "color", "height", "lang", "title", "width", "style",]; + + let tags = hashset![ + "a", + "abbr", + "acronym", + "area", + "article", + "aside", + "b", + "bdi", + "bdo", + "blockquote", + "br", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "data", + "dd", + "del", + "details", + "dfn", + "div", + "dl", + "dt", + "em", + "figcaption", + "figure", + "footer", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "header", + "hgroup", + "hr", + "i", + "img", + "ins", + "kbd", + "kbd", + "li", + "map", + "mark", + "nav", + "ol", + "p", + "pre", + "q", + "rp", + "rt", + "rtc", + "ruby", + "s", + "samp", + "small", + "span", + "strike", + "strong", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "th", + "thead", + "time", + "title", // wathiede + "tr", + "tt", + "u", + "ul", + "var", + "wbr", + ]; + let tag_attributes = hashmap![ + "a" => hashset![ + "href", "hreflang" + ], + "bdo" => hashset![ + "dir" + ], + "blockquote" => hashset![ + "cite" + ], + "col" => hashset![ + "align", "char", "charoff", "span" + ], + "colgroup" => hashset![ + "align", "char", "charoff", "span" + ], + "del" => hashset![ + "cite", "datetime" + ], + "hr" => hashset![ + "align", "size", "width" + ], + "img" => hashset![ + "align", "alt", "height", "src", "width" + ], + "ins" => hashset![ + "cite", "datetime" + ], + "ol" => hashset![ + "start" + ], + "q" => hashset![ + "cite" + ], + "table" => hashset![ + "align", "border", "cellpadding", "cellspacing", "char", "charoff", "summary", + ], + "tbody" => hashset![ + "align", "char", "charoff" + ], + "td" => hashset![ + "align", "char", "charoff", "colspan", "headers", "rowspan" + ], + "tfoot" => hashset![ + "align", "char", "charoff" + ], + "th" => hashset![ + "align", "char", "charoff", "colspan", "headers", "rowspan", "scope" + ], + "thead" => hashset![ + "align", "char", "charoff" + ], + "tr" => hashset![ + "align", "char", "charoff" + ], + ]; + + let clean_html = ammonia::Builder::default() + .tags(tags) + .tag_attributes(tag_attributes) + .generic_attributes(attributes) + .clean(&inlined_html) + .to_string(); + //let clean_html = inlined_html; + + Ok(rewrite_str( + // TODO(wathiede): replace ammonia with more lol-html rules. + // &ammonia::clean(&html), + &clean_html, + RewriteStrSettings { + element_content_handlers, + ..RewriteStrSettings::default() + }, + )?) +}