Improve server side html sanitization.

This commit is contained in:
Bill Thiede 2024-01-20 08:14:10 -08:00
parent 304819275d
commit 8c47f01758
4 changed files with 234 additions and 21 deletions

View File

@ -2,7 +2,7 @@
name = "server"
version = "0.1.0"
edition = "2021"
default-bin = "server"
default-run = "server"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@ -24,6 +24,9 @@ memmap = "0.7.0"
mailparse = "0.14.0"
ammonia = "3.3.0"
lol_html = "1.2.0"
css-inline = "0.13.0"
anyhow = "1.0.79"
maplit = "1.0.2"
[dependencies.rocket_contrib]
version = "0.4.11"

View File

@ -0,0 +1,16 @@
use std::fs;
use server::sanitize_html;
fn main() -> anyhow::Result<()> {
let mut args = std::env::args().skip(1);
let src = args.next().expect("source not specified");
let dst = args.next().expect("destination not specified");
println!("Sanitizing {src} into {dst}");
let bytes = fs::read(src)?;
let html = String::from_utf8_lossy(&bytes);
let html = sanitize_html(&html)?;
fs::write(dst, html)?;
Ok(())
}

View File

@ -11,12 +11,13 @@ use async_graphql::{
SimpleObject, Union,
};
use log::{error, info, warn};
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail};
use memmap::MmapOptions;
use notmuch::Notmuch;
use rocket::time::Instant;
use crate::sanitize_html;
pub struct QueryRoot;
/// # Number of seconds since the Epoch
@ -190,25 +191,6 @@ struct Tag {
bg_color: String,
unread: usize,
}
fn sanitize_html(html: &str) -> Result<String, RewritingError> {
let element_content_handlers = vec![
// Open links in new tab
element!("a[href]", |el| {
el.set_attribute("target", "_blank").unwrap();
Ok(())
}),
];
Ok(rewrite_str(
// TODO(wathiede): replace ammonia with more lol-html rules.
&ammonia::clean(&html),
RewriteStrSettings {
element_content_handlers,
..RewriteStrSettings::default()
},
)?)
}
#[Object]
impl QueryRoot {

View File

@ -1,3 +1,215 @@
pub mod error;
pub mod graphql;
pub mod nm;
use css_inline::{CSSInliner, InlineError, InlineOptions};
use log::error;
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
use maplit::{hashmap, hashset};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum SanitizeError {
#[error("lol-html rewrite error")]
RewritingError(#[from] RewritingError),
#[error("css inline error")]
InlineError(#[from] InlineError),
}
pub fn sanitize_html(html: &str) -> Result<String, SanitizeError> {
let element_content_handlers = vec![
// Open links in new tab
element!("a[href]", |el| {
el.set_attribute("target", "_blank").unwrap();
Ok(())
}),
// Only secure image URLs
element!("img[src]", |el| {
let src = el
.get_attribute("src")
.expect("src was required")
.replace("http:", "https:");
el.set_attribute("src", &src)?;
Ok(())
}),
];
let inline_opts = InlineOptions {
inline_style_tags: true,
keep_style_tags: false,
keep_link_tags: false,
base_url: None,
load_remote_stylesheets: false,
extra_css: None,
preallocate_node_capacity: 32,
..InlineOptions::default()
};
let inlined_html = match CSSInliner::new(inline_opts).inline(&html) {
Ok(inlined_html) => inlined_html,
Err(err) => {
error!("failed to inline CSS: {err}");
html.to_string()
}
};
// Default's don't allow style, but we want to preserve that.
let attributes =
hashset!["align", "bgcolor", "color", "height", "lang", "title", "width", "style",];
let tags = hashset![
"a",
"abbr",
"acronym",
"area",
"article",
"aside",
"b",
"bdi",
"bdo",
"blockquote",
"br",
"caption",
"center",
"cite",
"code",
"col",
"colgroup",
"data",
"dd",
"del",
"details",
"dfn",
"div",
"dl",
"dt",
"em",
"figcaption",
"figure",
"footer",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"i",
"img",
"ins",
"kbd",
"kbd",
"li",
"map",
"mark",
"nav",
"ol",
"p",
"pre",
"q",
"rp",
"rt",
"rtc",
"ruby",
"s",
"samp",
"small",
"span",
"strike",
"strong",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"th",
"thead",
"time",
"title", // wathiede
"tr",
"tt",
"u",
"ul",
"var",
"wbr",
];
let tag_attributes = hashmap![
"a" => hashset![
"href", "hreflang"
],
"bdo" => hashset![
"dir"
],
"blockquote" => hashset![
"cite"
],
"col" => hashset![
"align", "char", "charoff", "span"
],
"colgroup" => hashset![
"align", "char", "charoff", "span"
],
"del" => hashset![
"cite", "datetime"
],
"hr" => hashset![
"align", "size", "width"
],
"img" => hashset![
"align", "alt", "height", "src", "width"
],
"ins" => hashset![
"cite", "datetime"
],
"ol" => hashset![
"start"
],
"q" => hashset![
"cite"
],
"table" => hashset![
"align", "border", "cellpadding", "cellspacing", "char", "charoff", "summary",
],
"tbody" => hashset![
"align", "char", "charoff"
],
"td" => hashset![
"align", "char", "charoff", "colspan", "headers", "rowspan"
],
"tfoot" => hashset![
"align", "char", "charoff"
],
"th" => hashset![
"align", "char", "charoff", "colspan", "headers", "rowspan", "scope"
],
"thead" => hashset![
"align", "char", "charoff"
],
"tr" => hashset![
"align", "char", "charoff"
],
];
let clean_html = ammonia::Builder::default()
.tags(tags)
.tag_attributes(tag_attributes)
.generic_attributes(attributes)
.clean(&inlined_html)
.to_string();
//let clean_html = inlined_html;
Ok(rewrite_str(
// TODO(wathiede): replace ammonia with more lol-html rules.
// &ammonia::clean(&html),
&clean_html,
RewriteStrSettings {
element_content_handlers,
..RewriteStrSettings::default()
},
)?)
}