server: escape RSS feeds that are HTML escaped

This commit is contained in:
Bill Thiede 2024-08-03 11:29:20 -07:00
parent e0863ac085
commit 56bc1cf7ed
5 changed files with 73 additions and 14 deletions

16
Cargo.lock generated
View File

@ -1427,6 +1427,15 @@ dependencies = [
"windows-sys 0.52.0", "windows-sys 0.52.0",
] ]
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]] [[package]]
name = "html5ever" name = "html5ever"
version = "0.26.0" version = "0.26.0"
@ -3168,6 +3177,7 @@ dependencies = [
"async-graphql-rocket", "async-graphql-rocket",
"css-inline", "css-inline",
"glog", "glog",
"html-escape",
"linkify", "linkify",
"log", "log",
"lol_html", "lol_html",
@ -4139,6 +4149,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
[[package]] [[package]]
name = "uuid" name = "uuid"
version = "1.10.0" version = "1.10.0"

View File

@ -30,4 +30,5 @@ maplit = "1.0.2"
linkify = "0.10.0" linkify = "0.10.0"
sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] } sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
url = "2.5.2" url = "2.5.2"
html-escape = "0.2.13"

View File

@ -3,7 +3,7 @@ use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error};
use mailparse::MailParseError; use mailparse::MailParseError;
use thiserror::Error; use thiserror::Error;
use crate::SanitizeError; use crate::TransformError;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum ServerError { pub enum ServerError {
@ -19,8 +19,8 @@ pub enum ServerError {
PartNotFound, PartNotFound,
#[error("sqlx error: {0}")] #[error("sqlx error: {0}")]
SQLXError(#[from] sqlx::Error), SQLXError(#[from] sqlx::Error),
#[error("html sanitize error: {0}")] #[error("html transform error: {0}")]
SanitizeError(#[from] SanitizeError), TransformError(#[from] TransformError),
#[error("UTF8 error: {0}")] #[error("UTF8 error: {0}")]
Utf8Error(#[from] Utf8Error), Utf8Error(#[from] Utf8Error),
#[error("FromUTF8 error: {0}")] #[error("FromUTF8 error: {0}")]

View File

@ -11,14 +11,49 @@ use maplit::{hashmap, hashset};
use thiserror::Error; use thiserror::Error;
use url::Url; use url::Url;
// TODO: figure out how to use Cow
trait Transformer {
fn should_run(&self, input: &str) -> bool;
// TODO: should input be something like `html_escape` uses:
// <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
fn transform(&self, input: &str) -> Result<String, TransformError>;
}
// TODO: how would we make this more generic to allow good implementations of Transformer outside
// of this module?
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum SanitizeError { pub enum TransformError {
#[error("lol-html rewrite error")] #[error("lol-html rewrite error")]
RewritingError(#[from] RewritingError), RewritingError(#[from] RewritingError),
#[error("css inline error")] #[error("css inline error")]
InlineError(#[from] InlineError), InlineError(#[from] InlineError),
} }
struct SanitizeHtml<'a> {
cid_prefix: &'a str,
base_url: &'a Option<Url>,
}
impl<'a> Transformer for SanitizeHtml<'a> {
fn should_run(&self, _input: &str) -> bool {
true
}
fn transform(&self, input: &str) -> Result<String, TransformError> {
Ok(sanitize_html(input, self.cid_prefix, self.base_url)?)
}
}
struct EscapeHtml;
impl Transformer for EscapeHtml {
fn should_run(&self, input: &str) -> bool {
input.starts_with("&lt")
}
fn transform(&self, input: &str) -> Result<String, TransformError> {
Ok(html_escape::decode_html_entities(input).to_string())
}
}
pub fn linkify_html(text: &str) -> String { pub fn linkify_html(text: &str) -> String {
let mut finder = LinkFinder::new(); let mut finder = LinkFinder::new();
let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]); let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);
@ -51,7 +86,7 @@ pub fn sanitize_html(
html: &str, html: &str,
cid_prefix: &str, cid_prefix: &str,
base_url: &Option<Url>, base_url: &Option<Url>,
) -> Result<String, SanitizeError> { ) -> Result<String, TransformError> {
let mut element_content_handlers = vec![ let mut element_content_handlers = vec![
// Open links in new tab // Open links in new tab
element!("a[href]", |el| { element!("a[href]", |el| {
@ -86,10 +121,7 @@ pub fn sanitize_html(
element_content_handlers.extend(vec![ element_content_handlers.extend(vec![
// Make links with relative URLs absolute // Make links with relative URLs absolute
element!("a[href]", |el| { element!("a[href]", |el| {
if let Some(Ok(href)) = el.get_attribute("href").map(|href| { if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
info!("href {href:?}");
base_url.join(&href)
}) {
el.set_attribute("href", &href.as_str()).unwrap(); el.set_attribute("href", &href.as_str()).unwrap();
} }
@ -98,7 +130,6 @@ pub fn sanitize_html(
// Make images with relative srcs absolute // Make images with relative srcs absolute
element!("img[src]", |el| { element!("img[src]", |el| {
if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) { if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
info!("src {src:?}");
el.set_attribute("src", &src.as_str()).unwrap(); el.set_attribute("src", &src.as_str()).unwrap();
} }

View File

@ -14,7 +14,7 @@ const THREAD_PREFIX: &'static str = "news:";
use crate::{ use crate::{
error::ServerError, error::ServerError,
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary}, graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
sanitize_html, EscapeHtml, SanitizeHtml, Transformer,
}; };
pub fn is_newsreader_search(query: &str) -> bool { pub fn is_newsreader_search(query: &str) -> bool {
@ -207,13 +207,24 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
} }
} }
}); });
let html = r.summary.unwrap_or("NO SUMMARY".to_string()); let mut html = r.summary.unwrap_or("NO SUMMARY".to_string());
// TODO: add site specific cleanups. For example: // TODO: add site specific cleanups. For example:
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div> // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
let html = sanitize_html(&html, "", &link)?; let tranformers: Vec<Box<dyn Transformer>> = vec![
Box::new(EscapeHtml),
Box::new(SanitizeHtml {
cid_prefix: "",
base_url: &link,
}),
];
for t in tranformers.iter() {
if t.should_run(&html) {
html = t.transform(&html)?;
}
}
let body = Body::Html(Html { let body = Body::Html(Html {
html, html: html.to_string(),
content_tree: "".to_string(), content_tree: "".to_string(),
}); });
let title = r.title.unwrap_or("NO TITLE".to_string()); let title = r.title.unwrap_or("NO TITLE".to_string());