Normalize links and images based on post's URL

2024-07-22 11:27:15 -07:00 · 2024-07-22 11:27:15 -07:00 · 1106377550
commit 1106377550
parent b5468bced2
7 changed files with 89 additions and 15 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3183,6 +3183,7 @@ dependencies = [
 "sqlx",
 "thiserror",
 "tokio",
+ "url",
 "urlencoding",
 ]

--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@ -29,4 +29,5 @@ anyhow = "1.0.79"
 maplit = "1.0.2"
 linkify = "0.10.0"
 sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
+url = "2.5.2"

--- a/server/sql/thread.sql
+++ b/server/sql/thread.sql
@ -5,7 +5,8 @@ SELECT
    site,
    summary,
    title,
-    name
+    name,
+    homepage
 FROM
    post p
    JOIN feed f ON p.site = f.slug
--- a/server/src/error.rs
+++ b/server/src/error.rs
@ -7,26 +7,28 @@ use crate::SanitizeError;

 #[derive(Error, Debug)]
 pub enum ServerError {
-    #[error("notmuch")]
+    #[error("notmuch: {0}")]
    NotmuchError(#[from] notmuch::NotmuchError),
    #[error("flatten")]
    FlattenError,
-    #[error("mail parse error")]
+    #[error("mail parse error: {0}")]
    MailParseError(#[from] MailParseError),
-    #[error("IO error")]
+    #[error("IO error: {0}")]
    IoError(#[from] std::io::Error),
    #[error("attachement not found")]
    PartNotFound,
-    #[error("sqlx error")]
+    #[error("sqlx error: {0}")]
    SQLXError(#[from] sqlx::Error),
-    #[error("html sanitize error")]
+    #[error("html sanitize error: {0}")]
    SanitizeError(#[from] SanitizeError),
-    #[error("UTF8 error")]
+    #[error("UTF8 error: {0}")]
    Utf8Error(#[from] Utf8Error),
-    #[error("FromUTF8 error")]
+    #[error("FromUTF8 error: {0}")]
    FromUtf8Error(#[from] FromUtf8Error),
-    #[error("error")]
+    #[error("error: {0}")]
    StringError(String),
-    #[error("impossible")]
+    #[error("invalid url: {0}")]
+    UrlParseError(#[from] url::ParseError),
+    #[error("impossible: {0}")]
    InfaillibleError(#[from] Infallible),
 }
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@ -9,6 +9,7 @@ use log::error;
 use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
 use maplit::{hashmap, hashset};
 use thiserror::Error;
+use url::Url;

 #[derive(Error, Debug)]
 pub enum SanitizeError {
@ -46,7 +47,11 @@ pub fn linkify_html(text: &str) -> String {

 // html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
 // referrences
-pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeError> {
+pub fn sanitize_html(
+    html: &str,
+    cid_prefix: &str,
+    base_url: &Url,
+) -> Result<String, SanitizeError> {
    let element_content_handlers = vec![
        // Open links in new tab
        element!("a[href]", |el| {
@ -54,6 +59,22 @@ pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeErr

            Ok(())
        }),
+        // Make links with relative URLs absolute
+        element!("a[href]", |el| {
+            if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
+                el.set_attribute("href", &href.as_str()).unwrap();
+            }
+
+            Ok(())
+        }),
+        // Make images with relative srcs absolute
+        element!("img[src]", |el| {
+            if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
+                el.set_attribute("src", &src.as_str()).unwrap();
+            }
+
+            Ok(())
+        }),
        // Replace mixed part CID images with URL
        element!("img[src]", |el| {
            let src = el
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@ -7,6 +7,7 @@ use std::{
 use async_graphql::connection::{self, Connection, Edge};
 use log::info;
 use sqlx::postgres::PgPool;
+use url::Url;

 const TAG_PREFIX: &'static str = "News/";
 const THREAD_PREFIX: &'static str = "news:";
@ -14,6 +15,7 @@ const THREAD_PREFIX: &'static str = "news:";
 use crate::{
    error::ServerError,
    graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
+    sanitize_html,
 };

 pub fn is_newsreader_search(query: &str) -> bool {
@ -174,14 +176,54 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
    } else {
        vec!["unread".to_string(), site.clone()]
    };
+    let default_homepage = "http://no-homepage";
+    let homepage = Url::parse(
+        &r.homepage
+            .map(|h| {
+                if h.is_empty() {
+                    default_homepage.to_string()
+                } else {
+                    h
+                }
+            })
+            .unwrap_or(default_homepage.to_string()),
+    )?;
+    let link = Url::parse(
+        &r.link
+            .as_ref()
+            .map(|h| {
+                if h.is_empty() {
+                    default_homepage.to_string()
+                } else {
+                    h.to_string()
+                }
+            })
+            .unwrap_or(default_homepage.to_string()),
+    )?;
+    let addr = r.link.as_ref().map(|link| {
+        if link.contains('@') {
+            link.clone()
+        } else {
+            if let Ok(url) = homepage.join(&link) {
+                url.to_string()
+            } else {
+                link.clone()
+            }
+        }
+    });
+    let html = r.summary.unwrap_or("NO SUMMARY".to_string());
+    // TODO: add site specific cleanups. For example:
+    // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
+    // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolet
+    let html = sanitize_html(&html, "", &link)?;
    let body = Body::Html(Html {
-        html: r.summary.unwrap_or("NO SUMMARY".to_string()),
+        html,
        content_tree: "".to_string(),
    });
    let title = r.title.unwrap_or("NO TITLE".to_string());
    let from = Some(Email {
        name: r.name,
-        addr: r.link,
+        addr: addr.map(|a| a.to_string()),
    });
    Ok(Thread {
        thread_id,
--- a/server/src/nm.rs
+++ b/server/src/nm.rs
@ -10,6 +10,7 @@ use log::{error, info, warn};
 use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail};
 use memmap::MmapOptions;
 use notmuch::Notmuch;
+use url::Url;

 use crate::{
    error::ServerError,
@ -178,6 +179,7 @@ pub async fn thread(
            .get_first_value("date")
            .and_then(|d| mailparse::dateparse(&d).ok());
        let cid_prefix = shared::urls::cid_prefix(None, &id);
+        let base_url = Url::parse("https://there-should-be-no-relative-urls-in-email").unwrap();
        let body = match extract_body(&m, &id)? {
            Body::PlainText(PlainText { text, content_tree }) => {
                let text = if text.len() > MAX_RAW_MESSAGE_SIZE {
@ -196,7 +198,11 @@ pub async fn thread(
                        // Trim newlines to prevent excessive white space at the beginning/end of
                        // presenation. Leave tabs and spaces incase plain text attempts to center a
                        // header on the first line.
-                        sanitize_html(&linkify_html(&text.trim_matches('\n')), &cid_prefix)?
+                        sanitize_html(
+                            &linkify_html(&text.trim_matches('\n')),
+                            &cid_prefix,
+                            &base_url
+                        )?
                    ),
                    content_tree: if debug_content_tree {
                        render_content_type_tree(&m)
@ -206,7 +212,7 @@ pub async fn thread(
                })
            }
            Body::Html(Html { html, content_tree }) => Body::Html(Html {
-                html: sanitize_html(&html, &cid_prefix)?,
+                html: sanitize_html(&html, &cid_prefix, &base_url)?,
                content_tree: if debug_content_tree {
                    render_content_type_tree(&m)
                } else {