Normalize links and images based on post's URL

This commit is contained in:
Bill Thiede 2024-07-22 11:27:15 -07:00
parent b5468bced2
commit 1106377550
7 changed files with 89 additions and 15 deletions

1
Cargo.lock generated
View File

@ -3183,6 +3183,7 @@ dependencies = [
"sqlx", "sqlx",
"thiserror", "thiserror",
"tokio", "tokio",
"url",
"urlencoding", "urlencoding",
] ]

View File

@ -29,4 +29,5 @@ anyhow = "1.0.79"
maplit = "1.0.2" maplit = "1.0.2"
linkify = "0.10.0" linkify = "0.10.0"
sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] } sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
url = "2.5.2"

View File

@ -5,7 +5,8 @@ SELECT
site, site,
summary, summary,
title, title,
name name,
homepage
FROM FROM
post p post p
JOIN feed f ON p.site = f.slug JOIN feed f ON p.site = f.slug

View File

@ -7,26 +7,28 @@ use crate::SanitizeError;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum ServerError { pub enum ServerError {
#[error("notmuch")] #[error("notmuch: {0}")]
NotmuchError(#[from] notmuch::NotmuchError), NotmuchError(#[from] notmuch::NotmuchError),
#[error("flatten")] #[error("flatten")]
FlattenError, FlattenError,
#[error("mail parse error")] #[error("mail parse error: {0}")]
MailParseError(#[from] MailParseError), MailParseError(#[from] MailParseError),
#[error("IO error")] #[error("IO error: {0}")]
IoError(#[from] std::io::Error), IoError(#[from] std::io::Error),
#[error("attachement not found")] #[error("attachement not found")]
PartNotFound, PartNotFound,
#[error("sqlx error")] #[error("sqlx error: {0}")]
SQLXError(#[from] sqlx::Error), SQLXError(#[from] sqlx::Error),
#[error("html sanitize error")] #[error("html sanitize error: {0}")]
SanitizeError(#[from] SanitizeError), SanitizeError(#[from] SanitizeError),
#[error("UTF8 error")] #[error("UTF8 error: {0}")]
Utf8Error(#[from] Utf8Error), Utf8Error(#[from] Utf8Error),
#[error("FromUTF8 error")] #[error("FromUTF8 error: {0}")]
FromUtf8Error(#[from] FromUtf8Error), FromUtf8Error(#[from] FromUtf8Error),
#[error("error")] #[error("error: {0}")]
StringError(String), StringError(String),
#[error("impossible")] #[error("invalid url: {0}")]
UrlParseError(#[from] url::ParseError),
#[error("impossible: {0}")]
InfaillibleError(#[from] Infallible), InfaillibleError(#[from] Infallible),
} }

View File

@ -9,6 +9,7 @@ use log::error;
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings}; use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
use maplit::{hashmap, hashset}; use maplit::{hashmap, hashset};
use thiserror::Error; use thiserror::Error;
use url::Url;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum SanitizeError { pub enum SanitizeError {
@ -46,7 +47,11 @@ pub fn linkify_html(text: &str) -> String {
// html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image // html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
// referrences // referrences
pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeError> { pub fn sanitize_html(
html: &str,
cid_prefix: &str,
base_url: &Url,
) -> Result<String, SanitizeError> {
let element_content_handlers = vec![ let element_content_handlers = vec![
// Open links in new tab // Open links in new tab
element!("a[href]", |el| { element!("a[href]", |el| {
@ -54,6 +59,22 @@ pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeErr
Ok(()) Ok(())
}), }),
// Make links with relative URLs absolute
element!("a[href]", |el| {
if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
el.set_attribute("href", &href.as_str()).unwrap();
}
Ok(())
}),
// Make images with relative srcs absolute
element!("img[src]", |el| {
if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
el.set_attribute("src", &src.as_str()).unwrap();
}
Ok(())
}),
// Replace mixed part CID images with URL // Replace mixed part CID images with URL
element!("img[src]", |el| { element!("img[src]", |el| {
let src = el let src = el

View File

@ -7,6 +7,7 @@ use std::{
use async_graphql::connection::{self, Connection, Edge}; use async_graphql::connection::{self, Connection, Edge};
use log::info; use log::info;
use sqlx::postgres::PgPool; use sqlx::postgres::PgPool;
use url::Url;
const TAG_PREFIX: &'static str = "News/"; const TAG_PREFIX: &'static str = "News/";
const THREAD_PREFIX: &'static str = "news:"; const THREAD_PREFIX: &'static str = "news:";
@ -14,6 +15,7 @@ const THREAD_PREFIX: &'static str = "news:";
use crate::{ use crate::{
error::ServerError, error::ServerError,
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary}, graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
sanitize_html,
}; };
pub fn is_newsreader_search(query: &str) -> bool { pub fn is_newsreader_search(query: &str) -> bool {
@ -174,14 +176,54 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
} else { } else {
vec!["unread".to_string(), site.clone()] vec!["unread".to_string(), site.clone()]
}; };
let default_homepage = "http://no-homepage";
let homepage = Url::parse(
&r.homepage
.map(|h| {
if h.is_empty() {
default_homepage.to_string()
} else {
h
}
})
.unwrap_or(default_homepage.to_string()),
)?;
let link = Url::parse(
&r.link
.as_ref()
.map(|h| {
if h.is_empty() {
default_homepage.to_string()
} else {
h.to_string()
}
})
.unwrap_or(default_homepage.to_string()),
)?;
let addr = r.link.as_ref().map(|link| {
if link.contains('@') {
link.clone()
} else {
if let Ok(url) = homepage.join(&link) {
url.to_string()
} else {
link.clone()
}
}
});
let html = r.summary.unwrap_or("NO SUMMARY".to_string());
// TODO: add site specific cleanups. For example:
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolet
let html = sanitize_html(&html, "", &link)?;
let body = Body::Html(Html { let body = Body::Html(Html {
html: r.summary.unwrap_or("NO SUMMARY".to_string()), html,
content_tree: "".to_string(), content_tree: "".to_string(),
}); });
let title = r.title.unwrap_or("NO TITLE".to_string()); let title = r.title.unwrap_or("NO TITLE".to_string());
let from = Some(Email { let from = Some(Email {
name: r.name, name: r.name,
addr: r.link, addr: addr.map(|a| a.to_string()),
}); });
Ok(Thread { Ok(Thread {
thread_id, thread_id,

View File

@ -10,6 +10,7 @@ use log::{error, info, warn};
use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail}; use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail};
use memmap::MmapOptions; use memmap::MmapOptions;
use notmuch::Notmuch; use notmuch::Notmuch;
use url::Url;
use crate::{ use crate::{
error::ServerError, error::ServerError,
@ -178,6 +179,7 @@ pub async fn thread(
.get_first_value("date") .get_first_value("date")
.and_then(|d| mailparse::dateparse(&d).ok()); .and_then(|d| mailparse::dateparse(&d).ok());
let cid_prefix = shared::urls::cid_prefix(None, &id); let cid_prefix = shared::urls::cid_prefix(None, &id);
let base_url = Url::parse("https://there-should-be-no-relative-urls-in-email").unwrap();
let body = match extract_body(&m, &id)? { let body = match extract_body(&m, &id)? {
Body::PlainText(PlainText { text, content_tree }) => { Body::PlainText(PlainText { text, content_tree }) => {
let text = if text.len() > MAX_RAW_MESSAGE_SIZE { let text = if text.len() > MAX_RAW_MESSAGE_SIZE {
@ -196,7 +198,11 @@ pub async fn thread(
// Trim newlines to prevent excessive white space at the beginning/end of // Trim newlines to prevent excessive white space at the beginning/end of
// presenation. Leave tabs and spaces incase plain text attempts to center a // presenation. Leave tabs and spaces incase plain text attempts to center a
// header on the first line. // header on the first line.
sanitize_html(&linkify_html(&text.trim_matches('\n')), &cid_prefix)? sanitize_html(
&linkify_html(&text.trim_matches('\n')),
&cid_prefix,
&base_url
)?
), ),
content_tree: if debug_content_tree { content_tree: if debug_content_tree {
render_content_type_tree(&m) render_content_type_tree(&m)
@ -206,7 +212,7 @@ pub async fn thread(
}) })
} }
Body::Html(Html { html, content_tree }) => Body::Html(Html { Body::Html(Html { html, content_tree }) => Body::Html(Html {
html: sanitize_html(&html, &cid_prefix)?, html: sanitize_html(&html, &cid_prefix, &base_url)?,
content_tree: if debug_content_tree { content_tree: if debug_content_tree {
render_content_type_tree(&m) render_content_type_tree(&m)
} else { } else {