Normalize links and images based on post's URL
This commit is contained in:
parent
b5468bced2
commit
1106377550
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -3183,6 +3183,7 @@ dependencies = [
|
|||||||
"sqlx",
|
"sqlx",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"url",
|
||||||
"urlencoding",
|
"urlencoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -29,4 +29,5 @@ anyhow = "1.0.79"
|
|||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
linkify = "0.10.0"
|
linkify = "0.10.0"
|
||||||
sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
|
sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
|
||||||
|
url = "2.5.2"
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,8 @@ SELECT
|
|||||||
site,
|
site,
|
||||||
summary,
|
summary,
|
||||||
title,
|
title,
|
||||||
name
|
name,
|
||||||
|
homepage
|
||||||
FROM
|
FROM
|
||||||
post p
|
post p
|
||||||
JOIN feed f ON p.site = f.slug
|
JOIN feed f ON p.site = f.slug
|
||||||
|
|||||||
@ -7,26 +7,28 @@ use crate::SanitizeError;
|
|||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum ServerError {
|
pub enum ServerError {
|
||||||
#[error("notmuch")]
|
#[error("notmuch: {0}")]
|
||||||
NotmuchError(#[from] notmuch::NotmuchError),
|
NotmuchError(#[from] notmuch::NotmuchError),
|
||||||
#[error("flatten")]
|
#[error("flatten")]
|
||||||
FlattenError,
|
FlattenError,
|
||||||
#[error("mail parse error")]
|
#[error("mail parse error: {0}")]
|
||||||
MailParseError(#[from] MailParseError),
|
MailParseError(#[from] MailParseError),
|
||||||
#[error("IO error")]
|
#[error("IO error: {0}")]
|
||||||
IoError(#[from] std::io::Error),
|
IoError(#[from] std::io::Error),
|
||||||
#[error("attachement not found")]
|
#[error("attachement not found")]
|
||||||
PartNotFound,
|
PartNotFound,
|
||||||
#[error("sqlx error")]
|
#[error("sqlx error: {0}")]
|
||||||
SQLXError(#[from] sqlx::Error),
|
SQLXError(#[from] sqlx::Error),
|
||||||
#[error("html sanitize error")]
|
#[error("html sanitize error: {0}")]
|
||||||
SanitizeError(#[from] SanitizeError),
|
SanitizeError(#[from] SanitizeError),
|
||||||
#[error("UTF8 error")]
|
#[error("UTF8 error: {0}")]
|
||||||
Utf8Error(#[from] Utf8Error),
|
Utf8Error(#[from] Utf8Error),
|
||||||
#[error("FromUTF8 error")]
|
#[error("FromUTF8 error: {0}")]
|
||||||
FromUtf8Error(#[from] FromUtf8Error),
|
FromUtf8Error(#[from] FromUtf8Error),
|
||||||
#[error("error")]
|
#[error("error: {0}")]
|
||||||
StringError(String),
|
StringError(String),
|
||||||
#[error("impossible")]
|
#[error("invalid url: {0}")]
|
||||||
|
UrlParseError(#[from] url::ParseError),
|
||||||
|
#[error("impossible: {0}")]
|
||||||
InfaillibleError(#[from] Infallible),
|
InfaillibleError(#[from] Infallible),
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,6 +9,7 @@ use log::error;
|
|||||||
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
|
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
|
||||||
use maplit::{hashmap, hashset};
|
use maplit::{hashmap, hashset};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum SanitizeError {
|
pub enum SanitizeError {
|
||||||
@ -46,7 +47,11 @@ pub fn linkify_html(text: &str) -> String {
|
|||||||
|
|
||||||
// html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
|
// html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
|
||||||
// referrences
|
// referrences
|
||||||
pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeError> {
|
pub fn sanitize_html(
|
||||||
|
html: &str,
|
||||||
|
cid_prefix: &str,
|
||||||
|
base_url: &Url,
|
||||||
|
) -> Result<String, SanitizeError> {
|
||||||
let element_content_handlers = vec![
|
let element_content_handlers = vec![
|
||||||
// Open links in new tab
|
// Open links in new tab
|
||||||
element!("a[href]", |el| {
|
element!("a[href]", |el| {
|
||||||
@ -54,6 +59,22 @@ pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeErr
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}),
|
}),
|
||||||
|
// Make links with relative URLs absolute
|
||||||
|
element!("a[href]", |el| {
|
||||||
|
if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
|
||||||
|
el.set_attribute("href", &href.as_str()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}),
|
||||||
|
// Make images with relative srcs absolute
|
||||||
|
element!("img[src]", |el| {
|
||||||
|
if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
|
||||||
|
el.set_attribute("src", &src.as_str()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}),
|
||||||
// Replace mixed part CID images with URL
|
// Replace mixed part CID images with URL
|
||||||
element!("img[src]", |el| {
|
element!("img[src]", |el| {
|
||||||
let src = el
|
let src = el
|
||||||
|
|||||||
@ -7,6 +7,7 @@ use std::{
|
|||||||
use async_graphql::connection::{self, Connection, Edge};
|
use async_graphql::connection::{self, Connection, Edge};
|
||||||
use log::info;
|
use log::info;
|
||||||
use sqlx::postgres::PgPool;
|
use sqlx::postgres::PgPool;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
const TAG_PREFIX: &'static str = "News/";
|
const TAG_PREFIX: &'static str = "News/";
|
||||||
const THREAD_PREFIX: &'static str = "news:";
|
const THREAD_PREFIX: &'static str = "news:";
|
||||||
@ -14,6 +15,7 @@ const THREAD_PREFIX: &'static str = "news:";
|
|||||||
use crate::{
|
use crate::{
|
||||||
error::ServerError,
|
error::ServerError,
|
||||||
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
|
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
|
||||||
|
sanitize_html,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn is_newsreader_search(query: &str) -> bool {
|
pub fn is_newsreader_search(query: &str) -> bool {
|
||||||
@ -174,14 +176,54 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
|||||||
} else {
|
} else {
|
||||||
vec!["unread".to_string(), site.clone()]
|
vec!["unread".to_string(), site.clone()]
|
||||||
};
|
};
|
||||||
|
let default_homepage = "http://no-homepage";
|
||||||
|
let homepage = Url::parse(
|
||||||
|
&r.homepage
|
||||||
|
.map(|h| {
|
||||||
|
if h.is_empty() {
|
||||||
|
default_homepage.to_string()
|
||||||
|
} else {
|
||||||
|
h
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.unwrap_or(default_homepage.to_string()),
|
||||||
|
)?;
|
||||||
|
let link = Url::parse(
|
||||||
|
&r.link
|
||||||
|
.as_ref()
|
||||||
|
.map(|h| {
|
||||||
|
if h.is_empty() {
|
||||||
|
default_homepage.to_string()
|
||||||
|
} else {
|
||||||
|
h.to_string()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.unwrap_or(default_homepage.to_string()),
|
||||||
|
)?;
|
||||||
|
let addr = r.link.as_ref().map(|link| {
|
||||||
|
if link.contains('@') {
|
||||||
|
link.clone()
|
||||||
|
} else {
|
||||||
|
if let Ok(url) = homepage.join(&link) {
|
||||||
|
url.to_string()
|
||||||
|
} else {
|
||||||
|
link.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let html = r.summary.unwrap_or("NO SUMMARY".to_string());
|
||||||
|
// TODO: add site specific cleanups. For example:
|
||||||
|
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
|
||||||
|
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolet
|
||||||
|
let html = sanitize_html(&html, "", &link)?;
|
||||||
let body = Body::Html(Html {
|
let body = Body::Html(Html {
|
||||||
html: r.summary.unwrap_or("NO SUMMARY".to_string()),
|
html,
|
||||||
content_tree: "".to_string(),
|
content_tree: "".to_string(),
|
||||||
});
|
});
|
||||||
let title = r.title.unwrap_or("NO TITLE".to_string());
|
let title = r.title.unwrap_or("NO TITLE".to_string());
|
||||||
let from = Some(Email {
|
let from = Some(Email {
|
||||||
name: r.name,
|
name: r.name,
|
||||||
addr: r.link,
|
addr: addr.map(|a| a.to_string()),
|
||||||
});
|
});
|
||||||
Ok(Thread {
|
Ok(Thread {
|
||||||
thread_id,
|
thread_id,
|
||||||
|
|||||||
@ -10,6 +10,7 @@ use log::{error, info, warn};
|
|||||||
use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail};
|
use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail};
|
||||||
use memmap::MmapOptions;
|
use memmap::MmapOptions;
|
||||||
use notmuch::Notmuch;
|
use notmuch::Notmuch;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
error::ServerError,
|
error::ServerError,
|
||||||
@ -178,6 +179,7 @@ pub async fn thread(
|
|||||||
.get_first_value("date")
|
.get_first_value("date")
|
||||||
.and_then(|d| mailparse::dateparse(&d).ok());
|
.and_then(|d| mailparse::dateparse(&d).ok());
|
||||||
let cid_prefix = shared::urls::cid_prefix(None, &id);
|
let cid_prefix = shared::urls::cid_prefix(None, &id);
|
||||||
|
let base_url = Url::parse("https://there-should-be-no-relative-urls-in-email").unwrap();
|
||||||
let body = match extract_body(&m, &id)? {
|
let body = match extract_body(&m, &id)? {
|
||||||
Body::PlainText(PlainText { text, content_tree }) => {
|
Body::PlainText(PlainText { text, content_tree }) => {
|
||||||
let text = if text.len() > MAX_RAW_MESSAGE_SIZE {
|
let text = if text.len() > MAX_RAW_MESSAGE_SIZE {
|
||||||
@ -196,7 +198,11 @@ pub async fn thread(
|
|||||||
// Trim newlines to prevent excessive white space at the beginning/end of
|
// Trim newlines to prevent excessive white space at the beginning/end of
|
||||||
// presenation. Leave tabs and spaces incase plain text attempts to center a
|
// presenation. Leave tabs and spaces incase plain text attempts to center a
|
||||||
// header on the first line.
|
// header on the first line.
|
||||||
sanitize_html(&linkify_html(&text.trim_matches('\n')), &cid_prefix)?
|
sanitize_html(
|
||||||
|
&linkify_html(&text.trim_matches('\n')),
|
||||||
|
&cid_prefix,
|
||||||
|
&base_url
|
||||||
|
)?
|
||||||
),
|
),
|
||||||
content_tree: if debug_content_tree {
|
content_tree: if debug_content_tree {
|
||||||
render_content_type_tree(&m)
|
render_content_type_tree(&m)
|
||||||
@ -206,7 +212,7 @@ pub async fn thread(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
Body::Html(Html { html, content_tree }) => Body::Html(Html {
|
Body::Html(Html { html, content_tree }) => Body::Html(Html {
|
||||||
html: sanitize_html(&html, &cid_prefix)?,
|
html: sanitize_html(&html, &cid_prefix, &base_url)?,
|
||||||
content_tree: if debug_content_tree {
|
content_tree: if debug_content_tree {
|
||||||
render_content_type_tree(&m)
|
render_content_type_tree(&m)
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user