server: add ability to slurp contents from site

This commit is contained in:
2024-08-25 19:37:53 -07:00
parent d98d429b5c
commit 71de3ef8ae
4 changed files with 661 additions and 82 deletions

View File

@@ -11,6 +11,7 @@ ammonia = "3.3.0"
anyhow = "1.0.79"
async-graphql = { version = "6.0.11", features = ["log"] }
async-graphql-rocket = "6.0.11"
async-trait = "0.1.81"
css-inline = "0.13.0"
glog = "0.1.0"
html-escape = "0.2.13"
@@ -21,8 +22,10 @@ mailparse = "0.15.0"
maplit = "1.0.2"
memmap = "0.7.0"
notmuch = { path = "../notmuch" }
reqwest = { version = "0.12.7", features = ["blocking"] }
rocket = { version = "0.5.0-rc.2", features = [ "json" ] }
rocket_cors = "0.6.0"
scraper = "0.20.0"
serde = { version = "1.0.147", features = ["derive"] }
serde_json = "1.0.87"
shared = { path = "../shared" }

View File

@@ -3,13 +3,15 @@ pub mod graphql;
pub mod newsreader;
pub mod nm;
use std::{convert::Infallible, str::FromStr};
use std::{collections::HashMap, convert::Infallible, str::FromStr};
use async_trait::async_trait;
use css_inline::{CSSInliner, InlineError, InlineOptions};
use linkify::{LinkFinder, LinkKind};
use log::{error, info};
use log::{error, info, warn};
use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings};
use maplit::{hashmap, hashset};
use scraper::{error::SelectorErrorKind, Html, Selector};
use thiserror::Error;
use url::Url;
@@ -19,23 +21,28 @@ use crate::newsreader::{
const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
// TODO: figure out how to use Cow
trait Transformer {
fn should_run(&self, _html: &str) -> bool {
#[async_trait]
trait Transformer: Send + Sync {
fn should_run(&self, addr: &Option<Url>, _html: &str) -> bool {
true
}
// TODO: should html be something like `html_escape` uses:
// <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
fn transform(&self, html: &str) -> Result<String, TransformError>;
async fn transform(&self, addr: &Option<Url>, html: &str) -> Result<String, TransformError>;
}
// TODO: how would we make this more generic to allow good implementations of Transformer outside
// of this module?
#[derive(Error, Debug)]
pub enum TransformError {
#[error("lol-html rewrite error")]
#[error("lol-html rewrite error: {0}")]
RewritingError(#[from] RewritingError),
#[error("css inline error")]
#[error("css inline error: {0}")]
InlineError(#[from] InlineError),
#[error("failed to fetch url error: {0}")]
ReqwestError(#[from] reqwest::Error),
#[error("failed to parse HTML: {0}")]
HtmlParsingError(String),
}
struct SanitizeHtml<'a> {
@@ -43,31 +50,34 @@ struct SanitizeHtml<'a> {
base_url: &'a Option<Url>,
}
#[async_trait]
impl<'a> Transformer for SanitizeHtml<'a> {
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
Ok(sanitize_html(html, self.cid_prefix, self.base_url)?)
}
}
struct EscapeHtml;
#[async_trait]
impl Transformer for EscapeHtml {
fn should_run(&self, html: &str) -> bool {
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
html.contains("&")
}
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
Ok(html_escape::decode_html_entities(html).to_string())
}
}
struct StripHtml;
#[async_trait]
impl Transformer for StripHtml {
fn should_run(&self, html: &str) -> bool {
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
// Lame test
html.contains("<")
}
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
let mut text = String::new();
let element_content_handlers = vec![text!("*", |t| {
text += t.as_str();
@@ -87,8 +97,9 @@ impl Transformer for StripHtml {
struct InlineStyle;
#[async_trait]
impl Transformer for InlineStyle {
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
let css = concat!(
"/* chrome-default.css */\n",
include_str!("chrome-default.css"),
@@ -118,29 +129,78 @@ impl Transformer for InlineStyle {
}
}
struct AddOutlink(Option<url::Url>);
struct AddOutlink;
#[async_trait]
impl Transformer for AddOutlink {
fn should_run(&self, html: &str) -> bool {
if let Some(link) = &self.0 {
return link.scheme().starts_with("http") && !html.contains(link.as_str());
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
if let Some(link) = link {
link.scheme().starts_with("http") && !html.contains(link.as_str())
} else {
false
}
false
}
fn transform(&self, html: &str) -> Result<String, TransformError> {
if let Some(url) = &self.0 {
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
if let Some(link) = link {
Ok(format!(
r#"
{html}
<div><a href="{}">View on site</a></div>
"#,
url
link
))
} else {
Ok(html.to_string())
}
}
}
struct SlurpContents {
site_selectors: HashMap<String, Vec<Selector>>,
}
impl SlurpContents {
fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
for (host, selector) in self.site_selectors.iter() {
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
return Some(&selector);
}
}
None
}
}
#[async_trait]
impl Transformer for SlurpContents {
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
if let Some(link) = link {
return self.get_selectors(link).is_some();
}
false
}
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
let Some(link) = link else {
return Ok(html.to_string());
};
let Some(selectors) = self.get_selectors(&link) else {
return Ok(html.to_string());
};
let body = reqwest::get(link.as_str()).await?.text().await?;
let doc = Html::parse_document(&body);
let mut results = Vec::new();
for selector in selectors {
if let Some(frag) = doc.select(&selector).next() {
results.push(frag.html())
} else {
warn!("couldn't find '{:?}' in {}", selector, link);
return Ok(html.to_string());
}
}
Ok(results.join("<br><br>"))
}
}
pub fn linkify_html(text: &str) -> String {
let mut finder = LinkFinder::new();
let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);

View File

@@ -1,6 +1,8 @@
use std::hash::{DefaultHasher, Hash, Hasher};
use log::info;
use maplit::hashmap;
use scraper::Selector;
use sqlx::postgres::PgPool;
use url::Url;
@@ -13,7 +15,7 @@ use crate::{
compute_offset_limit,
error::ServerError,
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, SlurpContents, StripHtml, Transformer,
};
pub fn is_newsreader_search(query: &str) -> bool {
@@ -89,36 +91,34 @@ pub async fn search(
.fetch_all(pool)
.await?;
Ok(rows
.into_iter()
.enumerate()
.map(|(i, r)| {
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
if !r.is_read.unwrap_or(true) {
tags.push("unread".to_string());
};
let mut title = r.title.unwrap_or("NO TITLE".to_string());
title = clean_title(&title).expect("failed to clean title");
(
i as i32 + offset,
ThreadSummary {
thread: format!("{THREAD_PREFIX}{}", r.uid),
timestamp: r
.date
.expect("post missing date")
.assume_utc()
.unix_timestamp() as isize,
date_relative: "TODO date_relative".to_string(),
matched: 0,
total: 1,
authors: r.name.unwrap_or_else(|| site.clone()),
subject: title,
tags,
},
)
})
.collect())
let mut res = Vec::new();
for (i, r) in rows.into_iter().enumerate() {
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
if !r.is_read.unwrap_or(true) {
tags.push("unread".to_string());
};
let mut title = r.title.unwrap_or("NO TITLE".to_string());
title = clean_title(&title).await.expect("failed to clean title");
res.push((
i as i32 + offset,
ThreadSummary {
thread: format!("{THREAD_PREFIX}{}", r.uid),
timestamp: r
.date
.expect("post missing date")
.assume_utc()
.unix_timestamp() as isize,
date_relative: "TODO date_relative".to_string(),
matched: 0,
total: 1,
authors: r.name.unwrap_or_else(|| site.clone()),
subject: title,
tags,
},
));
}
Ok(res)
}
pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
@@ -197,8 +197,25 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
// TODO: add site specific cleanups. For example:
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
let mut body_tranformers: Vec<Box<dyn Transformer>> = vec![
Box::new(AddOutlink(link.clone())),
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
// TODO: add a map of urls and selectors
Box::new(SlurpContents {
site_selectors: hashmap![
"hackaday.com".to_string() => vec![
Selector::parse("div.entry-featured-image").unwrap(),
Selector::parse("div.entry-content").unwrap()
],
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
"natwelch.com".to_string() => vec![
Selector::parse("article div.prose").unwrap(),
],
"slashdot.org".to_string() => vec![
Selector::parse("span.story-byline").unwrap(),
Selector::parse("div.p").unwrap(),
],
],
}),
Box::new(AddOutlink),
Box::new(EscapeHtml),
Box::new(InlineStyle),
Box::new(SanitizeHtml {
@@ -207,15 +224,15 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
}),
];
for t in body_tranformers.iter() {
if t.should_run(&body) {
body = t.transform(&body)?;
if t.should_run(&link, &body) {
body = t.transform(&link, &body).await?;
}
}
let body = Body::Html(Html {
html: body,
content_tree: "".to_string(),
});
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?;
let from = Some(Email {
name: r.name,
addr: addr.map(|a| a.to_string()),
@@ -254,7 +271,7 @@ pub async fn set_read_status<'ctx>(
.await?;
Ok(true)
}
fn clean_title(title: &str) -> Result<String, ServerError> {
async fn clean_title(title: &str) -> Result<String, ServerError> {
// Make title HTML so html parsers work
let mut title = format!("<html>{title}</html>");
let title_tranformers: Vec<Box<dyn Transformer>> =
@@ -262,8 +279,8 @@ fn clean_title(title: &str) -> Result<String, ServerError> {
// Make title HTML so html parsers work
title = format!("<html>{title}</html>");
for t in title_tranformers.iter() {
if t.should_run(&title) {
title = t.transform(&title)?;
if t.should_run(&None, &title) {
title = t.transform(&None, &title).await?;
}
}
Ok(title)