server: add ability to slurp contents from site

This commit is contained in:
2024-08-25 19:37:53 -07:00
parent d98d429b5c
commit 71de3ef8ae
4 changed files with 661 additions and 82 deletions

View File

@@ -3,13 +3,15 @@ pub mod graphql;
pub mod newsreader;
pub mod nm;
use std::{convert::Infallible, str::FromStr};
use std::{collections::HashMap, convert::Infallible, str::FromStr};
use async_trait::async_trait;
use css_inline::{CSSInliner, InlineError, InlineOptions};
use linkify::{LinkFinder, LinkKind};
use log::{error, info};
use log::{error, info, warn};
use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings};
use maplit::{hashmap, hashset};
use scraper::{error::SelectorErrorKind, Html, Selector};
use thiserror::Error;
use url::Url;
@@ -19,23 +21,28 @@ use crate::newsreader::{
const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
// TODO: figure out how to use Cow
trait Transformer {
fn should_run(&self, _html: &str) -> bool {
#[async_trait]
trait Transformer: Send + Sync {
fn should_run(&self, addr: &Option<Url>, _html: &str) -> bool {
true
}
// TODO: should html be something like `html_escape` uses:
// <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
fn transform(&self, html: &str) -> Result<String, TransformError>;
async fn transform(&self, addr: &Option<Url>, html: &str) -> Result<String, TransformError>;
}
// TODO: how would we make this more generic to allow good implementations of Transformer outside
// of this module?
#[derive(Error, Debug)]
pub enum TransformError {
#[error("lol-html rewrite error")]
#[error("lol-html rewrite error: {0}")]
RewritingError(#[from] RewritingError),
#[error("css inline error")]
#[error("css inline error: {0}")]
InlineError(#[from] InlineError),
#[error("failed to fetch url error: {0}")]
ReqwestError(#[from] reqwest::Error),
#[error("failed to parse HTML: {0}")]
HtmlParsingError(String),
}
struct SanitizeHtml<'a> {
@@ -43,31 +50,34 @@ struct SanitizeHtml<'a> {
base_url: &'a Option<Url>,
}
#[async_trait]
impl<'a> Transformer for SanitizeHtml<'a> {
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
Ok(sanitize_html(html, self.cid_prefix, self.base_url)?)
}
}
struct EscapeHtml;
#[async_trait]
impl Transformer for EscapeHtml {
fn should_run(&self, html: &str) -> bool {
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
html.contains("&")
}
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
Ok(html_escape::decode_html_entities(html).to_string())
}
}
struct StripHtml;
#[async_trait]
impl Transformer for StripHtml {
fn should_run(&self, html: &str) -> bool {
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
// Lame test
html.contains("<")
}
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
let mut text = String::new();
let element_content_handlers = vec![text!("*", |t| {
text += t.as_str();
@@ -87,8 +97,9 @@ impl Transformer for StripHtml {
struct InlineStyle;
#[async_trait]
impl Transformer for InlineStyle {
fn transform(&self, html: &str) -> Result<String, TransformError> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
let css = concat!(
"/* chrome-default.css */\n",
include_str!("chrome-default.css"),
@@ -118,29 +129,78 @@ impl Transformer for InlineStyle {
}
}
struct AddOutlink(Option<url::Url>);
struct AddOutlink;
#[async_trait]
impl Transformer for AddOutlink {
fn should_run(&self, html: &str) -> bool {
if let Some(link) = &self.0 {
return link.scheme().starts_with("http") && !html.contains(link.as_str());
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
if let Some(link) = link {
link.scheme().starts_with("http") && !html.contains(link.as_str())
} else {
false
}
false
}
fn transform(&self, html: &str) -> Result<String, TransformError> {
if let Some(url) = &self.0 {
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
if let Some(link) = link {
Ok(format!(
r#"
{html}
<div><a href="{}">View on site</a></div>
"#,
url
link
))
} else {
Ok(html.to_string())
}
}
}
struct SlurpContents {
site_selectors: HashMap<String, Vec<Selector>>,
}
impl SlurpContents {
fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
for (host, selector) in self.site_selectors.iter() {
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
return Some(&selector);
}
}
None
}
}
#[async_trait]
impl Transformer for SlurpContents {
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
if let Some(link) = link {
return self.get_selectors(link).is_some();
}
false
}
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
let Some(link) = link else {
return Ok(html.to_string());
};
let Some(selectors) = self.get_selectors(&link) else {
return Ok(html.to_string());
};
let body = reqwest::get(link.as_str()).await?.text().await?;
let doc = Html::parse_document(&body);
let mut results = Vec::new();
for selector in selectors {
if let Some(frag) = doc.select(&selector).next() {
results.push(frag.html())
} else {
warn!("couldn't find '{:?}' in {}", selector, link);
return Ok(html.to_string());
}
}
Ok(results.join("<br><br>"))
}
}
pub fn linkify_html(text: &str) -> String {
let mut finder = LinkFinder::new();
let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);