server: add ability to slurp contents from site
This commit is contained in:
@@ -3,13 +3,15 @@ pub mod graphql;
|
||||
pub mod newsreader;
|
||||
pub mod nm;
|
||||
|
||||
use std::{convert::Infallible, str::FromStr};
|
||||
use std::{collections::HashMap, convert::Infallible, str::FromStr};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use css_inline::{CSSInliner, InlineError, InlineOptions};
|
||||
use linkify::{LinkFinder, LinkKind};
|
||||
use log::{error, info};
|
||||
use log::{error, info, warn};
|
||||
use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings};
|
||||
use maplit::{hashmap, hashset};
|
||||
use scraper::{error::SelectorErrorKind, Html, Selector};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
@@ -19,23 +21,28 @@ use crate::newsreader::{
|
||||
const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
|
||||
|
||||
// TODO: figure out how to use Cow
|
||||
trait Transformer {
|
||||
fn should_run(&self, _html: &str) -> bool {
|
||||
#[async_trait]
|
||||
trait Transformer: Send + Sync {
|
||||
fn should_run(&self, addr: &Option<Url>, _html: &str) -> bool {
|
||||
true
|
||||
}
|
||||
// TODO: should html be something like `html_escape` uses:
|
||||
// <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError>;
|
||||
async fn transform(&self, addr: &Option<Url>, html: &str) -> Result<String, TransformError>;
|
||||
}
|
||||
|
||||
// TODO: how would we make this more generic to allow good implementations of Transformer outside
|
||||
// of this module?
|
||||
#[derive(Error, Debug)]
|
||||
pub enum TransformError {
|
||||
#[error("lol-html rewrite error")]
|
||||
#[error("lol-html rewrite error: {0}")]
|
||||
RewritingError(#[from] RewritingError),
|
||||
#[error("css inline error")]
|
||||
#[error("css inline error: {0}")]
|
||||
InlineError(#[from] InlineError),
|
||||
#[error("failed to fetch url error: {0}")]
|
||||
ReqwestError(#[from] reqwest::Error),
|
||||
#[error("failed to parse HTML: {0}")]
|
||||
HtmlParsingError(String),
|
||||
}
|
||||
|
||||
struct SanitizeHtml<'a> {
|
||||
@@ -43,31 +50,34 @@ struct SanitizeHtml<'a> {
|
||||
base_url: &'a Option<Url>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<'a> Transformer for SanitizeHtml<'a> {
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
Ok(sanitize_html(html, self.cid_prefix, self.base_url)?)
|
||||
}
|
||||
}
|
||||
|
||||
struct EscapeHtml;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for EscapeHtml {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
|
||||
html.contains("&")
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
Ok(html_escape::decode_html_entities(html).to_string())
|
||||
}
|
||||
}
|
||||
|
||||
struct StripHtml;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for StripHtml {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
|
||||
// Lame test
|
||||
html.contains("<")
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
let mut text = String::new();
|
||||
let element_content_handlers = vec![text!("*", |t| {
|
||||
text += t.as_str();
|
||||
@@ -87,8 +97,9 @@ impl Transformer for StripHtml {
|
||||
|
||||
struct InlineStyle;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for InlineStyle {
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
let css = concat!(
|
||||
"/* chrome-default.css */\n",
|
||||
include_str!("chrome-default.css"),
|
||||
@@ -118,29 +129,78 @@ impl Transformer for InlineStyle {
|
||||
}
|
||||
}
|
||||
|
||||
struct AddOutlink(Option<url::Url>);
|
||||
struct AddOutlink;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for AddOutlink {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
if let Some(link) = &self.0 {
|
||||
return link.scheme().starts_with("http") && !html.contains(link.as_str());
|
||||
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
|
||||
if let Some(link) = link {
|
||||
link.scheme().starts_with("http") && !html.contains(link.as_str())
|
||||
} else {
|
||||
false
|
||||
}
|
||||
false
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
if let Some(url) = &self.0 {
|
||||
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
if let Some(link) = link {
|
||||
Ok(format!(
|
||||
r#"
|
||||
{html}
|
||||
<div><a href="{}">View on site</a></div>
|
||||
"#,
|
||||
url
|
||||
link
|
||||
))
|
||||
} else {
|
||||
Ok(html.to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SlurpContents {
|
||||
site_selectors: HashMap<String, Vec<Selector>>,
|
||||
}
|
||||
|
||||
impl SlurpContents {
|
||||
fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
|
||||
for (host, selector) in self.site_selectors.iter() {
|
||||
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
|
||||
return Some(&selector);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for SlurpContents {
|
||||
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
|
||||
if let Some(link) = link {
|
||||
return self.get_selectors(link).is_some();
|
||||
}
|
||||
false
|
||||
}
|
||||
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
let Some(link) = link else {
|
||||
return Ok(html.to_string());
|
||||
};
|
||||
let Some(selectors) = self.get_selectors(&link) else {
|
||||
return Ok(html.to_string());
|
||||
};
|
||||
let body = reqwest::get(link.as_str()).await?.text().await?;
|
||||
let doc = Html::parse_document(&body);
|
||||
|
||||
let mut results = Vec::new();
|
||||
for selector in selectors {
|
||||
if let Some(frag) = doc.select(&selector).next() {
|
||||
results.push(frag.html())
|
||||
} else {
|
||||
warn!("couldn't find '{:?}' in {}", selector, link);
|
||||
return Ok(html.to_string());
|
||||
}
|
||||
}
|
||||
Ok(results.join("<br><br>"))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn linkify_html(text: &str) -> String {
|
||||
let mut finder = LinkFinder::new();
|
||||
let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);
|
||||
|
||||
Reference in New Issue
Block a user