server: add ability to slurp contents from site
This commit is contained in:
@@ -3,13 +3,15 @@ pub mod graphql;
|
||||
pub mod newsreader;
|
||||
pub mod nm;
|
||||
|
||||
use std::{convert::Infallible, str::FromStr};
|
||||
use std::{collections::HashMap, convert::Infallible, str::FromStr};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use css_inline::{CSSInliner, InlineError, InlineOptions};
|
||||
use linkify::{LinkFinder, LinkKind};
|
||||
use log::{error, info};
|
||||
use log::{error, info, warn};
|
||||
use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings};
|
||||
use maplit::{hashmap, hashset};
|
||||
use scraper::{error::SelectorErrorKind, Html, Selector};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
@@ -19,23 +21,28 @@ use crate::newsreader::{
|
||||
const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
|
||||
|
||||
// TODO: figure out how to use Cow
|
||||
trait Transformer {
|
||||
fn should_run(&self, _html: &str) -> bool {
|
||||
#[async_trait]
|
||||
trait Transformer: Send + Sync {
|
||||
fn should_run(&self, addr: &Option<Url>, _html: &str) -> bool {
|
||||
true
|
||||
}
|
||||
// TODO: should html be something like `html_escape` uses:
|
||||
// <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError>;
|
||||
async fn transform(&self, addr: &Option<Url>, html: &str) -> Result<String, TransformError>;
|
||||
}
|
||||
|
||||
// TODO: how would we make this more generic to allow good implementations of Transformer outside
|
||||
// of this module?
|
||||
#[derive(Error, Debug)]
|
||||
pub enum TransformError {
|
||||
#[error("lol-html rewrite error")]
|
||||
#[error("lol-html rewrite error: {0}")]
|
||||
RewritingError(#[from] RewritingError),
|
||||
#[error("css inline error")]
|
||||
#[error("css inline error: {0}")]
|
||||
InlineError(#[from] InlineError),
|
||||
#[error("failed to fetch url error: {0}")]
|
||||
ReqwestError(#[from] reqwest::Error),
|
||||
#[error("failed to parse HTML: {0}")]
|
||||
HtmlParsingError(String),
|
||||
}
|
||||
|
||||
struct SanitizeHtml<'a> {
|
||||
@@ -43,31 +50,34 @@ struct SanitizeHtml<'a> {
|
||||
base_url: &'a Option<Url>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<'a> Transformer for SanitizeHtml<'a> {
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
Ok(sanitize_html(html, self.cid_prefix, self.base_url)?)
|
||||
}
|
||||
}
|
||||
|
||||
struct EscapeHtml;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for EscapeHtml {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
|
||||
html.contains("&")
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
Ok(html_escape::decode_html_entities(html).to_string())
|
||||
}
|
||||
}
|
||||
|
||||
struct StripHtml;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for StripHtml {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
|
||||
// Lame test
|
||||
html.contains("<")
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
let mut text = String::new();
|
||||
let element_content_handlers = vec![text!("*", |t| {
|
||||
text += t.as_str();
|
||||
@@ -87,8 +97,9 @@ impl Transformer for StripHtml {
|
||||
|
||||
struct InlineStyle;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for InlineStyle {
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
let css = concat!(
|
||||
"/* chrome-default.css */\n",
|
||||
include_str!("chrome-default.css"),
|
||||
@@ -118,29 +129,78 @@ impl Transformer for InlineStyle {
|
||||
}
|
||||
}
|
||||
|
||||
struct AddOutlink(Option<url::Url>);
|
||||
struct AddOutlink;
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for AddOutlink {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
if let Some(link) = &self.0 {
|
||||
return link.scheme().starts_with("http") && !html.contains(link.as_str());
|
||||
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
|
||||
if let Some(link) = link {
|
||||
link.scheme().starts_with("http") && !html.contains(link.as_str())
|
||||
} else {
|
||||
false
|
||||
}
|
||||
false
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
if let Some(url) = &self.0 {
|
||||
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
if let Some(link) = link {
|
||||
Ok(format!(
|
||||
r#"
|
||||
{html}
|
||||
<div><a href="{}">View on site</a></div>
|
||||
"#,
|
||||
url
|
||||
link
|
||||
))
|
||||
} else {
|
||||
Ok(html.to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SlurpContents {
|
||||
site_selectors: HashMap<String, Vec<Selector>>,
|
||||
}
|
||||
|
||||
impl SlurpContents {
|
||||
fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
|
||||
for (host, selector) in self.site_selectors.iter() {
|
||||
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
|
||||
return Some(&selector);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for SlurpContents {
|
||||
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
|
||||
if let Some(link) = link {
|
||||
return self.get_selectors(link).is_some();
|
||||
}
|
||||
false
|
||||
}
|
||||
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||
let Some(link) = link else {
|
||||
return Ok(html.to_string());
|
||||
};
|
||||
let Some(selectors) = self.get_selectors(&link) else {
|
||||
return Ok(html.to_string());
|
||||
};
|
||||
let body = reqwest::get(link.as_str()).await?.text().await?;
|
||||
let doc = Html::parse_document(&body);
|
||||
|
||||
let mut results = Vec::new();
|
||||
for selector in selectors {
|
||||
if let Some(frag) = doc.select(&selector).next() {
|
||||
results.push(frag.html())
|
||||
} else {
|
||||
warn!("couldn't find '{:?}' in {}", selector, link);
|
||||
return Ok(html.to_string());
|
||||
}
|
||||
}
|
||||
Ok(results.join("<br><br>"))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn linkify_html(text: &str) -> String {
|
||||
let mut finder = LinkFinder::new();
|
||||
let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::hash::{DefaultHasher, Hash, Hasher};
|
||||
|
||||
use log::info;
|
||||
use maplit::hashmap;
|
||||
use scraper::Selector;
|
||||
use sqlx::postgres::PgPool;
|
||||
use url::Url;
|
||||
|
||||
@@ -13,7 +15,7 @@ use crate::{
|
||||
compute_offset_limit,
|
||||
error::ServerError,
|
||||
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
|
||||
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
|
||||
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, SlurpContents, StripHtml, Transformer,
|
||||
};
|
||||
|
||||
pub fn is_newsreader_search(query: &str) -> bool {
|
||||
@@ -89,36 +91,34 @@ pub async fn search(
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
Ok(rows
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, r)| {
|
||||
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
|
||||
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
|
||||
if !r.is_read.unwrap_or(true) {
|
||||
tags.push("unread".to_string());
|
||||
};
|
||||
let mut title = r.title.unwrap_or("NO TITLE".to_string());
|
||||
title = clean_title(&title).expect("failed to clean title");
|
||||
(
|
||||
i as i32 + offset,
|
||||
ThreadSummary {
|
||||
thread: format!("{THREAD_PREFIX}{}", r.uid),
|
||||
timestamp: r
|
||||
.date
|
||||
.expect("post missing date")
|
||||
.assume_utc()
|
||||
.unix_timestamp() as isize,
|
||||
date_relative: "TODO date_relative".to_string(),
|
||||
matched: 0,
|
||||
total: 1,
|
||||
authors: r.name.unwrap_or_else(|| site.clone()),
|
||||
subject: title,
|
||||
tags,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect())
|
||||
let mut res = Vec::new();
|
||||
for (i, r) in rows.into_iter().enumerate() {
|
||||
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
|
||||
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
|
||||
if !r.is_read.unwrap_or(true) {
|
||||
tags.push("unread".to_string());
|
||||
};
|
||||
let mut title = r.title.unwrap_or("NO TITLE".to_string());
|
||||
title = clean_title(&title).await.expect("failed to clean title");
|
||||
res.push((
|
||||
i as i32 + offset,
|
||||
ThreadSummary {
|
||||
thread: format!("{THREAD_PREFIX}{}", r.uid),
|
||||
timestamp: r
|
||||
.date
|
||||
.expect("post missing date")
|
||||
.assume_utc()
|
||||
.unix_timestamp() as isize,
|
||||
date_relative: "TODO date_relative".to_string(),
|
||||
matched: 0,
|
||||
total: 1,
|
||||
authors: r.name.unwrap_or_else(|| site.clone()),
|
||||
subject: title,
|
||||
tags,
|
||||
},
|
||||
));
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
|
||||
@@ -197,8 +197,25 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
// TODO: add site specific cleanups. For example:
|
||||
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
|
||||
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
|
||||
let mut body_tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
Box::new(AddOutlink(link.clone())),
|
||||
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
// TODO: add a map of urls and selectors
|
||||
Box::new(SlurpContents {
|
||||
site_selectors: hashmap![
|
||||
"hackaday.com".to_string() => vec![
|
||||
Selector::parse("div.entry-featured-image").unwrap(),
|
||||
Selector::parse("div.entry-content").unwrap()
|
||||
],
|
||||
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
|
||||
"natwelch.com".to_string() => vec![
|
||||
Selector::parse("article div.prose").unwrap(),
|
||||
],
|
||||
"slashdot.org".to_string() => vec![
|
||||
Selector::parse("span.story-byline").unwrap(),
|
||||
Selector::parse("div.p").unwrap(),
|
||||
],
|
||||
],
|
||||
}),
|
||||
Box::new(AddOutlink),
|
||||
Box::new(EscapeHtml),
|
||||
Box::new(InlineStyle),
|
||||
Box::new(SanitizeHtml {
|
||||
@@ -207,15 +224,15 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
}),
|
||||
];
|
||||
for t in body_tranformers.iter() {
|
||||
if t.should_run(&body) {
|
||||
body = t.transform(&body)?;
|
||||
if t.should_run(&link, &body) {
|
||||
body = t.transform(&link, &body).await?;
|
||||
}
|
||||
}
|
||||
let body = Body::Html(Html {
|
||||
html: body,
|
||||
content_tree: "".to_string(),
|
||||
});
|
||||
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
|
||||
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?;
|
||||
let from = Some(Email {
|
||||
name: r.name,
|
||||
addr: addr.map(|a| a.to_string()),
|
||||
@@ -254,7 +271,7 @@ pub async fn set_read_status<'ctx>(
|
||||
.await?;
|
||||
Ok(true)
|
||||
}
|
||||
fn clean_title(title: &str) -> Result<String, ServerError> {
|
||||
async fn clean_title(title: &str) -> Result<String, ServerError> {
|
||||
// Make title HTML so html parsers work
|
||||
let mut title = format!("<html>{title}</html>");
|
||||
let title_tranformers: Vec<Box<dyn Transformer>> =
|
||||
@@ -262,8 +279,8 @@ fn clean_title(title: &str) -> Result<String, ServerError> {
|
||||
// Make title HTML so html parsers work
|
||||
title = format!("<html>{title}</html>");
|
||||
for t in title_tranformers.iter() {
|
||||
if t.should_run(&title) {
|
||||
title = t.transform(&title)?;
|
||||
if t.should_run(&None, &title) {
|
||||
title = t.transform(&None, &title).await?;
|
||||
}
|
||||
}
|
||||
Ok(title)
|
||||
|
||||
Reference in New Issue
Block a user