server: add ability to slurp contents from site

2024-08-25 19:37:53 -07:00
parent d98d429b5c
commit 71de3ef8ae
4 changed files with 661 additions and 82 deletions
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -11,6 +11,7 @@ ammonia = "3.3.0"
 anyhow = "1.0.79"
 async-graphql = { version = "6.0.11", features = ["log"] }
 async-graphql-rocket = "6.0.11"
+async-trait = "0.1.81"
 css-inline = "0.13.0"
 glog = "0.1.0"
 html-escape = "0.2.13"
@@ -21,8 +22,10 @@ mailparse = "0.15.0"
 maplit = "1.0.2"
 memmap = "0.7.0"
 notmuch = { path = "../notmuch" }
+reqwest = { version = "0.12.7", features = ["blocking"] }
 rocket = { version = "0.5.0-rc.2", features = [ "json" ] }
 rocket_cors = "0.6.0"
+scraper = "0.20.0"
 serde = { version = "1.0.147", features = ["derive"] }
 serde_json = "1.0.87"
 shared = { path = "../shared" }
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -3,13 +3,15 @@ pub mod graphql;
 pub mod newsreader;
 pub mod nm;

-use std::{convert::Infallible, str::FromStr};
+use std::{collections::HashMap, convert::Infallible, str::FromStr};

+use async_trait::async_trait;
 use css_inline::{CSSInliner, InlineError, InlineOptions};
 use linkify::{LinkFinder, LinkKind};
-use log::{error, info};
+use log::{error, info, warn};
 use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings};
 use maplit::{hashmap, hashset};
+use scraper::{error::SelectorErrorKind, Html, Selector};
 use thiserror::Error;
 use url::Url;

@@ -19,23 +21,28 @@ use crate::newsreader::{
 const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";

 // TODO: figure out how to use Cow
-trait Transformer {
-    fn should_run(&self, _html: &str) -> bool {
+#[async_trait]
+trait Transformer: Send + Sync {
+    fn should_run(&self, addr: &Option<Url>, _html: &str) -> bool {
        true
    }
    // TODO: should html be something like `html_escape` uses:
    // <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
-    fn transform(&self, html: &str) -> Result<String, TransformError>;
+    async fn transform(&self, addr: &Option<Url>, html: &str) -> Result<String, TransformError>;
 }

 // TODO: how would we make this more generic to allow good implementations of Transformer outside
 // of this module?
 #[derive(Error, Debug)]
 pub enum TransformError {
-    #[error("lol-html rewrite error")]
+    #[error("lol-html rewrite error: {0}")]
    RewritingError(#[from] RewritingError),
-    #[error("css inline error")]
+    #[error("css inline error: {0}")]
    InlineError(#[from] InlineError),
+    #[error("failed to fetch url error: {0}")]
+    ReqwestError(#[from] reqwest::Error),
+    #[error("failed to parse HTML: {0}")]
+    HtmlParsingError(String),
 }

 struct SanitizeHtml<'a> {
@@ -43,31 +50,34 @@ struct SanitizeHtml<'a> {
    base_url: &'a Option<Url>,
 }

+#[async_trait]
 impl<'a> Transformer for SanitizeHtml<'a> {
-    fn transform(&self, html: &str) -> Result<String, TransformError> {
+    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        Ok(sanitize_html(html, self.cid_prefix, self.base_url)?)
    }
 }

 struct EscapeHtml;

+#[async_trait]
 impl Transformer for EscapeHtml {
-    fn should_run(&self, html: &str) -> bool {
+    fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
        html.contains("&")
    }
-    fn transform(&self, html: &str) -> Result<String, TransformError> {
+    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        Ok(html_escape::decode_html_entities(html).to_string())
    }
 }

 struct StripHtml;

+#[async_trait]
 impl Transformer for StripHtml {
-    fn should_run(&self, html: &str) -> bool {
+    fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
        // Lame test
        html.contains("<")
    }
-    fn transform(&self, html: &str) -> Result<String, TransformError> {
+    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        let mut text = String::new();
        let element_content_handlers = vec![text!("*", |t| {
            text += t.as_str();
@@ -87,8 +97,9 @@ impl Transformer for StripHtml {

 struct InlineStyle;

+#[async_trait]
 impl Transformer for InlineStyle {
-    fn transform(&self, html: &str) -> Result<String, TransformError> {
+    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        let css = concat!(
            "/* chrome-default.css */\n",
            include_str!("chrome-default.css"),
@@ -118,29 +129,78 @@ impl Transformer for InlineStyle {
    }
 }

-struct AddOutlink(Option<url::Url>);
+struct AddOutlink;

+#[async_trait]
 impl Transformer for AddOutlink {
-    fn should_run(&self, html: &str) -> bool {
-        if let Some(link) = &self.0 {
-            return link.scheme().starts_with("http") && !html.contains(link.as_str());
+    fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
+        if let Some(link) = link {
+            link.scheme().starts_with("http") && !html.contains(link.as_str())
+        } else {
+            false
        }
-        false
    }
-    fn transform(&self, html: &str) -> Result<String, TransformError> {
-        if let Some(url) = &self.0 {
+    async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
+        if let Some(link) = link {
            Ok(format!(
                r#"
                {html}
                <div><a href="{}">View on site</a></div>
                "#,
-                url
+                link
            ))
        } else {
            Ok(html.to_string())
        }
    }
 }
+
+struct SlurpContents {
+    site_selectors: HashMap<String, Vec<Selector>>,
+}
+
+impl SlurpContents {
+    fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
+        for (host, selector) in self.site_selectors.iter() {
+            if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
+                return Some(&selector);
+            }
+        }
+        None
+    }
+}
+
+#[async_trait]
+impl Transformer for SlurpContents {
+    fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
+        if let Some(link) = link {
+            return self.get_selectors(link).is_some();
+        }
+        false
+    }
+    async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
+        let Some(link) = link else {
+            return Ok(html.to_string());
+        };
+        let Some(selectors) = self.get_selectors(&link) else {
+            return Ok(html.to_string());
+        };
+        let body = reqwest::get(link.as_str()).await?.text().await?;
+        let doc = Html::parse_document(&body);
+
+        let mut results = Vec::new();
+        for selector in selectors {
+            if let Some(frag) = doc.select(&selector).next() {
+                results.push(frag.html())
+            } else {
+                warn!("couldn't find '{:?}' in {}", selector, link);
+                return Ok(html.to_string());
+            }
+        }
+        Ok(results.join("<br><br>"))
+    }
+}
+
 pub fn linkify_html(text: &str) -> String {
    let mut finder = LinkFinder::new();
    let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@@ -1,6 +1,8 @@
 use std::hash::{DefaultHasher, Hash, Hasher};

 use log::info;
+use maplit::hashmap;
+use scraper::Selector;
 use sqlx::postgres::PgPool;
 use url::Url;

@@ -13,7 +15,7 @@ use crate::{
    compute_offset_limit,
    error::ServerError,
    graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
-    AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
+    AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, SlurpContents, StripHtml, Transformer,
 };

 pub fn is_newsreader_search(query: &str) -> bool {
@@ -89,36 +91,34 @@ pub async fn search(
    .fetch_all(pool)
    .await?;

-    Ok(rows
-        .into_iter()
-        .enumerate()
-        .map(|(i, r)| {
-            let site = r.site.unwrap_or("UNKOWN TAG".to_string());
-            let mut tags = vec![format!("{TAG_PREFIX}{site}")];
-            if !r.is_read.unwrap_or(true) {
-                tags.push("unread".to_string());
-            };
-            let mut title = r.title.unwrap_or("NO TITLE".to_string());
-            title = clean_title(&title).expect("failed to clean title");
-            (
-                i as i32 + offset,
-                ThreadSummary {
-                    thread: format!("{THREAD_PREFIX}{}", r.uid),
-                    timestamp: r
-                        .date
-                        .expect("post missing date")
-                        .assume_utc()
-                        .unix_timestamp() as isize,
-                    date_relative: "TODO date_relative".to_string(),
-                    matched: 0,
-                    total: 1,
-                    authors: r.name.unwrap_or_else(|| site.clone()),
-                    subject: title,
-                    tags,
-                },
-            )
-        })
-        .collect())
+    let mut res = Vec::new();
+    for (i, r) in rows.into_iter().enumerate() {
+        let site = r.site.unwrap_or("UNKOWN TAG".to_string());
+        let mut tags = vec![format!("{TAG_PREFIX}{site}")];
+        if !r.is_read.unwrap_or(true) {
+            tags.push("unread".to_string());
+        };
+        let mut title = r.title.unwrap_or("NO TITLE".to_string());
+        title = clean_title(&title).await.expect("failed to clean title");
+        res.push((
+            i as i32 + offset,
+            ThreadSummary {
+                thread: format!("{THREAD_PREFIX}{}", r.uid),
+                timestamp: r
+                    .date
+                    .expect("post missing date")
+                    .assume_utc()
+                    .unix_timestamp() as isize,
+                date_relative: "TODO date_relative".to_string(),
+                matched: 0,
+                total: 1,
+                authors: r.name.unwrap_or_else(|| site.clone()),
+                subject: title,
+                tags,
+            },
+        ));
+    }
+    Ok(res)
 }

 pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
@@ -197,8 +197,25 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
    // TODO: add site specific cleanups. For example:
    // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
    // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
-    let mut body_tranformers: Vec<Box<dyn Transformer>> = vec![
-        Box::new(AddOutlink(link.clone())),
+    let body_tranformers: Vec<Box<dyn Transformer>> = vec![
+        // TODO: add a map of urls and selectors
+        Box::new(SlurpContents {
+            site_selectors: hashmap![
+                "hackaday.com".to_string() => vec![
+                    Selector::parse("div.entry-featured-image").unwrap(),
+                    Selector::parse("div.entry-content").unwrap()
+                ],
+                "mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
+                "natwelch.com".to_string() => vec![
+                    Selector::parse("article div.prose").unwrap(),
+                ],
+                "slashdot.org".to_string() => vec![
+                    Selector::parse("span.story-byline").unwrap(),
+                    Selector::parse("div.p").unwrap(),
+                ],
+            ],
+        }),
+        Box::new(AddOutlink),
        Box::new(EscapeHtml),
        Box::new(InlineStyle),
        Box::new(SanitizeHtml {
@@ -207,15 +224,15 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
        }),
    ];
    for t in body_tranformers.iter() {
-        if t.should_run(&body) {
-            body = t.transform(&body)?;
+        if t.should_run(&link, &body) {
+            body = t.transform(&link, &body).await?;
        }
    }
    let body = Body::Html(Html {
        html: body,
        content_tree: "".to_string(),
    });
-    let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
+    let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?;
    let from = Some(Email {
        name: r.name,
        addr: addr.map(|a| a.to_string()),
@@ -254,7 +271,7 @@ pub async fn set_read_status<'ctx>(
        .await?;
    Ok(true)
 }
-fn clean_title(title: &str) -> Result<String, ServerError> {
+async fn clean_title(title: &str) -> Result<String, ServerError> {
    // Make title HTML so html parsers work
    let mut title = format!("<html>{title}</html>");
    let title_tranformers: Vec<Box<dyn Transformer>> =
@@ -262,8 +279,8 @@ fn clean_title(title: &str) -> Result<String, ServerError> {
    // Make title HTML so html parsers work
    title = format!("<html>{title}</html>");
    for t in title_tranformers.iter() {
-        if t.should_run(&title) {
-            title = t.transform(&title)?;
+        if t.should_run(&None, &title) {
+            title = t.transform(&None, &title).await?;
        }
    }
    Ok(title)