server: add ability to slurp contents from site

2024-08-25 19:37:53 -07:00
parent d98d429b5c
commit 71de3ef8ae
4 changed files with 661 additions and 82 deletions
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@@ -1,6 +1,8 @@
 use std::hash::{DefaultHasher, Hash, Hasher};

 use log::info;
+use maplit::hashmap;
+use scraper::Selector;
 use sqlx::postgres::PgPool;
 use url::Url;

@@ -13,7 +15,7 @@ use crate::{
    compute_offset_limit,
    error::ServerError,
    graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
-    AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
+    AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, SlurpContents, StripHtml, Transformer,
 };

 pub fn is_newsreader_search(query: &str) -> bool {
@@ -89,36 +91,34 @@ pub async fn search(
    .fetch_all(pool)
    .await?;

-    Ok(rows
-        .into_iter()
-        .enumerate()
-        .map(|(i, r)| {
-            let site = r.site.unwrap_or("UNKOWN TAG".to_string());
-            let mut tags = vec![format!("{TAG_PREFIX}{site}")];
-            if !r.is_read.unwrap_or(true) {
-                tags.push("unread".to_string());
-            };
-            let mut title = r.title.unwrap_or("NO TITLE".to_string());
-            title = clean_title(&title).expect("failed to clean title");
-            (
-                i as i32 + offset,
-                ThreadSummary {
-                    thread: format!("{THREAD_PREFIX}{}", r.uid),
-                    timestamp: r
-                        .date
-                        .expect("post missing date")
-                        .assume_utc()
-                        .unix_timestamp() as isize,
-                    date_relative: "TODO date_relative".to_string(),
-                    matched: 0,
-                    total: 1,
-                    authors: r.name.unwrap_or_else(|| site.clone()),
-                    subject: title,
-                    tags,
-                },
-            )
-        })
-        .collect())
+    let mut res = Vec::new();
+    for (i, r) in rows.into_iter().enumerate() {
+        let site = r.site.unwrap_or("UNKOWN TAG".to_string());
+        let mut tags = vec![format!("{TAG_PREFIX}{site}")];
+        if !r.is_read.unwrap_or(true) {
+            tags.push("unread".to_string());
+        };
+        let mut title = r.title.unwrap_or("NO TITLE".to_string());
+        title = clean_title(&title).await.expect("failed to clean title");
+        res.push((
+            i as i32 + offset,
+            ThreadSummary {
+                thread: format!("{THREAD_PREFIX}{}", r.uid),
+                timestamp: r
+                    .date
+                    .expect("post missing date")
+                    .assume_utc()
+                    .unix_timestamp() as isize,
+                date_relative: "TODO date_relative".to_string(),
+                matched: 0,
+                total: 1,
+                authors: r.name.unwrap_or_else(|| site.clone()),
+                subject: title,
+                tags,
+            },
+        ));
+    }
+    Ok(res)
 }

 pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
@@ -197,8 +197,25 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
    // TODO: add site specific cleanups. For example:
    // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
    // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
-    let mut body_tranformers: Vec<Box<dyn Transformer>> = vec![
-        Box::new(AddOutlink(link.clone())),
+    let body_tranformers: Vec<Box<dyn Transformer>> = vec![
+        // TODO: add a map of urls and selectors
+        Box::new(SlurpContents {
+            site_selectors: hashmap![
+                "hackaday.com".to_string() => vec![
+                    Selector::parse("div.entry-featured-image").unwrap(),
+                    Selector::parse("div.entry-content").unwrap()
+                ],
+                "mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
+                "natwelch.com".to_string() => vec![
+                    Selector::parse("article div.prose").unwrap(),
+                ],
+                "slashdot.org".to_string() => vec![
+                    Selector::parse("span.story-byline").unwrap(),
+                    Selector::parse("div.p").unwrap(),
+                ],
+            ],
+        }),
+        Box::new(AddOutlink),
        Box::new(EscapeHtml),
        Box::new(InlineStyle),
        Box::new(SanitizeHtml {
@@ -207,15 +224,15 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
        }),
    ];
    for t in body_tranformers.iter() {
-        if t.should_run(&body) {
-            body = t.transform(&body)?;
+        if t.should_run(&link, &body) {
+            body = t.transform(&link, &body).await?;
        }
    }
    let body = Body::Html(Html {
        html: body,
        content_tree: "".to_string(),
    });
-    let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
+    let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?;
    let from = Some(Email {
        name: r.name,
        addr: addr.map(|a| a.to_string()),
@@ -254,7 +271,7 @@ pub async fn set_read_status<'ctx>(
        .await?;
    Ok(true)
 }
-fn clean_title(title: &str) -> Result<String, ServerError> {
+async fn clean_title(title: &str) -> Result<String, ServerError> {
    // Make title HTML so html parsers work
    let mut title = format!("<html>{title}</html>");
    let title_tranformers: Vec<Box<dyn Transformer>> =
@@ -262,8 +279,8 @@ fn clean_title(title: &str) -> Result<String, ServerError> {
    // Make title HTML so html parsers work
    title = format!("<html>{title}</html>");
    for t in title_tranformers.iter() {
-        if t.should_run(&title) {
-            title = t.transform(&title)?;
+        if t.should_run(&None, &title) {
+            title = t.transform(&None, &title).await?;
        }
    }
    Ok(title)