server: add ability to slurp contents from site

This commit is contained in:
2024-08-25 19:37:53 -07:00
parent d98d429b5c
commit 71de3ef8ae
4 changed files with 661 additions and 82 deletions

View File

@@ -1,6 +1,8 @@
use std::hash::{DefaultHasher, Hash, Hasher};
use log::info;
use maplit::hashmap;
use scraper::Selector;
use sqlx::postgres::PgPool;
use url::Url;
@@ -13,7 +15,7 @@ use crate::{
compute_offset_limit,
error::ServerError,
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, SlurpContents, StripHtml, Transformer,
};
pub fn is_newsreader_search(query: &str) -> bool {
@@ -89,36 +91,34 @@ pub async fn search(
.fetch_all(pool)
.await?;
Ok(rows
.into_iter()
.enumerate()
.map(|(i, r)| {
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
if !r.is_read.unwrap_or(true) {
tags.push("unread".to_string());
};
let mut title = r.title.unwrap_or("NO TITLE".to_string());
title = clean_title(&title).expect("failed to clean title");
(
i as i32 + offset,
ThreadSummary {
thread: format!("{THREAD_PREFIX}{}", r.uid),
timestamp: r
.date
.expect("post missing date")
.assume_utc()
.unix_timestamp() as isize,
date_relative: "TODO date_relative".to_string(),
matched: 0,
total: 1,
authors: r.name.unwrap_or_else(|| site.clone()),
subject: title,
tags,
},
)
})
.collect())
let mut res = Vec::new();
for (i, r) in rows.into_iter().enumerate() {
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
if !r.is_read.unwrap_or(true) {
tags.push("unread".to_string());
};
let mut title = r.title.unwrap_or("NO TITLE".to_string());
title = clean_title(&title).await.expect("failed to clean title");
res.push((
i as i32 + offset,
ThreadSummary {
thread: format!("{THREAD_PREFIX}{}", r.uid),
timestamp: r
.date
.expect("post missing date")
.assume_utc()
.unix_timestamp() as isize,
date_relative: "TODO date_relative".to_string(),
matched: 0,
total: 1,
authors: r.name.unwrap_or_else(|| site.clone()),
subject: title,
tags,
},
));
}
Ok(res)
}
pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
@@ -197,8 +197,25 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
// TODO: add site specific cleanups. For example:
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
let mut body_tranformers: Vec<Box<dyn Transformer>> = vec![
Box::new(AddOutlink(link.clone())),
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
// TODO: add a map of urls and selectors
Box::new(SlurpContents {
site_selectors: hashmap![
"hackaday.com".to_string() => vec![
Selector::parse("div.entry-featured-image").unwrap(),
Selector::parse("div.entry-content").unwrap()
],
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
"natwelch.com".to_string() => vec![
Selector::parse("article div.prose").unwrap(),
],
"slashdot.org".to_string() => vec![
Selector::parse("span.story-byline").unwrap(),
Selector::parse("div.p").unwrap(),
],
],
}),
Box::new(AddOutlink),
Box::new(EscapeHtml),
Box::new(InlineStyle),
Box::new(SanitizeHtml {
@@ -207,15 +224,15 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
}),
];
for t in body_tranformers.iter() {
if t.should_run(&body) {
body = t.transform(&body)?;
if t.should_run(&link, &body) {
body = t.transform(&link, &body).await?;
}
}
let body = Body::Html(Html {
html: body,
content_tree: "".to_string(),
});
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?;
let from = Some(Email {
name: r.name,
addr: addr.map(|a| a.to_string()),
@@ -254,7 +271,7 @@ pub async fn set_read_status<'ctx>(
.await?;
Ok(true)
}
fn clean_title(title: &str) -> Result<String, ServerError> {
async fn clean_title(title: &str) -> Result<String, ServerError> {
// Make title HTML so html parsers work
let mut title = format!("<html>{title}</html>");
let title_tranformers: Vec<Box<dyn Transformer>> =
@@ -262,8 +279,8 @@ fn clean_title(title: &str) -> Result<String, ServerError> {
// Make title HTML so html parsers work
title = format!("<html>{title}</html>");
for t in title_tranformers.iter() {
if t.should_run(&title) {
title = t.transform(&title)?;
if t.should_run(&None, &title) {
title = t.transform(&None, &title).await?;
}
}
Ok(title)