server: add ability to slurp contents from site
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
use std::hash::{DefaultHasher, Hash, Hasher};
|
||||
|
||||
use log::info;
|
||||
use maplit::hashmap;
|
||||
use scraper::Selector;
|
||||
use sqlx::postgres::PgPool;
|
||||
use url::Url;
|
||||
|
||||
@@ -13,7 +15,7 @@ use crate::{
|
||||
compute_offset_limit,
|
||||
error::ServerError,
|
||||
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
|
||||
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
|
||||
AddOutlink, EscapeHtml, InlineStyle, SanitizeHtml, SlurpContents, StripHtml, Transformer,
|
||||
};
|
||||
|
||||
pub fn is_newsreader_search(query: &str) -> bool {
|
||||
@@ -89,36 +91,34 @@ pub async fn search(
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
Ok(rows
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, r)| {
|
||||
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
|
||||
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
|
||||
if !r.is_read.unwrap_or(true) {
|
||||
tags.push("unread".to_string());
|
||||
};
|
||||
let mut title = r.title.unwrap_or("NO TITLE".to_string());
|
||||
title = clean_title(&title).expect("failed to clean title");
|
||||
(
|
||||
i as i32 + offset,
|
||||
ThreadSummary {
|
||||
thread: format!("{THREAD_PREFIX}{}", r.uid),
|
||||
timestamp: r
|
||||
.date
|
||||
.expect("post missing date")
|
||||
.assume_utc()
|
||||
.unix_timestamp() as isize,
|
||||
date_relative: "TODO date_relative".to_string(),
|
||||
matched: 0,
|
||||
total: 1,
|
||||
authors: r.name.unwrap_or_else(|| site.clone()),
|
||||
subject: title,
|
||||
tags,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect())
|
||||
let mut res = Vec::new();
|
||||
for (i, r) in rows.into_iter().enumerate() {
|
||||
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
|
||||
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
|
||||
if !r.is_read.unwrap_or(true) {
|
||||
tags.push("unread".to_string());
|
||||
};
|
||||
let mut title = r.title.unwrap_or("NO TITLE".to_string());
|
||||
title = clean_title(&title).await.expect("failed to clean title");
|
||||
res.push((
|
||||
i as i32 + offset,
|
||||
ThreadSummary {
|
||||
thread: format!("{THREAD_PREFIX}{}", r.uid),
|
||||
timestamp: r
|
||||
.date
|
||||
.expect("post missing date")
|
||||
.assume_utc()
|
||||
.unix_timestamp() as isize,
|
||||
date_relative: "TODO date_relative".to_string(),
|
||||
matched: 0,
|
||||
total: 1,
|
||||
authors: r.name.unwrap_or_else(|| site.clone()),
|
||||
subject: title,
|
||||
tags,
|
||||
},
|
||||
));
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
|
||||
@@ -197,8 +197,25 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
// TODO: add site specific cleanups. For example:
|
||||
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
|
||||
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
|
||||
let mut body_tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
Box::new(AddOutlink(link.clone())),
|
||||
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
// TODO: add a map of urls and selectors
|
||||
Box::new(SlurpContents {
|
||||
site_selectors: hashmap![
|
||||
"hackaday.com".to_string() => vec![
|
||||
Selector::parse("div.entry-featured-image").unwrap(),
|
||||
Selector::parse("div.entry-content").unwrap()
|
||||
],
|
||||
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
|
||||
"natwelch.com".to_string() => vec![
|
||||
Selector::parse("article div.prose").unwrap(),
|
||||
],
|
||||
"slashdot.org".to_string() => vec![
|
||||
Selector::parse("span.story-byline").unwrap(),
|
||||
Selector::parse("div.p").unwrap(),
|
||||
],
|
||||
],
|
||||
}),
|
||||
Box::new(AddOutlink),
|
||||
Box::new(EscapeHtml),
|
||||
Box::new(InlineStyle),
|
||||
Box::new(SanitizeHtml {
|
||||
@@ -207,15 +224,15 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
}),
|
||||
];
|
||||
for t in body_tranformers.iter() {
|
||||
if t.should_run(&body) {
|
||||
body = t.transform(&body)?;
|
||||
if t.should_run(&link, &body) {
|
||||
body = t.transform(&link, &body).await?;
|
||||
}
|
||||
}
|
||||
let body = Body::Html(Html {
|
||||
html: body,
|
||||
content_tree: "".to_string(),
|
||||
});
|
||||
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
|
||||
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?;
|
||||
let from = Some(Email {
|
||||
name: r.name,
|
||||
addr: addr.map(|a| a.to_string()),
|
||||
@@ -254,7 +271,7 @@ pub async fn set_read_status<'ctx>(
|
||||
.await?;
|
||||
Ok(true)
|
||||
}
|
||||
fn clean_title(title: &str) -> Result<String, ServerError> {
|
||||
async fn clean_title(title: &str) -> Result<String, ServerError> {
|
||||
// Make title HTML so html parsers work
|
||||
let mut title = format!("<html>{title}</html>");
|
||||
let title_tranformers: Vec<Box<dyn Transformer>> =
|
||||
@@ -262,8 +279,8 @@ fn clean_title(title: &str) -> Result<String, ServerError> {
|
||||
// Make title HTML so html parsers work
|
||||
title = format!("<html>{title}</html>");
|
||||
for t in title_tranformers.iter() {
|
||||
if t.should_run(&title) {
|
||||
title = t.transform(&title)?;
|
||||
if t.should_run(&None, &title) {
|
||||
title = t.transform(&None, &title).await?;
|
||||
}
|
||||
}
|
||||
Ok(title)
|
||||
|
||||
Reference in New Issue
Block a user