diff --git a/server/Rocket.toml b/server/Rocket.toml index 3178201..c0a48d1 100644 --- a/server/Rocket.toml +++ b/server/Rocket.toml @@ -12,3 +12,43 @@ port = 9345 newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader" newsreader_tantivy_db_path = "../target/database/newsreader" slurp_cache_path = "/net/nasx/x/letterbox/slurp" + +[debug.slurp_site_selectors] +"atmeta.com" = [ + "div.entry-content" +] +"blog.prusa3d.com" = [ + "article.content .post-block" +] +"blog.cloudflare.com" = [ + ".author-lists .author-name-tooltip", + ".post-full-content" +] +"blog.zsa.io" = [ + "section.blog-article" +] +"engineering.fb.com" = [ + "article" +] +"hackaday.com" = [ + "div.entry-featured-image", + "div.entry-content" +] +"mitchellh.com" = [ + "div.w-full" +] +"natwelch.com" = [ + "article div.prose" +] +"slashdot.org" = [ + "span.story-byline", + "div.p" +] +"www.redox-os.org" = [ + "div.content" +] +"www.smbc-comics.com" = [ + "img#cc-comic", + "div#aftercomic img" +] + diff --git a/server/src/bin/server.rs b/server/src/bin/server.rs index 80f1cbf..434cf2f 100644 --- a/server/src/bin/server.rs +++ b/server/src/bin/server.rs @@ -347,6 +347,7 @@ async fn main() -> Result<(), Box> { .attach(AdHoc::config::()); let config: Config = rkt.figment().extract()?; + info!("Config:\n{config:#?}"); if !std::fs::exists(&config.slurp_cache_path)? { info!("Creating slurp cache @ '{}'", &config.slurp_cache_path); std::fs::create_dir_all(&config.slurp_cache_path)?; diff --git a/server/src/config.rs b/server/src/config.rs index 15fe116..7d2b5f8 100644 --- a/server/src/config.rs +++ b/server/src/config.rs @@ -1,7 +1,23 @@ -use serde::Deserialize; -#[derive(Deserialize)] +use std::{collections::HashMap, fmt::Display, str::FromStr}; + +use scraper::Selector; +use serde::{de, Deserialize, Deserializer}; + +#[derive(Debug)] +pub struct DeSelector(pub Selector); +impl<'de> Deserialize<'de> for DeSelector { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Ok(DeSelector(Selector::parse(&s).map_err(de::Error::custom)?)) + } +} +#[derive(Debug, Deserialize)] pub struct Config { pub newsreader_database_url: String, pub newsreader_tantivy_db_path: String, pub slurp_cache_path: String, + pub slurp_site_selectors: HashMap>, } diff --git a/server/src/lib.rs b/server/src/lib.rs index 04d8b19..e7075ab 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -21,7 +21,10 @@ use thiserror::Error; use tokio::sync::Mutex; use url::Url; -use crate::newsreader::{extract_thread_id, is_newsreader_thread}; +use crate::{ + config::DeSelector, + newsreader::{extract_thread_id, is_newsreader_thread}, +}; // TODO: figure out how to use Cow #[async_trait] @@ -215,13 +218,13 @@ impl Transformer for AddOutlink { } } -struct SlurpContents { +struct SlurpContents<'h> { cacher: Arc>, - site_selectors: HashMap>, + site_selectors: &'h HashMap>, } -impl SlurpContents { - fn get_selectors(&self, link: &Url) -> Option<&[Selector]> { +impl<'h> SlurpContents<'h> { + fn get_selectors(&self, link: &Url) -> Option<&[DeSelector]> { for (host, selector) in self.site_selectors.iter() { if link.host_str().map(|h| h.contains(host)).unwrap_or(false) { return Some(&selector); @@ -232,7 +235,7 @@ impl SlurpContents { } #[async_trait] -impl Transformer for SlurpContents { +impl<'h> Transformer for SlurpContents<'h> { fn should_run(&self, link: &Option, _: &str) -> bool { if let Some(link) = link { return self.get_selectors(link).is_some(); @@ -259,7 +262,7 @@ impl Transformer for SlurpContents { let mut results = Vec::new(); for selector in selectors { - for frag in doc.select(&selector) { + for frag in doc.select(&selector.0) { results.push(frag.html()) // TODO: figure out how to warn if there were no hits //warn!("couldn't find '{:?}' in {}", selector, link); diff --git a/server/src/newsreader.rs b/server/src/newsreader.rs index 266c4b4..b4c90e7 100644 --- a/server/src/newsreader.rs +++ b/server/src/newsreader.rs @@ -181,43 +181,7 @@ pub async fn thread( let body_tranformers: Vec> = vec![ Box::new(SlurpContents { cacher, - site_selectors: hashmap![ - "atmeta.com".to_string() => vec![ - Selector::parse("div.entry-content").unwrap(), - ], - "blog.prusa3d.com".to_string() => vec![ - Selector::parse("article.content .post-block").unwrap(), - ], - "blog.cloudflare.com".to_string() => vec![ - Selector::parse(".author-lists .author-name-tooltip").unwrap(), - Selector::parse(".post-full-content").unwrap() - ], - "blog.zsa.io".to_string() => vec![ - Selector::parse("section.blog-article").unwrap(), - ], - "engineering.fb.com".to_string() => vec![ - Selector::parse("article").unwrap(), - ], - "hackaday.com".to_string() => vec![ - Selector::parse("div.entry-featured-image").unwrap(), - Selector::parse("div.entry-content").unwrap() - ], - "mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()], - "natwelch.com".to_string() => vec![ - Selector::parse("article div.prose").unwrap(), - ], - "slashdot.org".to_string() => vec![ - Selector::parse("span.story-byline").unwrap(), - Selector::parse("div.p").unwrap(), - ], - "www.redox-os.org".to_string() => vec![ - Selector::parse("div.content").unwrap(), - ], - "www.smbc-comics.com".to_string() => vec![ - Selector::parse("img#cc-comic").unwrap(), - Selector::parse("div#aftercomic img").unwrap(), - ], - ], + site_selectors: &config.slurp_site_selectors, }), Box::new(FrameImages), Box::new(AddOutlink),