Load slurp config from toml file

2024-09-21 12:52:08 -07:00 · 2024-09-21 12:52:08 -07:00 · 86805f38e3
commit 86805f38e3
parent 62b17bd6a6
5 changed files with 70 additions and 46 deletions
--- a/server/Rocket.toml
+++ b/server/Rocket.toml
@ -12,3 +12,43 @@ port = 9345
 newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
 newsreader_tantivy_db_path = "../target/database/newsreader"
 slurp_cache_path = "/net/nasx/x/letterbox/slurp"
+
+[debug.slurp_site_selectors]
+"atmeta.com" = [
+  "div.entry-content"
+]
+"blog.prusa3d.com" = [
+  "article.content .post-block"
+]
+"blog.cloudflare.com" = [
+  ".author-lists .author-name-tooltip",
+  ".post-full-content"
+]
+"blog.zsa.io" = [
+  "section.blog-article"
+]
+"engineering.fb.com" = [
+  "article"
+]
+"hackaday.com" = [
+  "div.entry-featured-image",
+  "div.entry-content"
+]
+"mitchellh.com" = [
+  "div.w-full"
+]
+"natwelch.com" = [
+  "article div.prose"
+]
+"slashdot.org" = [
+  "span.story-byline",
+  "div.p"
+]
+"www.redox-os.org" = [
+  "div.content"
+]
+"www.smbc-comics.com" = [
+  "img#cc-comic",
+  "div#aftercomic img"
+]
+
--- a/server/src/bin/server.rs
+++ b/server/src/bin/server.rs
@ -347,6 +347,7 @@ async fn main() -> Result<(), Box<dyn Error>> {
        .attach(AdHoc::config::<Config>());

    let config: Config = rkt.figment().extract()?;
+    info!("Config:\n{config:#?}");
    if !std::fs::exists(&config.slurp_cache_path)? {
        info!("Creating slurp cache @ '{}'", &config.slurp_cache_path);
        std::fs::create_dir_all(&config.slurp_cache_path)?;
--- a/server/src/config.rs
+++ b/server/src/config.rs
@ -1,7 +1,23 @@
-use serde::Deserialize;
-#[derive(Deserialize)]
+use std::{collections::HashMap, fmt::Display, str::FromStr};
+
+use scraper::Selector;
+use serde::{de, Deserialize, Deserializer};
+
+#[derive(Debug)]
+pub struct DeSelector(pub Selector);
+impl<'de> Deserialize<'de> for DeSelector {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        Ok(DeSelector(Selector::parse(&s).map_err(de::Error::custom)?))
+    }
+}
+#[derive(Debug, Deserialize)]
 pub struct Config {
    pub newsreader_database_url: String,
    pub newsreader_tantivy_db_path: String,
    pub slurp_cache_path: String,
+    pub slurp_site_selectors: HashMap<String, Vec<DeSelector>>,
 }
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@ -21,7 +21,10 @@ use thiserror::Error;
 use tokio::sync::Mutex;
 use url::Url;

-use crate::newsreader::{extract_thread_id, is_newsreader_thread};
+use crate::{
+    config::DeSelector,
+    newsreader::{extract_thread_id, is_newsreader_thread},
+};

 // TODO: figure out how to use Cow
 #[async_trait]
@ -215,13 +218,13 @@ impl Transformer for AddOutlink {
    }
 }

-struct SlurpContents {
+struct SlurpContents<'h> {
    cacher: Arc<Mutex<FilesystemCacher>>,
-    site_selectors: HashMap<String, Vec<Selector>>,
+    site_selectors: &'h HashMap<String, Vec<DeSelector>>,
 }

-impl SlurpContents {
-    fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
+impl<'h> SlurpContents<'h> {
+    fn get_selectors(&self, link: &Url) -> Option<&[DeSelector]> {
        for (host, selector) in self.site_selectors.iter() {
            if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
                return Some(&selector);
@ -232,7 +235,7 @@ impl SlurpContents {
 }

 #[async_trait]
-impl Transformer for SlurpContents {
+impl<'h> Transformer for SlurpContents<'h> {
    fn should_run(&self, link: &Option<Url>, _: &str) -> bool {
        if let Some(link) = link {
            return self.get_selectors(link).is_some();
@ -259,7 +262,7 @@ impl Transformer for SlurpContents {

        let mut results = Vec::new();
        for selector in selectors {
-            for frag in doc.select(&selector) {
+            for frag in doc.select(&selector.0) {
                results.push(frag.html())
                // TODO: figure out how to warn if there were no hits
                //warn!("couldn't find '{:?}' in {}", selector, link);
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@ -181,43 +181,7 @@ pub async fn thread(
    let body_tranformers: Vec<Box<dyn Transformer>> = vec![
        Box::new(SlurpContents {
            cacher,
-            site_selectors: hashmap![
-                "atmeta.com".to_string() => vec![
-                    Selector::parse("div.entry-content").unwrap(),
-                ],
-                "blog.prusa3d.com".to_string() => vec![
-                    Selector::parse("article.content .post-block").unwrap(),
-                ],
-                "blog.cloudflare.com".to_string() => vec![
-                    Selector::parse(".author-lists .author-name-tooltip").unwrap(),
-                    Selector::parse(".post-full-content").unwrap()
-                ],
-                "blog.zsa.io".to_string() => vec![
-                    Selector::parse("section.blog-article").unwrap(),
-                ],
-                "engineering.fb.com".to_string() => vec![
-                    Selector::parse("article").unwrap(),
-                ],
-                "hackaday.com".to_string() => vec![
-                    Selector::parse("div.entry-featured-image").unwrap(),
-                    Selector::parse("div.entry-content").unwrap()
-                ],
-                "mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
-                "natwelch.com".to_string() => vec![
-                    Selector::parse("article div.prose").unwrap(),
-                ],
-                "slashdot.org".to_string() => vec![
-                    Selector::parse("span.story-byline").unwrap(),
-                    Selector::parse("div.p").unwrap(),
-                ],
-                "www.redox-os.org".to_string() => vec![
-                    Selector::parse("div.content").unwrap(),
-                ],
-                "www.smbc-comics.com".to_string() => vec![
-                    Selector::parse("img#cc-comic").unwrap(),
-                    Selector::parse("div#aftercomic img").unwrap(),
-                ],
-            ],
+            site_selectors: &config.slurp_site_selectors,
        }),
        Box::new(FrameImages),
        Box::new(AddOutlink),