Load slurp config from toml file

This commit is contained in:
Bill Thiede 2024-09-21 12:52:08 -07:00
parent 62b17bd6a6
commit 86805f38e3
5 changed files with 70 additions and 46 deletions

View File

@ -12,3 +12,43 @@ port = 9345
newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
newsreader_tantivy_db_path = "../target/database/newsreader"
slurp_cache_path = "/net/nasx/x/letterbox/slurp"
[debug.slurp_site_selectors]
"atmeta.com" = [
"div.entry-content"
]
"blog.prusa3d.com" = [
"article.content .post-block"
]
"blog.cloudflare.com" = [
".author-lists .author-name-tooltip",
".post-full-content"
]
"blog.zsa.io" = [
"section.blog-article"
]
"engineering.fb.com" = [
"article"
]
"hackaday.com" = [
"div.entry-featured-image",
"div.entry-content"
]
"mitchellh.com" = [
"div.w-full"
]
"natwelch.com" = [
"article div.prose"
]
"slashdot.org" = [
"span.story-byline",
"div.p"
]
"www.redox-os.org" = [
"div.content"
]
"www.smbc-comics.com" = [
"img#cc-comic",
"div#aftercomic img"
]

View File

@ -347,6 +347,7 @@ async fn main() -> Result<(), Box<dyn Error>> {
.attach(AdHoc::config::<Config>());
let config: Config = rkt.figment().extract()?;
info!("Config:\n{config:#?}");
if !std::fs::exists(&config.slurp_cache_path)? {
info!("Creating slurp cache @ '{}'", &config.slurp_cache_path);
std::fs::create_dir_all(&config.slurp_cache_path)?;

View File

@ -1,7 +1,23 @@
use serde::Deserialize;
#[derive(Deserialize)]
use std::{collections::HashMap, fmt::Display, str::FromStr};
use scraper::Selector;
use serde::{de, Deserialize, Deserializer};
#[derive(Debug)]
pub struct DeSelector(pub Selector);
impl<'de> Deserialize<'de> for DeSelector {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
Ok(DeSelector(Selector::parse(&s).map_err(de::Error::custom)?))
}
}
#[derive(Debug, Deserialize)]
pub struct Config {
pub newsreader_database_url: String,
pub newsreader_tantivy_db_path: String,
pub slurp_cache_path: String,
pub slurp_site_selectors: HashMap<String, Vec<DeSelector>>,
}

View File

@ -21,7 +21,10 @@ use thiserror::Error;
use tokio::sync::Mutex;
use url::Url;
use crate::newsreader::{extract_thread_id, is_newsreader_thread};
use crate::{
config::DeSelector,
newsreader::{extract_thread_id, is_newsreader_thread},
};
// TODO: figure out how to use Cow
#[async_trait]
@ -215,13 +218,13 @@ impl Transformer for AddOutlink {
}
}
struct SlurpContents {
struct SlurpContents<'h> {
cacher: Arc<Mutex<FilesystemCacher>>,
site_selectors: HashMap<String, Vec<Selector>>,
site_selectors: &'h HashMap<String, Vec<DeSelector>>,
}
impl SlurpContents {
fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
impl<'h> SlurpContents<'h> {
fn get_selectors(&self, link: &Url) -> Option<&[DeSelector]> {
for (host, selector) in self.site_selectors.iter() {
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
return Some(&selector);
@ -232,7 +235,7 @@ impl SlurpContents {
}
#[async_trait]
impl Transformer for SlurpContents {
impl<'h> Transformer for SlurpContents<'h> {
fn should_run(&self, link: &Option<Url>, _: &str) -> bool {
if let Some(link) = link {
return self.get_selectors(link).is_some();
@ -259,7 +262,7 @@ impl Transformer for SlurpContents {
let mut results = Vec::new();
for selector in selectors {
for frag in doc.select(&selector) {
for frag in doc.select(&selector.0) {
results.push(frag.html())
// TODO: figure out how to warn if there were no hits
//warn!("couldn't find '{:?}' in {}", selector, link);

View File

@ -181,43 +181,7 @@ pub async fn thread(
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
Box::new(SlurpContents {
cacher,
site_selectors: hashmap![
"atmeta.com".to_string() => vec![
Selector::parse("div.entry-content").unwrap(),
],
"blog.prusa3d.com".to_string() => vec![
Selector::parse("article.content .post-block").unwrap(),
],
"blog.cloudflare.com".to_string() => vec![
Selector::parse(".author-lists .author-name-tooltip").unwrap(),
Selector::parse(".post-full-content").unwrap()
],
"blog.zsa.io".to_string() => vec![
Selector::parse("section.blog-article").unwrap(),
],
"engineering.fb.com".to_string() => vec![
Selector::parse("article").unwrap(),
],
"hackaday.com".to_string() => vec![
Selector::parse("div.entry-featured-image").unwrap(),
Selector::parse("div.entry-content").unwrap()
],
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
"natwelch.com".to_string() => vec![
Selector::parse("article div.prose").unwrap(),
],
"slashdot.org".to_string() => vec![
Selector::parse("span.story-byline").unwrap(),
Selector::parse("div.p").unwrap(),
],
"www.redox-os.org".to_string() => vec![
Selector::parse("div.content").unwrap(),
],
"www.smbc-comics.com".to_string() => vec![
Selector::parse("img#cc-comic").unwrap(),
Selector::parse("div#aftercomic img").unwrap(),
],
],
site_selectors: &config.slurp_site_selectors,
}),
Box::new(FrameImages),
Box::new(AddOutlink),