Compare commits
1 Commits
master
...
slurp-toml
| Author | SHA1 | Date | |
|---|---|---|---|
| 86805f38e3 |
@ -12,3 +12,43 @@ port = 9345
|
||||
newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
|
||||
newsreader_tantivy_db_path = "../target/database/newsreader"
|
||||
slurp_cache_path = "/net/nasx/x/letterbox/slurp"
|
||||
|
||||
[debug.slurp_site_selectors]
|
||||
"atmeta.com" = [
|
||||
"div.entry-content"
|
||||
]
|
||||
"blog.prusa3d.com" = [
|
||||
"article.content .post-block"
|
||||
]
|
||||
"blog.cloudflare.com" = [
|
||||
".author-lists .author-name-tooltip",
|
||||
".post-full-content"
|
||||
]
|
||||
"blog.zsa.io" = [
|
||||
"section.blog-article"
|
||||
]
|
||||
"engineering.fb.com" = [
|
||||
"article"
|
||||
]
|
||||
"hackaday.com" = [
|
||||
"div.entry-featured-image",
|
||||
"div.entry-content"
|
||||
]
|
||||
"mitchellh.com" = [
|
||||
"div.w-full"
|
||||
]
|
||||
"natwelch.com" = [
|
||||
"article div.prose"
|
||||
]
|
||||
"slashdot.org" = [
|
||||
"span.story-byline",
|
||||
"div.p"
|
||||
]
|
||||
"www.redox-os.org" = [
|
||||
"div.content"
|
||||
]
|
||||
"www.smbc-comics.com" = [
|
||||
"img#cc-comic",
|
||||
"div#aftercomic img"
|
||||
]
|
||||
|
||||
|
||||
@ -347,6 +347,7 @@ async fn main() -> Result<(), Box<dyn Error>> {
|
||||
.attach(AdHoc::config::<Config>());
|
||||
|
||||
let config: Config = rkt.figment().extract()?;
|
||||
info!("Config:\n{config:#?}");
|
||||
if !std::fs::exists(&config.slurp_cache_path)? {
|
||||
info!("Creating slurp cache @ '{}'", &config.slurp_cache_path);
|
||||
std::fs::create_dir_all(&config.slurp_cache_path)?;
|
||||
|
||||
@ -1,7 +1,23 @@
|
||||
use serde::Deserialize;
|
||||
#[derive(Deserialize)]
|
||||
use std::{collections::HashMap, fmt::Display, str::FromStr};
|
||||
|
||||
use scraper::Selector;
|
||||
use serde::{de, Deserialize, Deserializer};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DeSelector(pub Selector);
|
||||
impl<'de> Deserialize<'de> for DeSelector {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let s = String::deserialize(deserializer)?;
|
||||
Ok(DeSelector(Selector::parse(&s).map_err(de::Error::custom)?))
|
||||
}
|
||||
}
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Config {
|
||||
pub newsreader_database_url: String,
|
||||
pub newsreader_tantivy_db_path: String,
|
||||
pub slurp_cache_path: String,
|
||||
pub slurp_site_selectors: HashMap<String, Vec<DeSelector>>,
|
||||
}
|
||||
|
||||
@ -21,7 +21,10 @@ use thiserror::Error;
|
||||
use tokio::sync::Mutex;
|
||||
use url::Url;
|
||||
|
||||
use crate::newsreader::{extract_thread_id, is_newsreader_thread};
|
||||
use crate::{
|
||||
config::DeSelector,
|
||||
newsreader::{extract_thread_id, is_newsreader_thread},
|
||||
};
|
||||
|
||||
// TODO: figure out how to use Cow
|
||||
#[async_trait]
|
||||
@ -215,13 +218,13 @@ impl Transformer for AddOutlink {
|
||||
}
|
||||
}
|
||||
|
||||
struct SlurpContents {
|
||||
struct SlurpContents<'h> {
|
||||
cacher: Arc<Mutex<FilesystemCacher>>,
|
||||
site_selectors: HashMap<String, Vec<Selector>>,
|
||||
site_selectors: &'h HashMap<String, Vec<DeSelector>>,
|
||||
}
|
||||
|
||||
impl SlurpContents {
|
||||
fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
|
||||
impl<'h> SlurpContents<'h> {
|
||||
fn get_selectors(&self, link: &Url) -> Option<&[DeSelector]> {
|
||||
for (host, selector) in self.site_selectors.iter() {
|
||||
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
|
||||
return Some(&selector);
|
||||
@ -232,7 +235,7 @@ impl SlurpContents {
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Transformer for SlurpContents {
|
||||
impl<'h> Transformer for SlurpContents<'h> {
|
||||
fn should_run(&self, link: &Option<Url>, _: &str) -> bool {
|
||||
if let Some(link) = link {
|
||||
return self.get_selectors(link).is_some();
|
||||
@ -259,7 +262,7 @@ impl Transformer for SlurpContents {
|
||||
|
||||
let mut results = Vec::new();
|
||||
for selector in selectors {
|
||||
for frag in doc.select(&selector) {
|
||||
for frag in doc.select(&selector.0) {
|
||||
results.push(frag.html())
|
||||
// TODO: figure out how to warn if there were no hits
|
||||
//warn!("couldn't find '{:?}' in {}", selector, link);
|
||||
|
||||
@ -181,43 +181,7 @@ pub async fn thread(
|
||||
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
Box::new(SlurpContents {
|
||||
cacher,
|
||||
site_selectors: hashmap![
|
||||
"atmeta.com".to_string() => vec![
|
||||
Selector::parse("div.entry-content").unwrap(),
|
||||
],
|
||||
"blog.prusa3d.com".to_string() => vec![
|
||||
Selector::parse("article.content .post-block").unwrap(),
|
||||
],
|
||||
"blog.cloudflare.com".to_string() => vec![
|
||||
Selector::parse(".author-lists .author-name-tooltip").unwrap(),
|
||||
Selector::parse(".post-full-content").unwrap()
|
||||
],
|
||||
"blog.zsa.io".to_string() => vec![
|
||||
Selector::parse("section.blog-article").unwrap(),
|
||||
],
|
||||
"engineering.fb.com".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"hackaday.com".to_string() => vec![
|
||||
Selector::parse("div.entry-featured-image").unwrap(),
|
||||
Selector::parse("div.entry-content").unwrap()
|
||||
],
|
||||
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
|
||||
"natwelch.com".to_string() => vec![
|
||||
Selector::parse("article div.prose").unwrap(),
|
||||
],
|
||||
"slashdot.org".to_string() => vec![
|
||||
Selector::parse("span.story-byline").unwrap(),
|
||||
Selector::parse("div.p").unwrap(),
|
||||
],
|
||||
"www.redox-os.org".to_string() => vec![
|
||||
Selector::parse("div.content").unwrap(),
|
||||
],
|
||||
"www.smbc-comics.com".to_string() => vec![
|
||||
Selector::parse("img#cc-comic").unwrap(),
|
||||
Selector::parse("div#aftercomic img").unwrap(),
|
||||
],
|
||||
],
|
||||
site_selectors: &config.slurp_site_selectors,
|
||||
}),
|
||||
Box::new(FrameImages),
|
||||
Box::new(AddOutlink),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user