Load slurp config from toml file

Bumping version to 0.0.29
server: add slurp config for zsa blog
2024-09-21 12:52:08 -07:00 · 2024-09-20 08:56:58 -07:00 · 2024-09-20 08:56:45 -07:00 · 2024-09-19 17:06:03 -07:00 · 2024-09-19 17:05:47 -07:00 · 2024-09-19 16:54:18 -07:00
19 changed files with 1572 additions and 258 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/notmuch/Cargo.toml
+++ b/notmuch/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "notmuch"
-version = "0.0.13"
+version = "0.0.29"
 edition = "2021"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
--- a/procmail2notmuch/Cargo.toml
+++ b/procmail2notmuch/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "procmail2notmuch"
-version = "0.0.13"
+version = "0.0.29"
 edition = "2021"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
--- a/server/.sqlx/query-1b2244c9b9b64a1395d8d266f5df5352242bbe5efe481b0852e1c1d4b40584a7.json
+++ b/server/.sqlx/query-1b2244c9b9b64a1395d8d266f5df5352242bbe5efe481b0852e1c1d4b40584a7.json
@@ -0,0 +1,62 @@
+{
+  "db_name": "PostgreSQL",
+  "query": "SELECT\n    site,\n    title,\n    summary,\n    link,\n    date,\n    is_read,\n    uid,\n    id\nFROM post\n",
+  "describe": {
+    "columns": [
+      {
+        "ordinal": 0,
+        "name": "site",
+        "type_info": "Text"
+      },
+      {
+        "ordinal": 1,
+        "name": "title",
+        "type_info": "Text"
+      },
+      {
+        "ordinal": 2,
+        "name": "summary",
+        "type_info": "Text"
+      },
+      {
+        "ordinal": 3,
+        "name": "link",
+        "type_info": "Text"
+      },
+      {
+        "ordinal": 4,
+        "name": "date",
+        "type_info": "Timestamp"
+      },
+      {
+        "ordinal": 5,
+        "name": "is_read",
+        "type_info": "Bool"
+      },
+      {
+        "ordinal": 6,
+        "name": "uid",
+        "type_info": "Text"
+      },
+      {
+        "ordinal": 7,
+        "name": "id",
+        "type_info": "Int4"
+      }
+    ],
+    "parameters": {
+      "Left": []
+    },
+    "nullable": [
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false
+    ]
+  },
+  "hash": "1b2244c9b9b64a1395d8d266f5df5352242bbe5efe481b0852e1c1d4b40584a7"
+}
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "server"
-version = "0.0.13"
+version = "0.0.29"
 edition = "2021"
 default-run = "server"

@@ -13,6 +13,7 @@ async-graphql = { version = "6.0.11", features = ["log"] }
 async-graphql-rocket = "6.0.11"
 async-trait = "0.1.81"
 build-info = "0.0.38"
+cacher = {git = "http://git-private.h.xinu.tv/wathiede/cacher.git"}
 css-inline = "0.13.0"
 glog = "0.1.0"
 html-escape = "0.2.13"
--- a/server/Rocket.toml
+++ b/server/Rocket.toml
@@ -1,6 +1,8 @@
 [release]
 address = "0.0.0.0"
 port = 9345
+newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
+newsreader_tantivy_db_path = "../target/database/newsreader"

 [debug]
 address = "0.0.0.0"
@@ -8,3 +10,45 @@ port = 9345
 # Uncomment to make it production like.
 #log_level = "critical"
 newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
+newsreader_tantivy_db_path = "../target/database/newsreader"
+slurp_cache_path = "/net/nasx/x/letterbox/slurp"
+
+[debug.slurp_site_selectors]
+"atmeta.com" = [
+  "div.entry-content"
+]
+"blog.prusa3d.com" = [
+  "article.content .post-block"
+]
+"blog.cloudflare.com" = [
+  ".author-lists .author-name-tooltip",
+  ".post-full-content"
+]
+"blog.zsa.io" = [
+  "section.blog-article"
+]
+"engineering.fb.com" = [
+  "article"
+]
+"hackaday.com" = [
+  "div.entry-featured-image",
+  "div.entry-content"
+]
+"mitchellh.com" = [
+  "div.w-full"
+]
+"natwelch.com" = [
+  "article div.prose"
+]
+"slashdot.org" = [
+  "span.story-byline",
+  "div.p"
+]
+"www.redox-os.org" = [
+  "div.content"
+]
+"www.smbc-comics.com" = [
+  "img#cc-comic",
+  "div#aftercomic img"
+]
+
--- a/server/sql/all-posts.sql
+++ b/server/sql/all-posts.sql
@@ -0,0 +1,10 @@
+SELECT
+    site,
+    title,
+    summary,
+    link,
+    date,
+    is_read,
+    uid,
+    id
+FROM post
--- a/server/src/bin/server.rs
+++ b/server/src/bin/server.rs
@@ -18,18 +18,14 @@ use rocket::{
    Response, State,
 };
 use rocket_cors::{AllowedHeaders, AllowedOrigins};
-use serde::Deserialize;
 use server::{
+    config::Config,
    error::ServerError,
    graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot},
    nm::{attachment_bytes, cid_attachment_bytes},
 };
 use sqlx::postgres::PgPool;
-
-#[derive(Deserialize)]
-struct Config {
-    newsreader_database_url: String,
-}
+use tantivy::{Index, IndexWriter};

 #[get("/refresh")]
 async fn refresh(nm: &State<Notmuch>) -> Result<Json<String>, Debug<NotmuchError>> {
@@ -170,6 +166,126 @@ fn graphiql() -> content::RawHtml<String> {
    content::RawHtml(GraphiQLSource::build().endpoint("/api/graphql").finish())
 }

+#[rocket::post("/create-news-db")]
+fn create_news_db(config: &State<Config>) -> Result<String, Debug<ServerError>> {
+    create_news_db_impl(config)?;
+    Ok(format!(
+        "DB created in {}\n",
+        config.newsreader_tantivy_db_path
+    ))
+}
+fn create_news_db_impl(config: &Config) -> Result<(), ServerError> {
+    std::fs::remove_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
+    std::fs::create_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
+    use tantivy::schema::*;
+    let mut schema_builder = Schema::builder();
+    schema_builder.add_text_field("site", STRING | STORED);
+    schema_builder.add_text_field("title", TEXT | STORED);
+    schema_builder.add_text_field("summary", TEXT);
+    schema_builder.add_text_field("link", STRING | STORED);
+    schema_builder.add_date_field("date", FAST);
+    schema_builder.add_bool_field("is_read", FAST);
+    schema_builder.add_text_field("uid", STRING | STORED);
+    schema_builder.add_i64_field("id", FAST);
+
+    let schema = schema_builder.build();
+    Index::create_in_dir(&config.newsreader_tantivy_db_path, schema).map_err(ServerError::from)?;
+    Ok(())
+}
+
+#[rocket::post("/reindex-news-db")]
+async fn reindex_news_db(
+    pool: &State<PgPool>,
+    config: &State<Config>,
+) -> Result<String, Debug<ServerError>> {
+    use tantivy::{doc, Term};
+
+    let start_time = std::time::Instant::now();
+    let pool: &PgPool = pool;
+
+    let index =
+        Index::open_in_dir(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
+    let mut index_writer = index.writer(50_000_000).map_err(ServerError::from)?;
+    let schema = index.schema();
+    let site = schema.get_field("site").map_err(ServerError::from)?;
+    let title = schema.get_field("title").map_err(ServerError::from)?;
+    let summary = schema.get_field("summary").map_err(ServerError::from)?;
+    let link = schema.get_field("link").map_err(ServerError::from)?;
+    let date = schema.get_field("date").map_err(ServerError::from)?;
+    let is_read = schema.get_field("is_read").map_err(ServerError::from)?;
+    let uid = schema.get_field("uid").map_err(ServerError::from)?;
+    let id = schema.get_field("id").map_err(ServerError::from)?;
+
+    let rows = sqlx::query_file!("sql/all-posts.sql")
+        .fetch_all(pool)
+        .await
+        .map_err(ServerError::from)?;
+
+    let total = rows.len();
+    for (i, r) in rows.into_iter().enumerate() {
+        if i % 10_000 == 0 {
+            info!(
+                "{i}/{total} processed, elapsed {:.2}s",
+                start_time.elapsed().as_secs_f32()
+            );
+        }
+        let id_term = Term::from_field_text(uid, &r.uid);
+        index_writer.delete_term(id_term);
+        index_writer
+            .add_document(doc!(
+                site =>  r.site.expect("UNKOWN_SITE"),
+                title =>  r.title.expect("UNKOWN_TITLE"),
+                // TODO: clean and extract text from HTML
+                summary => r.summary.expect("UNKNOWN_SUMMARY"),
+                link => r.link.expect("link"),
+                date => tantivy::DateTime::from_primitive(r.date.expect("date")),
+                is_read => r.is_read.expect("is_read"),
+                uid => r.uid,
+                id => r.id as i64,
+            ))
+            .map_err(ServerError::from)?;
+    }
+
+    index_writer.commit().map_err(ServerError::from)?;
+
+    info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
+    Ok(format!(
+        "DB openned in {}\n",
+        config.newsreader_tantivy_db_path
+    ))
+}
+
+#[rocket::get("/search-news-db")]
+fn search_news_db(
+    index: &State<tantivy::Index>,
+    reader: &State<tantivy::IndexReader>,
+) -> Result<String, Debug<ServerError>> {
+    use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
+
+    let searcher = reader.searcher();
+    let schema = index.schema();
+    let site = schema.get_field("site").map_err(ServerError::from)?;
+    let title = schema.get_field("title").map_err(ServerError::from)?;
+    let summary = schema.get_field("summary").map_err(ServerError::from)?;
+    let query_parser = QueryParser::for_index(&index, vec![site, title, summary]);
+
+    let query = query_parser
+        .parse_query("grapheme")
+        .map_err(ServerError::from)?;
+    let top_docs = searcher
+        .search(&query, &TopDocs::with_limit(10))
+        .map_err(ServerError::from)?;
+    let mut results = vec![];
+    info!("search found {} docs", top_docs.len());
+    for (_score, doc_address) in top_docs {
+        let retrieved_doc: TantivyDocument =
+            searcher.doc(doc_address).map_err(ServerError::from)?;
+        results.push(format!("{}", retrieved_doc.to_json(&schema)));
+    }
+
+    Ok(format!("{}", results.join(" ")))
+}
+
 #[rocket::get("/graphql?<query..>")]
 async fn graphql_query(schema: &State<GraphqlSchema>, query: GraphQLQuery) -> GraphQLResponse {
    query.execute(schema.inner()).await
@@ -183,7 +299,6 @@ async fn graphql_request(
    request.execute(schema.inner()).await
 }

-
 #[rocket::main]
 async fn main() -> Result<(), Box<dyn Error>> {
    glog::new()
@@ -213,6 +328,9 @@ async fn main() -> Result<(), Box<dyn Error>> {
        .mount(
            shared::urls::MOUNT_POINT,
            routes![
+                create_news_db,
+                reindex_news_db,
+                search_news_db,
                original,
                refresh,
                show_pretty,
@@ -229,14 +347,33 @@ async fn main() -> Result<(), Box<dyn Error>> {
        .attach(AdHoc::config::<Config>());

    let config: Config = rkt.figment().extract()?;
+    info!("Config:\n{config:#?}");
+    if !std::fs::exists(&config.slurp_cache_path)? {
+        info!("Creating slurp cache @ '{}'", &config.slurp_cache_path);
+        std::fs::create_dir_all(&config.slurp_cache_path)?;
+    }
    let pool = PgPool::connect(&config.newsreader_database_url).await?;
+    let tantivy_newsreader_index = match Index::open_in_dir(&config.newsreader_tantivy_db_path) {
+        Ok(idx) => idx,
+        Err(_) => {
+            create_news_db_impl(&config)?;
+            Index::open_in_dir(&config.newsreader_tantivy_db_path)?
+        }
+    };
+    let tantivy_newsreader_reader = tantivy_newsreader_index.reader()?;
    let schema = Schema::build(QueryRoot, Mutation, EmptySubscription)
        .data(Notmuch::default())
+        .data(config)
        .data(pool.clone())
        .extension(async_graphql::extensions::Logger)
        .finish();

-    let rkt = rkt.manage(schema).manage(pool).manage(Notmuch::default());
+    let rkt = rkt
+        .manage(schema)
+        .manage(pool)
+        .manage(Notmuch::default())
+        .manage(tantivy_newsreader_index)
+        .manage(tantivy_newsreader_reader);
    //.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config"))

    rkt.launch().await?;
--- a/server/src/config.rs
+++ b/server/src/config.rs
@@ -0,0 +1,23 @@
+use std::{collections::HashMap, fmt::Display, str::FromStr};
+
+use scraper::Selector;
+use serde::{de, Deserialize, Deserializer};
+
+#[derive(Debug)]
+pub struct DeSelector(pub Selector);
+impl<'de> Deserialize<'de> for DeSelector {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        Ok(DeSelector(Selector::parse(&s).map_err(de::Error::custom)?))
+    }
+}
+#[derive(Debug, Deserialize)]
+pub struct Config {
+    pub newsreader_database_url: String,
+    pub newsreader_tantivy_db_path: String,
+    pub slurp_cache_path: String,
+    pub slurp_site_selectors: HashMap<String, Vec<DeSelector>>,
+}
--- a/server/src/error.rs
+++ b/server/src/error.rs
@@ -1,6 +1,8 @@
 use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error};

 use mailparse::MailParseError;
+use tantivy::TantivyError;
+use tantivy::query::QueryParserError;
 use thiserror::Error;

 use crate::TransformError;
@@ -29,6 +31,10 @@ pub enum ServerError {
    StringError(String),
    #[error("invalid url: {0}")]
    UrlParseError(#[from] url::ParseError),
+    #[error("tantivy error: {0}")]
+    TantivyError(#[from] TantivyError),
+    #[error("tantivy query parse error: {0}")]
+    QueryParseError(#[from] QueryParserError),
    #[error("impossible: {0}")]
    InfaillibleError(#[from] Infallible),
 }
--- a/server/src/graphql.rs
+++ b/server/src/graphql.rs
@@ -8,7 +8,7 @@ use notmuch::Notmuch;
 use serde::{Deserialize, Serialize};
 use sqlx::postgres::PgPool;

-use crate::{newsreader, nm, Query};
+use crate::{config::Config, newsreader, nm, Query};

 /// # Number of seconds since the Epoch
 pub type UnixTime = isize;
@@ -384,6 +384,7 @@ impl QueryRoot {
    async fn thread<'ctx>(&self, ctx: &Context<'ctx>, thread_id: String) -> Result<Thread, Error> {
        let nm = ctx.data_unchecked::<Notmuch>();
        let pool = ctx.data_unchecked::<PgPool>();
+        let config = ctx.data_unchecked::<Config>();
        let debug_content_tree = ctx
            .look_ahead()
            .field("messages")
@@ -392,7 +393,7 @@ impl QueryRoot {
            .exists();
        // TODO: look at thread_id and conditionally load newsreader
        if newsreader::is_newsreader_thread(&thread_id) {
-            Ok(newsreader::thread(pool, thread_id).await?)
+            Ok(newsreader::thread(config, pool, thread_id).await?)
        } else {
            Ok(nm::thread(nm, thread_id, debug_content_tree).await?)
        }
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -1,11 +1,13 @@
+pub mod config;
 pub mod error;
 pub mod graphql;
 pub mod newsreader;
 pub mod nm;

-use std::{collections::HashMap, convert::Infallible, str::FromStr};
+use std::{collections::HashMap, convert::Infallible, str::FromStr, sync::Arc};

 use async_trait::async_trait;
+use cacher::{Cacher, FilesystemCacher};
 use css_inline::{CSSInliner, InlineError, InlineOptions};
 use linkify::{LinkFinder, LinkKind};
 use log::{error, info, warn};
@@ -16,9 +18,13 @@ use lol_html::{
 use maplit::{hashmap, hashset};
 use scraper::{Html, Selector};
 use thiserror::Error;
+use tokio::sync::Mutex;
 use url::Url;

-use crate::newsreader::{extract_thread_id, is_newsreader_thread};
+use crate::{
+    config::DeSelector,
+    newsreader::{extract_thread_id, is_newsreader_thread},
+};

 // TODO: figure out how to use Cow
 #[async_trait]
@@ -109,16 +115,17 @@ impl Transformer for InlineStyle {
            include_str!("custom.css"),
        );
        let inline_opts = InlineOptions {
-            inline_style_tags: false,
+            inline_style_tags: true,
            keep_style_tags: false,
-            keep_link_tags: false,
+            keep_link_tags: true,
            base_url: None,
-            load_remote_stylesheets: false,
+            load_remote_stylesheets: true,
            extra_css: Some(css.into()),
            preallocate_node_capacity: 32,
            ..InlineOptions::default()
        };

+        //info!("HTML:\n{html}");
        Ok(match CSSInliner::new(inline_opts).inline(&html) {
            Ok(inlined_html) => inlined_html,
            Err(err) => {
@@ -141,7 +148,6 @@ impl Transformer for FrameImages {
            RewriteStrSettings {
                element_content_handlers: vec![
                    element!("img[data-src]", |el| {
-                        info!("found image with data-src {el:?}");
                        let src = el
                            .get_attribute("data-src")
                            .unwrap_or("https://placehold.co/600x400".to_string());
@@ -150,7 +156,6 @@ impl Transformer for FrameImages {
                        Ok(())
                    }),
                    element!("img[data-cfsrc]", |el| {
-                        info!("found image with data-cfsrc {el:?}");
                        let src = el
                            .get_attribute("data-cfsrc")
                            .unwrap_or("https://placehold.co/600x400".to_string());
@@ -159,7 +164,6 @@ impl Transformer for FrameImages {
                        Ok(())
                    }),
                    element!("img[alt], img[title]", |el| {
-                        info!("found image with alt or title {el:?}");
                        let src = el
                            .get_attribute("src")
                            .unwrap_or("https://placehold.co/600x400".to_string());
@@ -214,12 +218,13 @@ impl Transformer for AddOutlink {
    }
 }

-struct SlurpContents {
-    site_selectors: HashMap<String, Vec<Selector>>,
+struct SlurpContents<'h> {
+    cacher: Arc<Mutex<FilesystemCacher>>,
+    site_selectors: &'h HashMap<String, Vec<DeSelector>>,
 }

-impl SlurpContents {
-    fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
+impl<'h> SlurpContents<'h> {
+    fn get_selectors(&self, link: &Url) -> Option<&[DeSelector]> {
        for (host, selector) in self.site_selectors.iter() {
            if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
                return Some(&selector);
@@ -230,7 +235,7 @@ impl SlurpContents {
 }

 #[async_trait]
-impl Transformer for SlurpContents {
+impl<'h> Transformer for SlurpContents<'h> {
    fn should_run(&self, link: &Option<Url>, _: &str) -> bool {
        if let Some(link) = link {
            return self.get_selectors(link).is_some();
@@ -244,19 +249,26 @@ impl Transformer for SlurpContents {
        let Some(selectors) = self.get_selectors(&link) else {
            return Ok(html.to_string());
        };
-        let body = reqwest::get(link.as_str()).await?.text().await?;
+        let mut cacher = self.cacher.lock().await;
+        let body = if let Some(body) = cacher.get(link.as_str()) {
+            info!("cache hit for {link}");
+            String::from_utf8_lossy(&body).to_string()
+        } else {
+            let body = reqwest::get(link.as_str()).await?.text().await?;
+            cacher.set(link.as_str(), body.as_bytes());
+            body
+        };
        let doc = Html::parse_document(&body);

        let mut results = Vec::new();
        for selector in selectors {
-            if let Some(frag) = doc.select(&selector).next() {
+            for frag in doc.select(&selector.0) {
                results.push(frag.html())
-            } else {
-                warn!("couldn't find '{:?}' in {}", selector, link);
-                return Ok(html.to_string());
+                // TODO: figure out how to warn if there were no hits
+                //warn!("couldn't find '{:?}' in {}", selector, link);
            }
        }
-        Ok(results.join("<br><br>"))
+        Ok(results.join(""))
    }
 }

@@ -295,7 +307,7 @@ pub fn sanitize_html(
 ) -> Result<String, TransformError> {
    let inline_opts = InlineOptions {
        inline_style_tags: true,
-        keep_style_tags: false,
+        keep_style_tags: true,
        keep_link_tags: false,
        base_url: None,
        load_remote_stylesheets: false,
@@ -338,6 +350,30 @@ pub fn sanitize_html(

            el.set_attribute("src", &src)?;

+            Ok(())
+        }),
+        // Add https to href with //<domain name>
+        element!("link[href]", |el| {
+            info!("found link[href] {el:?}");
+            let mut href = el.get_attribute("href").expect("href was required");
+            if href.starts_with("//") {
+                warn!("adding https to {href}");
+                href.insert_str(0, "https:");
+            }
+
+            el.set_attribute("href", &href)?;
+
+            Ok(())
+        }),
+        // Add https to src with //<domain name>
+        element!("style[src]", |el| {
+            let mut src = el.get_attribute("src").expect("src was required");
+            if src.starts_with("//") {
+                src.insert_str(0, "https:");
+            }
+
+            el.set_attribute("src", &src)?;
+
            Ok(())
        }),
    ];
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@@ -1,24 +1,26 @@
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::sync::Arc;

+use cacher::FilesystemCacher;
 use log::info;
 use maplit::hashmap;
 use scraper::Selector;
+use shared::compute_color;
 use sqlx::postgres::PgPool;
+use tokio::sync::Mutex;
 use url::Url;

-use crate::Query;
-
-const TAG_PREFIX: &'static str = "News/";
-const THREAD_PREFIX: &'static str = "news:";
-
 use crate::{
    compute_offset_limit,
+    config::Config,
    error::ServerError,
    graphql::{NewsPost, Tag, Thread, ThreadSummary},
-    AddOutlink, EscapeHtml, FrameImages, InlineStyle, SanitizeHtml, SlurpContents, StripHtml,
-    Transformer,
+    AddOutlink, EscapeHtml, FrameImages, InlineStyle, Query, SanitizeHtml, SlurpContents,
+    StripHtml, Transformer,
 };

+const TAG_PREFIX: &'static str = "News/";
+const THREAD_PREFIX: &'static str = "news:";
+
 pub fn is_newsreader_search(query: &str) -> bool {
    query.contains(TAG_PREFIX)
 }
@@ -128,11 +130,9 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
    let tags = tags
        .into_iter()
        .map(|tag| {
-            let mut hasher = DefaultHasher::new();
-            tag.site.hash(&mut hasher);
-            let hex = format!("#{:06x}", hasher.finish() % (1 << 24));
            let unread = tag.unread.unwrap_or(0).try_into().unwrap_or(0);
            let name = format!("{TAG_PREFIX}{}", tag.site.expect("tag must have site"));
+            let hex = compute_color(&name);
            Tag {
                name,
                fg_color: "white".to_string(),
@@ -144,7 +144,11 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
    Ok(tags)
 }

-pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerError> {
+pub async fn thread(
+    config: &Config,
+    pool: &PgPool,
+    thread_id: String,
+) -> Result<Thread, ServerError> {
    let id = thread_id
        .strip_prefix(THREAD_PREFIX)
        .expect("news thread doesn't start with '{THREAD_PREFIX}'")
@@ -173,48 +177,20 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
    // TODO: add site specific cleanups. For example:
    // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
    // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
+    let cacher = Arc::new(Mutex::new(FilesystemCacher::new(&config.slurp_cache_path)?));
    let body_tranformers: Vec<Box<dyn Transformer>> = vec![
        Box::new(SlurpContents {
-            site_selectors: hashmap![
-                "atmeta.com".to_string() => vec![
-                    Selector::parse("div.entry-content").unwrap(),
-                ],
-                "blog.cloudflare.com".to_string() => vec![
-                    Selector::parse(".author-lists").unwrap(),
-                    Selector::parse(".post-full-content").unwrap()
-                ],
-                "engineering.fb.com".to_string() => vec![
-                    Selector::parse("article").unwrap(),
-                ],
-                "hackaday.com".to_string() => vec![
-                    Selector::parse("div.entry-featured-image").unwrap(),
-                    Selector::parse("div.entry-content").unwrap()
-                ],
-                "mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
-                "natwelch.com".to_string() => vec![
-                    Selector::parse("article div.prose").unwrap(),
-                ],
-                "slashdot.org".to_string() => vec![
-                    Selector::parse("span.story-byline").unwrap(),
-                    Selector::parse("div.p").unwrap(),
-                ],
-                "www.redox-os.org".to_string() => vec![
-                    Selector::parse("div.content").unwrap(),
-                ],
-                "www.smbc-comics.com".to_string() => vec![
-                    Selector::parse("img#cc-comic").unwrap(),
-                    Selector::parse("div#aftercomic img").unwrap(),
-                ],
-            ],
+            cacher,
+            site_selectors: &config.slurp_site_selectors,
        }),
        Box::new(FrameImages),
        Box::new(AddOutlink),
        Box::new(EscapeHtml),
-        Box::new(InlineStyle),
        Box::new(SanitizeHtml {
            cid_prefix: "",
            base_url: &link,
        }),
+        Box::new(InlineStyle),
    ];
    for t in body_tranformers.iter() {
        if t.should_run(&link, &body) {
--- a/shared/Cargo.toml
+++ b/shared/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "shared"
-version = "0.0.13"
+version = "0.0.29"
 edition = "2021"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
--- a/shared/src/lib.rs
+++ b/shared/src/lib.rs
@@ -1,3 +1,5 @@
+use std::hash::{DefaultHasher, Hash, Hasher};
+
 use build_info::{BuildInfo, VersionControl};
 use notmuch::SearchSummary;
 use serde::{Deserialize, Serialize};
@@ -49,3 +51,8 @@ pub fn build_version(bi: fn() -> &'static BuildInfo) -> String {

    format!("v{}{}", bi.crate_info.version, commit(&bi.version_control)).to_string()
 }
+pub fn compute_color(data: &str) -> String {
+    let mut hasher = DefaultHasher::new();
+    data.hash(&mut hasher);
+    format!("#{:06x}", hasher.finish() % (1 << 24))
+}
--- a/web/Cargo.toml
+++ b/web/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-version = "0.0.13"
+version = "0.0.29"
 name = "letterbox"
 repository = "https://github.com/seed-rs/seed-quickstart"
 authors = ["Bill Thiede <git@xinu.tv>"]
--- a/web/src/state.rs
+++ b/web/src/state.rs
@@ -1,7 +1,7 @@
 use std::collections::HashSet;

 use graphql_client::GraphQLQuery;
-use log::{error, info, warn};
+use log::{debug, error, info, warn};
 use seed::{prelude::*, *};
 use thiserror::Error;
 use web_sys::HtmlElement;
@@ -332,6 +332,8 @@ pub fn update(msg: Msg, model: &mut Model, orders: &mut impl Orders<Msg>) {
                selected_threads,
            };
            orders.send_msg(Msg::UpdateServerVersion(data.version));
+            // Generate signal so progress bar is reset
+            orders.send_msg(Msg::WindowScrolled);
        }

        Msg::ShowThreadRequest { thread_id } => {
@@ -382,6 +384,8 @@ pub fn update(msg: Msg, model: &mut Model, orders: &mut impl Orders<Msg>) {
                }
            }
            orders.send_msg(Msg::UpdateServerVersion(data.version));
+            // Generate signal so progress bar is reset
+            orders.send_msg(Msg::WindowScrolled);
        }
        Msg::ShowThreadResult(bad) => {
            error!("show_thread_query error: {bad:#?}");
@@ -511,16 +515,31 @@ pub fn update(msg: Msg, model: &mut Model, orders: &mut impl Orders<Msg>) {
                    .value_of();

                let r = el.get_bounding_client_rect();
-                info!(
-                    "window scrolled {}x{}@{},{}",
+                if r.height() < ih {
+                    // The whole content fits in the window, no scrollbar
+                    orders.send_msg(Msg::SetProgress(0.));
+                    return;
+                }
+                let end: f64 = r.height() - ih;
+                if end < 0. {
+                    orders.send_msg(Msg::SetProgress(0.));
+                    return;
+                }
+                // Flip Y, normally it's 0-point when the top of the content hits the top of the
+                // screen and goes negative from there.
+                let y = -r.y();
+                let ratio: f64 = (y / end).max(0.);
+                debug!(
+                    "WindowScrolled ih {ih} end {end} ratio {ratio:.02} {}x{} @ {},{}",
                    r.width(),
                    r.height(),
                    r.x(),
-                    r.y(),
+                    r.y()
                );
-                let end = r.height() - ih;
-                let y = -r.y();
-                orders.send_msg(Msg::SetProgress((y / end).max(0.)));
+
+                orders.send_msg(Msg::SetProgress(ratio));
+            } else {
+                orders.send_msg(Msg::SetProgress(0.));
            }
        }
        Msg::SetProgress(ratio) => {
--- a/web/src/view/mod.rs
+++ b/web/src/view/mod.rs
@@ -1,7 +1,4 @@
-use std::{
-    collections::{hash_map::DefaultHasher, HashSet},
-    hash::{Hash, Hasher},
-};
+use std::collections::HashSet;

 use chrono::{DateTime, Datelike, Duration, Local, Utc};
 use human_format::{Formatter, Scales};
@@ -9,6 +6,7 @@ use itertools::Itertools;
 use log::{debug, error, info};
 use seed::{prelude::*, *};
 use seed_hooks::{state_access::CloneState, topo, use_state};
+use shared::compute_color;
 use web_sys::HtmlElement;

 use crate::{
@@ -29,12 +27,6 @@ fn set_title(title: &str) {
    seed::document().set_title(&format!("lb: {}", title));
 }

-fn compute_color(data: &str) -> String {
-    let mut hasher = DefaultHasher::new();
-    data.hash(&mut hasher);
-    format!("#{:06x}", hasher.finish() % (1 << 24))
-}
-
 fn tags_chiclet(tags: &[String], is_mobile: bool) -> impl Iterator<Item = Node<Msg>> + '_ {
    tags.iter().map(move |tag| {
        let hex = compute_color(tag);
@@ -869,12 +861,13 @@ fn thread(
                ],
            ],
        ],
-        div![el_ref(content_el), messages] /* TODO(wathiede): plumb in orignal id
-                                           a![
-                                           attrs! {At::Href=>api::original(&thread_node.0.as_ref().expect("message missing").id)},
-                                           "Original"
-                                           ],
-                                           */
+        div![el_ref(content_el), messages, click_to_top()],
+        /* TODO(wathiede): plumb in orignal id
+        a![
+        attrs! {At::Href=>api::original(&thread_node.0.as_ref().expect("message missing").id)},
+        "Original"
+        ],
+        */
    ]
 }

@@ -1129,7 +1122,7 @@ fn news_post(
        "Original"
        ],
        */
-        ev(Ev::Scroll, |e| info!("scroll event {e:?}"))
+        click_to_top(),
    ]
 }
 fn render_news_post_header(post: &ShowThreadQueryThreadOnNewsPost) -> Node<Msg> {
@@ -1222,3 +1215,14 @@ pub fn versions(versions: &crate::state::Version) -> Node<Msg> {
        ])
    ]
 }
+
+fn click_to_top() -> Node<Msg> {
+    button![
+        C!["button", "is-danger", "is-small"],
+        span!["Top"],
+        span![C!["icon"], i![C!["fas", "fa-arrow-turn-up"]]],
+        ev(Ev::Click, move |_| web_sys::window()
+            .unwrap()
+            .scroll_to_with_x_and_y(0., 0.))
+    ]
+}
--- a/web/static/site-specific.css
+++ b/web/static/site-specific.css
@@ -2,6 +2,28 @@
  color: var(--color-text) !important;
 }

+.body.news-post em {
+  border: 0 !important;
+  font-style: italic;
+  margin: inherit !important;
+  padding: inherit !important;
+}
+
+.body.news-post .number {
+  align-items: inherit;
+  background-color: inherit;
+  border-radius: inherit;
+  display: inherit;
+  font-size: inherit;
+  height: inherit;
+  justify-content: inherit;
+  margin-right: inherit;
+  min-width: inherit;
+  padding: inherit;
+  text-align: inherit;
+  vertical-align: inherit;
+}
+
 .body.news-post.site-saturday-morning-breakfast-cereal {
  display: flex;
  align-items: center;
@@ -18,13 +40,12 @@
  padding-left: 1em;
 }

-.body.news-post em {
-  margin: inherit !important;
-  padding: inherit !important;
-  font-weight: inherit !important;
-  border: inherit !important;
-  display: inline !important;
-  color: inherit !important;
+.body.news-post.site-news-on-redox-your-next-gen-os h1,
+.body.news-post.site-news-on-redox-your-next-gen-os h2,
+.body.news-post.site-news-on-redox-your-next-gen-os h3,
+.body.news-post.site-news-on-redox-your-next-gen-os h4,
+.body.news-post.site-news-on-redox-your-next-gen-os h5 {
+  color: var(--color-text) !important;
 }

 .body.mail code,
Author	SHA1	Message	Date
Bill Thiede	86805f38e3	Load slurp config from toml file	2024-09-21 12:52:08 -07:00
Bill Thiede	62b17bd6a6	Bumping version to 0.0.29	2024-09-20 08:56:58 -07:00
Bill Thiede	c0bac99d5a	server: add slurp config for zsa blog	2024-09-20 08:56:45 -07:00
Bill Thiede	3b69c5e74b	Bumping version to 0.0.28	2024-09-19 17:06:03 -07:00
Bill Thiede	539fd469cc	server: create index when missing	2024-09-19 17:05:47 -07:00
Bill Thiede	442688c35c	web: lint	2024-09-19 16:54:18 -07:00
Bill Thiede	da27f02237	Bumping version to 0.0.27	2024-09-19 16:52:35 -07:00
Bill Thiede	9460e354b7	server: cargo sqlx prepare	2024-09-19 16:52:26 -07:00
Bill Thiede	6bab128ed9	Bumping version to 0.0.26	2024-09-19 16:33:50 -07:00
Bill Thiede	3856b4ca5a	server: try different cacher url	2024-09-19 16:33:40 -07:00
Bill Thiede	bef39eefa5	Bumping version to 0.0.25	2024-09-19 16:08:20 -07:00
Bill Thiede	b0366c7b4d	server: try non-https to see if that works	2024-09-19 16:07:59 -07:00
Bill Thiede	ca02d84d63	Bumping version to 0.0.24	2024-09-19 16:01:55 -07:00
Bill Thiede	461d5de886	server: change internal git url	2024-09-19 16:01:41 -07:00
Bill Thiede	f8134dad7a	Bumping version to 0.0.23	2024-09-19 15:53:56 -07:00
Bill Thiede	30f510bb03	server: WIP tantivy, cache slurps, use shared::compute_color,	2024-09-19 15:53:09 -07:00
Bill Thiede	e7cbf9cc45	shared: remove debug logging	2024-09-19 13:54:47 -07:00
Bill Thiede	5108213af5	web: use shared compute_color	2024-09-19 13:49:24 -07:00
Bill Thiede	d148f625ac	shared: add compute_color	2024-09-19 13:48:56 -07:00
Bill Thiede	a9b8f5a88f	Bumping version to 0.0.22	2024-09-16 20:00:16 -07:00
Bill Thiede	539b584d9b	web: fix broken build	2024-09-16 20:00:06 -07:00
Bill Thiede	2f8d83fc4b	Bumping version to 0.0.21	2024-09-16 19:52:28 -07:00
Bill Thiede	86ee1257fa	web: better progress bar	2024-09-16 19:52:20 -07:00
Bill Thiede	03f1035e0e	Bumping version to 0.0.20	2024-09-12 22:38:18 -07:00
Bill Thiede	bd578191a8	web: add scroll to top button and squelch some debug logging	2024-09-12 22:37:58 -07:00
Bill Thiede	d4fc2e2ef1	Bumping version to 0.0.19	2024-09-12 15:41:01 -07:00
Bill Thiede	cde30de81c	web: explicitly set progress to zero when not in thread/news view	2024-09-12 15:40:42 -07:00
Bill Thiede	96be74e3ee	Bumping version to 0.0.18	2024-09-12 15:32:30 -07:00
Bill Thiede	b78d34b27e	web: disable bulma styling for .number	2024-09-12 15:32:18 -07:00
Bill Thiede	b4b64c33a6	Bumping version to 0.0.17	2024-09-12 10:07:00 -07:00
Bill Thiede	47b1875022	server: tweak cloudflare and prusa slurp config	2024-09-12 10:06:46 -07:00
Bill Thiede	b06cbd1381	Bumping version to 0.0.16	2024-09-12 10:03:26 -07:00
Bill Thiede	9e35f8ca6c	web: fix <em> looking like a button	2024-09-12 10:01:58 -07:00
Bill Thiede	8eaefde67d	Bumping version to 0.0.15	2024-09-12 09:28:14 -07:00
Bill Thiede	d5a3324837	server: slurp config for prusa blog and squelch some info logging	2024-09-12 09:27:57 -07:00
Bill Thiede	f5c90d8770	Bumping version to 0.0.14	2024-09-11 11:46:04 -07:00
Bill Thiede	825a125a62	web: redox specific styling	2024-09-11 11:45:53 -07:00