From d1cfc77148381476fa5d10bd2871223ee889403e Mon Sep 17 00:00:00 2001
From: Bill Thiede <git@xinu.tv>
Date: Mon, 12 Aug 2024 20:53:48 -0700
Subject: [PATCH] server: more news title/body cleanup, and don't search news
 so much

---
 server/src/graphql.rs    |  66 ++++++++++++--------
 server/src/lib.rs        | 118 +++++++++++++++++++++++++++++++++-
 server/src/newsreader.rs | 132 +++++++++++++++++++--------------------
 3 files changed, 218 insertions(+), 98 deletions(-)
diff --git a/server/src/graphql.rs b/server/src/graphql.rs
index ea7b0d9..8f89093 100644
--- a/server/src/graphql.rs
+++ b/server/src/graphql.rs
@@ -8,7 +8,7 @@ use notmuch::Notmuch;
 use serde::{Deserialize, Serialize};
 use sqlx::postgres::PgPool;
 
-use crate::{newsreader, nm};
+use crate::{newsreader, nm, Query};
 
 /// # Number of seconds since the Epoch
 pub type UnixTime = isize;
@@ -215,7 +215,7 @@ impl QueryRoot {
         let nm = ctx.data_unchecked::<Notmuch>();
         let pool = ctx.data_unchecked::<PgPool>();
 
-        let newsreader_query: newsreader::Query = query.parse()?;
+        let newsreader_query: Query = query.parse()?;
 
         Ok(newsreader::count(pool, &newsreader_query).await? + nm::count(nm, &query).await?)
     }
@@ -257,32 +257,46 @@ impl QueryRoot {
                 let newsreader_before = before.as_ref().map(|sc| sc.newsreader_offset);
                 let notmuch_before = before.as_ref().map(|sc| sc.notmuch_offset);
 
-                let newsreader_query: newsreader::Query = query.parse()?;
-                let newsreader_results = newsreader::search(
-                    pool,
-                    newsreader_after,
-                    newsreader_before,
-                    first.map(|v| v as i32),
-                    last.map(|v| v as i32),
-                    &newsreader_query,
-                )
-                .await?
-                .into_iter()
-                .map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts));
+                let newsreader_query: Query = query.parse()?;
+                info!("newsreader_query {newsreader_query:?}");
+                let newsreader_results = if newsreader_query.is_newsreader {
+                    newsreader::search(
+                        pool,
+                        newsreader_after,
+                        newsreader_before,
+                        first.map(|v| v as i32),
+                        last.map(|v| v as i32),
+                        &newsreader_query,
+                    )
+                    .await?
+                    .into_iter()
+                    .map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts))
+                    .collect()
+                } else {
+                    Vec::new()
+                };
 
-                let notmuch_results = nm::search(
-                    nm,
-                    notmuch_after,
-                    notmuch_before,
-                    first.map(|v| v as i32),
-                    last.map(|v| v as i32),
-                    query,
-                )
-                .await?
-                .into_iter()
-                .map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts));
+                let notmuch_results = if newsreader_query.is_notmuch {
+                    nm::search(
+                        nm,
+                        notmuch_after,
+                        notmuch_before,
+                        first.map(|v| v as i32),
+                        last.map(|v| v as i32),
+                        newsreader_query.to_notmuch(),
+                    )
+                    .await?
+                    .into_iter()
+                    .map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts))
+                    .collect()
+                } else {
+                    Vec::new()
+                };
 
-                let mut results: Vec<_> = newsreader_results.chain(notmuch_results).collect();
+                let mut results: Vec<_> = newsreader_results
+                    .into_iter()
+                    .chain(notmuch_results)
+                    .collect();
 
                 // The leading '-' is to reverse sort
                 results.sort_by_key(|item| match item {
diff --git a/server/src/lib.rs b/server/src/lib.rs
index d63c149..11c5d30 100644
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -3,14 +3,21 @@ pub mod graphql;
 pub mod newsreader;
 pub mod nm;
 
+use std::{convert::Infallible, str::FromStr};
+
 use css_inline::{CSSInliner, InlineError, InlineOptions};
 use linkify::{LinkFinder, LinkKind};
-use log::error;
-use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
+use log::{error, info};
+use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings};
 use maplit::{hashmap, hashset};
 use thiserror::Error;
 use url::Url;
 
+use crate::newsreader::{
+    extract_thread_id, is_newsreader_search, is_newsreader_thread, make_news_tag,
+};
+const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
+
 // TODO: figure out how to use Cow
 trait Transformer {
     fn should_run(&self, _html: &str) -> bool {
@@ -46,13 +53,38 @@ struct EscapeHtml;
 
 impl Transformer for EscapeHtml {
     fn should_run(&self, html: &str) -> bool {
-        html.starts_with("&lt")
+        html.contains("&")
     }
     fn transform(&self, html: &str) -> Result<String, TransformError> {
         Ok(html_escape::decode_html_entities(html).to_string())
     }
 }
 
+struct StripHtml;
+
+impl Transformer for StripHtml {
+    fn should_run(&self, html: &str) -> bool {
+        // Lame test
+        html.contains("<")
+    }
+    fn transform(&self, html: &str) -> Result<String, TransformError> {
+        let mut text = String::new();
+        let element_content_handlers = vec![text!("*", |t| {
+            text += t.as_str();
+            Ok(())
+        })];
+        let _ = rewrite_str(
+            html,
+            RewriteStrSettings {
+                element_content_handlers,
+                ..RewriteStrSettings::default()
+            },
+        )?;
+
+        Ok(text)
+    }
+}
+
 struct InlineStyle;
 
 impl Transformer for InlineStyle {
@@ -381,3 +413,83 @@ fn compute_offset_limit(
         }
     }
 }
+
+#[derive(Debug)]
+pub struct Query {
+    pub unread_only: bool,
+    pub tag: Option<String>,
+    pub uid: Option<String>,
+    pub remainder: Vec<String>,
+    pub is_notmuch: bool,
+    pub is_newsreader: bool,
+}
+
+impl Query {
+    // Converts the internal state of Query to something suitable for notmuch queries. Removes and
+    // letterbox specific '<key>:<value' tags
+    fn to_notmuch(&self) -> String {
+        let mut parts = Vec::new();
+        if !self.is_notmuch {
+            return String::new();
+        }
+
+        if self.unread_only {
+            parts.push("is:unread".to_string());
+        }
+        if let Some(site) = &self.tag {
+            parts.push(format!("tag:{site}"));
+        }
+        if let Some(uid) = &self.uid {
+            parts.push(uid.clone());
+        }
+        parts.extend(self.remainder.clone());
+        parts.join(" ")
+    }
+}
+
+impl FromStr for Query {
+    type Err = Infallible;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut unread_only = false;
+        let mut tag = None;
+        let mut uid = None;
+        let mut remainder = Vec::new();
+        let site_prefix = make_news_tag("");
+        let mut is_notmuch = false;
+        let mut is_newsreader = false;
+        for word in s.split_whitespace() {
+            if word == "is:unread" {
+                unread_only = true
+            } else if word.starts_with("tag:") {
+                tag = Some(word["tag:".len()..].to_string())
+                /*
+                            } else if word.starts_with("tag:") {
+                                // Any tag that doesn't match site_prefix should explicitly set the site to something not in the
+                                // database
+                                site = Some(NON_EXISTENT_SITE_NAME.to_string());
+                */
+            } else if is_newsreader_thread(word) {
+                uid = Some(extract_thread_id(word).to_string())
+            } else if word == "is:mail" || word == "is:email" || word == "is:notmuch" {
+                is_notmuch = true;
+            } else if word == "is:news" || word == "is:newsreader" {
+                is_newsreader = true;
+            } else {
+                remainder.push(word.to_string());
+            }
+        }
+        // If we don't see any explicit filters for a corpus, flip them all on
+        if !(is_notmuch || is_newsreader) {
+            is_newsreader = true;
+            is_notmuch = true;
+        }
+        Ok(Query {
+            unread_only,
+            tag,
+            uid,
+            remainder,
+            is_notmuch,
+            is_newsreader,
+        })
+    }
+}
diff --git a/server/src/newsreader.rs b/server/src/newsreader.rs
index 4de5f5d..0bae333 100644
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@@ -1,22 +1,19 @@
-use std::{
-    convert::Infallible,
-    hash::{DefaultHasher, Hash, Hasher},
-    str::FromStr,
-};
+use std::hash::{DefaultHasher, Hash, Hasher};
 
 use log::info;
 use sqlx::postgres::PgPool;
 use url::Url;
 
+use crate::Query;
+
 const TAG_PREFIX: &'static str = "News/";
 const THREAD_PREFIX: &'static str = "news:";
-const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
 
 use crate::{
     compute_offset_limit,
     error::ServerError,
     graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
-    EscapeHtml, InlineStyle, SanitizeHtml, Transformer,
+    EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
 };
 
 pub fn is_newsreader_search(query: &str) -> bool {
@@ -27,8 +24,20 @@ pub fn is_newsreader_thread(query: &str) -> bool {
     query.starts_with(THREAD_PREFIX)
 }
 
+pub fn extract_thread_id(query: &str) -> &str {
+    &query[THREAD_PREFIX.len()..]
+}
+
+pub fn extract_site(tag: &str) -> &str {
+    &tag[TAG_PREFIX.len()..]
+}
+
+pub fn make_news_tag(tag: &str) -> String {
+    format!("tag:{TAG_PREFIX}{tag}")
+}
+
 pub async fn count(pool: &PgPool, query: &Query) -> Result<usize, ServerError> {
-    let row = sqlx::query_file!("sql/count.sql", query.site, query.unread_only)
+    let row = sqlx::query_file!("sql/count.sql", query.tag, query.unread_only)
         .fetch_one(pool)
         .await?;
     Ok(row.count.unwrap_or(0).try_into().unwrap_or(0))
@@ -43,6 +52,12 @@ pub async fn search(
     query: &Query,
 ) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
     info!("search({after:?} {before:?} {first:?} {last:?} {query:?}");
+    if !query.remainder.is_empty() {
+        // TODO: handle full text search against all sites, for now, early return if search words
+        // are specified.
+        return Ok(Vec::new());
+    }
+
     let (offset, mut limit) = compute_offset_limit(after, before, first, last);
     if before.is_none() {
         // When searching forward, the +1 is to see if there are more pages of data available.
@@ -50,11 +65,17 @@ pub async fn search(
         // `before` is on the next page.
         limit = limit + 1;
     }
-    info!("search offset {offset} limit {limit}");
 
+    let site = query.tag.as_ref().map(|t| extract_site(&t).to_string());
+    info!(
+        "search offset {offset} limit {limit} site {site:?} unread_only {}",
+        query.unread_only
+    );
+
+    // TODO: further limit results to include query.remainder if set
     let rows = sqlx::query_file!(
         "sql/threads.sql",
-        query.site,
+        site,
         query.unread_only,
         offset as i64,
         limit as i64
@@ -66,12 +87,13 @@ pub async fn search(
         .into_iter()
         .enumerate()
         .map(|(i, r)| {
-            let site = r.site.unwrap_or("UNKOWN SITE".to_string());
-            let tags = if r.is_read.unwrap_or(false) {
-                vec![site.clone()]
-            } else {
-                vec!["unread".to_string(), site.clone()]
+            let site = r.site.unwrap_or("UNKOWN TAG".to_string());
+            let mut tags = vec![format!("{TAG_PREFIX}{site}")];
+            if !r.is_read.unwrap_or(true) {
+                tags.push("unread".to_string());
             };
+            let mut title = r.title.unwrap_or("NO TITLE".to_string());
+            title = clean_title(&title).expect("failed to clean title");
             (
                 i as i32 + offset,
                 ThreadSummary {
@@ -85,7 +107,7 @@ pub async fn search(
                     matched: 0,
                     total: 1,
                     authors: r.name.unwrap_or_else(|| site.clone()),
-                    subject: r.title.unwrap_or("NO TITLE".to_string()),
+                    subject: title,
                     tags,
                 },
             )
@@ -125,11 +147,10 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
         .fetch_one(pool)
         .await?;
 
-    let site = r.site.unwrap_or("NO SITE".to_string());
-    let tags = if r.is_read.unwrap_or(false) {
-        vec![site.clone()]
-    } else {
-        vec!["unread".to_string(), site.clone()]
+    let site = r.site.unwrap_or("NO TAG".to_string());
+    let mut tags = vec![format!("{TAG_PREFIX}{site}")];
+    if r.is_read.unwrap_or(true) {
+        tags.push("unread".to_string());
     };
     let default_homepage = "http://no-homepage";
     let homepage = Url::parse(
@@ -166,11 +187,11 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
             }
         }
     });
-    let mut html = r.summary.unwrap_or("NO SUMMARY".to_string());
+    let mut body = r.summary.unwrap_or("NO SUMMARY".to_string());
     // TODO: add site specific cleanups. For example:
     // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
     // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
-    let tranformers: Vec<Box<dyn Transformer>> = vec![
+    let body_tranformers: Vec<Box<dyn Transformer>> = vec![
         Box::new(EscapeHtml),
         Box::new(InlineStyle),
         Box::new(SanitizeHtml {
@@ -178,16 +199,16 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
             base_url: &link,
         }),
     ];
-    for t in tranformers.iter() {
-        if t.should_run(&html) {
-            html = t.transform(&html)?;
+    for t in body_tranformers.iter() {
+        if t.should_run(&body) {
+            body = t.transform(&body)?;
         }
     }
     let body = Body::Html(Html {
-        html,
+        html: body,
         content_tree: "".to_string(),
     });
-    let title = r.title.unwrap_or("NO TITLE".to_string());
+    let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
     let from = Some(Email {
         name: r.name,
         addr: addr.map(|a| a.to_string()),
@@ -215,47 +236,6 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
         }],
     })
 }
-
-#[derive(Debug)]
-pub struct Query {
-    pub unread_only: bool,
-    pub site: Option<String>,
-    pub uid: Option<String>,
-    pub remainder: Vec<String>,
-}
-
-impl FromStr for Query {
-    type Err = Infallible;
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut unread_only = false;
-        let mut site = None;
-        let mut uid = None;
-        let mut remainder = Vec::new();
-        let site_prefix = format!("tag:{TAG_PREFIX}");
-        for word in s.split_whitespace() {
-            if word == "is:unread" {
-                unread_only = true
-            } else if word.starts_with(&site_prefix) {
-                site = Some(word[site_prefix.len()..].to_string())
-            } else if word.starts_with("tag:") {
-                // Any tag that doesn't match site_prefix should explicitly set the site to something not in the
-                // database
-                site = Some(NON_EXISTENT_SITE_NAME.to_string());
-            } else if word.starts_with(THREAD_PREFIX) {
-                uid = Some(word[THREAD_PREFIX.len()..].to_string())
-            } else {
-                remainder.push(word.to_string());
-            }
-        }
-        Ok(Query {
-            unread_only,
-            site,
-            uid,
-            remainder,
-        })
-    }
-}
-
 pub async fn set_read_status<'ctx>(
     pool: &PgPool,
     query: &str,
@@ -267,3 +247,17 @@ pub async fn set_read_status<'ctx>(
         .await?;
     Ok(true)
 }
+fn clean_title(title: &str) -> Result<String, ServerError> {
+    // Make title HTML so html parsers work
+    let mut title = format!("<html>{title}</html>");
+    let title_tranformers: Vec<Box<dyn Transformer>> =
+        vec![Box::new(EscapeHtml), Box::new(StripHtml)];
+    // Make title HTML so html parsers work
+    title = format!("<html>{title}</html>");
+    for t in title_tranformers.iter() {
+        if t.should_run(&title) {
+            title = t.transform(&title)?;
+        }
+    }
+    Ok(title)
+}