From d1cfc77148381476fa5d10bd2871223ee889403e Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Mon, 12 Aug 2024 20:53:48 -0700 Subject: [PATCH] server: more news title/body cleanup, and don't search news so much --- server/src/graphql.rs | 66 ++++++++++++-------- server/src/lib.rs | 118 +++++++++++++++++++++++++++++++++- server/src/newsreader.rs | 132 +++++++++++++++++++-------------------- 3 files changed, 218 insertions(+), 98 deletions(-) diff --git a/server/src/graphql.rs b/server/src/graphql.rs index ea7b0d9..8f89093 100644 --- a/server/src/graphql.rs +++ b/server/src/graphql.rs @@ -8,7 +8,7 @@ use notmuch::Notmuch; use serde::{Deserialize, Serialize}; use sqlx::postgres::PgPool; -use crate::{newsreader, nm}; +use crate::{newsreader, nm, Query}; /// # Number of seconds since the Epoch pub type UnixTime = isize; @@ -215,7 +215,7 @@ impl QueryRoot { let nm = ctx.data_unchecked::(); let pool = ctx.data_unchecked::(); - let newsreader_query: newsreader::Query = query.parse()?; + let newsreader_query: Query = query.parse()?; Ok(newsreader::count(pool, &newsreader_query).await? + nm::count(nm, &query).await?) } @@ -257,32 +257,46 @@ impl QueryRoot { let newsreader_before = before.as_ref().map(|sc| sc.newsreader_offset); let notmuch_before = before.as_ref().map(|sc| sc.notmuch_offset); - let newsreader_query: newsreader::Query = query.parse()?; - let newsreader_results = newsreader::search( - pool, - newsreader_after, - newsreader_before, - first.map(|v| v as i32), - last.map(|v| v as i32), - &newsreader_query, - ) - .await? - .into_iter() - .map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts)); + let newsreader_query: Query = query.parse()?; + info!("newsreader_query {newsreader_query:?}"); + let newsreader_results = if newsreader_query.is_newsreader { + newsreader::search( + pool, + newsreader_after, + newsreader_before, + first.map(|v| v as i32), + last.map(|v| v as i32), + &newsreader_query, + ) + .await? + .into_iter() + .map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts)) + .collect() + } else { + Vec::new() + }; - let notmuch_results = nm::search( - nm, - notmuch_after, - notmuch_before, - first.map(|v| v as i32), - last.map(|v| v as i32), - query, - ) - .await? - .into_iter() - .map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts)); + let notmuch_results = if newsreader_query.is_notmuch { + nm::search( + nm, + notmuch_after, + notmuch_before, + first.map(|v| v as i32), + last.map(|v| v as i32), + newsreader_query.to_notmuch(), + ) + .await? + .into_iter() + .map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts)) + .collect() + } else { + Vec::new() + }; - let mut results: Vec<_> = newsreader_results.chain(notmuch_results).collect(); + let mut results: Vec<_> = newsreader_results + .into_iter() + .chain(notmuch_results) + .collect(); // The leading '-' is to reverse sort results.sort_by_key(|item| match item { diff --git a/server/src/lib.rs b/server/src/lib.rs index d63c149..11c5d30 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -3,14 +3,21 @@ pub mod graphql; pub mod newsreader; pub mod nm; +use std::{convert::Infallible, str::FromStr}; + use css_inline::{CSSInliner, InlineError, InlineOptions}; use linkify::{LinkFinder, LinkKind}; -use log::error; -use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings}; +use log::{error, info}; +use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings}; use maplit::{hashmap, hashset}; use thiserror::Error; use url::Url; +use crate::newsreader::{ + extract_thread_id, is_newsreader_search, is_newsreader_thread, make_news_tag, +}; +const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE"; + // TODO: figure out how to use Cow trait Transformer { fn should_run(&self, _html: &str) -> bool { @@ -46,13 +53,38 @@ struct EscapeHtml; impl Transformer for EscapeHtml { fn should_run(&self, html: &str) -> bool { - html.starts_with("<") + html.contains("&") } fn transform(&self, html: &str) -> Result { Ok(html_escape::decode_html_entities(html).to_string()) } } +struct StripHtml; + +impl Transformer for StripHtml { + fn should_run(&self, html: &str) -> bool { + // Lame test + html.contains("<") + } + fn transform(&self, html: &str) -> Result { + let mut text = String::new(); + let element_content_handlers = vec![text!("*", |t| { + text += t.as_str(); + Ok(()) + })]; + let _ = rewrite_str( + html, + RewriteStrSettings { + element_content_handlers, + ..RewriteStrSettings::default() + }, + )?; + + Ok(text) + } +} + struct InlineStyle; impl Transformer for InlineStyle { @@ -381,3 +413,83 @@ fn compute_offset_limit( } } } + +#[derive(Debug)] +pub struct Query { + pub unread_only: bool, + pub tag: Option, + pub uid: Option, + pub remainder: Vec, + pub is_notmuch: bool, + pub is_newsreader: bool, +} + +impl Query { + // Converts the internal state of Query to something suitable for notmuch queries. Removes and + // letterbox specific ': String { + let mut parts = Vec::new(); + if !self.is_notmuch { + return String::new(); + } + + if self.unread_only { + parts.push("is:unread".to_string()); + } + if let Some(site) = &self.tag { + parts.push(format!("tag:{site}")); + } + if let Some(uid) = &self.uid { + parts.push(uid.clone()); + } + parts.extend(self.remainder.clone()); + parts.join(" ") + } +} + +impl FromStr for Query { + type Err = Infallible; + fn from_str(s: &str) -> Result { + let mut unread_only = false; + let mut tag = None; + let mut uid = None; + let mut remainder = Vec::new(); + let site_prefix = make_news_tag(""); + let mut is_notmuch = false; + let mut is_newsreader = false; + for word in s.split_whitespace() { + if word == "is:unread" { + unread_only = true + } else if word.starts_with("tag:") { + tag = Some(word["tag:".len()..].to_string()) + /* + } else if word.starts_with("tag:") { + // Any tag that doesn't match site_prefix should explicitly set the site to something not in the + // database + site = Some(NON_EXISTENT_SITE_NAME.to_string()); + */ + } else if is_newsreader_thread(word) { + uid = Some(extract_thread_id(word).to_string()) + } else if word == "is:mail" || word == "is:email" || word == "is:notmuch" { + is_notmuch = true; + } else if word == "is:news" || word == "is:newsreader" { + is_newsreader = true; + } else { + remainder.push(word.to_string()); + } + } + // If we don't see any explicit filters for a corpus, flip them all on + if !(is_notmuch || is_newsreader) { + is_newsreader = true; + is_notmuch = true; + } + Ok(Query { + unread_only, + tag, + uid, + remainder, + is_notmuch, + is_newsreader, + }) + } +} diff --git a/server/src/newsreader.rs b/server/src/newsreader.rs index 4de5f5d..0bae333 100644 --- a/server/src/newsreader.rs +++ b/server/src/newsreader.rs @@ -1,22 +1,19 @@ -use std::{ - convert::Infallible, - hash::{DefaultHasher, Hash, Hasher}, - str::FromStr, -}; +use std::hash::{DefaultHasher, Hash, Hasher}; use log::info; use sqlx::postgres::PgPool; use url::Url; +use crate::Query; + const TAG_PREFIX: &'static str = "News/"; const THREAD_PREFIX: &'static str = "news:"; -const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE"; use crate::{ compute_offset_limit, error::ServerError, graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary}, - EscapeHtml, InlineStyle, SanitizeHtml, Transformer, + EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer, }; pub fn is_newsreader_search(query: &str) -> bool { @@ -27,8 +24,20 @@ pub fn is_newsreader_thread(query: &str) -> bool { query.starts_with(THREAD_PREFIX) } +pub fn extract_thread_id(query: &str) -> &str { + &query[THREAD_PREFIX.len()..] +} + +pub fn extract_site(tag: &str) -> &str { + &tag[TAG_PREFIX.len()..] +} + +pub fn make_news_tag(tag: &str) -> String { + format!("tag:{TAG_PREFIX}{tag}") +} + pub async fn count(pool: &PgPool, query: &Query) -> Result { - let row = sqlx::query_file!("sql/count.sql", query.site, query.unread_only) + let row = sqlx::query_file!("sql/count.sql", query.tag, query.unread_only) .fetch_one(pool) .await?; Ok(row.count.unwrap_or(0).try_into().unwrap_or(0)) @@ -43,6 +52,12 @@ pub async fn search( query: &Query, ) -> Result, async_graphql::Error> { info!("search({after:?} {before:?} {first:?} {last:?} {query:?}"); + if !query.remainder.is_empty() { + // TODO: handle full text search against all sites, for now, early return if search words + // are specified. + return Ok(Vec::new()); + } + let (offset, mut limit) = compute_offset_limit(after, before, first, last); if before.is_none() { // When searching forward, the +1 is to see if there are more pages of data available. @@ -50,11 +65,17 @@ pub async fn search( // `before` is on the next page. limit = limit + 1; } - info!("search offset {offset} limit {limit}"); + let site = query.tag.as_ref().map(|t| extract_site(&t).to_string()); + info!( + "search offset {offset} limit {limit} site {site:?} unread_only {}", + query.unread_only + ); + + // TODO: further limit results to include query.remainder if set let rows = sqlx::query_file!( "sql/threads.sql", - query.site, + site, query.unread_only, offset as i64, limit as i64 @@ -66,12 +87,13 @@ pub async fn search( .into_iter() .enumerate() .map(|(i, r)| { - let site = r.site.unwrap_or("UNKOWN SITE".to_string()); - let tags = if r.is_read.unwrap_or(false) { - vec![site.clone()] - } else { - vec!["unread".to_string(), site.clone()] + let site = r.site.unwrap_or("UNKOWN TAG".to_string()); + let mut tags = vec![format!("{TAG_PREFIX}{site}")]; + if !r.is_read.unwrap_or(true) { + tags.push("unread".to_string()); }; + let mut title = r.title.unwrap_or("NO TITLE".to_string()); + title = clean_title(&title).expect("failed to clean title"); ( i as i32 + offset, ThreadSummary { @@ -85,7 +107,7 @@ pub async fn search( matched: 0, total: 1, authors: r.name.unwrap_or_else(|| site.clone()), - subject: r.title.unwrap_or("NO TITLE".to_string()), + subject: title, tags, }, ) @@ -125,11 +147,10 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result Result // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent - let tranformers: Vec> = vec![ + let body_tranformers: Vec> = vec![ Box::new(EscapeHtml), Box::new(InlineStyle), Box::new(SanitizeHtml { @@ -178,16 +199,16 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result Result, - pub uid: Option, - pub remainder: Vec, -} - -impl FromStr for Query { - type Err = Infallible; - fn from_str(s: &str) -> Result { - let mut unread_only = false; - let mut site = None; - let mut uid = None; - let mut remainder = Vec::new(); - let site_prefix = format!("tag:{TAG_PREFIX}"); - for word in s.split_whitespace() { - if word == "is:unread" { - unread_only = true - } else if word.starts_with(&site_prefix) { - site = Some(word[site_prefix.len()..].to_string()) - } else if word.starts_with("tag:") { - // Any tag that doesn't match site_prefix should explicitly set the site to something not in the - // database - site = Some(NON_EXISTENT_SITE_NAME.to_string()); - } else if word.starts_with(THREAD_PREFIX) { - uid = Some(word[THREAD_PREFIX.len()..].to_string()) - } else { - remainder.push(word.to_string()); - } - } - Ok(Query { - unread_only, - site, - uid, - remainder, - }) - } -} - pub async fn set_read_status<'ctx>( pool: &PgPool, query: &str, @@ -267,3 +247,17 @@ pub async fn set_read_status<'ctx>( .await?; Ok(true) } +fn clean_title(title: &str) -> Result { + // Make title HTML so html parsers work + let mut title = format!("{title}"); + let title_tranformers: Vec> = + vec![Box::new(EscapeHtml), Box::new(StripHtml)]; + // Make title HTML so html parsers work + title = format!("{title}"); + for t in title_tranformers.iter() { + if t.should_run(&title) { + title = t.transform(&title)?; + } + } + Ok(title) +}