use std::collections::HashMap; use cacher::FilesystemCacher; use futures::{stream::FuturesUnordered, StreamExt}; use log::{error, info}; use maplit::hashmap; use scraper::Selector; use shared::compute_color; use sqlx::postgres::PgPool; use tracing::instrument; use url::Url; use crate::{ clean_title, compute_offset_limit, error::ServerError, graphql::{Corpus, NewsPost, Tag, Thread, ThreadSummary}, thread_summary_from_row, AddOutlink, FrameImages, Query, SanitizeHtml, SlurpContents, StripHtml, ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX, NEWSREADER_THREAD_PREFIX, }; pub fn is_newsreader_query(query: &Query) -> bool { query.is_newsreader || query.corpus == Some(Corpus::Newsreader) } pub fn is_newsreader_thread(query: &str) -> bool { query.starts_with(NEWSREADER_THREAD_PREFIX) } pub fn extract_thread_id(query: &str) -> &str { if query.starts_with(NEWSREADER_THREAD_PREFIX) { &query[NEWSREADER_THREAD_PREFIX.len()..] } else { query } } pub fn extract_site(tag: &str) -> &str { &tag[NEWSREADER_TAG_PREFIX.len()..] } pub fn make_news_tag(tag: &str) -> String { format!("tag:{NEWSREADER_TAG_PREFIX}{tag}") } fn site_from_tags(tags: &[String]) -> Option { for t in tags { if t.starts_with(NEWSREADER_TAG_PREFIX) { return Some(extract_site(t).to_string()); } } None } #[instrument(name = "newsreader::count", skip_all, fields(query=%query))] pub async fn count(pool: &PgPool, query: &Query) -> Result { if !is_newsreader_query(query) { return Ok(0); } let site = site_from_tags(&query.tags); if !query.tags.is_empty() && site.is_none() { // Newsreader can only handle all sites read/unread queries, anything with a non-site tag // isn't supported return Ok(0); } let search_term = query.remainder.join(" "); let search_term = search_term.trim(); let search_term = if search_term.is_empty() { None } else { Some(search_term) }; // TODO: add support for looking for search_term in title and site let row = sqlx::query_file!("sql/count.sql", site, query.unread_only, search_term) .fetch_one(pool) .await?; Ok(row.count.unwrap_or(0).try_into().unwrap_or(0)) } #[instrument(name = "newsreader::search", skip_all, fields(query=%query))] pub async fn search( pool: &PgPool, after: Option, before: Option, first: Option, last: Option, query: &Query, ) -> Result, async_graphql::Error> { info!("search({after:?} {before:?} {first:?} {last:?} {query:?}"); if !is_newsreader_query(query) { return Ok(Vec::new()); } let site = site_from_tags(&query.tags); if !query.tags.is_empty() && site.is_none() { // Newsreader can only handle all sites read/unread queries, anything with a non-site tag // isn't supported return Ok(Vec::new()); } let (offset, mut limit) = compute_offset_limit(after, before, first, last); if before.is_none() { // When searching forward, the +1 is to see if there are more pages of data available. // Searching backwards implies there's more pages forward, because the value represented by // `before` is on the next page. limit = limit + 1; } info!( "search offset {offset} limit {limit} site {site:?} unread_only {}", query.unread_only ); let search_term = query.remainder.join(" "); let search_term = search_term.trim(); let search_term = if search_term.is_empty() { None } else { Some(search_term) }; // TODO: add support for looking for search_term in title and site let rows = sqlx::query_file!( "sql/threads.sql", site, query.unread_only, offset as i64, limit as i64, search_term ) .fetch_all(pool) .await?; let mut res = Vec::new(); for (i, r) in rows.into_iter().enumerate() { res.push(( i as i32 + offset, thread_summary_from_row(ThreadSummaryRecord { site: r.site, date: r.date, is_read: r.is_read, title: r.title, uid: r.uid, name: r.name, corpus: Corpus::Newsreader, }) .await, )); } Ok(res) } #[instrument(name = "newsreader::tags", skip_all, fields(needs_unread=%_needs_unread))] pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result, ServerError> { // TODO: optimize query by using needs_unread let tags = sqlx::query_file!("sql/tags.sql").fetch_all(pool).await?; let tags = tags .into_iter() .map(|tag| { let unread = tag.unread.unwrap_or(0).try_into().unwrap_or(0); let name = format!( "{NEWSREADER_TAG_PREFIX}{}", tag.site.expect("tag must have site") ); let hex = compute_color(&name); Tag { name, fg_color: "white".to_string(), bg_color: hex, unread, } }) .collect(); Ok(tags) } #[instrument(name = "newsreader::thread", skip_all, fields(thread_id=%thread_id))] pub async fn thread( cacher: &FilesystemCacher, pool: &PgPool, thread_id: String, ) -> Result { let id = thread_id .strip_prefix(NEWSREADER_THREAD_PREFIX) .expect("news thread doesn't start with '{NEWSREADER_THREAD_PREFIX}'") .to_string(); let r = sqlx::query_file!("sql/thread.sql", id) .fetch_one(pool) .await?; let slug = r.site.unwrap_or("no-slug".to_string()); let site = r.name.unwrap_or("NO SITE".to_string()); // TODO: remove the various places that have this as an Option let link = Some(Url::parse(&r.link)?); let mut body = r.clean_summary.unwrap_or("NO SUMMARY".to_string()); let body_transformers: Vec> = vec![ Box::new(SlurpContents { cacher, inline_css: true, site_selectors: slurp_contents_selectors(), }), Box::new(FrameImages), Box::new(AddOutlink), // TODO: causes doubling of images in cloudflare blogs //Box::new(EscapeHtml), Box::new(SanitizeHtml { cid_prefix: "", base_url: &link, }), ]; for t in body_transformers.iter() { if t.should_run(&link, &body) { body = t.transform(&link, &body).await?; } } let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?; let is_read = r.is_read.unwrap_or(false); let timestamp = r .date .expect("post missing date") .assume_utc() .unix_timestamp(); Ok(Thread::News(NewsPost { thread_id, is_read, slug, site, title, body, url: link .as_ref() .map(|url| url.to_string()) .unwrap_or("NO URL".to_string()), timestamp, })) } #[instrument(name = "newsreader::set_read_status", skip_all, fields(query=%query,unread=%unread))] pub async fn set_read_status<'ctx>( pool: &PgPool, query: &Query, unread: bool, ) -> Result { // TODO: make single query when query.uids.len() > 1 let uids: Vec<_> = query .uids .iter() .filter(|uid| is_newsreader_thread(uid)) .map( |uid| extract_thread_id(uid), // TODO strip prefix ) .collect(); for uid in uids { sqlx::query_file!("sql/set_unread.sql", !unread, uid) .execute(pool) .await?; } Ok(true) } #[instrument(name = "newsreader::refresh", skip_all)] pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result { async fn update_search_summary( pool: &PgPool, cacher: &FilesystemCacher, link: String, body: String, id: i32, ) -> Result<(), ServerError> { let slurp_contents = SlurpContents { cacher, inline_css: true, site_selectors: slurp_contents_selectors(), }; let strip_html = StripHtml; info!("adding {link} to search index"); let mut body = body; if let Ok(link) = Url::parse(&link) { let link = Some(link); if slurp_contents.should_run(&link, &body) { body = slurp_contents.transform(&link, &body).await?; } } else { error!("failed to parse link: {}", link); } body = strip_html.transform(&None, &body).await?; sqlx::query!( "UPDATE post SET search_summary = $1 WHERE id = $2", body, id ) .execute(pool) .await?; Ok(()) } let mut unordered: FuturesUnordered<_> = sqlx::query_file!("sql/need-search-summary.sql",) .fetch_all(pool) .await? .into_iter() .filter_map(|r| { let Some(body) = r.clean_summary else { error!("clean_summary missing for {}", r.link); return None; }; let id = r.id; Some(update_search_summary(pool, cacher, r.link, body, id)) }) .collect(); while let Some(res) = unordered.next().await { //let res = res; match res { Ok(()) => {} Err(err) => { info!("failed refresh {err:?}"); // TODO: //fd.error = Some(err); } }; } Ok(true) } fn slurp_contents_selectors() -> HashMap> { hashmap![ "atmeta.com".to_string() => vec![ Selector::parse("div.entry-content").unwrap(), ], "blog.prusa3d.com".to_string() => vec![ Selector::parse("article.content .post-block").unwrap(), ], "blog.cloudflare.com".to_string() => vec![ Selector::parse(".author-lists .author-name-tooltip").unwrap(), Selector::parse(".post-full-content").unwrap() ], "blog.zsa.io".to_string() => vec![ Selector::parse("section.blog-article").unwrap(), ], "engineering.fb.com".to_string() => vec![ Selector::parse("article").unwrap(), ], "grafana.com".to_string() => vec![ Selector::parse(".blog-content").unwrap(), ], "hackaday.com".to_string() => vec![ Selector::parse("div.entry-featured-image").unwrap(), Selector::parse("div.entry-content").unwrap() ], "ingowald.blog".to_string() => vec![ Selector::parse("article").unwrap(), ], "jvns.ca".to_string() => vec![ Selector::parse("article").unwrap(), ], "mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()], "natwelch.com".to_string() => vec![ Selector::parse("article div.prose").unwrap(), ], "rustacean-station.org".to_string() => vec![ Selector::parse("article").unwrap(), ], "slashdot.org".to_string() => vec![ Selector::parse("span.story-byline").unwrap(), Selector::parse("div.p").unwrap(), ], "theonion.com".to_string() => vec![ // Single image joke w/ title Selector::parse("article > section > div > figure").unwrap(), // Single cartoon Selector::parse("article > div > div > figure").unwrap(), // Image at top of article Selector::parse("article > header > div > div > figure").unwrap(), // Article body Selector::parse("article .entry-content > *").unwrap(), ], "trofi.github.io".to_string() => vec![ Selector::parse("#content").unwrap(), ], "www.redox-os.org".to_string() => vec![ Selector::parse("div.content").unwrap(), ], "www.smbc-comics.com".to_string() => vec![ Selector::parse("img#cc-comic").unwrap(), Selector::parse("div#aftercomic img").unwrap(), ], ] }