letterbox/server/src/newsreader.rs

use std::collections::HashMap;

use cacher::FilesystemCacher;
use futures::{stream::FuturesUnordered, StreamExt};
use letterbox_shared::compute_color;
use maplit::hashmap;
use scraper::Selector;
use sqlx::postgres::PgPool;
use tracing::{error, info, instrument};
use url::Url;

use crate::{
    clean_title, compute_offset_limit,
    error::ServerError,
    graphql::{Corpus, NewsPost, Tag, Thread, ThreadSummary},
    thread_summary_from_row, AddOutlink, FrameImages, Query, SanitizeHtml, SlurpContents,
    StripHtml, ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX, NEWSREADER_THREAD_PREFIX,
};

pub fn is_newsreader_query(query: &Query) -> bool {
    query.is_newsreader || query.corpus == Some(Corpus::Newsreader)
}

pub fn is_newsreader_thread(query: &str) -> bool {
    query.starts_with(NEWSREADER_THREAD_PREFIX)
}

pub fn extract_thread_id(query: &str) -> &str {
    if query.starts_with(NEWSREADER_THREAD_PREFIX) {
        &query[NEWSREADER_THREAD_PREFIX.len()..]
    } else {
        query
    }
}

pub fn extract_site(tag: &str) -> &str {
    &tag[NEWSREADER_TAG_PREFIX.len()..]
}

pub fn make_news_tag(tag: &str) -> String {
    format!("tag:{NEWSREADER_TAG_PREFIX}{tag}")
}

fn site_from_tags(tags: &[String]) -> Option<String> {
    for t in tags {
        if t.starts_with(NEWSREADER_TAG_PREFIX) {
            return Some(extract_site(t).to_string());
        }
    }
    None
}

#[instrument(name = "newsreader::count", skip_all, fields(query=%query))]
pub async fn count(pool: &PgPool, query: &Query) -> Result<usize, ServerError> {
    if !is_newsreader_query(query) {
        return Ok(0);
    }
    let site = site_from_tags(&query.tags);
    if !query.tags.is_empty() && site.is_none() {
        // Newsreader can only handle all sites read/unread queries, anything with a non-site tag
        // isn't supported
        return Ok(0);
    }

    let search_term = query.remainder.join(" ");
    let search_term = search_term.trim();
    let search_term = if search_term.is_empty() {
        None
    } else {
        Some(search_term)
    };
    // TODO: add support for looking for search_term in title and site
    let row = sqlx::query_file!("sql/count.sql", site, query.unread_only, search_term)
        .fetch_one(pool)
        .await?;
    Ok(row.count.unwrap_or(0).try_into().unwrap_or(0))
}

#[instrument(name = "newsreader::search", skip_all, fields(query=%query))]
pub async fn search(
    pool: &PgPool,
    after: Option<i32>,
    before: Option<i32>,
    first: Option<i32>,
    last: Option<i32>,
    query: &Query,
) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
    info!("search({after:?} {before:?} {first:?} {last:?} {query:?}");
    if !is_newsreader_query(query) {
        return Ok(Vec::new());
    }
    let site = site_from_tags(&query.tags);
    if !query.tags.is_empty() && site.is_none() {
        // Newsreader can only handle all sites read/unread queries, anything with a non-site tag
        // isn't supported
        return Ok(Vec::new());
    }

    let (offset, mut limit) = compute_offset_limit(after, before, first, last);
    if before.is_none() {
        // When searching forward, the +1 is to see if there are more pages of data available.
        // Searching backwards implies there's more pages forward, because the value represented by
        // `before` is on the next page.
        limit = limit + 1;
    }

    info!(
        "search offset {offset} limit {limit} site {site:?} unread_only {}",
        query.unread_only
    );
    let search_term = query.remainder.join(" ");
    let search_term = search_term.trim();
    let search_term = if search_term.is_empty() {
        None
    } else {
        Some(search_term)
    };

    // TODO: add support for looking for search_term in title and site
    let rows = sqlx::query_file!(
        "sql/threads.sql",
        site,
        query.unread_only,
        offset as i64,
        limit as i64,
        search_term
    )
    .fetch_all(pool)
    .await?;
    let mut res = Vec::new();
    for (i, r) in rows.into_iter().enumerate() {
        res.push((
            i as i32 + offset,
            thread_summary_from_row(ThreadSummaryRecord {
                site: r.site,
                date: r.date,
                is_read: r.is_read,
                title: r.title,
                uid: r.uid,
                name: r.name,
                corpus: Corpus::Newsreader,
            })
            .await,
        ));
    }
    Ok(res)
}
#[instrument(name = "newsreader::tags", skip_all, fields(needs_unread=%_needs_unread))]
pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
    // TODO: optimize query by using needs_unread
    let tags = sqlx::query_file!("sql/tags.sql").fetch_all(pool).await?;
    let tags = tags
        .into_iter()
        .map(|tag| {
            let unread = tag.unread.unwrap_or(0).try_into().unwrap_or(0);
            let name = format!(
                "{NEWSREADER_TAG_PREFIX}{}",
                tag.site.expect("tag must have site")
            );
            let hex = compute_color(&name);
            Tag {
                name,
                fg_color: "white".to_string(),
                bg_color: hex,
                unread,
            }
        })
        .collect();
    Ok(tags)
}

#[instrument(name = "newsreader::thread", skip_all, fields(thread_id=%thread_id))]
pub async fn thread(
    cacher: &FilesystemCacher,
    pool: &PgPool,
    thread_id: String,
) -> Result<Thread, ServerError> {
    let id = thread_id
        .strip_prefix(NEWSREADER_THREAD_PREFIX)
        .expect("news thread doesn't start with '{NEWSREADER_THREAD_PREFIX}'")
        .to_string();

    let r = sqlx::query_file!("sql/thread.sql", id)
        .fetch_one(pool)
        .await?;

    let slug = r.site.unwrap_or("no-slug".to_string());
    let site = r.name.unwrap_or("NO SITE".to_string());
    // TODO: remove the various places that have this as an Option
    let link = Some(Url::parse(&r.link)?);
    let mut body = r.clean_summary.unwrap_or("NO SUMMARY".to_string());
    let body_transformers: Vec<Box<dyn Transformer>> = vec![
        Box::new(SlurpContents {
            cacher,
            inline_css: true,
            site_selectors: slurp_contents_selectors(),
        }),
        Box::new(FrameImages),
        Box::new(AddOutlink),
        // TODO: causes doubling of images in cloudflare blogs
        //Box::new(EscapeHtml),
        Box::new(SanitizeHtml {
            cid_prefix: "",
            base_url: &link,
        }),
    ];
    for t in body_transformers.iter() {
        if t.should_run(&link, &body) {
            body = t.transform(&link, &body).await?;
        }
    }
    let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string())).await?;
    let is_read = r.is_read.unwrap_or(false);
    let timestamp = r
        .date
        .expect("post missing date")
        .assume_utc()
        .unix_timestamp();
    Ok(Thread::News(NewsPost {
        thread_id,
        is_read,
        slug,
        site,
        title,
        body,
        url: link
            .as_ref()
            .map(|url| url.to_string())
            .unwrap_or("NO URL".to_string()),
        timestamp,
    }))
}
#[instrument(name = "newsreader::set_read_status", skip_all, fields(query=%query,unread=%unread))]
pub async fn set_read_status<'ctx>(
    pool: &PgPool,
    query: &Query,
    unread: bool,
) -> Result<bool, ServerError> {
    // TODO: make single query when query.uids.len() > 1
    let uids: Vec<_> = query
        .uids
        .iter()
        .filter(|uid| is_newsreader_thread(uid))
        .map(
            |uid| extract_thread_id(uid), // TODO strip prefix
        )
        .collect();
    for uid in uids {
        sqlx::query_file!("sql/set_unread.sql", !unread, uid)
            .execute(pool)
            .await?;
    }
    Ok(true)
}
#[instrument(name = "newsreader::refresh", skip_all)]
pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<bool, ServerError> {
    async fn update_search_summary(
        pool: &PgPool,
        cacher: &FilesystemCacher,
        link: String,
        body: String,
        id: i32,
    ) -> Result<(), ServerError> {
        let slurp_contents = SlurpContents {
            cacher,
            inline_css: true,
            site_selectors: slurp_contents_selectors(),
        };
        let strip_html = StripHtml;

        info!("adding {link} to search index");
        let mut body = body;
        if let Ok(link) = Url::parse(&link) {
            let link = Some(link);
            if slurp_contents.should_run(&link, &body) {
                body = slurp_contents.transform(&link, &body).await?;
            }
        } else {
            error!("failed to parse link: {}", link);
        }
        body = strip_html.transform(&None, &body).await?;
        sqlx::query!(
            "UPDATE post SET search_summary = $1 WHERE id = $2",
            body,
            id
        )
        .execute(pool)
        .await?;
        Ok(())
    }

    let mut unordered: FuturesUnordered<_> = sqlx::query_file!("sql/need-search-summary.sql",)
        .fetch_all(pool)
        .await?
        .into_iter()
        .filter_map(|r| {
            let Some(body) = r.clean_summary else {
                error!("clean_summary missing for {}", r.link);
                return None;
            };
            let id = r.id;
            Some(update_search_summary(pool, cacher, r.link, body, id))
        })
        .collect();

    while let Some(res) = unordered.next().await {
        //let res = res;
        match res {
            Ok(()) => {}
            Err(err) => {
                info!("failed refresh {err:?}");
                // TODO:
                //fd.error = Some(err);
            }
        };
    }
    Ok(true)
}

fn slurp_contents_selectors() -> HashMap<String, Vec<Selector>> {
    hashmap![
        "atmeta.com".to_string() => vec![
            Selector::parse("div.entry-content").unwrap(),
        ],
        "blog.prusa3d.com".to_string() => vec![
            Selector::parse("article.content .post-block").unwrap(),
        ],
        "blog.cloudflare.com".to_string() => vec![
            Selector::parse(".author-lists .author-name-tooltip").unwrap(),
            Selector::parse(".post-full-content").unwrap()
        ],
        "blog.zsa.io".to_string() => vec![
            Selector::parse("section.blog-article").unwrap(),
        ],
        "engineering.fb.com".to_string() => vec![
            Selector::parse("article").unwrap(),
        ],
        "grafana.com".to_string() => vec![
            Selector::parse(".blog-content").unwrap(),
        ],
        "hackaday.com".to_string() => vec![
            Selector::parse("div.entry-featured-image").unwrap(),
            Selector::parse("div.entry-content").unwrap()
        ],
        "ingowald.blog".to_string() => vec![
            Selector::parse("article").unwrap(),
        ],
        "jvns.ca".to_string() => vec![
            Selector::parse("article").unwrap(),
        ],
        "mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
        "natwelch.com".to_string() => vec![
            Selector::parse("article div.prose").unwrap(),
        ],
        "seiya.me".to_string() => vec![
            Selector::parse("header + div").unwrap(),
        ],
        "rustacean-station.org".to_string() => vec![
            Selector::parse("article").unwrap(),
        ],
        "slashdot.org".to_string() => vec![
            Selector::parse("span.story-byline").unwrap(),
            Selector::parse("div.p").unwrap(),
        ],
        "theonion.com".to_string() => vec![
            // Single image joke w/ title
            Selector::parse("article > section > div > figure").unwrap(),
            // Single cartoon
            Selector::parse("article > div > div > figure").unwrap(),
            // Image at top of article
            Selector::parse("article > header > div > div > figure").unwrap(),
            // Article body
            Selector::parse("article .entry-content > *").unwrap(),
        ],
        "trofi.github.io".to_string() => vec![
            Selector::parse("#content").unwrap(),
        ],
        "www.redox-os.org".to_string() => vec![
            Selector::parse("div.content").unwrap(),
        ],
        "www.smbc-comics.com".to_string() => vec![
            Selector::parse("img#cc-comic").unwrap(),
            Selector::parse("div#aftercomic img").unwrap(),
        ],
    ]
}