server: use fetched contents of news for search index

This commit is contained in:
2025-01-29 14:08:20 -08:00
parent c7aa32b922
commit 12c8e0e33b
12 changed files with 168 additions and 87 deletions

View File

@@ -1,4 +1,4 @@
use std::sync::Arc;
use std::collections::HashMap;
use cacher::FilesystemCacher;
use log::info;
@@ -6,17 +6,15 @@ use maplit::hashmap;
use scraper::Selector;
use shared::compute_color;
use sqlx::postgres::PgPool;
use tokio::sync::Mutex;
use tracing::instrument;
use url::Url;
use crate::{
clean_title, compute_offset_limit,
config::Config,
error::ServerError,
graphql::{Corpus, NewsPost, Tag, Thread, ThreadSummary},
thread_summary_from_row, AddOutlink, FrameImages, Query, SanitizeHtml, SlurpContents,
ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX, NEWSREADER_THREAD_PREFIX,
StripHtml, ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX, NEWSREADER_THREAD_PREFIX,
};
pub fn is_newsreader_query(query: &Query) -> bool {
@@ -173,7 +171,7 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
#[instrument(name = "newsreader::thread", skip_all, fields(thread_id=%thread_id))]
pub async fn thread(
config: &Config,
cacher: &FilesystemCacher,
pool: &PgPool,
thread_id: String,
) -> Result<Thread, ServerError> {
@@ -191,73 +189,11 @@ pub async fn thread(
// TODO: remove the various places that have this as an Option
let link = Some(Url::parse(&r.link)?);
let mut body = r.clean_summary.unwrap_or("NO SUMMARY".to_string());
let cacher = Arc::new(Mutex::new(FilesystemCacher::new(&config.slurp_cache_path)?));
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
let body_transformers: Vec<Box<dyn Transformer>> = vec![
Box::new(SlurpContents {
cacher,
inline_css: true,
site_selectors: hashmap![
"atmeta.com".to_string() => vec![
Selector::parse("div.entry-content").unwrap(),
],
"blog.prusa3d.com".to_string() => vec![
Selector::parse("article.content .post-block").unwrap(),
],
"blog.cloudflare.com".to_string() => vec![
Selector::parse(".author-lists .author-name-tooltip").unwrap(),
Selector::parse(".post-full-content").unwrap()
],
"blog.zsa.io".to_string() => vec![
Selector::parse("section.blog-article").unwrap(),
],
"engineering.fb.com".to_string() => vec![
Selector::parse("article").unwrap(),
],
"grafana.com".to_string() => vec![
Selector::parse(".blog-content").unwrap(),
],
"hackaday.com".to_string() => vec![
Selector::parse("div.entry-featured-image").unwrap(),
Selector::parse("div.entry-content").unwrap()
],
"ingowald.blog".to_string() => vec![
Selector::parse("article").unwrap(),
],
"jvns.ca".to_string() => vec![
Selector::parse("article").unwrap(),
],
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
"natwelch.com".to_string() => vec![
Selector::parse("article div.prose").unwrap(),
],
"rustacean-station.org".to_string() => vec![
Selector::parse("article").unwrap(),
],
"slashdot.org".to_string() => vec![
Selector::parse("span.story-byline").unwrap(),
Selector::parse("div.p").unwrap(),
],
"theonion.com".to_string() => vec![
// Single image joke w/ title
Selector::parse("article > section > div > figure").unwrap(),
// Single cartoon
Selector::parse("article > div > div > figure").unwrap(),
// Image at top of article
Selector::parse("article > header > div > div > figure").unwrap(),
// Article body
Selector::parse("article .entry-content > *").unwrap(),
],
"trofi.github.io".to_string() => vec![
Selector::parse("#content").unwrap(),
],
"www.redox-os.org".to_string() => vec![
Selector::parse("div.content").unwrap(),
],
"www.smbc-comics.com".to_string() => vec![
Selector::parse("img#cc-comic").unwrap(),
Selector::parse("div#aftercomic img").unwrap(),
],
],
site_selectors: slurp_contents_selectors(),
}),
Box::new(FrameImages),
Box::new(AddOutlink),
@@ -268,7 +204,7 @@ pub async fn thread(
base_url: &link,
}),
];
for t in body_tranformers.iter() {
for t in body_transformers.iter() {
if t.should_run(&link, &body) {
body = t.transform(&link, &body).await?;
}
@@ -316,3 +252,102 @@ pub async fn set_read_status<'ctx>(
}
Ok(true)
}
#[instrument(name = "newsreader::refresh", skip_all)]
pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<bool, ServerError> {
let body_transformers: Vec<Box<dyn Transformer>> = vec![
Box::new(SlurpContents {
cacher,
inline_css: true,
site_selectors: slurp_contents_selectors(),
}),
Box::new(StripHtml),
];
let rows = sqlx::query_file!("sql/need-search-summary.sql",)
.fetch_all(pool)
.await?;
for r in rows {
let link = Url::parse(&r.link)?;
info!("adding {link} to search index");
let link = Some(link);
let mut body = r.clean_summary.unwrap_or("NO SUMMARY".to_string());
for t in body_transformers.iter() {
if t.should_run(&link, &body) {
body = t.transform(&link, &body).await?;
}
}
sqlx::query!(
"UPDATE post SET search_summary = $1 WHERE id = $2",
body,
r.id
)
.execute(pool)
.await?;
}
Ok(true)
}
fn slurp_contents_selectors() -> HashMap<String, Vec<Selector>> {
hashmap![
"atmeta.com".to_string() => vec![
Selector::parse("div.entry-content").unwrap(),
],
"blog.prusa3d.com".to_string() => vec![
Selector::parse("article.content .post-block").unwrap(),
],
"blog.cloudflare.com".to_string() => vec![
Selector::parse(".author-lists .author-name-tooltip").unwrap(),
Selector::parse(".post-full-content").unwrap()
],
"blog.zsa.io".to_string() => vec![
Selector::parse("section.blog-article").unwrap(),
],
"engineering.fb.com".to_string() => vec![
Selector::parse("article").unwrap(),
],
"grafana.com".to_string() => vec![
Selector::parse(".blog-content").unwrap(),
],
"hackaday.com".to_string() => vec![
Selector::parse("div.entry-featured-image").unwrap(),
Selector::parse("div.entry-content").unwrap()
],
"ingowald.blog".to_string() => vec![
Selector::parse("article").unwrap(),
],
"jvns.ca".to_string() => vec![
Selector::parse("article").unwrap(),
],
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
"natwelch.com".to_string() => vec![
Selector::parse("article div.prose").unwrap(),
],
"rustacean-station.org".to_string() => vec![
Selector::parse("article").unwrap(),
],
"slashdot.org".to_string() => vec![
Selector::parse("span.story-byline").unwrap(),
Selector::parse("div.p").unwrap(),
],
"theonion.com".to_string() => vec![
// Single image joke w/ title
Selector::parse("article > section > div > figure").unwrap(),
// Single cartoon
Selector::parse("article > div > div > figure").unwrap(),
// Image at top of article
Selector::parse("article > header > div > div > figure").unwrap(),
// Article body
Selector::parse("article .entry-content > *").unwrap(),
],
"trofi.github.io".to_string() => vec![
Selector::parse("#content").unwrap(),
],
"www.redox-os.org".to_string() => vec![
Selector::parse("div.content").unwrap(),
],
"www.smbc-comics.com".to_string() => vec![
Selector::parse("img#cc-comic").unwrap(),
Selector::parse("div#aftercomic img").unwrap(),
],
]
}