server: use fetched contents of news for search index
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use cacher::FilesystemCacher;
|
||||
use log::info;
|
||||
@@ -6,17 +6,15 @@ use maplit::hashmap;
|
||||
use scraper::Selector;
|
||||
use shared::compute_color;
|
||||
use sqlx::postgres::PgPool;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
|
||||
use crate::{
|
||||
clean_title, compute_offset_limit,
|
||||
config::Config,
|
||||
error::ServerError,
|
||||
graphql::{Corpus, NewsPost, Tag, Thread, ThreadSummary},
|
||||
thread_summary_from_row, AddOutlink, FrameImages, Query, SanitizeHtml, SlurpContents,
|
||||
ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX, NEWSREADER_THREAD_PREFIX,
|
||||
StripHtml, ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX, NEWSREADER_THREAD_PREFIX,
|
||||
};
|
||||
|
||||
pub fn is_newsreader_query(query: &Query) -> bool {
|
||||
@@ -173,7 +171,7 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
|
||||
|
||||
#[instrument(name = "newsreader::thread", skip_all, fields(thread_id=%thread_id))]
|
||||
pub async fn thread(
|
||||
config: &Config,
|
||||
cacher: &FilesystemCacher,
|
||||
pool: &PgPool,
|
||||
thread_id: String,
|
||||
) -> Result<Thread, ServerError> {
|
||||
@@ -191,73 +189,11 @@ pub async fn thread(
|
||||
// TODO: remove the various places that have this as an Option
|
||||
let link = Some(Url::parse(&r.link)?);
|
||||
let mut body = r.clean_summary.unwrap_or("NO SUMMARY".to_string());
|
||||
let cacher = Arc::new(Mutex::new(FilesystemCacher::new(&config.slurp_cache_path)?));
|
||||
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
let body_transformers: Vec<Box<dyn Transformer>> = vec![
|
||||
Box::new(SlurpContents {
|
||||
cacher,
|
||||
inline_css: true,
|
||||
site_selectors: hashmap![
|
||||
"atmeta.com".to_string() => vec![
|
||||
Selector::parse("div.entry-content").unwrap(),
|
||||
],
|
||||
"blog.prusa3d.com".to_string() => vec![
|
||||
Selector::parse("article.content .post-block").unwrap(),
|
||||
],
|
||||
"blog.cloudflare.com".to_string() => vec![
|
||||
Selector::parse(".author-lists .author-name-tooltip").unwrap(),
|
||||
Selector::parse(".post-full-content").unwrap()
|
||||
],
|
||||
"blog.zsa.io".to_string() => vec![
|
||||
Selector::parse("section.blog-article").unwrap(),
|
||||
],
|
||||
"engineering.fb.com".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"grafana.com".to_string() => vec![
|
||||
Selector::parse(".blog-content").unwrap(),
|
||||
],
|
||||
"hackaday.com".to_string() => vec![
|
||||
Selector::parse("div.entry-featured-image").unwrap(),
|
||||
Selector::parse("div.entry-content").unwrap()
|
||||
],
|
||||
"ingowald.blog".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"jvns.ca".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
|
||||
"natwelch.com".to_string() => vec![
|
||||
Selector::parse("article div.prose").unwrap(),
|
||||
],
|
||||
"rustacean-station.org".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"slashdot.org".to_string() => vec![
|
||||
Selector::parse("span.story-byline").unwrap(),
|
||||
Selector::parse("div.p").unwrap(),
|
||||
],
|
||||
"theonion.com".to_string() => vec![
|
||||
// Single image joke w/ title
|
||||
Selector::parse("article > section > div > figure").unwrap(),
|
||||
// Single cartoon
|
||||
Selector::parse("article > div > div > figure").unwrap(),
|
||||
// Image at top of article
|
||||
Selector::parse("article > header > div > div > figure").unwrap(),
|
||||
// Article body
|
||||
Selector::parse("article .entry-content > *").unwrap(),
|
||||
],
|
||||
"trofi.github.io".to_string() => vec![
|
||||
Selector::parse("#content").unwrap(),
|
||||
],
|
||||
"www.redox-os.org".to_string() => vec![
|
||||
Selector::parse("div.content").unwrap(),
|
||||
],
|
||||
"www.smbc-comics.com".to_string() => vec![
|
||||
Selector::parse("img#cc-comic").unwrap(),
|
||||
Selector::parse("div#aftercomic img").unwrap(),
|
||||
],
|
||||
],
|
||||
site_selectors: slurp_contents_selectors(),
|
||||
}),
|
||||
Box::new(FrameImages),
|
||||
Box::new(AddOutlink),
|
||||
@@ -268,7 +204,7 @@ pub async fn thread(
|
||||
base_url: &link,
|
||||
}),
|
||||
];
|
||||
for t in body_tranformers.iter() {
|
||||
for t in body_transformers.iter() {
|
||||
if t.should_run(&link, &body) {
|
||||
body = t.transform(&link, &body).await?;
|
||||
}
|
||||
@@ -316,3 +252,102 @@ pub async fn set_read_status<'ctx>(
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
#[instrument(name = "newsreader::refresh", skip_all)]
|
||||
pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<bool, ServerError> {
|
||||
let body_transformers: Vec<Box<dyn Transformer>> = vec![
|
||||
Box::new(SlurpContents {
|
||||
cacher,
|
||||
inline_css: true,
|
||||
site_selectors: slurp_contents_selectors(),
|
||||
}),
|
||||
Box::new(StripHtml),
|
||||
];
|
||||
|
||||
let rows = sqlx::query_file!("sql/need-search-summary.sql",)
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
for r in rows {
|
||||
let link = Url::parse(&r.link)?;
|
||||
info!("adding {link} to search index");
|
||||
let link = Some(link);
|
||||
let mut body = r.clean_summary.unwrap_or("NO SUMMARY".to_string());
|
||||
for t in body_transformers.iter() {
|
||||
if t.should_run(&link, &body) {
|
||||
body = t.transform(&link, &body).await?;
|
||||
}
|
||||
}
|
||||
sqlx::query!(
|
||||
"UPDATE post SET search_summary = $1 WHERE id = $2",
|
||||
body,
|
||||
r.id
|
||||
)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn slurp_contents_selectors() -> HashMap<String, Vec<Selector>> {
|
||||
hashmap![
|
||||
"atmeta.com".to_string() => vec![
|
||||
Selector::parse("div.entry-content").unwrap(),
|
||||
],
|
||||
"blog.prusa3d.com".to_string() => vec![
|
||||
Selector::parse("article.content .post-block").unwrap(),
|
||||
],
|
||||
"blog.cloudflare.com".to_string() => vec![
|
||||
Selector::parse(".author-lists .author-name-tooltip").unwrap(),
|
||||
Selector::parse(".post-full-content").unwrap()
|
||||
],
|
||||
"blog.zsa.io".to_string() => vec![
|
||||
Selector::parse("section.blog-article").unwrap(),
|
||||
],
|
||||
"engineering.fb.com".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"grafana.com".to_string() => vec![
|
||||
Selector::parse(".blog-content").unwrap(),
|
||||
],
|
||||
"hackaday.com".to_string() => vec![
|
||||
Selector::parse("div.entry-featured-image").unwrap(),
|
||||
Selector::parse("div.entry-content").unwrap()
|
||||
],
|
||||
"ingowald.blog".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"jvns.ca".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"mitchellh.com".to_string() => vec![Selector::parse("div.w-full").unwrap()],
|
||||
"natwelch.com".to_string() => vec![
|
||||
Selector::parse("article div.prose").unwrap(),
|
||||
],
|
||||
"rustacean-station.org".to_string() => vec![
|
||||
Selector::parse("article").unwrap(),
|
||||
],
|
||||
"slashdot.org".to_string() => vec![
|
||||
Selector::parse("span.story-byline").unwrap(),
|
||||
Selector::parse("div.p").unwrap(),
|
||||
],
|
||||
"theonion.com".to_string() => vec![
|
||||
// Single image joke w/ title
|
||||
Selector::parse("article > section > div > figure").unwrap(),
|
||||
// Single cartoon
|
||||
Selector::parse("article > div > div > figure").unwrap(),
|
||||
// Image at top of article
|
||||
Selector::parse("article > header > div > div > figure").unwrap(),
|
||||
// Article body
|
||||
Selector::parse("article .entry-content > *").unwrap(),
|
||||
],
|
||||
"trofi.github.io".to_string() => vec![
|
||||
Selector::parse("#content").unwrap(),
|
||||
],
|
||||
"www.redox-os.org".to_string() => vec![
|
||||
Selector::parse("div.content").unwrap(),
|
||||
],
|
||||
"www.smbc-comics.com".to_string() => vec![
|
||||
Selector::parse("img#cc-comic").unwrap(),
|
||||
Selector::parse("div#aftercomic img").unwrap(),
|
||||
],
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user