Compare commits

..

No commits in common. "53093f4cce0210efa501f227edbdc816e8683216" and "795029cb065ba7cf41a92714f9692fda39f5208a" have entirely different histories.

9 changed files with 39 additions and 82 deletions

10
Cargo.lock generated
View File

@ -2910,7 +2910,7 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "letterbox"
version = "0.0.128"
version = "0.0.127"
dependencies = [
"build-info",
"build-info-build",
@ -2936,7 +2936,7 @@ dependencies = [
[[package]]
name = "letterbox-server"
version = "0.0.128"
version = "0.0.127"
dependencies = [
"ammonia",
"anyhow",
@ -3457,7 +3457,7 @@ dependencies = [
[[package]]
name = "notmuch"
version = "0.0.128"
version = "0.0.127"
dependencies = [
"itertools 0.10.5",
"log",
@ -4252,7 +4252,7 @@ dependencies = [
[[package]]
name = "procmail2notmuch"
version = "0.0.128"
version = "0.0.127"
dependencies = [
"anyhow",
]
@ -5331,7 +5331,7 @@ dependencies = [
[[package]]
name = "shared"
version = "0.0.128"
version = "0.0.127"
dependencies = [
"build-info",
"notmuch",

View File

@ -1,6 +1,6 @@
[package]
name = "notmuch"
version = "0.0.128"
version = "0.0.127"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,6 +1,6 @@
[package]
name = "procmail2notmuch"
version = "0.0.128"
version = "0.0.127"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,6 +1,6 @@
{
"db_name": "PostgreSQL",
"query": "SELECT\n p.id,\n link,\n clean_summary\nFROM\n post AS p\nINNER JOIN feed AS f ON p.site = f.slug -- necessary to weed out nzb posts\nWHERE\n search_summary IS NULL\n -- TODO remove\n AND link ~ '^<'\nORDER BY date DESC\nLIMIT 100;\n",
"query": "SELECT\n p.id,\n link,\n clean_summary\nFROM\n post AS p\nINNER JOIN feed AS f ON p.site = f.slug -- necessary to weed out nzb posts\nWHERE search_summary IS NULL\nORDER BY date DESC\nLIMIT 100;\n",
"describe": {
"columns": [
{
@ -28,5 +28,5 @@
true
]
},
"hash": "118e453e59594487fd01873c07d3dc9c8069187d1bcc6d30f6b4940fe694f0eb"
"hash": "e90019c1e96c20318b23ef5671cf04e48b42477178d068ced4dba7bb6d5896d5"
}

View File

@ -1,6 +1,6 @@
[package]
name = "letterbox-server"
version = "0.0.128"
version = "0.0.127"
edition = "2021"
default-run = "letterbox-server"

View File

@ -7,13 +7,7 @@ pub mod nm;
#[cfg(feature = "tantivy")]
pub mod tantivy;
use std::{
collections::{HashMap, HashSet},
convert::Infallible,
fmt,
str::FromStr,
sync::Arc,
};
use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc};
use async_trait::async_trait;
use cacher::{Cacher, FilesystemCacher};
@ -26,8 +20,6 @@ use lol_html::{
};
use maplit::{hashmap, hashset};
use regex::Regex;
use reqwest::StatusCode;
use rocket::response::status;
use scraper::{Html, Selector};
use sqlx::types::time::PrimitiveDateTime;
use thiserror::Error;
@ -66,8 +58,6 @@ pub enum TransformError {
ReqwestError(#[from] reqwest::Error),
#[error("failed to parse HTML: {0}")]
HtmlParsingError(String),
#[error("got a retryable error code {0} for {1}")]
RetryableHttpStatusError(StatusCode, String),
}
struct SanitizeHtml<'a> {
@ -98,37 +88,18 @@ struct StripHtml;
#[async_trait]
impl Transformer for StripHtml {
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
debug!("StripHtml should_run {link:?} {}", html.contains("<"));
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
// Lame test
html.contains("<")
}
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
debug!("StripHtml {link:?}");
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
let mut text = String::new();
let element_content_handlers = vec![
element!("style", |el| {
el.remove();
Ok(())
}),
element!("script", |el| {
el.remove();
Ok(())
}),
];
let html = rewrite_str(
html,
RewriteStrSettings {
element_content_handlers,
..RewriteStrSettings::default()
},
)?;
let element_content_handlers = vec![text!("*", |t| {
text += t.as_str();
Ok(())
})];
let _ = rewrite_str(
&html,
html,
RewriteStrSettings {
element_content_handlers,
..RewriteStrSettings::default()
@ -301,7 +272,6 @@ impl<'c> SlurpContents<'c> {
#[async_trait]
impl<'c> Transformer for SlurpContents<'c> {
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
debug!("SlurpContents should_run {link:?}");
let mut will_slurp = false;
if let Some(link) = link {
will_slurp = self.get_selectors(link).is_some();
@ -312,15 +282,6 @@ impl<'c> Transformer for SlurpContents<'c> {
will_slurp
}
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
debug!("SlurpContents {link:?}");
let retryable_status: HashSet<StatusCode> = vec![
StatusCode::UNAUTHORIZED,
StatusCode::FORBIDDEN,
StatusCode::REQUEST_TIMEOUT,
StatusCode::TOO_MANY_REQUESTS,
]
.into_iter()
.collect();
if let Some(test_link) = link {
// If SlurpContents is configured for inline CSS, but no
// configuration found for this site, use the local InlineStyle
@ -340,18 +301,11 @@ impl<'c> Transformer for SlurpContents<'c> {
let body = if let Some(body) = cacher.get(link.as_str()) {
String::from_utf8_lossy(&body).to_string()
} else {
let resp = reqwest::get(link.as_str()).await?;
let status = resp.status();
if status.is_server_error() || retryable_status.contains(&status) {
return Err(TransformError::RetryableHttpStatusError(
status,
link.to_string(),
));
}
if !status.is_success() {
return Ok(html.to_string());
}
let body = resp.text().await?;
let body = reqwest::get(link.as_str())
.await?
.error_for_status()?
.text()
.await?;
cacher.set(link.as_str(), body.as_bytes());
body
};

View File

@ -258,28 +258,27 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
async fn update_search_summary(
pool: &PgPool,
cacher: &FilesystemCacher,
link: String,
link: Url,
body: String,
id: i32,
) -> Result<(), ServerError> {
let slurp_contents = SlurpContents {
cacher,
inline_css: true,
site_selectors: slurp_contents_selectors(),
};
let strip_html = StripHtml;
let body_transformers: Vec<Box<dyn Transformer>> = vec![
Box::new(SlurpContents {
cacher,
inline_css: true,
site_selectors: slurp_contents_selectors(),
}),
Box::new(StripHtml),
];
info!("adding {link} to search index");
let mut body = body;
if let Ok(link) = Url::parse(&link) {
let link = Some(link);
if slurp_contents.should_run(&link, &body) {
body = slurp_contents.transform(&link, &body).await?;
let link = Some(link);
for t in body_transformers.iter() {
if t.should_run(&link, &body) {
body = t.transform(&link, &body).await?;
}
} else {
error!("failed to parse link: {}", link);
}
body = strip_html.transform(&None, &body).await?;
sqlx::query!(
"UPDATE post SET search_summary = $1 WHERE id = $2",
body,
@ -295,12 +294,16 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
.await?
.into_iter()
.filter_map(|r| {
let Ok(link) = Url::parse(&r.link) else {
error!("failed to parse link: {}", r.link);
return None;
};
let Some(body) = r.clean_summary else {
error!("clean_summary missing for {}", r.link);
return None;
};
let id = r.id;
Some(update_search_summary(pool, cacher, r.link, body, id))
Some(update_search_summary(pool, cacher, link, body, id))
})
.collect();

View File

@ -1,6 +1,6 @@
[package]
name = "shared"
version = "0.0.128"
version = "0.0.127"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,5 +1,5 @@
[package]
version = "0.0.128"
version = "0.0.127"
name = "letterbox"
repository = "https://github.com/seed-rs/seed-quickstart"
authors = ["Bill Thiede <git@xinu.tv>"]