Skip to content

Commit

Permalink
Sitemap (#3808)
Browse files Browse the repository at this point in the history
* generate sitemap.xml file

* set up endpoint for sitemap

* Update sitemap generation

- remove sitemap generation from scheduled tasks
- add posts query for sitemap
- create sitemap module in API crate

* remove priority and change freq from sitemap

* add configuration option for number of posts for sitemap

* fix default config

* rate limit sitemap endpoint

* update sitemap query

* update sitemap generation

- remove config value for query limit
- adjust sitemap generation to query changes
- tidy up error handling

* refactor sitemap generation loop

* remove `limit` argument

* refactor `generate_urlset` and add unit test

* change query to only fetch local posts of past 24h

* fix outdated comment and log

* cargo fmt
  • Loading branch information
Trombach authored Aug 22, 2023
1 parent ab828b8 commit 28324ad
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 0 deletions.
28 changes: 28 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions crates/api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@ captcha = { workspace = true }
anyhow = { workspace = true }
tracing = { workspace = true }
chrono = { workspace = true }
url = { workspace = true }
wav = "1.0.0"
sitemap-rs = "0.2.0"

[dev-dependencies]
serial_test = { workspace = true }
tokio = { workspace = true }
elementtree = "1.2.3"
1 change: 1 addition & 0 deletions crates/api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub mod post_report;
pub mod private_message;
pub mod private_message_report;
pub mod site;
pub mod sitemap;

#[async_trait::async_trait(?Send)]
pub trait Perform {
Expand Down
142 changes: 142 additions & 0 deletions crates/api/src/sitemap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
use actix_web::{
http::header::{self, CacheDirective},
web::Data,
HttpResponse,
};
use chrono::{DateTime, FixedOffset};
use lemmy_api_common::context::LemmyContext;
use lemmy_db_schema::{newtypes::DbUrl, source::post::Post};
use lemmy_utils::error::LemmyResult;
use sitemap_rs::{url::Url, url_set::UrlSet};
use tracing::info;

async fn generate_urlset(posts: Vec<(DbUrl, chrono::NaiveDateTime)>) -> LemmyResult<UrlSet> {
let urls = posts
.into_iter()
.map_while(|post| {
Url::builder(post.0.to_string())
.last_modified(DateTime::from_utc(
post.1,
FixedOffset::east_opt(0).expect("Error setting timezone offset"), // TODO what is the proper timezone offset here?
))
.build()
.ok()
})
.collect();

Ok(UrlSet::new(urls)?)
}

pub async fn get_sitemap(context: Data<LemmyContext>) -> LemmyResult<HttpResponse> {
info!("Generating sitemap with posts from last {} hours...", 24);
let posts = Post::list_for_sitemap(&mut context.pool()).await?;
info!("Loaded latest {} posts", posts.len());

let mut buf = Vec::<u8>::new();
generate_urlset(posts).await?.write(&mut buf)?;

Ok(
HttpResponse::Ok()
.content_type("application/xml")
.insert_header(header::CacheControl(vec![CacheDirective::MaxAge(86_400)])) // 24 h
.body(buf),
)
}

#[cfg(test)]
pub(crate) mod tests {
#![allow(clippy::unwrap_used)]

use crate::sitemap::generate_urlset;
use chrono::{NaiveDate, NaiveDateTime};
use elementtree::Element;
use lemmy_db_schema::newtypes::DbUrl;
use url::Url;

#[tokio::test]
async fn test_generate_urlset() {
let posts: Vec<(DbUrl, NaiveDateTime)> = vec![
(
Url::parse("https://example.com").unwrap().into(),
NaiveDate::from_ymd_opt(2022, 12, 1)
.unwrap()
.and_hms_opt(9, 10, 11)
.unwrap(),
),
(
Url::parse("https://lemmy.ml").unwrap().into(),
NaiveDate::from_ymd_opt(2023, 1, 1)
.unwrap()
.and_hms_opt(1, 2, 3)
.unwrap(),
),
];

let mut buf = Vec::<u8>::new();
generate_urlset(posts)
.await
.unwrap()
.write(&mut buf)
.unwrap();
let root = Element::from_reader(buf.as_slice()).unwrap();

assert_eq!(root.tag().name(), "urlset");
assert_eq!(root.child_count(), 2);

assert!(root.children().all(|url| url.tag().name() == "url"));
assert!(root.children().all(|url| url.child_count() == 2));
assert!(root.children().all(|url| url
.children()
.next()
.is_some_and(|element| element.tag().name() == "loc")));
assert!(root.children().all(|url| url
.children()
.nth(1)
.is_some_and(|element| element.tag().name() == "lastmod")));

assert_eq!(
root
.children()
.next()
.unwrap()
.children()
.find(|element| element.tag().name() == "loc")
.unwrap()
.text(),
"https://example.com/"
);
assert_eq!(
root
.children()
.next()
.unwrap()
.children()
.find(|element| element.tag().name() == "lastmod")
.unwrap()
.text(),
"2022-12-01T09:10:11+00:00"
);
assert_eq!(
root
.children()
.nth(1)
.unwrap()
.children()
.find(|element| element.tag().name() == "loc")
.unwrap()
.text(),
"https://lemmy.ml/"
);
assert_eq!(
root
.children()
.nth(1)
.unwrap()
.children()
.find(|element| element.tag().name() == "lastmod")
.unwrap()
.text(),
"2023-01-01T01:02:03+00:00"
);
}
}
18 changes: 18 additions & 0 deletions crates/db_schema/src/impls/post.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use super::instance::coalesce;
use crate::{
newtypes::{CommunityId, DbUrl, PersonId, PostId},
schema::post::dsl::{
Expand All @@ -7,6 +8,7 @@ use crate::{
creator_id,
deleted,
featured_community,
local,
name,
post,
published,
Expand All @@ -30,6 +32,7 @@ use crate::{
utils::{get_conn, naive_now, DbPool, DELETED_REPLACEMENT_TEXT, FETCH_LIMIT_MAX},
};
use ::url::Url;
use chrono::{Duration, Utc};
use diesel::{dsl::insert_into, result::Error, ExpressionMethods, QueryDsl, TextExpressionMethods};
use diesel_async::RunQueryDsl;

Expand Down Expand Up @@ -96,6 +99,21 @@ impl Post {
.await
}

pub async fn list_for_sitemap(
pool: &mut DbPool<'_>,
) -> Result<Vec<(DbUrl, chrono::NaiveDateTime)>, Error> {
let conn = &mut get_conn(pool).await?;
post
.select((ap_id, coalesce(updated, published)))
.filter(local)
.filter(deleted.eq(false))
.filter(removed.eq(false))
.filter(published.ge(Utc::now().naive_utc() - Duration::days(1)))
.order(published.desc())
.load::<(DbUrl, chrono::NaiveDateTime)>(conn)
.await
}

pub async fn permadelete_for_creator(
pool: &mut DbPool<'_>,
for_creator_id: PersonId,
Expand Down
6 changes: 6 additions & 0 deletions src/api_routes_http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use lemmy_api::{
local_user::{ban_person::ban_from_site, notifications::mark_reply_read::mark_reply_as_read},
post::{feature::feature_post, like::like_post, lock::lock_post},
post_report::create::create_post_report,
sitemap::get_sitemap,
Perform,
};
use lemmy_api_common::{
Expand Down Expand Up @@ -340,6 +341,11 @@ pub fn config(cfg: &mut web::ServiceConfig, rate_limit: &RateLimitCell) {
.route("/delete", web::post().to(delete_custom_emoji)),
),
);
cfg.service(
web::scope("/sitemap.xml")
.wrap(rate_limit.message())
.route("", web::get().to(get_sitemap)),
);
}

async fn perform<'a, Data>(
Expand Down

0 comments on commit 28324ad

Please sign in to comment.