Skip to content

Commit c452113

Browse files
committed
Fix: Ensure that two consecutive jobs are spaced exactly by the specified interval.
1 parent a233590 commit c452113

File tree

1 file changed

+20
-2
lines changed
  • chloria-backend/chloria-job/src/infrastructure/news_fetcher

1 file changed

+20
-2
lines changed

chloria-backend/chloria-job/src/infrastructure/news_fetcher/yahoo.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::{sync::Arc, usize::MAX};
22

33
use anyhow::{bail, Error, Result};
44
use async_trait::async_trait;
5-
use chrono::{DateTime, Local};
5+
use chrono::{offset::LocalResult, DateTime, Datelike, Local, TimeZone, Timelike};
66
use html2text::render::RichDecorator;
77
use log::{error, info};
88
use regex::Regex;
@@ -759,12 +759,30 @@ impl YahooClient {
759759
let link_regex = Regex::new(r"https://news.yahoo.co.jp/articles/(?<id>[^/?]+)")?;
760760
let content_selector = Selector::parse("article#uamods .article_body").unwrap();
761761
let now = Local::now();
762+
// We prevent fetching news multiple times by checking if the published time is older than `fetch_time`
763+
// adjusted by `self.interval` (see the `for` loop below for details).
764+
// However, in some cases, two consecutive jobs may run less than `self.interval` apart,
765+
// resulting in news being fetched multiple times.
766+
// For example:
767+
// - The news was published today at 00:10
768+
// - The interval is set to 12 hours:
769+
// - The first job starts at 00:00 and fetches the news at 00:12
770+
// - The second job starts at 12:00 and fetches the news at 12:08
771+
// To avoid this, we round down `fetch_time` to the nearest hour,
772+
// ensuring that fetches of the same news from two consecutive jobs are spaced exactly `self.interval` apart.
773+
let fetch_time = match now
774+
.timezone()
775+
.with_ymd_and_hms(now.year(), now.month(), now.day(), now.hour(), 0, 0)
776+
{
777+
LocalResult::Single(rounded_now) => rounded_now,
778+
_ => now, // Fallback in case of an error when rounding down
779+
};
762780
for item in response.channel.items.into_iter() {
763781
let published_time = match item.pub_date {
764782
Some(pub_date) => match DateTime::parse_from_rfc2822(&pub_date) {
765783
Ok(published_time) => {
766784
let published_time: DateTime<Local> = published_time.into();
767-
if (now - published_time).num_hours() >= self.interval {
785+
if fetch_time < published_time || (fetch_time - published_time).num_hours() >= self.interval {
768786
continue;
769787
}
770788
Some(published_time)

0 commit comments

Comments
 (0)