@@ -2,7 +2,7 @@ use std::{sync::Arc, usize::MAX};
2
2
3
3
use anyhow:: { bail, Error , Result } ;
4
4
use async_trait:: async_trait;
5
- use chrono:: { DateTime , Local } ;
5
+ use chrono:: { offset :: LocalResult , DateTime , Datelike , Local , TimeZone , Timelike } ;
6
6
use html2text:: render:: RichDecorator ;
7
7
use log:: { error, info} ;
8
8
use regex:: Regex ;
@@ -759,12 +759,30 @@ impl YahooClient {
759
759
let link_regex = Regex :: new ( r"https://news.yahoo.co.jp/articles/(?<id>[^/?]+)" ) ?;
760
760
let content_selector = Selector :: parse ( "article#uamods .article_body" ) . unwrap ( ) ;
761
761
let now = Local :: now ( ) ;
762
+ // We prevent fetching news multiple times by checking if the published time is older than `fetch_time`
763
+ // adjusted by `self.interval` (see the `for` loop below for details).
764
+ // However, in some cases, two consecutive jobs may run less than `self.interval` apart,
765
+ // resulting in news being fetched multiple times.
766
+ // For example:
767
+ // - The news was published today at 00:10
768
+ // - The interval is set to 12 hours:
769
+ // - The first job starts at 00:00 and fetches the news at 00:12
770
+ // - The second job starts at 12:00 and fetches the news at 12:08
771
+ // To avoid this, we round down `fetch_time` to the nearest hour,
772
+ // ensuring that fetches of the same news from two consecutive jobs are spaced exactly `self.interval` apart.
773
+ let fetch_time = match now
774
+ . timezone ( )
775
+ . with_ymd_and_hms ( now. year ( ) , now. month ( ) , now. day ( ) , now. hour ( ) , 0 , 0 )
776
+ {
777
+ LocalResult :: Single ( rounded_now) => rounded_now,
778
+ _ => now, // Fallback in case of an error when rounding down
779
+ } ;
762
780
for item in response. channel . items . into_iter ( ) {
763
781
let published_time = match item. pub_date {
764
782
Some ( pub_date) => match DateTime :: parse_from_rfc2822 ( & pub_date) {
765
783
Ok ( published_time) => {
766
784
let published_time: DateTime < Local > = published_time. into ( ) ;
767
- if ( now - published_time) . num_hours ( ) >= self . interval {
785
+ if fetch_time < published_time || ( fetch_time - published_time) . num_hours ( ) >= self . interval {
768
786
continue ;
769
787
}
770
788
Some ( published_time)
0 commit comments