From 1bea64da068c18993d4f4d852a26cdcb2d7f1ba2 Mon Sep 17 00:00:00 2001 From: Adnan Awan Date: Tue, 30 Jan 2024 12:53:12 +0500 Subject: [PATCH 1/2] [dateparser] Fix parsing very short weekday names - Update en.yaml file to support two letters days of the week - https://github.com/scrapinghub/dateparser/issues/1170 --- dateparser/data/date_translation_data/en.py | 19 +++++++++++------ .../date_translation_data/en.yaml | 21 +++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/dateparser/data/date_translation_data/en.py b/dateparser/data/date_translation_data/en.py index b70975e1b..74fd0c66f 100644 --- a/dateparser/data/date_translation_data/en.py +++ b/dateparser/data/date_translation_data/en.py @@ -51,32 +51,39 @@ ], "monday": [ "mon", - "monday" + "monday", + "mo" ], "tuesday": [ "tue", "tuesday", + "tu", "Tues" ], "wednesday": [ "wed", - "wednesday" + "wednesday", + "we" ], "thursday": [ "thu", - "thursday" + "thursday", + "th" ], "friday": [ "fri", - "friday" + "friday", + "fr" ], "saturday": [ "sat", - "saturday" + "saturday", + "sa" ], "sunday": [ "sun", - "sunday" + "sunday", + "su" ], "am": [ "am" diff --git a/dateparser_data/supplementary_language_data/date_translation_data/en.yaml b/dateparser_data/supplementary_language_data/date_translation_data/en.yaml index d103bc577..172961545 100644 --- a/dateparser_data/supplementary_language_data/date_translation_data/en.yaml +++ b/dateparser_data/supplementary_language_data/date_translation_data/en.yaml @@ -3,9 +3,30 @@ pertain: ["of"] sentence_splitter_group : 1 +# two letters days of week + +monday: + - mo + tuesday: + - tu - Tues +wednesday: + - we + +thursday: + - th + +friday: + - fr + +saturday: + - sa + +sunday: + - su + september: - sept From 45bd628b5f15c338742b22aa043a6d200dd4efa1 Mon Sep 17 00:00:00 2001 From: Adnan Awan Date: Tue, 30 Jan 2024 23:24:07 +0500 Subject: [PATCH 2/2] [dateparser] Fix parsing very short weekday names - Develop method to remove_multiple_occurrences of the day(s) of the week - https://github.com/scrapinghub/dateparser/issues/1170 --- dateparser/languages/locale.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dateparser/languages/locale.py b/dateparser/languages/locale.py index bd0df2c66..7ef489974 100644 --- a/dateparser/languages/locale.py +++ b/dateparser/languages/locale.py @@ -109,6 +109,31 @@ def clean_dictionary(dictionary, threshold=2): del dictionary[del_key] return dictionary + @property + def weekdays(self): + weekdays = [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday", + ] + return weekdays + + def remove_multiple_occurrences(self, date_str_tokens: list): + # first occurrence of day of the week will be considered + # followings occurrence(s) will be skipped and removed from the token list. + weekdays_counter = 0 + for i, token in enumerate(date_str_tokens): + if token in self.weekdays: + weekdays_counter += 1 + + if weekdays_counter > 1: + date_str_tokens.pop(i) + continue + def translate(self, date_string, keep_formatting=False, settings=None): """ Translate the date string to its English equivalent. @@ -145,6 +170,7 @@ def translate(self, date_string, keep_formatting=False, settings=None): if "in" in date_string_tokens: date_string_tokens = self._clear_future_words(date_string_tokens) + self.remove_multiple_occurrences(date_string_tokens) return self._join( list(filter(bool, date_string_tokens)), separator="" if keep_formatting else " ",