diff --git a/dateparser/find_date.py b/dateparser/find_date.py new file mode 100644 index 000000000..6b0b33144 --- /dev/null +++ b/dateparser/find_date.py @@ -0,0 +1,70 @@ +from datetime import datetime +import logging +import re +from typing import List, Optional + +from dateparser.conf import apply_settings +from dateparser.date import DateDataParser +from dateparser.languages.loader import LocaleDataLoader +from dateparser.search.search import DateSearchWithDetection + + +LANGUAGES = set(LocaleDataLoader().get_locale_map()) +_bad_date_re = re.compile( + # whole dates we black-list (can still be parts of valid dates) + '^(' + '|'.join([ + r'\d{1,3}', # less than 4 digits + r'#\d+', # this is a sequence number + # some common false positives + # (https://github.com/scrapinghub/dateparser/issues/568) + r'[-/.]+', # bare separators parsed as current date + r'\w\.?', # one letter (with optional dot) + 'an', + ]) + ')$') +_date_separator = re.compile(r'[ ,|\(\)@]') # never part of the date +_drop_words = {'on', 'at', 'of', 'a'} # cause annoying false positives +_date_search = DateSearchWithDetection() + + +@apply_settings +def find_date( + text: str, *, + languages: Optional[List[str]], + settings, + max_join: int = 7, + ) -> Optional[datetime]: + """ Look for a date in the string, return the first date that is parsed. + This is used instead of search_dates from dateparser, because it has more + predictable performance and gets more dates correct, although it's still + not perfect. + Approach: + - split the date into tokens using _date_separator + - move over tokens and try to parse multiple tokens joined with + dateparser.parse, and return the first date. At each position start with + the longest n-gram, to parse the most complete date (max_join sets the + maximum length of the ngram) + """ + languages = list(languages or []) + languages = [l for l in languages if l in LANGUAGES] + if not languages: + detected = _date_search.detect_language(text=text, languages=languages) + if detected: + languages = [detected] + if 'en' not in languages: + languages.append('en') + parser = DateDataParser(languages=languages, settings=settings) + to_parse = [p for p in _date_separator.split(text) + if p and p not in _drop_words] + for i in range(len(to_parse)): + for j in reversed(range(min(max_join, len(to_parse) - i))): + x = ' '.join(to_parse[i: i + j + 1]) + if _bad_date_re.match(x): + continue + + try: + match = parser.get_date_data(x)['date_obj'] + except Exception as e: + logging.exception(e) + else: + if match: + return match diff --git a/tests/test_find_date.py b/tests/test_find_date.py new file mode 100644 index 000000000..ce0bd4943 --- /dev/null +++ b/tests/test_find_date.py @@ -0,0 +1,249 @@ +from datetime import datetime + +import pytest + +from dateparser.find_date import find_date + + +DATE_TEST_BASE = datetime(2018, 6, 28) +DATE_CASES_FAILURES = [ + ('+38 (097) 34-23-083', ['ru', 'en', 'mt'], '+38 (097) 34-23-083', None), + ('01.09 – 03.09.2017', ['en', 'de'], '01.09 – 03.09.2017', '2018-09-03T00:00:00'), + ('0800 022 26 26', ['nl'], '0800 022 26 26', None), + ('09 شباط/فبراير 2017', ['ar'], '09 شباط/فبراير 2017', '2017-09-09T00:00:00'), + ('10:00 - 01:30', ['nl'], '10:00 - 01:30', None), + ('2012 - 2016', ['de', 'en'], '2012 - 2016', '2012-01-01T00:00:00'), + ('2015-2017 Audio Sermons', ['en'], '2015-2017 Audio Sermons', '2015-01-01T00:00:00'), + ('2017.02.23更新', ['ja'], '2017.02.23更新', '2017-02-23T00:00:00'), + ('2018 Maiatzak 25', ['eu'], '2018 Maiatzak 25', '2018-05-25T00:00:00'), + ('2802', ['en'], '2802', None), + ('05 آبان 1394', ['fa'], '05 آبان 1394', '1394-11-05T00:00:00'), + ('Apr 25th 2018 at 7.57pm', ['en'], 'Apr 25th 2018 at 7.57pm', '2018-04-25T19:57:00'), + ('Año: 2011', ['es'], '2011', '2011-01-01T00:00:00'), + ('Mis à jour : 28 août 2015', ['fr'], '28 août 2015', '2015-09-28T00:00:00'), + ('Published 12:05 p.m. UTC Apr 26, 2018', ['en'], 'Published 12:05 p.m. UTC Apr 26, 2018', '2018-04-26T12:05:00'), + # ('SUMMER 2014', ['en', 'sw'], ???), + ('Published: 02:09 BST, 26 April 2018', ['en'], '02:09 BST, 26 April 2018', '2018-04-26T02:09:00'), + ('Updated on 2017년 2월 13일', ['ko', 'en', 'pt'], 'Updated on 2017년 2월 13일', '2018-02-13T00:00:00'), + ('Year composed: 2017', ['en'], '2017', '2017-01-01T00:00:00'), + ('Создано: 06.02.2017 10:49', ['ru'], '06.02.2017 10:49', '2017-02-06T10:49:00'), + ('نشر بتاريخ: 01 أيلول/سبتمبر 2016', ['ar'], '01 أيلول/سبتمبر 2016', '2016-09-01T00:00:00'), + ('نشر بتاريخ: 16 أيار 2016', ['ar'], '16 أيار 2016', '2016-05-16T00:00:00'), + ('發佈日期:2015-12-21', ['zh-Hant', 'da'], '發佈日期:2015-12-21', '2015-12-21T00:00:00'), + ('April 26, 2018 02:00 AM Eastern Daylight Time', ['en'], + 'April 26, 2018 02:00 AM Eastern Daylight Time', '2018-04-26T02:00:00-04:00'), + ('1 month ago (25, Apr 2018 7:18:54 PM)', ['en'], '1 month ago (25, Apr 2018 7:18:54 PM)', '2018-04-25T19:18:54'), + ('Publication : 9 novembre 2017', ['en'], '9 novembre 2017', '2017-11-09T00:00:00'), + ('11/12', ['es'], '11/12', '2018-12-11T00:00:00'), + ('05 49 27 01 11', ['fr'], '05 49 27 01 11', None), + ('09 65 35 95 16', ['fr'], '09 65 35 95 16', None), + ('4 days ago / May 3, 2019 / 5:46 AM', ['en'], 'May 3, 2019 / 5:46 AM / 4 days ago', '2019-05-03T05:46:00'), +] +DATE_CASES_SUCCESS = [ + ('- 24/04/2018', ['en'], '- 24/04/2018', '2018-04-24T00:00:00'), + ('- Jun 17, 2018 10:58 pm', ['en'], '- Jun 17, 2018 10:58 pm', '2018-06-17T22:58:00'), + ('/ 30 June, 2017', ['en'], '/ 30 June, 2017', '2017-06-30T00:00:00'), + ('01 Aprile 2015', ['it'], '01 Aprile 2015', '2015-04-01T00:00:00'), + ('01 June 2016', ['en'], '01 June 2016', '2016-06-01T00:00:00'), + ('01 Июня, 2015', ['ru'], '01 Июня, 2015', '2015-06-01T00:00:00'), + ('01/18/2018', ['en'], '01/18/2018', '2018-01-18T00:00:00'), + ('04.25.18 10:02 PM ET', ['en'], '04.25.18 10:02 PM ET', '2018-04-25T22:02:00-05:00'), + ('04.25.2018', ['en'], '04.25.2018', '2018-04-25T00:00:00'), + ('04/25/18 09:45 PM EDT', ['en'], '04/25/18 09:45 PM EDT', '2018-04-25T21:45:00-04:00'), + ('05 Junio 2017', ['es'], '05 Junio 2017', '2017-06-05T00:00:00'), + ('05 июля 2016', ['ru'], '05 июля 2016', '2016-07-05T00:00:00'), + ('06 lipiec 2015', ['pl', 'en'], '06 lipiec 2015', '2015-07-06T00:00:00'), + ('07 Jun, 2018', ['en'], '07 Jun, 2018', '2018-06-07T00:00:00'), + ('07 Maggio 2014', ['it'], '07 Maggio 2014', '2014-05-07T00:00:00'), + ('07 Mayo 2017', ['es', 'en'], '07 Mayo 2017', '2017-05-07T00:00:00'), + ('07 November 2017', ['en'], '07 November 2017', '2017-11-07T00:00:00'), + ('07 de noviembre de 2017', ['es'], '07 de noviembre de 2017', '2017-11-07T00:00:00'), + ('1 abril 2013 13:03:00', ['es'], '1 abril 2013 13:03:00', '2013-04-01T13:03:00'), + ('1 año hace', ['es'], '1 año hace', '2017-06-28T00:00:00'), + ('1 mayo, 2017', ['es'], '1 mayo, 2017', '2017-05-01T00:00:00'), + ('1 oktober 2015', [], '1 oktober 2015', '2015-10-01T00:00:00'), + ('1 year ago', ['ar'], '1 year ago', '2017-06-28T00:00:00'), + ('1-15', ['en'], '1-15', '2018-01-15T00:00:00'), + ('1. Mai 2005', ['de'], '1. Mai 2005', '2005-05-01T00:00:00'), + ('10 Styczeń 2017', ['pl'], '10 Styczeń 2017', '2017-01-10T00:00:00'), + ('10. Juli 2010', ['de'], '10. Juli 2010', '2010-07-10T00:00:00'), + ('10. marec, 2017', ['sl'], '10. marec, 2017', '2017-03-10T00:00:00'), + ('10/05/2018', ['en'], '10/05/2018', '2018-10-05T00:00:00'), + ('11 lutego 2018', ['pl'], '11 lutego 2018', '2018-02-11T00:00:00'), + ('11 marzo, 2017', ['es'], '11 marzo, 2017', '2017-03-11T00:00:00'), + ('11 noviembre, 2017', ['es'], '11 noviembre, 2017', '2017-11-11T00:00:00'), + ('11 septembre 2017', ['fr'], '11 septembre 2017', '2017-09-11T00:00:00'), + ('11. März 2017', ['en', 'de'], '11. März 2017', '2017-03-11T00:00:00'), + ('11. srpen 2016', ['cs'], '11. srpen 2016', '2016-08-11T00:00:00'), + ('12 Julio, 2017', ['es'], '12 Julio, 2017', '2017-07-12T00:00:00'), + ('12 luglio 2017', ['it'], '12 luglio 2017', '2017-07-12T00:00:00'), + ('12 maja 2017', ['pl'], '12 maja 2017', '2017-05-12T00:00:00'), + ('12 mois', ['fr'], '12 mois', '2017-06-28T00:00:00'), # ago? + ('12. Juni 2014', ['de'], '12. Juni 2014', '2014-06-12T00:00:00'), + ('12. Mai 2011', ['de'], '12. Mai 2011', '2011-05-12T00:00:00'), + ('12th September 2017', ['en'], '12th September 2017', '2017-09-12T00:00:00'), + ('13 - Sep - 2017', ['en'], '13 - Sep - 2017', '2017-09-13T00:00:00'), + ('13 Mar 2014', ['en'], '13 Mar 2014', '2014-03-13T00:00:00'), + ('13 enero, 2011', ['es'], '13 enero, 2011', '2011-01-13T00:00:00'), + ('13 novembre 2017', ['fr'], '13 novembre 2017', '2017-11-13T00:00:00'), + ('13 noviembre, 2017', ['es'], '13 noviembre, 2017', '2017-11-13T00:00:00'), + ('13 septembre', ['fr'], '13 septembre', '2018-09-13T00:00:00'), + ('13 мая 2017', ['ru', 'sr'], '13 мая 2017', '2017-05-13T00:00:00'), + ('13F Report Date: 9/30/2016', ['en'], '13F Report Date: 9/30/2016', '2016-09-30T00:00:00'), + ('14 Novembre 2015', ['it', 'en'], '14 Novembre 2015', '2015-11-14T00:00:00'), + ('15 марта 2017, в 08:30', ['ru'], '15 марта 2017, в 08:30', '2017-03-15T08:30:00'), + ('16 gennaio 2014', ['it'], '16 gennaio 2014', '2014-01-16T00:00:00'), + ('16 juny 2015', ['ca'], '16 juny 2015', '2015-06-16T00:00:00'), + ('18 ianuarie 2013', ['ro', 'en'], '18 ianuarie 2013', '2013-01-18T00:00:00'), + ('18. říjen 2017', ['cs'], '18. říjen 2017', '2017-10-18T00:00:00'), + ('1:55 AM 04/26/2018', ['en'], '1:55 AM 04/26/2018', '2018-04-26T01:55:00'), + ('2 months ago', ['en'], '2 months ago', '2018-04-28T00:00:00'), + ('20 Apr 2018 at 22:06', ['en'], '20 Apr 2018 at 22:06', '2018-04-20T22:06:00'), + ('2009. július 13. hétfő', ['hu', 'en'], '2009. július 13. hétfő', '2009-07-13T00:00:00'), + ('2014. július 31. 10:00', ['hu'], '2014. július 31. 10:00', '2014-07-31T10:00:00'), + ('2015. november 25. (szerda) 22:53', ['hu', 'en'], '2015. november 25. (szerda) 22:53', '2015-11-25T22:53:00'), + ('2017-06-22 08:54:58', ['en'], '2017-06-22 08:54:58', '2017-06-22T08:54:58'), + ('2018 06 18', ['en'], '2018 06 18', '2018-06-18T00:00:00'), + ('2018-04-24T21:07:58Z', [], '2018-04-24T21:07:58Z', '2018-04-24T21:07:58+00:00'), + ('22 augustus 2016', ['nl'], '22 augustus 2016', '2016-08-22T00:00:00'), + ('230 days ago', ['en'], '230 days ago', '2017-11-10T00:00:00'), + ('24 abril 2018', ['es', 'en'], '24 abril 2018', '2018-04-24T00:00:00'), + ('24 mayo, 2011', ['es'], '24 mayo, 2011', '2011-05-24T00:00:00'), + ('25 Apr 2018', ['en'], '25 Apr 2018', '2018-04-25T00:00:00'), + ('25 April 2018', ['en'], '25 April 2018', '2018-04-25T00:00:00'), + ('25 Οκτ,2017', ['el'], '25 Οκτ,2017', '2017-10-25T00:00:00'), + ('25.04.2018 | 10:47', ['en'], '25.04.2018 | 10:47', '2018-04-25T10:47:00'), + ('26 Apr 2018 07:00 GMT', ['en'], '26 Apr 2018 07:00 GMT', '2018-04-26T07:00:00+00:00'), + ('26 april 2018 04:00', ['sv', 'no', 'en'], '26 april 2018 04:00', '2018-04-26T04:00:00'), + ('26.04.2018 - 09:33 Uhr', ['de', 'en'], '26.04.2018 - 09:33 Uhr', '2018-04-26T09:33:00'), + ('26.09.2017', ['en'], '26.09.2017', '2017-09-26T00:00:00'), + ('26/04 - 00:14', ['en'], '26/04 - 00:14', '2018-04-26T00:14:00'), + ('26/04/2018 01:40 | Actualizado a 26/04/2018 03:36', ['es'], '26/04/2018 01:40 | Actualizado a 26/04/2018 03:36', + '2018-04-26T01:40:00'), + ('26th April 2018', ['en'], '26th April 2018', '2018-04-26T00:00:00'), + ('29th April 2015', ['en'], '29th April 2015', '2015-04-29T00:00:00'), + ('2nd November 2017 12:26 pm', ['en'], '2nd November 2017 12:26 pm', '2017-11-02T12:26:00'), + ('30 September 2017, 11:37 pm', ['en'], '30 September 2017, 11:37 pm', '2017-09-30T23:37:00'), + ('3月 06, 2011', ['ja', 'en', 'da'], '3月 06, 2011', '2011-03-06T00:00:00'), + ('4/21/17 11:41am', ['en'], '4/21/17 11:41am', '2017-04-21T11:41:00'), + ('4/25/2018 08:17:00 PM', ['en'], '4/25/2018 08:17:00 PM', '2018-04-25T20:17:00'), + ('7 tweets', ['en'], '7 tweets', None), + ('Apr 18, 2018', ['fr'], 'Apr 18, 2018', '2018-04-18T00:00:00'), + ('Apr 25', ['en'], 'Apr 25', '2018-04-25T00:00:00'), + ('Apr 25, 2018', ['en'], 'Apr 25, 2018', '2018-04-25T00:00:00'), + ('Apr 25, 2018 11:53 p.m. ET', ['en'], 'Apr 25, 2018 11:53 p.m. ET', '2018-04-25T23:53:00-05:00'), + ('Apr 26, 2018 00:00 IST', ['en'], 'Apr 26, 2018 00:00 IST', '2018-04-26T00:00:00+02:00'), + ('Apr 27, 2018 - 6:32 PM', ['en'], 'Apr 27, 2018 - 6:32 PM', '2018-04-27T18:32:00'), + ('Apr. 26, 2018', ['en'], 'Apr. 26, 2018', '2018-04-26T00:00:00'), + ('April 07, 2017', ['en'], 'April 07, 2017', '2017-04-07T00:00:00'), + ('April 24, 2018 05:29 PM', ['en'], 'April 24, 2018 05:29 PM', '2018-04-24T17:29:00'), + ('April 25 2018, 10:00pm,', ['en'], 'April 25 2018, 10:00pm,', '2018-04-25T22:00:00'), + ('April 25, 2018 9:56 PM EDT', ['en'], 'April 25, 2018 9:56 PM EDT', '2018-04-25T21:56:00-04:00'), + ('April 25, 2018 @10:11 PM', ['en'], 'April 25, 2018 @10:11 PM', '2018-04-25T22:11:00'), + ('April 25, 2018 at 11:31 pm', ['en'], 'April 25, 2018 at 11:31 pm', '2018-04-25T23:31:00'), + ('April 25, 2018 | 10:26pm', ['en'], 'April 25, 2018 | 10:26pm', '2018-04-25T22:26:00'), + ('April 25, 2018 | 4:27 PM', ['en'], 'April 25, 2018 | 4:27 PM', '2018-04-25T16:27:00'), + ('April 25, 2018, 6:08 PM', ['en'], 'April 25, 2018, 6:08 PM', '2018-04-25T18:08:00'), + ('April 26th, 2018 | Author: Gerry', ['en'], 'April 26th, 2018 | Author: Gerry', '2018-04-26T00:00:00'), + ('April 27, 2018', ['en'], 'April 27, 2018', '2018-04-27T00:00:00'), + ('August 22, 1939', ['en'], 'August 22, 1939', '1939-08-22T00:00:00'), + ('Breaking News April 25, 2018 11:14', ['en'], 'Breaking News April 25, 2018 11:14', '2018-04-25T11:14:00'), + ('By Francis Arinze Iloani | Publish Date: Apr 26 2018 4:00AM', ['en'], + 'Apr 26 2018 4:00AM', '2018-04-26T04:00:00'), + ('Creado: Lunes, 04 Noviembre 2013 17:16', ['es'], 'Lunes, 04 Noviembre 2013 17:16', '2013-11-04T17:16:00'), + ('Created: 06 November 2017', ['en'], '06 November 2017', '2017-11-06T00:00:00'), + ('Danny Bird | Published 26 April 2017', ['en'], 'Danny Bird | Published 26 April 2017', '2017-04-26T00:00:00'), + ('Ditayangkan: 02 Desember 2010', ['id'], '02 Desember 2010', '2010-12-02T00:00:00'), + ('Euan Andrews , April 26th, 2018 06:40', ['en'], 'Euan Andrews , April 26th, 2018 06:40', '2018-04-26T06:40:00'), + ('Kreirano: 04 Veljača 2016', ['hr'], '04 Veljača 2016', '2016-02-04T00:00:00'), + ('Laatst bijgewerkt: 17 oktober 2016', ['nl'], '17 oktober 2016', '2016-10-17T00:00:00'), + ('Last Updated: 02 January 2018', ['en'], '02 January 2018', '2018-01-02T00:00:00'), + ('Last modified: 18 Oct 2017', ['en'], '18 Oct 2017', '2017-10-18T00:00:00'), + ('Last updated 26 Apr 2018, 12:12 pm', ['en'], 'Last updated 26 Apr 2018, 12:12 pm', '2018-04-26T12:12:00'), + ('Latest update : 2018-04-26', ['en'], '2018-04-26', '2018-04-26T00:00:00'), + ('Lundi 05 septembre 2016', ['fr'], 'Lundi 05 septembre 2016', '2016-09-05T00:00:00'), + ('Megjelent: 2017. május 12.', ['hu'], '2017. május 12.', '2017-05-12T00:00:00'), + ('Monday, April 17, 2017', ['en'], 'Monday, April 17, 2017', '2017-04-17T00:00:00'), + ('Neil Macdonald , April 15th, 2013 05:04', ['en'], 'Neil Macdonald , April 15th, 2013 05:04', '2013-04-15T05:04:00'), + ('Objavljeno: 13 Srpanj 2017', ['hr'], '13 Srpanj 2017', '2017-07-13T00:00:00'), + ('On April 25, 2018', ['en'], 'On April 25, 2018', '2018-04-25T00:00:00'), + ('Opublikowano: 02 marzec 2017', ['pl'], '02 marzec 2017', '2017-03-02T00:00:00'), + ('POSTED April 26, 2018', ['en'], 'POSTED April 26, 2018', '2018-04-26T00:00:00'), + ('Paskelbta: 2017 spalio 09', ['lt'], '2017 spalio 09', '2017-10-09T00:00:00'), + ('Posted 2018-04-16', ['en'], 'Posted 2018-04-16', '2018-04-16T00:00:00'), + ('Posted: 04/25/2018 06:44:37 PM MDT', ['en'], '04/25/2018 06:44:37 PM MDT', '2018-04-25T18:44:37-06:00'), + ('Pubblicato: 30 Ottobre 2017', ['it'], '30 Ottobre 2017', '2017-10-30T00:00:00'), + ('Publicado: 05 Noviembre 2015', ['es'], '05 Noviembre 2015', '2015-11-05T00:00:00'), + ('Publicat: 06 Decembrie 2016', ['ro'], '06 Decembrie 2016', '2016-12-06T00:00:00'), + ('Publicerad 16 mars 2015', ['sv'], 'Publicerad 16 mars 2015', '2015-03-16T00:00:00'), + ('Published 29 August, 2015', [], 'Published 29 August, 2015', '2015-08-29T00:00:00'), + ('Published April 26, 2018 by Lisa Campbell', ['en'], + 'Published April 26, 2018 by Lisa Campbell', '2018-04-26T00:00:00'), + ('Sabato 1 Aprile 2017', ['it'], 'Sabato 1 Aprile 2017', '2017-04-01T00:00:00'), + ('Scris pe 16 februarie 2017 17 Comments', ['ro'], 'Scris pe 16 februarie 2017 17 Comments', '2017-02-16T00:00:00'), + ('Sunday, 23 July 2017', ['en'], 'Sunday, 23 July 2017', '2017-07-23T00:00:00'), + ('Thu, 04/26/2018 - 05:22', ['en'], 'Thu, 04/26/2018 - 05:22', '2018-04-26T05:22:00'), + ('Thursday 26 April 2018 10:00 UTC', ['en'], 'Thursday 26 April 2018 10:00 UTC', '2018-04-26T10:00:00+00:00'), + ('Thursday, April 26, 2018', ['en'], 'Thursday, April 26, 2018', '2018-04-26T00:00:00'), + ('Ultima modifica: 18 Ottobre 2010', ['it'], '18 Ottobre 2010', '2010-10-18T00:00:00'), + ('Updated: Apr 25, 2018 09:44 PM PDT', ['en'], 'Apr 25, 2018 09:44 PM PDT', '2018-04-25T21:44:00-07:00'), + ('Utworzono: 08 listopad 2017', [], '08 listopad 2017', '2017-11-08T00:00:00'), + ('Wed 5:18 PM, Apr 25, 2018', ['en'], 'Wed 5:18 PM, Apr 25, 2018', '2018-04-25T17:18:00'), + ('Wednesday 25 April 2018 - 5:01pm', ['en'], 'Wednesday 25 April 2018 - 5:01pm', '2018-04-25T17:01:00'), + ('Zuletzt aktualisiert: 12. Februar 2016', ['de'], '12. Februar 2016', '2016-02-12T00:00:00'), + ('Zveřejněno: 12. duben 2017', ['cs'], '12. duben 2017', '2017-04-12T00:00:00'), + ('by Joseph A. Wulfsohn | 11:06 pm, April 25th, 2018', ['en'], + 'by Joseph A. Wulfsohn | 11:06 pm, April 25th, 2018', '2018-04-25T23:06:00'), + ('on August 11, 2015', ['en'], 'on August 11, 2015', '2015-08-11T00:00:00'), + ('| 7:45 pm', ['en'], '| 7:45 pm', '2018-06-28T19:45:00'), + ('| April 25, 2018 08:15 PM', ['en'], '| April 25, 2018 08:15 PM', '2018-04-25T20:15:00'), + ('| Updated July 27, 2017', ['en'], '| Updated July 27, 2017', '2017-07-27T00:00:00'), + ('Última actualización: Lunes, 06 Julio 2015 18:00', ['es'], + 'Lunes, 06 Julio 2015 18:00', '2015-07-06T18:00:00'), + ('Được đăng: 14 Tháng 7 2017', ['vi'], '14 Tháng 7 2017', '2017-07-14T00:00:00'), + ('Được đăng: 17 Tháng 10 2016', ['vi'], '17 Tháng 10 2016', '2016-10-17T00:00:00'), + ('Πρώτη καταχώρηση: Τρίτη, 16 Αυγούστου 2016, 14:26', ['el'], + 'Τρίτη, 16 Αυγούστου 2016, 14:26', '2016-08-16T14:26:00'), + ('Τελευταία ενημέρωση : 08 Αύγουστος 2017', ['el', 'es'], + '08 Αύγουστος 2017', '2017-08-08T00:00:00'), + ('Дата: 01 серпня 2006', ['uk', 'en'], '01 серпня 2006', '2006-08-01T00:00:00'), + ('Опубликовано: 07 января 2017', ['ru'], '07 января 2017', '2017-01-07T00:00:00'), + ('Публикувана на 07 Юни 2018', ['bg'], 'Публикувана на 07 Юни 2018', '2018-06-07T00:00:00'), + ('от Administrator · Сентябрь 25, 2017', ['ru', 'en'], + 'от Administrator · Сентябрь 25, 2017', '2017-09-25T00:00:00'), + ('מאי 11, 2015', ['iw'], 'מאי 11, 2015', '2015-05-11T00:00:00'), + ('ธันวาคม 31, 2016', ['th'], 'ธันวาคม 31, 2016', '2016-12-31T00:00:00'), + ('เผยแพร่เมื่อ: วันอังคาร, 18 เมษายน 2560 11:14', ['th'], + 'วันอังคาร, 18 เมษายน 2560 11:14', '2560-04-18T11:14:00'), + ('— 26 Apr, 2018', ['en'], '— 26 Apr, 2018', '2018-04-26T00:00:00'), + ('Publiceret: 24. juli 2017', ['da'], '24. juli 2017', '2017-07-24T00:00:00'), + ('Veröffentlicht: 24. August 2017', ['de'], '24. August 2017', '2017-08-24T00:00:00'), + ('11/12', ['en'], '11/12', '2018-11-12T00:00:00'), + ('(201) 254-0596', ['en'], '(201) 254-0596', None), + ('01', ['en'], '01', None), + ('0823.1543014', ['it'], '0823.1543014', None), + ('100 głosów', ['pl'], '100 głosów', None), + ('16年12月22日', ['ja'], '16年12月22日', '2016-12-22T00:00:00'), + ('2 meses ago', ['es'], '2 meses ago', '2018-04-28T00:00:00'), + ('日期:2020年2月1日 下午6:25', ['zh'], '2020年2月1日 下午6:25', '2020-02-01T18:25:00'), + ('约会᠄ 2020年2月1日 下午6:25', ['zh'], '2020年2月1日 下午6:25', '2020-02-01T18:25:00'), + ('416 Pages / Published: 03/05/2018', ['en'], '416 Pages / Published: 03/05/2018', '2018-03-05T00:00:00'), + ('Posted on 07.23.16', ['en'], 'Posted on 07.23.16', '2016-07-23T00:00:00'), + ('May 3, 2019 / 5:46 AM / 4 days ago', ['en'], 'May 3, 2019 / 5:46 AM / 4 days ago', '2019-05-03T05:46:00'), + ('Oct 1, 2018 4:40 PM EST —', ['en'], 'Oct 1, 2018 4:40 PM EST —', '2018-10-01T16:40:00-05:00'), + ] +DATE_CASES = [pytest.param(*row) for row in DATE_CASES_SUCCESS] + \ + [pytest.param(*row, marks=pytest.mark.xfail) for row in DATE_CASES_FAILURES] + + +@pytest.mark.parametrize(['value', 'languages', 'expected_raw', 'expected'], DATE_CASES) +def test_find_date(value, languages, expected_raw, expected): + # value was used to test pre-cleanup step, using expected_raw instead here + # dt_raw = clean_date_attr_prefix(value) + # assert dt_raw == expected_raw + dt = find_date( + expected_raw, languages=languages, settings={'RELATIVE_BASE': DATE_TEST_BASE}) + if expected is None: + assert dt == expected + else: + assert dt.isoformat() == expected