From c7948d563b9c0d1e18ea8cf70c22c17466f17922 Mon Sep 17 00:00:00 2001 From: Bulat Date: Sun, 18 Aug 2019 19:43:11 +0300 Subject: [PATCH 01/10] Do not parse dates as prices. Sort imports. --- price_parser/parser.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index b4141e4..24cfe19 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -1,10 +1,13 @@ # -*- coding: utf-8 -*- + import re import string -from typing import Callable, Optional, Pattern, List, Tuple +from datetime import datetime from decimal import Decimal, InvalidOperation +from typing import Callable, List, Optional, Pattern, Tuple import attr + from ._currencies import (CURRENCY_CODES, CURRENCY_NATIONAL_SYMBOLS, CURRENCY_SYMBOLS) @@ -69,7 +72,7 @@ def or_regex(symbols: List[str]) -> Pattern: # unique currency symbols '$', '€', '£', 'zł', 'Zł', 'Kč', '₽', '¥', '¥', - '฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡', + '฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡', 'টকা', 'ƒ', '₲', '؋', '₮', 'नेरू', '₨', '₶', '₾', '֏', 'ރ', '৲', '૱', '௹', '₠', '₢', '₣', '₤', '₧', '₯', '₰', '₳', '₷', '₸', '₹', '₺', '₼', '₾', '₿', 'ℳ', @@ -82,7 +85,7 @@ def or_regex(symbols: List[str]) -> Pattern: # other common symbols, which we consider unambiguous 'EUR', 'euro', 'eur', 'CHF', 'DKK', 'Rp', 'lei', - 'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв', + 'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв', 'р.', 'тңг', 'тңг.', 'ман.', ] @@ -139,8 +142,8 @@ def extract_currency_symbol(price: Optional[str], if price and '$' in price: methods.insert(0, (_search_dollar_code, price)) - for meth, attr in methods: - m = meth(attr) if attr else None + for meth, attrib in methods: + m = meth(attrib) if attrib else None if m: return m.group(0) @@ -180,6 +183,11 @@ def extract_price_text(price: str) -> Optional[str]: >>> extract_price_text("50") '50' """ + + date_ftm = date_format(price) + if date_ftm: + return + if price.count('€') == 1: m = re.search(r""" [\d\s.,]*?\d # number, probably with thousand separators @@ -283,3 +291,15 @@ def parse_number(num: str) -> Optional[Decimal]: return Decimal(num) except InvalidOperation: return None + + +def date_format(price): + for fmt in ['%d.%m.%Y', '%B, %Y']: + try: + date = datetime.strptime(price, fmt) + if isinstance(date, datetime): + return date + except (ValueError, TypeError): + continue + + return None From 1aadabe7c2b67fda244d206d79e027b3c341420b Mon Sep 17 00:00:00 2001 From: Bulat Date: Sun, 18 Aug 2019 20:07:11 +0300 Subject: [PATCH 02/10] Fix Py36 compatibility. --- price_parser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index 24cfe19..c9de07f 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -186,7 +186,7 @@ def extract_price_text(price: str) -> Optional[str]: date_ftm = date_format(price) if date_ftm: - return + return None if price.count('€') == 1: m = re.search(r""" From 4a81a99bfe07bbebc3b40d0167e104235060af71 Mon Sep 17 00:00:00 2001 From: Bulat Date: Mon, 19 Aug 2019 11:50:28 +0300 Subject: [PATCH 03/10] Add tests --- tests/test_price_parsing.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index eb9f597..f4b9d1d 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -11,12 +11,14 @@ we've found in a wild; PRICE_PARSING_EXAMPLES_NEW is a list of tests for new features. New tests should probably go these two lists. """ -from typing import Optional, Union +from datetime import datetime from decimal import Decimal +from typing import Optional, Union import pytest from price_parser import Price +from price_parser.parser import date_format class Example(Price): @@ -1986,3 +1988,18 @@ def test_parsing(example: Example): ) def test_price_amount_float(amount, amount_float): assert Price(amount, None, None).amount_float == amount_float + + +@pytest.mark.parametrize( + "price, result", + [ + ('10.04.2004', datetime(2004, 4, 10, 0, 0)), + ('July, 2004', datetime(2004, 7, 1, 0, 0)), + ('200', None), + ('2004', None), + (2004, None), + (10.2014, None), + ] +) +def test_date_format(price, result): + assert date_format(price) == result From 20504330b1874d787cf9367e1eea7476cc6c7c81 Mon Sep 17 00:00:00 2001 From: Bulat Date: Mon, 19 Aug 2019 12:04:08 +0300 Subject: [PATCH 04/10] Fix return statement --- price_parser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index c9de07f..7197ef8 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -302,4 +302,4 @@ def date_format(price): except (ValueError, TypeError): continue - return None + return None From d759b346fafd25ff8311d7bec58cc76b308bbdd1 Mon Sep 17 00:00:00 2001 From: Bulat Date: Mon, 19 Aug 2019 12:56:19 +0300 Subject: [PATCH 05/10] Refactoring. --- price_parser/parser.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index 7197ef8..be15f23 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -184,8 +184,7 @@ def extract_price_text(price: str) -> Optional[str]: '50' """ - date_ftm = date_format(price) - if date_ftm: + if date_format(price): return None if price.count('€') == 1: @@ -301,5 +300,3 @@ def date_format(price): return date except (ValueError, TypeError): continue - - return None From 4a512b2e7d0a37afd19ec9e557e31c1f91f227df Mon Sep 17 00:00:00 2001 From: Bulat Date: Mon, 19 Aug 2019 13:17:35 +0300 Subject: [PATCH 06/10] Fix coverage issue --- price_parser/parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/price_parser/parser.py b/price_parser/parser.py index be15f23..d6ee4ef 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -298,5 +298,7 @@ def date_format(price): date = datetime.strptime(price, fmt) if isinstance(date, datetime): return date + else: + continue except (ValueError, TypeError): continue From 1ede7b5dc4d236fe689e504c3f4dc31f4d8f0922 Mon Sep 17 00:00:00 2001 From: Bulat Date: Mon, 19 Aug 2019 14:12:13 +0300 Subject: [PATCH 07/10] Add one more format --- price_parser/parser.py | 5 ++--- tests/test_price_parsing.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index d6ee4ef..a36b510 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -293,12 +293,11 @@ def parse_number(num: str) -> Optional[Decimal]: def date_format(price): - for fmt in ['%d.%m.%Y', '%B, %Y']: + for fmt in ['%d.%m.%Y', '%B, %Y', '%b, %Y']: try: date = datetime.strptime(price, fmt) if isinstance(date, datetime): return date - else: - continue except (ValueError, TypeError): continue + return None diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index f4b9d1d..7b771e2 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -1995,6 +1995,7 @@ def test_price_amount_float(amount, amount_float): [ ('10.04.2004', datetime(2004, 4, 10, 0, 0)), ('July, 2004', datetime(2004, 7, 1, 0, 0)), + ('Jul, 2004', datetime(2004, 7, 1, 0, 0)), ('200', None), ('2004', None), (2004, None), From 32ae1296c946c1e53bb6cf6ff68ecdb2b3f649c7 Mon Sep 17 00:00:00 2001 From: Bulat Date: Sun, 15 Sep 2019 18:43:27 +0300 Subject: [PATCH 08/10] Delete datetime if a string contains a text. --- price_parser/parser.py | 21 ++++++++++++++++++++- tests/test_price_parsing.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index a36b510..a83c5bc 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -187,6 +187,8 @@ def extract_price_text(price: str) -> Optional[str]: if date_format(price): return None + price = strip_date(price) + if price.count('€') == 1: m = re.search(r""" [\d\s.,]*?\d # number, probably with thousand separators @@ -293,11 +295,28 @@ def parse_number(num: str) -> Optional[Decimal]: def date_format(price): - for fmt in ['%d.%m.%Y', '%B, %Y', '%b, %Y']: + for fmt in ['%d.%m.%Y', '%B, %Y', '%b, %Y', '%Y-%m-%d']: try: date = datetime.strptime(price, fmt) if isinstance(date, datetime): return date except (ValueError, TypeError): continue + return None + + +def strip_date(text): + # normalize whitspace + text = re.sub(r'\s+', ' ', text) + all_date_regexp = [ + r'\d{1,4}-\d{1,2}-\d{2,4}', + r' \S{3,8},\s\d{4}', + ] + text_processed = text + for regexp in all_date_regexp: + for match in re.finditer(regexp, text): + if match and date_format(match.group(0).strip()): + text_processed = text_processed.replace(match.group(0), '') + + return text_processed diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index 7b771e2..d7a37de 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -18,7 +18,7 @@ import pytest from price_parser import Price -from price_parser.parser import date_format +from price_parser.parser import date_format, strip_date class Example(Price): @@ -1946,6 +1946,21 @@ def __eq__(self, other): Example(None, '15.08.2017', None, None, None), + Example(None, '0€ until May, 2005, 35€ afterwards', + '€', '0', 0), + + Example(None, '2019-08-19: 22 USD', + 'USD', '22', 22), + + Example(None, '2105 EUR at July, 2004', + 'EUR', '2105', 2105), + + Example(None, '$10 EUR during March, 2016', + '$', '10', 10), + + Example(None, '$10 EUR at March, 2016 or 2019-08-19', + '$', '10', 10), + # other incorrectly extracted prices Example('8.5', '25-09', None, None, None), @@ -2004,3 +2019,18 @@ def test_price_amount_float(amount, amount_float): ) def test_date_format(price, result): assert date_format(price) == result + + +@pytest.mark.parametrize( + "price, result", + [ + ('0€ until May, 2005, 35€ afterwards', '0€ until, 35€ afterwards'), + ('2019-08-19: 22 USD', ': 22 USD'), + ('105 EUR at July, 2004', '105 EUR at'), + ('$10 EUR during March, 2016', '$10 EUR during'), + ('$10 EUR during March, 2016 -- March, 2020', '$10 EUR during --'), + ('$10', '$10'), + ] +) +def test_strip_date(price, result): + assert strip_date(price) == result From 20ac018ad2b57a9316da239d38e1feda69661735 Mon Sep 17 00:00:00 2001 From: Bulat Date: Sun, 22 Sep 2019 12:18:52 +0300 Subject: [PATCH 09/10] Add more tests. Delete unused code. --- price_parser/parser.py | 10 ++++------ tests/test_price_parsing.py | 2 ++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index a83c5bc..7337319 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -297,26 +297,24 @@ def parse_number(num: str) -> Optional[Decimal]: def date_format(price): for fmt in ['%d.%m.%Y', '%B, %Y', '%b, %Y', '%Y-%m-%d']: try: - date = datetime.strptime(price, fmt) - if isinstance(date, datetime): - return date + return datetime.strptime(price, fmt) except (ValueError, TypeError): continue - return None - def strip_date(text): # normalize whitspace text = re.sub(r'\s+', ' ', text) + all_date_regexp = [ r'\d{1,4}-\d{1,2}-\d{2,4}', r' \S{3,8},\s\d{4}', ] + text_processed = text for regexp in all_date_regexp: for match in re.finditer(regexp, text): - if match and date_format(match.group(0).strip()): + if date_format(match.group(0).strip()): text_processed = text_processed.replace(match.group(0), '') return text_processed diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index d7a37de..31638be 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -2030,6 +2030,8 @@ def test_date_format(price, result): ('$10 EUR during March, 2016', '$10 EUR during'), ('$10 EUR during March, 2016 -- March, 2020', '$10 EUR during --'), ('$10', '$10'), + ('sample text', 'sample text'), + ('$10 - 1-08-19', '$10 - 1-08-19'), ] ) def test_strip_date(price, result): From 737194c92f74f8fed3c0d365f97096500b654bc0 Mon Sep 17 00:00:00 2001 From: Bulat Date: Sat, 19 Oct 2019 09:43:24 +0300 Subject: [PATCH 10/10] Move date tests to BUGS_CAUGHT. --- tests/test_price_parsing.py | 38 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index 31638be..ef768d7 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -63,6 +63,22 @@ def __eq__(self, other): 'GBP', '29.1583', 29.1583), Example(None, '1.11000000000000009770', None, '1.11000000000000009770', Decimal('1.11000000000000009770')), + + # dates + Example(None, 'July, 2004', + None, None, None), + Example(None, '15.08.2017', + None, None, None), + Example(None, '0€ until May, 2005, 35€ afterwards', + '€', '0', 0), + Example(None, '2019-08-19: 22 USD', + 'USD', '22', 22), + Example(None, '2105 EUR at July, 2004', + 'EUR', '2105', 2105), + Example(None, '$10 EUR during March, 2016', + '$', '10', 10), + Example(None, '$10 EUR at March, 2016 or 2019-08-19', + '$', '10', 10), ] @@ -1939,28 +1955,6 @@ def __eq__(self, other): Example('Купить', 'Печная труба', None, None, None), - # dates - Example(None, 'July, 2004', - None, None, None), - - Example(None, '15.08.2017', - None, None, None), - - Example(None, '0€ until May, 2005, 35€ afterwards', - '€', '0', 0), - - Example(None, '2019-08-19: 22 USD', - 'USD', '22', 22), - - Example(None, '2105 EUR at July, 2004', - 'EUR', '2105', 2105), - - Example(None, '$10 EUR during March, 2016', - '$', '10', 10), - - Example(None, '$10 EUR at March, 2016 or 2019-08-19', - '$', '10', 10), - # other incorrectly extracted prices Example('8.5', '25-09', None, None, None),