diff --git a/price_parser/parser.py b/price_parser/parser.py index f9959b7..87b5e1d 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -1,10 +1,13 @@ # -*- coding: utf-8 -*- + import re import string -from typing import Callable, Optional, Pattern, List, Tuple +from datetime import datetime from decimal import Decimal, InvalidOperation +from typing import Callable, List, Optional, Pattern, Tuple import attr + from ._currencies import (CURRENCY_CODES, CURRENCY_NATIONAL_SYMBOLS, CURRENCY_SYMBOLS) @@ -73,7 +76,7 @@ def or_regex(symbols: List[str]) -> Pattern: # unique currency symbols '$', '€', '£', 'zł', 'Zł', 'Kč', '₽', '¥', '¥', - '฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡', + '฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡', 'টকা', 'ƒ', '₲', '؋', '₮', 'नेरू', '₨', '₶', '₾', '֏', 'ރ', '৲', '૱', '௹', '₠', '₢', '₣', '₤', '₧', '₯', '₰', '₳', '₷', '₸', '₹', '₺', '₼', '₾', '₿', 'ℳ', @@ -86,7 +89,7 @@ def or_regex(symbols: List[str]) -> Pattern: # other common symbols, which we consider unambiguous 'EUR', 'euro', 'eur', 'CHF', 'DKK', 'Rp', 'lei', - 'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв', + 'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв', 'р.', 'тңг', 'тңг.', 'ман.', ] @@ -143,8 +146,8 @@ def extract_currency_symbol(price: Optional[str], if price and '$' in price: methods.insert(0, (_search_dollar_code, price)) - for meth, attr in methods: - m = meth(attr) if attr else None + for meth, attrib in methods: + m = meth(attrib) if attrib else None if m: return m.group(0) @@ -184,6 +187,12 @@ def extract_price_text(price: str) -> Optional[str]: >>> extract_price_text("50") '50' """ + + if date_format(price): + return None + + price = strip_date(price) + if price.count('€') == 1: m = re.search(r""" [\d\s.,]*?\d # number, probably with thousand separators @@ -292,3 +301,29 @@ def parse_number(num: str, return Decimal(num) except InvalidOperation: return None + + +def date_format(price): + for fmt in ['%d.%m.%Y', '%B, %Y', '%b, %Y', '%Y-%m-%d']: + try: + return datetime.strptime(price, fmt) + except (ValueError, TypeError): + continue + + +def strip_date(text): + # normalize whitspace + text = re.sub(r'\s+', ' ', text) + + all_date_regexp = [ + r'\d{1,4}-\d{1,2}-\d{2,4}', + r' \S{3,8},\s\d{4}', + ] + + text_processed = text + for regexp in all_date_regexp: + for match in re.finditer(regexp, text): + if date_format(match.group(0).strip()): + text_processed = text_processed.replace(match.group(0), '') + + return text_processed diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index fe7b8aa..e1e2e7f 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -11,12 +11,14 @@ we've found in a wild; PRICE_PARSING_EXAMPLES_NEW is a list of tests for new features. New tests should probably go these two lists. """ -from typing import Optional, Union +from datetime import datetime from decimal import Decimal +from typing import Optional, Union import pytest from price_parser import Price +from price_parser.parser import date_format, strip_date class Example(Price): @@ -63,6 +65,22 @@ def __eq__(self, other): 'GBP', '29.1583', 29.1583), Example(None, '1.11000000000000009770', None, '1.11000000000000009770', Decimal('1.11000000000000009770')), + + # dates + Example(None, 'July, 2004', + None, None, None), + Example(None, '15.08.2017', + None, None, None), + Example(None, '0€ until May, 2005, 35€ afterwards', + '€', '0', 0), + Example(None, '2019-08-19: 22 USD', + 'USD', '22', 22), + Example(None, '2105 EUR at July, 2004', + 'EUR', '2105', 2105), + Example(None, '$10 EUR during March, 2016', + '$', '10', 10), + Example(None, '$10 EUR at March, 2016 or 2019-08-19', + '$', '10', 10), ] @@ -1939,13 +1957,6 @@ def __eq__(self, other): Example('Купить', 'Печная труба', None, None, None), - # dates - Example(None, 'July, 2004', - None, None, None), - - Example(None, '15.08.2017', - None, None, None), - # other incorrectly extracted prices Example('8.5', '25-09', None, None, None), @@ -2018,3 +2029,36 @@ def test_price_decimal_separator(price_raw, decimal_separator, expected_result): decimal_separator=decimal_separator ) assert parsed.amount == expected_result + + +@pytest.mark.parametrize( + "price, result", + [ + ('10.04.2004', datetime(2004, 4, 10, 0, 0)), + ('July, 2004', datetime(2004, 7, 1, 0, 0)), + ('Jul, 2004', datetime(2004, 7, 1, 0, 0)), + ('200', None), + ('2004', None), + (2004, None), + (10.2014, None), + ] +) +def test_date_format(price, result): + assert date_format(price) == result + + +@pytest.mark.parametrize( + "price, result", + [ + ('0€ until May, 2005, 35€ afterwards', '0€ until, 35€ afterwards'), + ('2019-08-19: 22 USD', ': 22 USD'), + ('105 EUR at July, 2004', '105 EUR at'), + ('$10 EUR during March, 2016', '$10 EUR during'), + ('$10 EUR during March, 2016 -- March, 2020', '$10 EUR during --'), + ('$10', '$10'), + ('sample text', 'sample text'), + ('$10 - 1-08-19', '$10 - 1-08-19'), + ] +) +def test_strip_date(price, result): + assert strip_date(price) == result