From cc9ffbd417908d63801ed02f015d8ffb00f70628 Mon Sep 17 00:00:00 2001 From: Marten Lienen Date: Mon, 28 Mar 2022 17:02:59 +0200 Subject: [PATCH] Sanitize the extensions of URLs with query parameters correctly With this fix, the extension is taken from the path of the URL instead of the complete URL. The previous implementation would erroneously search for an extension in the query and params part of the URL. Additionally, a mistake is fixed where very long "extensions" could have the returned path break the max_length. --- tensorflow_datasets/core/download/resource.py | 14 ++++++++++---- tensorflow_datasets/core/download/resource_test.py | 8 ++++++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tensorflow_datasets/core/download/resource.py b/tensorflow_datasets/core/download/resource.py index a3c35f225fa..58f779be489 100644 --- a/tensorflow_datasets/core/download/resource.py +++ b/tensorflow_datasets/core/download/resource.py @@ -139,15 +139,21 @@ def _sanitize_url(url, max_length): for suffix in _NETLOC_COMMON_SUFFIXES: if netloc.endswith(suffix): netloc = netloc[:-len(suffix)] - url = '%s%s%s%s' % (netloc, url.path, url.params, url.query) + path = url.path # Get the extension: for ext in _KNOWN_EXTENSIONS: - if url.endswith(ext): + if path.endswith(ext): extension = ext - url = url[:-len(extension)] + path = path[:-len(extension)] break else: - url, extension = os.path.splitext(url) + path, extension = os.path.splitext(path) + if len(extension) >= max_length: + # If the extension is this long, the remaining url would be empty and the extension + # is most likely not actually an extension but the final part of a filename without + # an extension but with dot separators, so we clear the extension + path, extension = url.path, "" + url = '%s%s%s%s' % (netloc, path, url.params, url.query) max_length -= len(extension) # Replace non authorized chars (including '/') by '_': url = re.sub(r'[^a-zA-Z0-9\.\-_]+', '_', url) diff --git a/tensorflow_datasets/core/download/resource_test.py b/tensorflow_datasets/core/download/resource_test.py index 56414688288..4b493d22b81 100644 --- a/tensorflow_datasets/core/download/resource_test.py +++ b/tensorflow_datasets/core/download/resource_test.py @@ -57,7 +57,9 @@ class DlDirNameTest(testing.TestCase): https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json https://storage.googleapis.com/scv_dataset/data/Brawl_64x64_png/valid-00000-of-00001.tfrecords https://storage.googleapis.com/scv_dataset/data/CollectMineralShards_128x128_png/train-00005-of-00010.tfrecords -https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz\ +https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz +https://somehost.example.com/path/to/file.tar.gz?query-parameters=value¶meter-with-a-dot=size-12.5&and-more-parameters=with-their-values&is-this-the-extension=no +https://somehost.example.com/path/to/file.with-no-extension-but-a-dot-somewhere-and-the-whole-thing-is-longer-than-the-maximum-allowed-characters\ """.split('\n') expected = """\ data.statmt.org_wmt17_translation-task_devDjZ11PU9sKPPvF2sZTAzTsV7Pi3IYHaPDMOoeEuby2E.tgz @@ -73,7 +75,9 @@ class DlDirNameTest(testing.TestCase): rajpurkar_SQuAD-explorer_train-v1.1uLsZc14btZFRCgHMAy9Mn5abwO6wga4bMozTBvOyQAg.json scv_Brawl_64x64_png_valid-0_1Ez3yPwN0QDCxBd0xHeLb2DfUERJjkqFd2dyL5Z7-ULg.tfrecords scv_CollectMi_128x128_png_train-5_10kiunW_2RTDhXuPrxCVkUZKCoWpADYBUWE8DpraC8zAA.tfrecords -cs.toronto.edu_kriz_cifar-100-pythonJDFhDchdt5UW8GUAkvf_-H_r_LnFs6sHlOrqTidrpSI.tar.gz\ +cs.toronto.edu_kriz_cifar-100-pythonJDFhDchdt5UW8GUAkvf_-H_r_LnFs6sHlOrqTidrpSI.tar.gz +some.exam.com_path_to_file-para_valu_pa6KFrwWxom3oLGuRv2mHErUlx3XXd-jwJWlgvHMXh2Yc.tar.gz +some.exam.com_path_to_file.with-no-exte-but-a-gSqjzNXcNOyu4vaF84g8NRJtgyCH2Lt6bn0PCyvvlMk\ """.split('\n') def test_(self):