diff --git a/tensorflow_datasets/core/download/resource.py b/tensorflow_datasets/core/download/resource.py index a3c35f225fa..58f779be489 100644 --- a/tensorflow_datasets/core/download/resource.py +++ b/tensorflow_datasets/core/download/resource.py @@ -139,15 +139,21 @@ def _sanitize_url(url, max_length): for suffix in _NETLOC_COMMON_SUFFIXES: if netloc.endswith(suffix): netloc = netloc[:-len(suffix)] - url = '%s%s%s%s' % (netloc, url.path, url.params, url.query) + path = url.path # Get the extension: for ext in _KNOWN_EXTENSIONS: - if url.endswith(ext): + if path.endswith(ext): extension = ext - url = url[:-len(extension)] + path = path[:-len(extension)] break else: - url, extension = os.path.splitext(url) + path, extension = os.path.splitext(path) + if len(extension) >= max_length: + # If the extension is this long, the remaining url would be empty and the extension + # is most likely not actually an extension but the final part of a filename without + # an extension but with dot separators, so we clear the extension + path, extension = url.path, "" + url = '%s%s%s%s' % (netloc, path, url.params, url.query) max_length -= len(extension) # Replace non authorized chars (including '/') by '_': url = re.sub(r'[^a-zA-Z0-9\.\-_]+', '_', url) diff --git a/tensorflow_datasets/core/download/resource_test.py b/tensorflow_datasets/core/download/resource_test.py index 56414688288..4b493d22b81 100644 --- a/tensorflow_datasets/core/download/resource_test.py +++ b/tensorflow_datasets/core/download/resource_test.py @@ -57,7 +57,9 @@ class DlDirNameTest(testing.TestCase): https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json https://storage.googleapis.com/scv_dataset/data/Brawl_64x64_png/valid-00000-of-00001.tfrecords https://storage.googleapis.com/scv_dataset/data/CollectMineralShards_128x128_png/train-00005-of-00010.tfrecords -https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz\ +https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz +https://somehost.example.com/path/to/file.tar.gz?query-parameters=value¶meter-with-a-dot=size-12.5&and-more-parameters=with-their-values&is-this-the-extension=no +https://somehost.example.com/path/to/file.with-no-extension-but-a-dot-somewhere-and-the-whole-thing-is-longer-than-the-maximum-allowed-characters\ """.split('\n') expected = """\ data.statmt.org_wmt17_translation-task_devDjZ11PU9sKPPvF2sZTAzTsV7Pi3IYHaPDMOoeEuby2E.tgz @@ -73,7 +75,9 @@ class DlDirNameTest(testing.TestCase): rajpurkar_SQuAD-explorer_train-v1.1uLsZc14btZFRCgHMAy9Mn5abwO6wga4bMozTBvOyQAg.json scv_Brawl_64x64_png_valid-0_1Ez3yPwN0QDCxBd0xHeLb2DfUERJjkqFd2dyL5Z7-ULg.tfrecords scv_CollectMi_128x128_png_train-5_10kiunW_2RTDhXuPrxCVkUZKCoWpADYBUWE8DpraC8zAA.tfrecords -cs.toronto.edu_kriz_cifar-100-pythonJDFhDchdt5UW8GUAkvf_-H_r_LnFs6sHlOrqTidrpSI.tar.gz\ +cs.toronto.edu_kriz_cifar-100-pythonJDFhDchdt5UW8GUAkvf_-H_r_LnFs6sHlOrqTidrpSI.tar.gz +some.exam.com_path_to_file-para_valu_pa6KFrwWxom3oLGuRv2mHErUlx3XXd-jwJWlgvHMXh2Yc.tar.gz +some.exam.com_path_to_file.with-no-exte-but-a-gSqjzNXcNOyu4vaF84g8NRJtgyCH2Lt6bn0PCyvvlMk\ """.split('\n') def test_(self):