From cc9ffbd417908d63801ed02f015d8ffb00f70628 Mon Sep 17 00:00:00 2001
From: Marten Lienen <marten.lienen@gmail.com>
Date: Mon, 28 Mar 2022 17:02:59 +0200
Subject: [PATCH] Sanitize the extensions of URLs with query parameters
 correctly

With this fix, the extension is taken from the path of the URL instead of the complete
URL. The previous implementation would erroneously search for an extension in the query
and params part of the URL. Additionally, a mistake is fixed where very long "extensions"
could have the returned path break the max_length.
---
 tensorflow_datasets/core/download/resource.py      | 14 ++++++++++----
 tensorflow_datasets/core/download/resource_test.py |  8 ++++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/tensorflow_datasets/core/download/resource.py b/tensorflow_datasets/core/download/resource.py
index a3c35f225fa..58f779be489 100644
--- a/tensorflow_datasets/core/download/resource.py
+++ b/tensorflow_datasets/core/download/resource.py
@@ -139,15 +139,21 @@ def _sanitize_url(url, max_length):
   for suffix in _NETLOC_COMMON_SUFFIXES:
     if netloc.endswith(suffix):
       netloc = netloc[:-len(suffix)]
-  url = '%s%s%s%s' % (netloc, url.path, url.params, url.query)
+  path = url.path
   # Get the extension:
   for ext in _KNOWN_EXTENSIONS:
-    if url.endswith(ext):
+    if path.endswith(ext):
       extension = ext
-      url = url[:-len(extension)]
+      path = path[:-len(extension)]
       break
   else:
-    url, extension = os.path.splitext(url)
+    path, extension = os.path.splitext(path)
+    if len(extension) >= max_length:
+      # If the extension is this long, the remaining url would be empty and the extension
+      # is most likely not actually an extension but the final part of a filename without
+      # an extension but with dot separators, so we clear the extension
+      path, extension = url.path, ""
+  url = '%s%s%s%s' % (netloc, path, url.params, url.query)
   max_length -= len(extension)
   # Replace non authorized chars (including '/') by '_':
   url = re.sub(r'[^a-zA-Z0-9\.\-_]+', '_', url)
diff --git a/tensorflow_datasets/core/download/resource_test.py b/tensorflow_datasets/core/download/resource_test.py
index 56414688288..4b493d22b81 100644
--- a/tensorflow_datasets/core/download/resource_test.py
+++ b/tensorflow_datasets/core/download/resource_test.py
@@ -57,7 +57,9 @@ class DlDirNameTest(testing.TestCase):
 https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
 https://storage.googleapis.com/scv_dataset/data/Brawl_64x64_png/valid-00000-of-00001.tfrecords
 https://storage.googleapis.com/scv_dataset/data/CollectMineralShards_128x128_png/train-00005-of-00010.tfrecords
-https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz\
+https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
+https://somehost.example.com/path/to/file.tar.gz?query-parameters=value&parameter-with-a-dot=size-12.5&and-more-parameters=with-their-values&is-this-the-extension=no
+https://somehost.example.com/path/to/file.with-no-extension-but-a-dot-somewhere-and-the-whole-thing-is-longer-than-the-maximum-allowed-characters\
 """.split('\n')
   expected = """\
 data.statmt.org_wmt17_translation-task_devDjZ11PU9sKPPvF2sZTAzTsV7Pi3IYHaPDMOoeEuby2E.tgz
@@ -73,7 +75,9 @@ class DlDirNameTest(testing.TestCase):
 rajpurkar_SQuAD-explorer_train-v1.1uLsZc14btZFRCgHMAy9Mn5abwO6wga4bMozTBvOyQAg.json
 scv_Brawl_64x64_png_valid-0_1Ez3yPwN0QDCxBd0xHeLb2DfUERJjkqFd2dyL5Z7-ULg.tfrecords
 scv_CollectMi_128x128_png_train-5_10kiunW_2RTDhXuPrxCVkUZKCoWpADYBUWE8DpraC8zAA.tfrecords
-cs.toronto.edu_kriz_cifar-100-pythonJDFhDchdt5UW8GUAkvf_-H_r_LnFs6sHlOrqTidrpSI.tar.gz\
+cs.toronto.edu_kriz_cifar-100-pythonJDFhDchdt5UW8GUAkvf_-H_r_LnFs6sHlOrqTidrpSI.tar.gz
+some.exam.com_path_to_file-para_valu_pa6KFrwWxom3oLGuRv2mHErUlx3XXd-jwJWlgvHMXh2Yc.tar.gz
+some.exam.com_path_to_file.with-no-exte-but-a-gSqjzNXcNOyu4vaF84g8NRJtgyCH2Lt6bn0PCyvvlMk\
 """.split('\n')
 
   def test_(self):