Merge branch 'r/1.0.2'

bitdruid · bitdruid · commit e5129b681cdf · 2024-05-31T09:34:19.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,164 @@
-.venv/
-.test/
-pywaybackup/__pycache__/
 waybackup_snapshots/
-dist/
-pywaybackup.egg-info/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
 build/
-```
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ This script allows you to download content from the Wayback Machine (archive.org
 #### Optional Arguments
 
 - `-l`, `--list`: Only print the snapshots available within the specified range. Does not download the snapshots.
-- `-e`, `--explicit`: Only download the explicit given url. No wildcard subdomains or paths.
+- `-e`, `--explicit`: Only download the explicit given url. No wildcard subdomains or paths. Use e.g. to get root-only snapshots.
 - `-o`, `--output`: The folder where downloaded files will be saved.
 
 - **Range Selection:**<br>
@@ -71,8 +71,8 @@ Specify the range in years or a specific timestamp either start, end or both. If
 Download latest snapshot of all files:<br>
 `waybackup -u http://example.com -c`
 
-Download latest snapshot of all files with retries:<br>
-`waybackup -u http://example.com -c --retry 3`
+Download latest snapshot of a specific file:<br>
+`waybackup -u http://example.com/subdir/file.html -c`
 
 Download all snapshots sorted per timestamp with a specified range and do not follow redirects:<br>
 `waybackup -u http://example.com -f -r 5 --no-redirect`
@@ -89,6 +89,44 @@ Download all snapshots and output a json response:<br>
 List available snapshots per timestamp without downloading and save a csv file to home folder:<br>
 `waybackup -u http://example.com -f -l --csv /home/user/Downloads`
 
+## Output path structure
+
+The output path is currently structured as follows by an example for the query:<br>
+`http://example.com/subdir1/subdir2/assets/`:
+<br><br>
+For the current version (`-c`):
+- The requested path will only include all files/folders starting from your query-path.
+```
+your/path/waybackup_snapshots/
+└── the_root_of_your_query/ (example.com/)
+    └── subdir1/
+        └── subdir2/
+            └── assets/
+                ├── image.jpg
+                ├── style.css
+                ...
+```
+For all versions (`-f`):
+- Will currently create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
+```
+your/path/waybackup_snapshots/
+└── the_root_of_your_query/ (example.com/)
+    ├── yyyymmddhhmmss/
+    │   ├── subidr1/
+    │   │   └── subdir2/
+    │   │       └── assets/
+    │   │           ├── image.jpg
+    │   │           └── style.css
+    ├── yyyymmddhhmmss/
+    │   ├── subdir1/
+    │   │   └── subdir2/
+    │   │       └── assets/
+    │   │           ├── image.jpg
+    │   │           └── style.css
+    ...
+```
+
+
 ### Json Response
 
 For download queries:
diff --git a/pywaybackup/SnapshotCollection.py b/pywaybackup/SnapshotCollection.py
@@ -36,11 +36,11 @@ def count_list(cls):
     @classmethod
     def create_collection(cls):
         new_collection = []
-        for cdx_entry in cls.SNAPSHOT_COLLECTION:
+        for idx, cdx_entry in enumerate(cls.SNAPSHOT_COLLECTION):
             timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
             url_archive = f"http://web.archive.org/web/{timestamp}{cls._url_get_filetype(url)}/{url}"
             collection_entry = {
-                "id": cls.SNAPSHOT_COLLECTION.index(cdx_entry),
+                "id": idx,
                 "timestamp": timestamp,
                 "url_archive": url_archive,
                 "url_origin": url,
@@ -65,7 +65,7 @@ def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> st
         - download_file: The output path for the snapshot entry (str) with filename.
         """
         timestamp, url = collection_entry["timestamp"], collection_entry["url_origin"]
-        domain, subdir, filename = cls._url_split(url)
+        domain, subdir, filename = cls.url_split(url, index=True)
         if cls.MODE_CURRENT:
             download_dir = os.path.join(output, domain, subdir)
         else:
@@ -109,12 +109,17 @@ def _url_get_filetype(cls, url):
         return urltype
 
     @classmethod
-    def _url_split(cls, url):
+    def url_split(cls, url, index=False):
         """
         Split a URL into domain, subdir and filename.
         """
+        if not urlparse(url).scheme:
+            url = "http://" + url
         parsed_url = urlparse(url)
-        domain = parsed_url.netloc
-        subdir = parsed_url.path.strip("/").rsplit("/", 1)[0]
-        filename = parsed_url.path.split("/")[-1] or "index.html"
+        domain = parsed_url.netloc.split("@")[-1].split(":")[0] # split mailto: and port
+        filename = parsed_url.path.split("/")[-1]
+        if index is True and filename == "":
+            filename = "index.html"
+        subdir = parsed_url.path.strip("/").replace(filename, "").strip("/")
+        filename = filename.replace("%20", " ") # replace url encoded spaces
         return domain, subdir, filename
diff --git a/pywaybackup/__version__.py b/pywaybackup/__version__.py
@@ -1 +1 @@
-__version__ = "1.0.1"
+__version__ = "1.0.2"
diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
@@ -89,12 +89,25 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
     try:
         v.write("\nQuerying snapshots...")
         query_range = ""
+        
         if not range:
             if start: query_range = query_range + f"&from={start}"
             if end: query_range = query_range + f"&to={end}"
         else: 
             query_range = "&from=" + str(datetime.now().year - range)
-        cdx_url = f"*.{url}/*" if not explicit else f"{url}"
+
+        # parse user input url and create according cdx url
+        domain, subdir, filename = sc.url_split(url)
+        if domain and not subdir and not filename:
+            cdx_url = f"*.{domain}/*" if not explicit else f"{domain}"
+        if domain and subdir and not filename:
+            cdx_url = f"{domain}/{subdir}/*"
+        if domain and subdir and filename:
+            cdx_url = f"{domain}/{subdir}/{filename}/*"
+        if domain and not subdir and filename:
+            cdx_url = f"{domain}/{filename}/*"
+
+        print(f"---> {cdx_url}")
         cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,digest,mimetype,statuscode,original&filter!=statuscode:200"
         cdxResult = requests.get(cdxQuery)
         sc.create_list(cdxResult, mode)
@@ -121,6 +134,7 @@ def download_list(output, retry, no_redirect, workers):
     else:
         batch_size = sc.count_list()
     sc.create_collection()
+    v.write("\n-----> Snapshots prepared")
     batch_list = [sc.SNAPSHOT_COLLECTION[i:i + batch_size] for i in range(0, len(sc.SNAPSHOT_COLLECTION), batch_size)]    
     threads = []
     worker = 0

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.0.1"`
	`1`	`+__version__ = "1.0.2"`