Skip to content

Commit e5129b6

Browse files
committed
Merge branch 'r/1.0.2'
1 parent f2fbf48 commit e5129b6

File tree

5 files changed

+231
-18
lines changed

5 files changed

+231
-18
lines changed

.gitignore

Lines changed: 162 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,164 @@
1-
.venv/
2-
.test/
3-
pywaybackup/__pycache__/
41
waybackup_snapshots/
5-
dist/
6-
pywaybackup.egg-info/
2+
3+
# Byte-compiled / optimized / DLL files
4+
__pycache__/
5+
*.py[cod]
6+
*$py.class
7+
8+
# C extensions
9+
*.so
10+
11+
# Distribution / packaging
12+
.Python
713
build/
8-
```
14+
develop-eggs/
15+
dist/
16+
downloads/
17+
eggs/
18+
.eggs/
19+
lib/
20+
lib64/
21+
parts/
22+
sdist/
23+
var/
24+
wheels/
25+
share/python-wheels/
26+
*.egg-info/
27+
.installed.cfg
28+
*.egg
29+
MANIFEST
30+
31+
# PyInstaller
32+
# Usually these files are written by a python script from a template
33+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
34+
*.manifest
35+
*.spec
36+
37+
# Installer logs
38+
pip-log.txt
39+
pip-delete-this-directory.txt
40+
41+
# Unit test / coverage reports
42+
htmlcov/
43+
.tox/
44+
.nox/
45+
.coverage
46+
.coverage.*
47+
.cache
48+
nosetests.xml
49+
coverage.xml
50+
*.cover
51+
*.py,cover
52+
.hypothesis/
53+
.pytest_cache/
54+
cover/
55+
56+
# Translations
57+
*.mo
58+
*.pot
59+
60+
# Django stuff:
61+
*.log
62+
local_settings.py
63+
db.sqlite3
64+
db.sqlite3-journal
65+
66+
# Flask stuff:
67+
instance/
68+
.webassets-cache
69+
70+
# Scrapy stuff:
71+
.scrapy
72+
73+
# Sphinx documentation
74+
docs/_build/
75+
76+
# PyBuilder
77+
.pybuilder/
78+
target/
79+
80+
# Jupyter Notebook
81+
.ipynb_checkpoints
82+
83+
# IPython
84+
profile_default/
85+
ipython_config.py
86+
87+
# pyenv
88+
# For a library or package, you might want to ignore these files since the code is
89+
# intended to run in multiple environments; otherwise, check them in:
90+
# .python-version
91+
92+
# pipenv
93+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
95+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
96+
# install all needed dependencies.
97+
#Pipfile.lock
98+
99+
# poetry
100+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101+
# This is especially recommended for binary packages to ensure reproducibility, and is more
102+
# commonly ignored for libraries.
103+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104+
#poetry.lock
105+
106+
# pdm
107+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108+
#pdm.lock
109+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110+
# in version control.
111+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112+
.pdm.toml
113+
.pdm-python
114+
.pdm-build/
115+
116+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117+
__pypackages__/
118+
119+
# Celery stuff
120+
celerybeat-schedule
121+
celerybeat.pid
122+
123+
# SageMath parsed files
124+
*.sage.py
125+
126+
# Environments
127+
.env
128+
.venv
129+
env/
130+
venv/
131+
ENV/
132+
env.bak/
133+
venv.bak/
134+
135+
# Spyder project settings
136+
.spyderproject
137+
.spyproject
138+
139+
# Rope project settings
140+
.ropeproject
141+
142+
# mkdocs documentation
143+
/site
144+
145+
# mypy
146+
.mypy_cache/
147+
.dmypy.json
148+
dmypy.json
149+
150+
# Pyre type checker
151+
.pyre/
152+
153+
# pytype static type analyzer
154+
.pytype/
155+
156+
# Cython debug symbols
157+
cython_debug/
158+
159+
# PyCharm
160+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162+
# and can be added to the global gitignore or merged into this file. For a more nuclear
163+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
164+
#.idea/

README.md

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ This script allows you to download content from the Wayback Machine (archive.org
4848
#### Optional Arguments
4949

5050
- `-l`, `--list`: Only print the snapshots available within the specified range. Does not download the snapshots.
51-
- `-e`, `--explicit`: Only download the explicit given url. No wildcard subdomains or paths.
51+
- `-e`, `--explicit`: Only download the explicit given url. No wildcard subdomains or paths. Use e.g. to get root-only snapshots.
5252
- `-o`, `--output`: The folder where downloaded files will be saved.
5353

5454
- **Range Selection:**<br>
@@ -71,8 +71,8 @@ Specify the range in years or a specific timestamp either start, end or both. If
7171
Download latest snapshot of all files:<br>
7272
`waybackup -u http://example.com -c`
7373

74-
Download latest snapshot of all files with retries:<br>
75-
`waybackup -u http://example.com -c --retry 3`
74+
Download latest snapshot of a specific file:<br>
75+
`waybackup -u http://example.com/subdir/file.html -c`
7676

7777
Download all snapshots sorted per timestamp with a specified range and do not follow redirects:<br>
7878
`waybackup -u http://example.com -f -r 5 --no-redirect`
@@ -89,6 +89,44 @@ Download all snapshots and output a json response:<br>
8989
List available snapshots per timestamp without downloading and save a csv file to home folder:<br>
9090
`waybackup -u http://example.com -f -l --csv /home/user/Downloads`
9191

92+
## Output path structure
93+
94+
The output path is currently structured as follows by an example for the query:<br>
95+
`http://example.com/subdir1/subdir2/assets/`:
96+
<br><br>
97+
For the current version (`-c`):
98+
- The requested path will only include all files/folders starting from your query-path.
99+
```
100+
your/path/waybackup_snapshots/
101+
└── the_root_of_your_query/ (example.com/)
102+
└── subdir1/
103+
└── subdir2/
104+
└── assets/
105+
├── image.jpg
106+
├── style.css
107+
...
108+
```
109+
For all versions (`-f`):
110+
- Will currently create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
111+
```
112+
your/path/waybackup_snapshots/
113+
└── the_root_of_your_query/ (example.com/)
114+
├── yyyymmddhhmmss/
115+
│ ├── subidr1/
116+
│ │ └── subdir2/
117+
│ │ └── assets/
118+
│ │ ├── image.jpg
119+
│ │ └── style.css
120+
├── yyyymmddhhmmss/
121+
│ ├── subdir1/
122+
│ │ └── subdir2/
123+
│ │ └── assets/
124+
│ │ ├── image.jpg
125+
│ │ └── style.css
126+
...
127+
```
128+
129+
92130
### Json Response
93131

94132
For download queries:

pywaybackup/SnapshotCollection.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ def count_list(cls):
3636
@classmethod
3737
def create_collection(cls):
3838
new_collection = []
39-
for cdx_entry in cls.SNAPSHOT_COLLECTION:
39+
for idx, cdx_entry in enumerate(cls.SNAPSHOT_COLLECTION):
4040
timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
4141
url_archive = f"http://web.archive.org/web/{timestamp}{cls._url_get_filetype(url)}/{url}"
4242
collection_entry = {
43-
"id": cls.SNAPSHOT_COLLECTION.index(cdx_entry),
43+
"id": idx,
4444
"timestamp": timestamp,
4545
"url_archive": url_archive,
4646
"url_origin": url,
@@ -65,7 +65,7 @@ def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> st
6565
- download_file: The output path for the snapshot entry (str) with filename.
6666
"""
6767
timestamp, url = collection_entry["timestamp"], collection_entry["url_origin"]
68-
domain, subdir, filename = cls._url_split(url)
68+
domain, subdir, filename = cls.url_split(url, index=True)
6969
if cls.MODE_CURRENT:
7070
download_dir = os.path.join(output, domain, subdir)
7171
else:
@@ -109,12 +109,17 @@ def _url_get_filetype(cls, url):
109109
return urltype
110110

111111
@classmethod
112-
def _url_split(cls, url):
112+
def url_split(cls, url, index=False):
113113
"""
114114
Split a URL into domain, subdir and filename.
115115
"""
116+
if not urlparse(url).scheme:
117+
url = "http://" + url
116118
parsed_url = urlparse(url)
117-
domain = parsed_url.netloc
118-
subdir = parsed_url.path.strip("/").rsplit("/", 1)[0]
119-
filename = parsed_url.path.split("/")[-1] or "index.html"
119+
domain = parsed_url.netloc.split("@")[-1].split(":")[0] # split mailto: and port
120+
filename = parsed_url.path.split("/")[-1]
121+
if index is True and filename == "":
122+
filename = "index.html"
123+
subdir = parsed_url.path.strip("/").replace(filename, "").strip("/")
124+
filename = filename.replace("%20", " ") # replace url encoded spaces
120125
return domain, subdir, filename

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.1"
1+
__version__ = "1.0.2"

pywaybackup/archive.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,25 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
8989
try:
9090
v.write("\nQuerying snapshots...")
9191
query_range = ""
92+
9293
if not range:
9394
if start: query_range = query_range + f"&from={start}"
9495
if end: query_range = query_range + f"&to={end}"
9596
else:
9697
query_range = "&from=" + str(datetime.now().year - range)
97-
cdx_url = f"*.{url}/*" if not explicit else f"{url}"
98+
99+
# parse user input url and create according cdx url
100+
domain, subdir, filename = sc.url_split(url)
101+
if domain and not subdir and not filename:
102+
cdx_url = f"*.{domain}/*" if not explicit else f"{domain}"
103+
if domain and subdir and not filename:
104+
cdx_url = f"{domain}/{subdir}/*"
105+
if domain and subdir and filename:
106+
cdx_url = f"{domain}/{subdir}/{filename}/*"
107+
if domain and not subdir and filename:
108+
cdx_url = f"{domain}/{filename}/*"
109+
110+
print(f"---> {cdx_url}")
98111
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,digest,mimetype,statuscode,original&filter!=statuscode:200"
99112
cdxResult = requests.get(cdxQuery)
100113
sc.create_list(cdxResult, mode)
@@ -121,6 +134,7 @@ def download_list(output, retry, no_redirect, workers):
121134
else:
122135
batch_size = sc.count_list()
123136
sc.create_collection()
137+
v.write("\n-----> Snapshots prepared")
124138
batch_list = [sc.SNAPSHOT_COLLECTION[i:i + batch_size] for i in range(0, len(sc.SNAPSHOT_COLLECTION), batch_size)]
125139
threads = []
126140
worker = 0

0 commit comments

Comments
 (0)