Skip to content

Commit f5dc789

Browse files
committed
Merge branch 'r/3.2.0'
1 parent f5ec109 commit f5dc789

15 files changed

+566
-388
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
waybackup_snapshots/
2+
PKGBUILD
23

34
# Byte-compiled / optimized / DLL files
45
__pycache__/
@@ -162,3 +163,4 @@ cython_debug/
162163
# and can be added to the global gitignore or merged into this file. For a more nuclear
163164
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
164165
#.idea/
166+
.qodo

README.md

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ This tool allows you to download content from the Wayback Machine (archive.org).
2828
```pip install .```
2929
- in a virtual env or use `--break-system-package`
3030

31-
## Important notes
31+
## notes / issues / hints
3232

33-
- Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
34-
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
33+
- Linux recommended: On Windows machines, the path length is limited. Files that exceed the path length will not be downloaded.
3534
- The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
35+
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
36+
- Downloading directly into a network share is not recommended. The sqlite locking mechanism may cause issues. If you need to download into a network share, set the `--metadata` argument to a local path.
3637

3738
<br>
3839
<br>
@@ -85,6 +86,9 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
8586
- **`-o`**, **`--output`**:<br>
8687
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
8788

89+
- **`-m`**, **`--metadata`**<br>
90+
Change the folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). Especially if you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
91+
8892
<!-- - **`--verbosity`** `<level>`:<br>
8993
Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
9094

@@ -99,13 +103,27 @@ Sets the number of simultaneous download workers. Default is 1, safe range is ab
99103

100104
- **`--no-redirect`**:<br>
101105
Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
102-
106+
103107
- **`--retry`** `<attempts>`:<br>
104108
Specifies number of retry attempts for failed downloads.
105109

106110
- **`--delay`** `<seconds>`:<br>
107111
Specifies delay between download requests in seconds. Default is no delay (0).
108112

113+
- **`--verbose`**:<br>
114+
Increase output verbosity.
115+
- verbose:
116+
```
117+
-----> Worker: 2 - Attempt: [1/1] Snapshot ID: [23/81]
118+
SUCCESS -> 200 OK
119+
-> URL: https://web.archive.org/web/20240225193302id_/https://example.com/assets/css/custom-styles.css
120+
-> FILE: /home/manjaro/Stuff/python-wayback-machine-downloader/waybackup_snapshots/example.com/20240225193302id_/assets/css/custom-styles.css
121+
```
122+
- non-verbose:
123+
```
124+
55/81 - W:2 - SUCCESS - 20240225193302 - https://example.com/assets/css/custom-styles.css
125+
```
126+
109127
<!-- - **`--convert-links`**:<br>
110128
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
111129

dev/pip_build.sh

Lines changed: 0 additions & 12 deletions
This file was deleted.

dev/venv_create.sh

Lines changed: 0 additions & 17 deletions
This file was deleted.

pyproject.toml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
77

88
[project]
99
name = "pywaybackup"
10-
version = "3.1.0"
10+
version = "3.2.0"
1111
description = "Query and download archive.org as simple as possible."
1212
authors = [
1313
{ name = "bitdruid", email = "bitdruid@outlook.com" }
@@ -16,9 +16,10 @@ license = { file = "LICENSE" }
1616
readme = "README.md"
1717
requires-python = ">=3.8"
1818
dependencies = [
19-
"pysqlite3-binary==0.5.4",
20-
"requests==2.31.0",
21-
"tqdm==4.66.2",
19+
"pysqlite3-binary==0.5.4; sys_platform == 'linux'",
20+
"pysqlite-binary; sys_platform == 'win32'",
21+
"requests==2.32.3",
22+
"tqdm==4.67.1",
2223
"python-magic==0.4.27; sys_platform == 'linux'",
2324
"python-magic-bin==0.4.14; sys_platform == 'win32'",
2425
]

pywaybackup/Arguments.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,14 @@
66
from importlib.metadata import version
77

88
from pywaybackup.helper import url_split, sanitize_filename
9-
from pywaybackup.Exception import Exception as ex
109

1110
class Arguments:
12-
11+
1312
def __init__(self):
14-
13+
1514
parser = argparse.ArgumentParser(description='Download from wayback machine (archive.org)')
1615
parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version("pywaybackup") + ' by @bitdruid -> https://github.com/bitdruid')
17-
16+
1817
required = parser.add_argument_group('required (one exclusive)')
1918
required.add_argument('-u', '--url', type=str, metavar="", help='url (with subdir/subdomain) to download')
2019
exclusive_required = required.add_mutually_exclusive_group(required=True)
@@ -32,16 +31,17 @@ def __init__(self):
3231
optional.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
3332

3433
behavior = parser.add_argument_group('manipulate behavior')
35-
behavior.add_argument('-o', '--output', type=str, metavar="", help='output folder - defaults to current directory')
34+
behavior.add_argument('-o', '--output', type=str, metavar="", help='output for all files - defaults to current directory')
35+
behavior.add_argument('-m', '--metadata', type=str, metavar="", help='change directory for db/cdx/csv/log files')
3636
behavior.add_argument('--log', action='store_true', help='save a log file into the output folder')
3737
behavior.add_argument('--progress', action='store_true', help='show a progress bar')
3838
behavior.add_argument('--no-redirect', action='store_true', help='do not follow redirects by archive.org')
39-
#behavior.add_argument('--verbosity', type=str, default="info", metavar="", help='verbosity level (info, trace)')
4039
behavior.add_argument('--retry', type=int, default=0, metavar="", help='retry failed downloads (opt tries as int, else infinite)')
4140
behavior.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
4241
# behavior.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
4342
behavior.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
44-
43+
behavior.add_argument('--verbose', action='store_true', help='overwritten by progress - gives detailed output')
44+
4545
special = parser.add_argument_group('special')
4646
special.add_argument('--reset', action='store_true', help='reset the job and ignore existing cdx/db/csv files')
4747
special.add_argument('--keep', action='store_true', help='keep all files after the job finished')
@@ -75,11 +75,11 @@ def init(cls):
7575

7676
if cls.output is None:
7777
cls.output = os.path.join(os.getcwd(), "waybackup_snapshots")
78+
if cls.metadata is None:
79+
cls.metadata = cls.output
7880
os.makedirs(cls.output, exist_ok=True) if not cls.save else None
79-
80-
if cls.log is True:
81-
cls.log = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.log")
82-
81+
os.makedirs(cls.metadata, exist_ok=True) if not cls.save else None
82+
8383
if cls.all:
8484
cls.mode = "all"
8585
if cls.last:
@@ -91,10 +91,13 @@ def init(cls):
9191

9292
if cls.filetype:
9393
cls.filetype = [ft.lower().strip() for ft in cls.filetype.split(",")]
94-
95-
cls.cdxfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.cdx")
96-
cls.dbfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.db")
97-
cls.csvfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.csv")
94+
95+
base_path = cls.metadata
96+
base_name = f"waybackup_{sanitize_filename(cls.url)}"
97+
cls.cdxfile = os.path.join(base_path, f"{base_name}.cdx")
98+
cls.dbfile = os.path.join(base_path, f"{base_name}.db")
99+
cls.csvfile = os.path.join(base_path, f"{base_name}.csv")
100+
cls.log = os.path.join(base_path, f"{base_name}.log") if cls.log else None
98101

99102
if cls.reset:
100103
os.remove(cls.cdxfile) if os.path.isfile(cls.cdxfile) else None

pywaybackup/Converter.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def define_root_steps(cls, filepath) -> str:
2929

3030

3131
@classmethod
32-
def links(cls, filepath, status_message=None):
32+
def links(cls, filepath, status_content=None):
3333
"""
3434
Convert all links in a HTML / CSS / JS file to local paths.
3535
"""
@@ -72,7 +72,7 @@ def local_url(original_url, domain, count) -> str:
7272
if original_url.startswith("//"):
7373
external = True
7474
if external:
75-
status_message.trace(status="", type=f"{count}/{len(links)}", message="External url")
75+
status_message.trace(status="", info=f"{count}/{len(links)}", content="External url")
7676
return original_url
7777

7878
# convert the url to a relative path to the local root (download dir) if it's a valid path, else return the original url
@@ -87,7 +87,7 @@ def local_url(original_url, domain, count) -> str:
8787
if original_url.startswith("../"): # if file is already ../ check if it's not too many steps up
8888
original_url = f"{cls.define_root_steps(filepath)}{original_url.split('../')[-1].lstrip('/')}"
8989
else:
90-
status_message.trace(status="", type="", message=f"{count}/{len(links)}: URL is not a valid path")
90+
status_message.trace(status="", info="", content=f"{count}/{len(links)}: URL is not a valid path")
9191

9292
return original_url
9393

@@ -158,24 +158,24 @@ def validate_path(filepath: str) -> bool:
158158

159159
if os.path.isfile(filepath):
160160
if magic.from_file(filepath, mime=True).split("/")[1] == "javascript":
161-
status_message.trace(status="Error", type="", message="JS-file is not supported")
161+
status_message.trace(status="Error", info="", content="JS-file is not supported")
162162
return
163163
try:
164-
with open(filepath, "r") as file:
164+
with open(filepath, "r", encoding="utf-8") as file:
165165
domain = config.domain
166166
content = file.read()
167167
links = extract_urls(content)
168-
status_message.store(message=f"\n-----> Convert: [{len(links)}] links in file")
168+
status_message.store(verbose=True, content=f"\n-----> Convert: [{len(links)}] links in file")
169169
count = 1
170170
for original_link in links:
171-
status_message.trace(status="ORIG", type=f"{count}/{len(links)}", message=original_link)
171+
status_message.trace(status="ORIG", info=f"{count}/{len(links)}", content=original_link)
172172
new_link = local_url(original_link, domain, count)
173173
if new_link != original_link:
174-
status_message.trace(status="CONV", type=f"{count}/{len(links)}", message=new_link)
174+
status_message.trace(status="CONV", info=f"{count}/{len(links)}", content=new_link)
175175
content = content.replace(original_link, new_link)
176176
count += 1
177-
file = open(filepath, "w")
177+
file = open(filepath, "w", encoding="utf-8")
178178
file.write(content)
179179
file.close()
180180
except UnicodeDecodeError:
181-
status_message.trace(status="Error", type="", message="Could not decode file to convert links")
181+
status_message.trace(status="Error", info="", content="Could not decode file to convert links")

pywaybackup/Exception.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,16 @@ class Exception:
1515

1616
@classmethod
1717
def init(cls, output=None, command=None):
18-
sys.excepthook = (
19-
cls.exception_handler
20-
) # set custom exception handler (uncaught exceptions)
18+
sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
2119
cls.output = output
2220
cls.command = command
2321

2422
@classmethod
2523
def exception(cls, message: str, e: Exception, tb=None):
2624
custom_tb = sys.exc_info()[-1] if tb is None else tb
27-
original_tb = cls.relativate_path(
28-
"".join(traceback.format_exception(type(e), e, e.__traceback__))
29-
)
25+
original_tb = cls.relativate_path("".join(traceback.format_exception(type(e), e, e.__traceback__)))
3026
exception_message = f"-------------------------\n!-- Exception: {message}\n"
27+
local_vars = {}
3128
if custom_tb is not None:
3229
while custom_tb.tb_next: # loop to last traceback frame
3330
custom_tb = custom_tb.tb_next
@@ -53,14 +50,14 @@ def exception(cls, message: str, e: Exception, tb=None):
5350
# print(f"Full traceback:\n{original_tb}")
5451
if cls.new_debug: # new run, overwrite file
5552
cls.new_debug = False
56-
f = open(debug_file, "w")
53+
f = open(debug_file, "w", encoding="utf-8")
5754
f.write("-------------------------\n")
5855
f.write(f"Version: {version('pywaybackup')}\n")
5956
f.write("-------------------------\n")
6057
f.write(f"Command: {cls.command}\n")
6158
f.write("-------------------------\n\n")
6259
else: # current run, append to file
63-
f = open(debug_file, "a")
60+
f = open(debug_file, "a", encoding="utf-8")
6461
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
6562
f.write(exception_message + "\n")
6663
f.write("!-- Local Variables:\n")
@@ -75,16 +72,16 @@ def exception(cls, message: str, e: Exception, tb=None):
7572
f.close()
7673

7774
@classmethod
78-
def relativate_path(cls, input: str) -> str:
75+
def relativate_path(cls, input_str: str) -> str:
7976
try:
8077
path_pattern = re.compile(r'File "([^"]+)"')
81-
if os.path.isfile(input): # case single path
82-
return os.path.relpath(input, os.getcwd())
78+
if os.path.isfile(input_str): # case single path
79+
return os.path.relpath(input_str, os.getcwd())
8380
input_modified = ""
84-
input_lines = input.split("\n")
81+
input_lines = input_str.split("\n")
8582
if len(input_lines) == 1: # case single line
86-
return input
87-
for line in input.split("\n"): # case multiple lines
83+
return input_str
84+
for line in input_str.split("\n"): # case multiple lines
8885
match = path_pattern.search(line)
8986
if match:
9087
original_path = match.group(1)
@@ -93,13 +90,11 @@ def relativate_path(cls, input: str) -> str:
9390
input_modified += line + "\n"
9491
return input_modified
9592
except ValueError:
96-
return input
93+
return input_str
9794

9895
@staticmethod
9996
def exception_handler(exception_type, exception, traceback):
10097
if issubclass(exception_type, KeyboardInterrupt):
10198
sys.__excepthook__(exception_type, exception, traceback)
10299
return
103-
Exception.exception(
104-
"UNCAUGHT EXCEPTION", exception, traceback
105-
) # uncaught exceptions also with custom scheme
100+
Exception.exception("UNCAUGHT EXCEPTION", exception, traceback) # uncaught exceptions also with custom scheme

0 commit comments

Comments
 (0)