Skip to content

Commit 78c6535

Browse files
committed
Merge branch 'r/1.5.0'
1 parent 388cef2 commit 78c6535

File tree

9 files changed

+150
-94
lines changed

9 files changed

+150
-94
lines changed

README.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,11 @@ This tool allows you to download content from the Wayback Machine (archive.org).
2828
```pip install .```
2929
- in a virtual env or use `--break-system-package`
3030

31-
## Usage infos
31+
## Usage infos - important notes
3232

3333
- Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
3434
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
35+
- The tool will inform you if your query has an immense amount of snapshots which could consume your system memory and lead to a crash. Consider splitting your query into smaller jobs by specifying a range e.g. `--start 2023 --end 2024` or `--range 1`.
3536

3637
## Arguments
3738

@@ -97,10 +98,13 @@ Specifies number of retry attempts for failed downloads.
9798
- **`--delay`** `<seconds>`:<br>
9899
Specifies delay between download requests in seconds. Default is no delay (0).
99100

101+
- **`--limit`** `<count>`:<br>
102+
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected (with `--cdxinject` or `--auto`), the limit will have no effect.
103+
100104
<!-- - **`--convert-links`**:<br>
101105
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
102106

103-
**CDX Query Handling:**
107+
**CDX Query Result Handling:**
104108
- **`--cdxbackup`** `<path>`:<br>
105109
Path defaults to output-dir. Saves the result of CDX query as a file. Useful for later downloading snapshots and overcoming refused connections by CDX server due to too many queries. Named as `waybackup_<sanitized_url>.cdx`.
106110

@@ -111,10 +115,6 @@ Injects a CDX query file to download snapshots. Ensure the query matches the pre
111115
- **`--auto`**:<br>
112116
If set, csv, skip and cdxbackup/cdxinject are handled automatically. Keep the files and folders as they are. Otherwise they will not be recognized when restarting a download.
113117

114-
### Debug
115-
116-
- `--debug`: If set, full traceback will be printed in case of an error. The full exception will be written into `waybackup_error.log`.
117-
118118
### Examples
119119

120120
Download latest snapshot of all files:<br>
@@ -216,6 +216,10 @@ For list queries:
216216

217217
The csv contains the json response in a table format.
218218

219+
### Debugging
220+
221+
Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
222+
219223
## Contributing
220224

221225
I'm always happy for some feature requests to improve the usability of this tool.

pywaybackup/Arguments.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ def __init__(self):
1313

1414
parser = argparse.ArgumentParser(description='Download from wayback machine (archive.org)')
1515
parser.add_argument('-a', '--about', action='version', version='%(prog)s ' + __version__ + ' by @bitdruid -> https://github.com/bitdruid')
16-
parser.add_argument('-d', '--debug', action='store_true', help='Debug mode (Always full traceback and creates an error.log')
1716

1817
required = parser.add_argument_group('required (one exclusive)')
1918
required.add_argument('-u', '--url', type=str, metavar="", help='url (with subdir/subdomain) to download')
@@ -40,6 +39,7 @@ def __init__(self):
4039
special.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
4140
# special.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
4241
special.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
42+
special.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
4343

4444
cdx = parser.add_argument_group('cdx (one exclusive)')
4545
exclusive_cdx = cdx.add_mutually_exclusive_group()
@@ -84,6 +84,8 @@ def init(cls):
8484
if cls.current:
8585
cls.mode = "current"
8686

87+
cls.cdxbackup = cls.output if cls.cdxbackup is None else cls.cdxbackup
88+
8789
if cls.auto:
8890
cls.skip = cls.output
8991
cls.csv = cls.output

pywaybackup/Exception.py

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,14 @@
1212
class Exception:
1313

1414
new_debug = True
15-
debug = False
1615
output = None
1716
command = None
1817

1918
@classmethod
20-
def init(cls, debug=False, output=None, command=None):
19+
def init(cls, output=None, command=None):
2120
sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
2221
cls.output = output
2322
cls.command = command
24-
cls.debug = True if debug else False
2523

2624
@classmethod
2725
def exception(cls, message: str, e: Exception, tb=None):
@@ -53,33 +51,32 @@ def exception(cls, message: str, e: Exception, tb=None):
5351
"-------------------------"
5452
)
5553
print(exception_message)
56-
if cls.debug:
57-
debug_file = os.path.join(cls.output, "waybackup_error.log")
58-
print(f"Exception log: {debug_file}")
59-
print("-------------------------")
60-
print(f"Full traceback:\n{original_tb}")
61-
if cls.new_debug: # new run, overwrite file
62-
cls.new_debug = False
63-
f = open(debug_file, "w")
64-
f.write("-------------------------\n")
65-
f.write(f"Version: {__version__}\n")
66-
f.write("-------------------------\n")
67-
f.write(f"Command: {cls.command}\n")
68-
f.write("-------------------------\n\n")
69-
else: # current run, append to file
70-
f = open(debug_file, "a")
71-
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
72-
f.write(exception_message + "\n")
73-
f.write("!-- Local Variables:\n")
74-
for var_name, value in local_vars.items():
75-
if var_name in ["status_message", "headers"]:
76-
continue
77-
value = cls.relativate_path(str(value))
78-
value = value[:666] + " ... " if len(value) > 666 else value
79-
f.write(f" -- {var_name} = {value}\n")
54+
debug_file = os.path.join(cls.output, "waybackup_error.log")
55+
print(f"Exception log: {debug_file}")
56+
# print("-------------------------")
57+
# print(f"Full traceback:\n{original_tb}")
58+
if cls.new_debug: # new run, overwrite file
59+
cls.new_debug = False
60+
f = open(debug_file, "w")
8061
f.write("-------------------------\n")
81-
f.write(original_tb + "\n")
82-
f.close()
62+
f.write(f"Version: {__version__}\n")
63+
f.write("-------------------------\n")
64+
f.write(f"Command: {cls.command}\n")
65+
f.write("-------------------------\n\n")
66+
else: # current run, append to file
67+
f = open(debug_file, "a")
68+
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
69+
f.write(exception_message + "\n")
70+
f.write("!-- Local Variables:\n")
71+
for var_name, value in local_vars.items():
72+
if var_name in ["status_message", "headers"]:
73+
continue
74+
value = cls.relativate_path(str(value))
75+
value = value[:666] + " ... " if len(value) > 666 else value
76+
f.write(f" -- {var_name} = {value}\n")
77+
f.write("-------------------------\n")
78+
f.write(original_tb + "\n")
79+
f.close()
8380

8481
@classmethod
8582
def relativate_path(cls, input: str) -> str:

pywaybackup/SnapshotCollection.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from pywaybackup.helper import url_split
2+
import json
23
import os
34

45
class SnapshotCollection:
@@ -7,15 +8,27 @@ class SnapshotCollection:
78
MODE_CURRENT = 0
89

910
@classmethod
10-
def create_list(cls, cdxResult, mode):
11+
def create_list(cls, cdxfile, mode):
1112
"""
1213
Create the snapshot collection list from a cdx result.
1314
1415
- mode `full`: All snapshots are included.
1516
- mode `current`: Only the latest snapshot of each file is included.
1617
"""
17-
# creates a list of dictionaries for each snapshot entry
18-
cls.SNAPSHOT_COLLECTION = sorted([{"timestamp": snapshot[0], "digest": snapshot[1], "mimetype": snapshot[2], "status": snapshot[3], "url": snapshot[4]} for snapshot in cdxResult[1:]], key=lambda k: k['timestamp'], reverse=True)
18+
with open(cdxfile, "r") as f:
19+
first_line = True
20+
for line in f:
21+
if first_line:
22+
first_line = False
23+
continue
24+
line = line.strip()
25+
if line.endswith("]]"): line = line.rsplit("]", 1)[0]
26+
if line.endswith(","): line = line.rsplit(",", 1)[0]
27+
else: continue # drop incomplete line, maybe cdx response was cut off
28+
line = json.loads(line)
29+
line = {"timestamp": line[0], "digest": line[1], "mimetype": line[2], "status": line[3], "url": line[4]}
30+
cls.SNAPSHOT_COLLECTION.append(line)
31+
cls.SNAPSHOT_COLLECTION = sorted(cls.SNAPSHOT_COLLECTION, key=lambda k: k['timestamp'], reverse=True)
1932
if mode == "current":
2033
cls.MODE_CURRENT = 1
2134
cdxResult_list_filtered = []

pywaybackup/Verbosity.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,13 @@ def __init__(self):
8888
self.message = {}
8989

9090
def __str__(self):
91-
return self.message
91+
return str(self.message)
9292

9393
def store(self, status: str = "", type: str = "", message: str = "", level: str = "info"):
9494
if level not in self.message:
9595
self.message[level] = []
9696
self.message[level].append(super().generate_logline(status, type, message))
97+
#super().write(message=f"Stored message: {status} -> {type}: {message}")
9798

9899
def clear(self):
99100
self.message = {}

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.4.0"
1+
__version__ = "1.5.0"

0 commit comments

Comments
 (0)