Skip to content

Commit 4cc1893

Browse files
authored
Use jldopen in collect_results! when scanning JLD2 files. (#293)
* Use `jldopen` in `collect_results!` when scanning JLD2 files. Employ lazy loading via Requires.jl to allow special load functions for particular file types in `collect_results!`, and falling back to `wload` as a default. This potentially makes scanning collections of large files much faster when using the `black_list` argument of `collect_results!`. An example use case are simulations that generate amounts of data where the parameters are stored in a Dict, together with the data under the keyword `data`, all saved in a JLD2 file. If we only want to enumerate all the parameters that have been run, we would use `collect_results(datadir("simulations"), black_list = ("data",))`. With `wload`, this still loads the whole file into memory before ignoring the `data` key. With `jldopen`, only the parameters are loaded into memory. In my tests, this leads to around 400x speed up when scanning large collections of files. For now, only `jldopen` for JLD2 files is implemented, but this could potentially be extended to other data storage formats that allow memory mapped access to storage keys. * Fix typo. * Use `Base.require` to try to load JLD2 instead of relying on Requires.jl. * Add JLD2 as dependency, simplify code. * Update minor version number and changelog.
1 parent e767799 commit 4cc1893

File tree

5 files changed

+43
-3
lines changed

5 files changed

+43
-3
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# 2.6.0
2+
* Use `JLD2`'s jldopen in `collect_results!` to speed up loading of metadata.
13
# 2.5.0
24
* Add an `update` option of `collect_results!` allowing the updating of an existing results collection if data files were modified or deleted.
35
# 2.4.1

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
name = "DrWatson"
22
uuid = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1"
33
repo = "https://github.com/JuliaDynamics/DrWatson.jl.git"
4-
version = "2.5.0"
4+
version = "2.6.0"
55

66
[deps]
77
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
88
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
9+
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
910
LibGit2 = "76f85450-5226-5b5a-8eaa-529ad045b433"
1011
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
1112
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -15,6 +16,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
1516

1617
[compat]
1718
FileIO = "1.0.6"
19+
JLD2 = "0.4.15"
1820
MacroTools = "0.5"
1921
Requires = "0.5.2, 0.6, 1"
2022
UnPack = "1.0.1"

src/DrWatson.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ export @pack!, @unpack
1717

1818
# Functionality that saves/loads
1919
using FileIO
20+
using JLD2
2021
export save, load
2122
export wsave, wload
2223

src/result_collection.jl

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,8 @@ function collect_results!(filename, folder;
148148
# Now update the mtime of the new or modified file
149149
mtimes[file] = mtime_file
150150

151-
data = rpath === nothing ? wload(file) : wload(joinpath(rpath, file))
152-
df_new = to_data_row(data, file; kwargs...)
151+
fpath = rpath === nothing ? file : joinpath(rpath, file)
152+
df_new = to_data_row(FileIO.query(fpath); kwargs...)
153153
#add filename
154154
df_new[!, :path] .= file
155155
if replace_entry
@@ -209,6 +209,20 @@ end
209209
is_valid_file(file, valid_filetypes) =
210210
any(endswith(file, v) for v in valid_filetypes)
211211

212+
# Use wload per default when nothing else is available
213+
function to_data_row(file::File; kwargs...)
214+
fpath = filename(file)
215+
@debug "Opening $(filename(file)) with fallback wload."
216+
return to_data_row(wload(fpath), fpath; kwargs...)
217+
end
218+
# Specialize for JLD2 files, can do much faster mmapped access
219+
function to_data_row(file::File{format"JLD2"}; kwargs...)
220+
fpath = filename(file)
221+
@debug "Opening $(filename(file)) with jldopen."
222+
JLD2.jldopen(filename(file), "r") do data
223+
return to_data_row(data, fpath; kwargs...)
224+
end
225+
end
212226
function to_data_row(data, file;
213227
white_list = collect(keys(data)),
214228
black_list = keytype(data).((:gitcommit, :gitpatch, :script)),

test/update_results_tests.jl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,27 @@ subfolders = true, special_list=special_list, black_list = black_list)
195195
end
196196
end
197197

198+
###############################################################################
199+
# test jldopen #
200+
###############################################################################
201+
202+
mktempdir(datadir()) do folder
203+
# Create a data file
204+
d = Dict("idx" => 1, "value" => rand(100000))
205+
fname = joinpath(folder, savename(d, ending, ignores = ("value",)))
206+
DrWatson.wsave(fname, d)
207+
208+
if ending == "jld2"
209+
msg_re = r"Opening .* with jldopen."
210+
else
211+
msg_re = r"Opening .* with fallback wload."
212+
end
213+
@test_logs (:debug, msg_re) min_level=Base.CoreLogging.Debug match_mode=:any cres = collect_results(folder, black_list = ("value",))
214+
215+
@test cres.idx[1] == 1 # It's what we've saved above.
216+
@test size(cres,1) == 1 # only one file
217+
@test size(cres,2) == 2 # idx and path
218+
end
198219

199220
###############################################################################
200221
# Quickactivate macro #

0 commit comments

Comments
 (0)