implement load_function kwarg for collect_results! (#424)

NuclearPowerNerd · web-flow · commit ac0fe7f3c2c8 · 2024-08-23T10:51:17.000+01:00
The changes in this branch are a follow up from a previous pull request based on commit 6e6ff07 in PR #421. In that PR there were issues with whitespace changes inadvertantly coming from the autoformatter in vscode. Reverting the whitespace only changes proved to be more difficult than anticicpated. So to resolve this, this branch was created and a new PR will be created from it. The whitespace issues are gone but all the feedback and changes from the original PR are retained. The commit makes the following changes. - add the `load_function` kwarg to `collect_results`. This allows customizing how data is loaded from file before being processed into a dataframe by `collect_results`. - add a test to `update_result_tests.jl` - update docstring of `collect_results` - increase package version to 2.16.0 - update `CHANGELOG.md` All tests passed, 589 of 589.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 2.16.0
+
+ - Add `load_function` keyword argument to `collect_results` to customize how data is loaded from file before being converted to a dataframe by `collect_results`
+
 # 2.15.0
 
  - Add `wload_kwargs` to `produce_or_load` to allow passing kwargs to `wload`
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DrWatson"
 uuid = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1"
 repo = "https://github.com/JuliaDynamics/DrWatson.jl.git"
-version = "2.15.0"
+version = "2.16.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
diff --git a/src/result_collection.jl b/src/result_collection.jl
@@ -50,6 +50,7 @@ See also [`collect_results`](@ref).
 * `black_list = [:gitcommit, :gitpatch, :script]`: List of keys not to include from result-file.
 * `special_list = []`: List of additional (derived) key-value pairs
   to put in `df` as explained below.
+*  `load_function = wload`: Load function. Defaults to `wload`. You may want to specify a custom load function for example if you store results as a struct and you want the fields of the struct to form the columns of the dataframe. The struct is saved to file as a one-element dictionary so the dataframe will only have a single column. To work around this you could convert it to a dictionary by specifying `load_function = (filename) -> struct2dict(wload(filename)["mykey"])`. This way `collect_results` will receive a `Dict` whose keys are the fields of the struct.
 
 `special_list` is a `Vector` where each entry
 is a derived quantity to be included in `df`. There are two types of entries.
@@ -90,6 +91,7 @@ function collect_results!(filename, folder;
     newfile = false, # keyword only for defining collect_results without !
     rinclude = [r""],
     rexclude = [r"^\b$"],
+    load_function = wload,
     kwargs...)
 
     @assert all(eltype(r) <: Regex for r in (rinclude, rexclude)) "Elements of `rinclude` and `rexclude` must be Regex expressions."
@@ -100,7 +102,7 @@ function collect_results!(filename, folder;
         mtimes = Dict{String,Float64}()
     else
         verbose && @info "Loading existing result collection..."
-        data = wload(filename)
+        data = load_function(filename)
         df = data["df"]
         # Check if we have pre-recorded mtimes (if not this could be because of an old results database).
         if "mtime" ∈ keys(data)
@@ -170,7 +172,7 @@ function collect_results!(filename, folder;
         mtimes[file] = mtime_file
 
         fpath = rpath === nothing ? file : joinpath(rpath, file)
-        df_new = to_data_row(FileIO.query(fpath); kwargs...)
+        df_new = to_data_row(FileIO.query(fpath); load_function=load_function, kwargs...)
         #add filename
         df_new[!, :path] .= file
         if replace_entry
@@ -231,18 +233,17 @@ is_valid_file(file, valid_filetypes) =
     any(endswith(file, v) for v in valid_filetypes)
 
 # Use wload per default when nothing else is available
-function to_data_row(file::File; kwargs...)
+function to_data_row(file::File; load_function=wload, kwargs...)
     fpath = filename(file)
     @debug "Opening $(filename(file)) with fallback wload."
-    return to_data_row(wload(fpath), fpath; kwargs...)
+    return to_data_row(load_function(fpath), fpath; kwargs...)
 end
 # Specialize for JLD2 files, can do much faster mmapped access
-function to_data_row(file::File{format"JLD2"}; kwargs...)
+function to_data_row(file::File{format"JLD2"}; load_function=(filename) -> JLD2.jldopen(filename, "r"), kwargs...)
     fpath = filename(file)
     @debug "Opening $(filename(file)) with jldopen."
-    JLD2.jldopen(filename(file), "r") do data
-        return to_data_row(data, fpath; kwargs...)
-    end
+    data = load_function(fpath)
+    return to_data_row(data, fpath; kwargs...)
 end
 function to_data_row(data, file;
         white_list = collect(keys(data)),
diff --git a/test/update_results_tests.jl b/test/update_results_tests.jl
@@ -64,6 +64,22 @@ cres_relpath = collect_results!(relpathname, folder;
     rpath = projectdir())
 @info all(startswith.(cres[!,"path"], "data"))
 
+struct dummy
+    a::Float64
+    b::Int64
+    c::Matrix{Float64}
+end
+_dummy_matrix = rand(3,3)
+_dummy = dummy(1.0, 1, _dummy_matrix)
+wsave(datadir("dummy.jld2"), "dummy", _dummy)
+
+actual_dataframe = collect_results(datadir(), rinclude=[r"dummy.jld2"], load_function=(filename) -> struct2dict(wload(filename)["dummy"]))
+_dataframe_vector = Vector{Union{Missing, Matrix{Float64}}}(undef, 1)
+_dataframe_vector[1] = _dummy_matrix
+expected_dataframe = DataFrame(a = 1.0, b = 1, c = _dataframe_vector, path = datadir("dummy.jld2"))
+
+@test actual_dataframe == expected_dataframe
+
 ###############################################################################
 #                           Trailing slash in foldername                      #
 ###############################################################################