From a88bbf362b4dbc3888d6511e8de454d2eacb44ef Mon Sep 17 00:00:00 2001 From: meggart Date: Mon, 7 Apr 2025 13:47:58 +0200 Subject: [PATCH 1/9] make cat almost type-stable --- src/cat.jl | 63 ++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/src/cat.jl b/src/cat.jl index 0f52321..5282e67 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -21,46 +21,17 @@ end function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray{<:Any,N},M}) where {N,M} T = mapreduce(eltype, promote_type, init=eltype(first(arrays)), arrays) - function othersize(x, id) - return (x[1:(id-1)]..., x[(id+1):end]...) - end if N > M - newshape = (size(arrays)..., ntuple(_ -> 1, N - M)...) + newshape = extenddims(size(arrays), size(first(arrays))) + @show newshape arrays1 = reshape(arrays, newshape) D = N - elseif N < M - arrays1 = map(arrays) do a - newshape = (size(a)..., ntuple(_ -> 1, M - N)...) - reshape(a, newshape) - end - D = M else arrays1 = arrays D = M end - arraysizes = map(size, arrays1) - si = ntuple(D) do id - a = reduce(arraysizes; dims=id, init=ntuple(zero, D)) do i, j - if all(iszero, i) - j - elseif othersize(i, id) == othersize(j, id) - j - else - error("Dimension sizes don't match") - end - end - I = ntuple(D) do i - i == id ? Colon() : 1 - end - ari = map(i -> i[id], arraysizes[I...]) - sl = sum(ari) - r = cumsum(ari) - pop!(pushfirst!(r, 0)) - r .+ 1, sl - end - - startinds = map(first, si) - sizes = map(last, si) + startinds, sizes = arraysize_and_startinds(arrays1) + @show startinds, sizes chunks = concat_chunksize(D, arrays1) hc = Chunked(batchstrategy(chunks)) @@ -73,9 +44,35 @@ function ConcatDiskArray(arrays::AbstractArray) error("Arrays don't have the same dimensions") return error("Should not be reached") end +extenddims(a::NTuple{N,Int},b::NTuple{M,Int}) where {N,M} = extenddims((a...,1), b) +extenddims(a::NTuple{N,Int},b::NTuple{N,Int}) where {N} = a Base.size(a::ConcatDiskArray) = a.size +function arraysize_and_startinds(arrays1) + sizes = map(i->zeros(Int,i),size(arrays1)) + for i in CartesianIndices(arrays1) + ai = arrays1[i] + sizecur = size(ai) + foreach(sizecur,i.I,sizes) do si, ind, sizeall + if sizeall[ind] == 0 + #init the size + sizeall[ind] = si + elseif sizeall[ind] != si + throw(ArgumentError("Array sizes don't form a grid")) + end + end + end + r = map(sizes) do sizeall + pushfirst!(sizeall, 1) + for i in 2:length(sizeall) + sizeall[i] = sizeall[i-1]+sizeall[i] + end + pop!(sizeall)-1,sizeall + end + map(last, r), map(first, r) +end + # DiskArrays interface eachchunk(a::ConcatDiskArray) = a.chunks From 5d913ffef432c6bea09e4c7a504ed13b1ade7cc6 Mon Sep 17 00:00:00 2001 From: meggart Date: Mon, 7 Apr 2025 14:34:02 +0200 Subject: [PATCH 2/9] faster chunk estimation --- src/cat.jl | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/cat.jl b/src/cat.jl index 5282e67..5992703 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -22,8 +22,7 @@ function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray{<:Any,N},M}) wher T = mapreduce(eltype, promote_type, init=eltype(first(arrays)), arrays) if N > M - newshape = extenddims(size(arrays), size(first(arrays))) - @show newshape + newshape = extenddims(size(arrays), size(first(arrays)),1) arrays1 = reshape(arrays, newshape) D = N else @@ -31,9 +30,8 @@ function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray{<:Any,N},M}) wher D = M end startinds, sizes = arraysize_and_startinds(arrays1) - @show startinds, sizes - chunks = concat_chunksize(D, arrays1) + chunks = concat_chunksize(arrays1) hc = Chunked(batchstrategy(chunks)) return ConcatDiskArray{T,D,typeof(arrays1),typeof(chunks),typeof(hc)}(arrays1, startinds, sizes, chunks, hc) @@ -44,8 +42,8 @@ function ConcatDiskArray(arrays::AbstractArray) error("Arrays don't have the same dimensions") return error("Should not be reached") end -extenddims(a::NTuple{N,Int},b::NTuple{M,Int}) where {N,M} = extenddims((a...,1), b) -extenddims(a::NTuple{N,Int},b::NTuple{N,Int}) where {N} = a +extenddims(a::NTuple{N},b::NTuple{M},fillval) where {N,M} = extenddims((a...,fillval), b, fillval) +extenddims(a::NTuple{N},_::NTuple{N},_) where {N} = a Base.size(a::ConcatDiskArray) = a.size @@ -113,15 +111,24 @@ function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...) end end -function concat_chunksize(N, parents) - oldchunks = map(eachchunk, parents) - newchunks = ntuple(N) do i - sliceinds = Base.setindex(ntuple(_ -> 1, N), :, i) - v = map(c -> c.chunks[i], oldchunks[sliceinds...]) +function concat_chunksize(parents) + newchunks = map(s->Vector{Union{RegularChunks, IrregularChunks}}(undef, s) ,size(parents)) + for i in CartesianIndices(parents) + array = parents[i] + chunks = eachchunk(array) + foreach(chunks.chunks,i.I,newchunks) do c, ind, newc + if !isassigned(newc, ind) + newc[ind] = c + elseif c != newc[ind] + throw(ArgumentError("Chunk sizes don't forma grid")) + end + end + end + newchunks = map(newchunks) do v init = RegularChunks(approx_chunksize(first(v)), 0, 0) reduce(mergechunks, v; init=init) end - + extenddims(newchunks, size(parents), RegularChunks(1,0,1)) return GridChunks(newchunks...) end From 0f0407a4b8d6c61bd1f467e9161347447893e4f0 Mon Sep 17 00:00:00 2001 From: meggart Date: Mon, 7 Apr 2025 16:30:58 +0200 Subject: [PATCH 3/9] some bugfixes --- src/cat.jl | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/src/cat.jl b/src/cat.jl index 5992703..b45e397 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -42,8 +42,8 @@ function ConcatDiskArray(arrays::AbstractArray) error("Arrays don't have the same dimensions") return error("Should not be reached") end -extenddims(a::NTuple{N},b::NTuple{M},fillval) where {N,M} = extenddims((a...,fillval), b, fillval) -extenddims(a::NTuple{N},_::NTuple{N},_) where {N} = a +extenddims(a::Tuple{Vararg{<:Any,N}}, b::Tuple{Vararg{<:Any,M}}, fillval) where {N,M} = extenddims((a..., fillval), b, fillval) +extenddims(a::Tuple{Vararg{<:Any,N}}, _::Tuple{Vararg{<:Any,N}}, _) where {N} = a Base.size(a::ConcatDiskArray) = a.size @@ -51,7 +51,7 @@ function arraysize_and_startinds(arrays1) sizes = map(i->zeros(Int,i),size(arrays1)) for i in CartesianIndices(arrays1) ai = arrays1[i] - sizecur = size(ai) + sizecur = extenddims(size(ai), size(arrays1), 1) foreach(sizecur,i.I,sizes) do si, ind, sizeall if sizeall[ind] == 0 #init the size @@ -79,12 +79,14 @@ haschunks(c::ConcatDiskArray) = c.haschunks function readblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...) # Find affected blocks and indices in blocks _concat_diskarray_block_io(a, inds...) do outer_range, array_range, I - aout[outer_range...] = a.parents[I][array_range...] + vout = view(aout, outer_range...) + readblock!(a.parents[I], vout, array_range...) end end function writeblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...) _concat_diskarray_block_io(a, inds...) do outer_range, array_range, I - a.parents[I][array_range...] = aout[outer_range...] + data = view(aout, outer_range...) + writeblock!(a.parents[I], data, array_range) end end @@ -99,17 +101,26 @@ function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...) end map(CartesianIndices(blockinds)) do cI myar = a.parents[cI] - mysize = size(myar) + mysize = extenddims(size(myar), cI.I, 1) array_range = map(cI.I, a.startinds, mysize, inds) do ii, si, ms, indstoread max(first(indstoread) - si[ii] + 1, 1):min(last(indstoread) - si[ii] + 1, ms) end outer_range = map(cI.I, a.startinds, array_range, inds) do ii, si, ar, indstoread (first(ar)+si[ii]-first(indstoread)):(last(ar)+si[ii]-first(indstoread)) end - # aout[outer_range...] = a.parents[cI][array_range...] + #Shorten array range to shape of actual array + array_range = map((i, j) -> j, size(myar), array_range) + outer_range = fix_outerrangeshape(outer_range, array_range) f(outer_range, array_range, cI) end end +fix_outerrangeshape(outer_range, array_range) = fix_outerrangeshape((), outer_range, array_range) +fix_outerrangeshape(res, outer_range, array_range) = + fix_outerrangeshape((res..., first(outer_range)), Base.tail(outer_range), Base.tail(array_range)) +fix_outerrangeshape(res, outer_range, ::Tuple{}) = + fix_outerrangeshape((res..., only(first(outer_range))), Base.tail(outer_range), ()) +fix_outerrangeshape(res, ::Tuple{}, ::Tuple{}) = res + function concat_chunksize(parents) newchunks = map(s->Vector{Union{RegularChunks, IrregularChunks}}(undef, s) ,size(parents)) @@ -125,6 +136,13 @@ function concat_chunksize(parents) end end newchunks = map(newchunks) do v + #Chunks that have not been set are from additional dimensions in the parent array shape + for i in eachindex(v) + if !isassigned(v, i) + v[i] = RegularChunks(1, 0, 1) + end + end + # Merge the chunks init = RegularChunks(approx_chunksize(first(v)), 0, 0) reduce(mergechunks, v; init=init) end From c0aac9278678ae3b5a2c8269b26353e62d5bde98 Mon Sep 17 00:00:00 2001 From: meggart Date: Mon, 7 Apr 2025 16:56:57 +0200 Subject: [PATCH 4/9] Add a readblock fallback --- src/cat.jl | 22 +++++++++++----------- src/diskarray.jl | 8 ++++++++ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/cat.jl b/src/cat.jl index b45e397..1a1e330 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -22,7 +22,7 @@ function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray{<:Any,N},M}) wher T = mapreduce(eltype, promote_type, init=eltype(first(arrays)), arrays) if N > M - newshape = extenddims(size(arrays), size(first(arrays)),1) + newshape = extenddims(size(arrays), size(first(arrays)), 1) arrays1 = reshape(arrays, newshape) D = N else @@ -42,17 +42,17 @@ function ConcatDiskArray(arrays::AbstractArray) error("Arrays don't have the same dimensions") return error("Should not be reached") end -extenddims(a::Tuple{Vararg{<:Any,N}}, b::Tuple{Vararg{<:Any,M}}, fillval) where {N,M} = extenddims((a..., fillval), b, fillval) -extenddims(a::Tuple{Vararg{<:Any,N}}, _::Tuple{Vararg{<:Any,N}}, _) where {N} = a +extenddims(a::Tuple{Vararg{Any,N}}, b::Tuple{Vararg{Any,M}}, fillval) where {N,M} = extenddims((a..., fillval), b, fillval) +extenddims(a::Tuple{Vararg{Any,N}}, _::Tuple{Vararg{Any,N}}, _) where {N} = a Base.size(a::ConcatDiskArray) = a.size function arraysize_and_startinds(arrays1) - sizes = map(i->zeros(Int,i),size(arrays1)) + sizes = map(i -> zeros(Int, i), size(arrays1)) for i in CartesianIndices(arrays1) ai = arrays1[i] sizecur = extenddims(size(ai), size(arrays1), 1) - foreach(sizecur,i.I,sizes) do si, ind, sizeall + foreach(sizecur, i.I, sizes) do si, ind, sizeall if sizeall[ind] == 0 #init the size sizeall[ind] = si @@ -64,9 +64,9 @@ function arraysize_and_startinds(arrays1) r = map(sizes) do sizeall pushfirst!(sizeall, 1) for i in 2:length(sizeall) - sizeall[i] = sizeall[i-1]+sizeall[i] + sizeall[i] = sizeall[i-1] + sizeall[i] end - pop!(sizeall)-1,sizeall + pop!(sizeall) - 1, sizeall end map(last, r), map(first, r) end @@ -86,7 +86,7 @@ end function writeblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...) _concat_diskarray_block_io(a, inds...) do outer_range, array_range, I data = view(aout, outer_range...) - writeblock!(a.parents[I], data, array_range) + writeblock!(a.parents[I], data, array_range...) end end @@ -123,11 +123,11 @@ fix_outerrangeshape(res, ::Tuple{}, ::Tuple{}) = res function concat_chunksize(parents) - newchunks = map(s->Vector{Union{RegularChunks, IrregularChunks}}(undef, s) ,size(parents)) + newchunks = map(s -> Vector{Union{RegularChunks,IrregularChunks}}(undef, s), size(parents)) for i in CartesianIndices(parents) array = parents[i] chunks = eachchunk(array) - foreach(chunks.chunks,i.I,newchunks) do c, ind, newc + foreach(chunks.chunks, i.I, newchunks) do c, ind, newc if !isassigned(newc, ind) newc[ind] = c elseif c != newc[ind] @@ -146,7 +146,7 @@ function concat_chunksize(parents) init = RegularChunks(approx_chunksize(first(v)), 0, 0) reduce(mergechunks, v; init=init) end - extenddims(newchunks, size(parents), RegularChunks(1,0,1)) + extenddims(newchunks, size(parents), RegularChunks(1, 0, 1)) return GridChunks(newchunks...) end diff --git a/src/diskarray.jl b/src/diskarray.jl index 7306ea2..98d9f91 100644 --- a/src/diskarray.jl +++ b/src/diskarray.jl @@ -23,6 +23,14 @@ The only function that should be implemented by a `AbstractDiskArray`. This func """ function readblock! end +function readblock!(a::AbstractArray, aout, r...) + # Fallback implementation for arrays that are not DiskArrays + if isdisk(a) + @warn "Using fallback readblock! for array $(typeof(a)). This should not happen but there should be a custom implementation." + end + copyto!(aout, CartesianIndices(aout), a, CartesianIndices(r)) +end + """ writeblock!(A::AbstractDiskArray, A_in, r::AbstractUnitRange...) From 240b404c2d79f968513dd2e1f8c45163972f4b9c Mon Sep 17 00:00:00 2001 From: meggart Date: Tue, 8 Apr 2025 15:47:04 +0200 Subject: [PATCH 5/9] last fixes --- src/cat.jl | 75 +++++++++++++++++++++++++++++++++++++++--------- src/diskarray.jl | 2 +- test/runtests.jl | 41 +++++++++++++++++++++----- 3 files changed, 96 insertions(+), 22 deletions(-) diff --git a/src/cat.jl b/src/cat.jl index 1a1e330..9f54a90 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -18,17 +18,46 @@ struct ConcatDiskArray{T,N,P,C,HC} <: AbstractDiskArray{T,N} chunks::C haschunks::HC end -function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray{<:Any,N},M}) where {N,M} - T = mapreduce(eltype, promote_type, init=eltype(first(arrays)), arrays) +function ConcatDiskArray(arrays::AbstractArray{Union{<:AbstractArray,Missing}}) + et = Base.nonmissingtype(eltype(arrays)) + T = Union{Missing,eltype(et)} + N = ndims(arrays) + M = ndims(et) + _ConcatDiskArray(arrays, T, Val(N), Val(M)) +end +function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray}) + T = eltype(eltype(arrays)) + N = ndims(arrays) + M = ndims(eltype(arrays)) + _ConcatDiskArray(arrays, T, Val(N), Val(M)) +end +function ConcatDiskArray(arrays::AbstractArray) + N = ndims(arrays) + M, T = foldl(arrays, init=(-1, Union{})) do (M, T), a + if ismissing(a) + (M, promote_type(Missing, T)) + else + M == -1 || ndims(a) == M || throw(ArgumentError("All arrays to concatenate must have equal ndims")) + (ndims(a), promote_type(eltype(a), T)) + end + end + _ConcatDiskArray(arrays, T, Val(N), Val(M)) +end + + +function _ConcatDiskArray(arrays, T, ::Val{N}, ::Val{M}) where {N,M} if N > M - newshape = extenddims(size(arrays), size(first(arrays)), 1) + newshape = extenddims(size(arrays), ntuple(_ -> 1, N), 1) arrays1 = reshape(arrays, newshape) D = N else arrays1 = arrays D = M end + _ConcatDiskArray(arrays1::AbstractArray, T, Val(D)) +end +function _ConcatDiskArray(arrays1::AbstractArray, T, ::Val{D}) where {D} startinds, sizes = arraysize_and_startinds(arrays1) chunks = concat_chunksize(arrays1) @@ -36,12 +65,7 @@ function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray{<:Any,N},M}) wher return ConcatDiskArray{T,D,typeof(arrays1),typeof(chunks),typeof(hc)}(arrays1, startinds, sizes, chunks, hc) end -function ConcatDiskArray(arrays::AbstractArray) - # Validate array eltype and dimensionality - all(a -> ndims(a) == ndims(first(arrays)), arrays) || - error("Arrays don't have the same dimensions") - return error("Should not be reached") -end + extenddims(a::Tuple{Vararg{Any,N}}, b::Tuple{Vararg{Any,M}}, fillval) where {N,M} = extenddims((a..., fillval), b, fillval) extenddims(a::Tuple{Vararg{Any,N}}, _::Tuple{Vararg{Any,N}}, _) where {N} = a @@ -51,6 +75,7 @@ function arraysize_and_startinds(arrays1) sizes = map(i -> zeros(Int, i), size(arrays1)) for i in CartesianIndices(arrays1) ai = arrays1[i] + ismissing(ai) && continue sizecur = extenddims(size(ai), size(arrays1), 1) foreach(sizecur, i.I, sizes) do si, ind, sizeall if sizeall[ind] == 0 @@ -62,6 +87,9 @@ function arraysize_and_startinds(arrays1) end end r = map(sizes) do sizeall + #Replace missing sizes with size 1 + replace!(sizeall, 0 => 1) + #Add starting 1 pushfirst!(sizeall, 1) for i in 2:length(sizeall) sizeall[i] = sizeall[i-1] + sizeall[i] @@ -80,13 +108,24 @@ function readblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...) # Find affected blocks and indices in blocks _concat_diskarray_block_io(a, inds...) do outer_range, array_range, I vout = view(aout, outer_range...) - readblock!(a.parents[I], vout, array_range...) + if ismissing(I) + vout .= missing + else + readblock!(a.parents[I], vout, array_range...) + end end end function writeblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...) _concat_diskarray_block_io(a, inds...) do outer_range, array_range, I data = view(aout, outer_range...) - writeblock!(a.parents[I], data, array_range...) + if ismissing(I) + if !all(ismissing, data) + @warn "Trying to write data to missing array tile, skipping write" + end + return + else + writeblock!(a.parents[I], data, array_range...) + end end end @@ -101,7 +140,10 @@ function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...) end map(CartesianIndices(blockinds)) do cI myar = a.parents[cI] - mysize = extenddims(size(myar), cI.I, 1) + size_inferred = map(a.startinds, size(a), cI.I) do si, sa, ii + ii == length(si) ? sa - si[ii] + 1 : si[ii+1] - si[ii] + end + mysize = extenddims(size_inferred, cI.I, 1) array_range = map(cI.I, a.startinds, mysize, inds) do ii, si, ms, indstoread max(first(indstoread) - si[ii] + 1, 1):min(last(indstoread) - si[ii] + 1, ms) end @@ -109,9 +151,13 @@ function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...) (first(ar)+si[ii]-first(indstoread)):(last(ar)+si[ii]-first(indstoread)) end #Shorten array range to shape of actual array - array_range = map((i, j) -> j, size(myar), array_range) + array_range = map((i, j) -> j, size_inferred, array_range) outer_range = fix_outerrangeshape(outer_range, array_range) - f(outer_range, array_range, cI) + if ismissing(myar) + f(outer_range, array_range, missing) + else + f(outer_range, array_range, cI) + end end end fix_outerrangeshape(outer_range, array_range) = fix_outerrangeshape((), outer_range, array_range) @@ -126,6 +172,7 @@ function concat_chunksize(parents) newchunks = map(s -> Vector{Union{RegularChunks,IrregularChunks}}(undef, s), size(parents)) for i in CartesianIndices(parents) array = parents[i] + ismissing(array) && continue chunks = eachchunk(array) foreach(chunks.chunks, i.I, newchunks) do c, ind, newc if !isassigned(newc, ind) diff --git a/src/diskarray.jl b/src/diskarray.jl index 98d9f91..c16b2c3 100644 --- a/src/diskarray.jl +++ b/src/diskarray.jl @@ -28,7 +28,7 @@ function readblock!(a::AbstractArray, aout, r...) if isdisk(a) @warn "Using fallback readblock! for array $(typeof(a)). This should not happen but there should be a custom implementation." end - copyto!(aout, CartesianIndices(aout), a, CartesianIndices(r)) + aout .= view(a, CartesianIndices(r)) end """ diff --git a/test/runtests.jl b/test/runtests.jl index a39d6bf..2218cc8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -491,6 +491,33 @@ end @test slic isa Vector{Float64} @test slic == Float64[1, 2, 3, 4, 1, 2, 3, 4] end + + @testset "Concat DiskArray with missing tiles" begin + a = zeros(Int, 3, 4) + b = ones(Int, 2, 4) + c = fill(2, 3, 5) + d = fill(missing, 2, 5) + aconc = DiskArrays.ConcatDiskArray(reshape([a, b, c, missing], 2, 2)) + abase = [a c; b d] + @test all(isequal.(aconc[:, :], abase)) + @test all(isequal.(aconc[3:4, 4:6], abase[3:4, 4:6])) + ch = DiskArrays.eachchunk(aconc) + @test ch.chunks[1] == [1:3, 4:5] + @test ch.chunks[2] == [1:4, 5:9] + + a = ones(100, 50) + b = [rem(i.I[3], 5) == 0 ? missing : a for i in CartesianIndices((1, 1, 100))] + b[1] = missing + a_conc = DiskArrays.ConcatDiskArray(b) + ch = eachchunk(a_conc) + @test ch.chunks[1] == [1:100] + @test ch.chunks[2] == [1:50] + @test ch.chunks[3] === DiskArrays.RegularChunks(1, 0, 100) + + @test all(isequal.(a_conc[2, 2, 1:5], [missing, 1.0, 1.0, 1.0, missing])) + @test all(isequal.(a_conc[end, end, 95:100], [missing, 1.0, 1.0, 1.0, 1.0, missing])) + + end end @testset "Broadcast with length 1 and 0 final dim" begin @@ -979,7 +1006,7 @@ end @testset "Padded disk arrays" begin M = (1:100) * (1:120)' A = cat(M, 2M, 3M, 4M; dims=3) - ch = ChunkedDiskArray(A, (128, 128, 2)) + ch = ChunkedDiskArray(A, (128, 128, 2)) pa = DiskArrays.pad(ch, ((10, 20), (30, 40), (1, 2)); fill=999) @test size(pa) == (130, 190, 7) # All outside @@ -1009,9 +1036,9 @@ end @test DiskArrays._pad_offset(c1, (10, 10)) == DiskArrays.RegularChunks(10, 0, 120) @test DiskArrays._pad_offset(c1, (0, 0)) == c1 - c2 = DiskArrays.IrregularChunks(chunksizes = [10, 10, 20, 30, 40]) + c2 = DiskArrays.IrregularChunks(chunksizes=[10, 10, 20, 30, 40]) #The following test would assume padding ends up in a separate chunk: - @test DiskArrays._pad_offset(c2, (5, 5)) == DiskArrays.IrregularChunks(chunksizes = [5, 10, 10, 20, 30, 40, 5]) + @test DiskArrays._pad_offset(c2, (5, 5)) == DiskArrays.IrregularChunks(chunksizes=[5, 10, 10, 20, 30, 40, 5]) @test DiskArrays._pad_offset(c2, (0, 0)) == c2 end end @@ -1054,10 +1081,10 @@ end end @testset "identity function" begin - a = ChunkedDiskArray(1:10 .> 3; chunksize=(3, )) - for fname in [:sum, :prod, :all, :any, :minimum, :maximum, :count] - @eval out = @capture_out @trace $fname($a) DiskArrays - @test occursin("DiskGenerator", out) == false + a = ChunkedDiskArray(1:10 .> 3; chunksize=(3,)) + for fname in [:sum, :prod, :all, :any, :minimum, :maximum, :count] + @eval out = @capture_out @trace $fname($a) DiskArrays + @test occursin("DiskGenerator", out) == false end @test count(a) + count(!, a) == length(a) end From 6732210946cc91b05761287dfcf75fbbec373033 Mon Sep 17 00:00:00 2001 From: meggart Date: Tue, 8 Apr 2025 15:53:21 +0200 Subject: [PATCH 6/9] Update docstring --- src/cat.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cat.jl b/src/cat.jl index 9f54a90..ea0870d 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -5,7 +5,11 @@ ConcatDiskArray(arrays) Joins multiple `AbstractArray`s or `AbstractDiskArray`s into -a single disk array, using lazy concatination. +a single disk array, using lazy concatination. Note that if some elements +of `arrays` are `missing`, this array will be interpreted as a block containing +only missing elements. This can be useful when concatenating mosaics of tiles +where some tiles in are missing or when stacking arrays along a new dimension +and some layers are missing. Returned from `cat` on disk arrays. From 2c4f0dad30add55cfd656a62c46651dd3d25347d Mon Sep 17 00:00:00 2001 From: meggart Date: Wed, 9 Apr 2025 15:09:57 +0200 Subject: [PATCH 7/9] fix last tests --- src/cat.jl | 55 ++++++++++++++++++++++++++++++------------------ src/diskarray.jl | 1 + 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/cat.jl b/src/cat.jl index ea0870d..9d700b0 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -15,12 +15,13 @@ Returned from `cat` on disk arrays. It is also useful on its own as it can easily concatenate an array of disk arrays. """ -struct ConcatDiskArray{T,N,P,C,HC} <: AbstractDiskArray{T,N} +struct ConcatDiskArray{T,N,P,C,HC,ID} <: AbstractDiskArray{T,N} parents::P startinds::NTuple{N,Vector{Int}} size::NTuple{N,Int} chunks::C haschunks::HC + innerdims::Val{ID} end function ConcatDiskArray(arrays::AbstractArray{Union{<:AbstractArray,Missing}}) @@ -30,15 +31,8 @@ function ConcatDiskArray(arrays::AbstractArray{Union{<:AbstractArray,Missing}}) M = ndims(et) _ConcatDiskArray(arrays, T, Val(N), Val(M)) end -function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray}) - T = eltype(eltype(arrays)) - N = ndims(arrays) - M = ndims(eltype(arrays)) - _ConcatDiskArray(arrays, T, Val(N), Val(M)) -end -function ConcatDiskArray(arrays::AbstractArray) - N = ndims(arrays) - M, T = foldl(arrays, init=(-1, Union{})) do (M, T), a +function infer_eltypes(arrays) + foldl(arrays, init=(-1, Union{})) do (M, T), a if ismissing(a) (M, promote_type(Missing, T)) else @@ -46,31 +40,48 @@ function ConcatDiskArray(arrays::AbstractArray) (ndims(a), promote_type(eltype(a), T)) end end +end +function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray}) + N = ndims(arrays) + T = eltype(eltype(arrays)) + if !isconcretetype(T) + M,T = infer_eltypes(arrays) + else + M = ndims(eltype(arrays)) + end + _ConcatDiskArray(arrays, T, Val(N), Val(M)) +end +function ConcatDiskArray(arrays::AbstractArray) + N = ndims(arrays) + M,T = infer_eltypes(arrays) _ConcatDiskArray(arrays, T, Val(N), Val(M)) end function _ConcatDiskArray(arrays, T, ::Val{N}, ::Val{M}) where {N,M} - if N > M - newshape = extenddims(size(arrays), ntuple(_ -> 1, N), 1) + if N < M + newshape = extenddims(size(arrays), ntuple(_ -> 1, M), 1) arrays1 = reshape(arrays, newshape) - D = N + D = M else arrays1 = arrays - D = M + D = N end - _ConcatDiskArray(arrays1::AbstractArray, T, Val(D)) + ConcatDiskArray(arrays1::AbstractArray, T, Val(D), Val(M)) end -function _ConcatDiskArray(arrays1::AbstractArray, T, ::Val{D}) where {D} +function ConcatDiskArray(arrays1::AbstractArray, T, ::Val{D},::Val{ID}) where {D,ID} startinds, sizes = arraysize_and_startinds(arrays1) chunks = concat_chunksize(arrays1) hc = Chunked(batchstrategy(chunks)) - return ConcatDiskArray{T,D,typeof(arrays1),typeof(chunks),typeof(hc)}(arrays1, startinds, sizes, chunks, hc) + return ConcatDiskArray{T,D,typeof(arrays1),typeof(chunks),typeof(hc),ID}(arrays1, startinds, sizes, chunks, hc,Val(ID)) end -extenddims(a::Tuple{Vararg{Any,N}}, b::Tuple{Vararg{Any,M}}, fillval) where {N,M} = extenddims((a..., fillval), b, fillval) +function extenddims(a::Tuple{Vararg{Any,N}}, b::Tuple{Vararg{Any,M}}, fillval) where {N,M} + length(a) > length(b) && error("Wrong") + extenddims((a..., fillval), b, fillval) +end extenddims(a::Tuple{Vararg{Any,N}}, _::Tuple{Vararg{Any,N}}, _) where {N} = a Base.size(a::ConcatDiskArray) = a.size @@ -134,9 +145,12 @@ function writeblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...) end # Utils +ninnerdims(a::ConcatDiskArray) = ninnerdims(a.innerdims) +ninnerdims(::Val{ID}) where ID = ID function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...) # Find affected blocks and indices in blocks + ID = ninnerdims(a) blockinds = map(inds, a.startinds, size(a.parents)) do i, si, s bi1 = max(searchsortedlast(si, first(i)), 1) bi2 = min(searchsortedfirst(si, last(i) + 1) - 1, s) @@ -147,15 +161,14 @@ function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...) size_inferred = map(a.startinds, size(a), cI.I) do si, sa, ii ii == length(si) ? sa - si[ii] + 1 : si[ii+1] - si[ii] end - mysize = extenddims(size_inferred, cI.I, 1) - array_range = map(cI.I, a.startinds, mysize, inds) do ii, si, ms, indstoread + array_range = map(cI.I, a.startinds, size_inferred, inds) do ii, si, ms, indstoread max(first(indstoread) - si[ii] + 1, 1):min(last(indstoread) - si[ii] + 1, ms) end outer_range = map(cI.I, a.startinds, array_range, inds) do ii, si, ar, indstoread (first(ar)+si[ii]-first(indstoread)):(last(ar)+si[ii]-first(indstoread)) end #Shorten array range to shape of actual array - array_range = map((i, j) -> j, size_inferred, array_range) + array_range = ntuple(j -> array_range[j], ID) outer_range = fix_outerrangeshape(outer_range, array_range) if ismissing(myar) f(outer_range, array_range, missing) diff --git a/src/diskarray.jl b/src/diskarray.jl index c16b2c3..8e58d9f 100644 --- a/src/diskarray.jl +++ b/src/diskarray.jl @@ -29,6 +29,7 @@ function readblock!(a::AbstractArray, aout, r...) @warn "Using fallback readblock! for array $(typeof(a)). This should not happen but there should be a custom implementation." end aout .= view(a, CartesianIndices(r)) + nothing end """ From f3ba15c89e1d59f03065a71eba0723670dbbba5c Mon Sep 17 00:00:00 2001 From: Felix Cremer Date: Tue, 13 May 2025 16:12:19 +0200 Subject: [PATCH 8/9] Add functionality to use arbitrary singular values in ConcatDiskArrays (#256) * Add test case for zero fill concatdiskarray * Enable arbitrary single values in concatDiskArray * Update src/cat.jl Co-authored-by: Rafael Schouten * Update src/cat.jl Co-authored-by: Rafael Schouten * Apply review Co-authored-by: Rafael Schouten * Wrap fill values in MissingTile for ConcatDiskArray This introduces a MissingTile type to wrap the values for ConcatDiskArray. This allows to use vectors as elements in the fillvalue. --------- Co-authored-by: Felix Cremer Co-authored-by: Rafael Schouten --- src/cat.jl | 33 +++++++++++++---------- src/subarray.jl | 2 +- test/runtests.jl | 69 +++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 85 insertions(+), 19 deletions(-) diff --git a/src/cat.jl b/src/cat.jl index 9d700b0..682751e 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -1,4 +1,3 @@ - """ ConcatDiskArray <: AbstractDiskArray @@ -31,16 +30,17 @@ function ConcatDiskArray(arrays::AbstractArray{Union{<:AbstractArray,Missing}}) M = ndims(et) _ConcatDiskArray(arrays, T, Val(N), Val(M)) end + function infer_eltypes(arrays) foldl(arrays, init=(-1, Union{})) do (M, T), a - if ismissing(a) - (M, promote_type(Missing, T)) - else + if a isa AbstractArray M == -1 || ndims(a) == M || throw(ArgumentError("All arrays to concatenate must have equal ndims")) - (ndims(a), promote_type(eltype(a), T)) + M = ndims(a) + end + (M, promote_type(eltype(a), T)) end end -end + function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray}) N = ndims(arrays) T = eltype(eltype(arrays)) @@ -78,6 +78,11 @@ function ConcatDiskArray(arrays1::AbstractArray, T, ::Val{D},::Val{ID}) where {D return ConcatDiskArray{T,D,typeof(arrays1),typeof(chunks),typeof(hc),ID}(arrays1, startinds, sizes, chunks, hc,Val(ID)) end +struct MissingTile{F} + fillvalue::F +end +Base.eltype(::Type{MissingTile{F}}) where F = F + function extenddims(a::Tuple{Vararg{Any,N}}, b::Tuple{Vararg{Any,M}}, fillval) where {N,M} length(a) > length(b) && error("Wrong") extenddims((a..., fillval), b, fillval) @@ -90,7 +95,7 @@ function arraysize_and_startinds(arrays1) sizes = map(i -> zeros(Int, i), size(arrays1)) for i in CartesianIndices(arrays1) ai = arrays1[i] - ismissing(ai) && continue + ai isa MissingTile && continue sizecur = extenddims(size(ai), size(arrays1), 1) foreach(sizecur, i.I, sizes) do si, ind, sizeall if sizeall[ind] == 0 @@ -123,10 +128,10 @@ function readblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...) # Find affected blocks and indices in blocks _concat_diskarray_block_io(a, inds...) do outer_range, array_range, I vout = view(aout, outer_range...) - if ismissing(I) - vout .= missing - else + if I isa CartesianIndex readblock!(a.parents[I], vout, array_range...) + else + vout .= (I.fillvalue,) end end end @@ -170,8 +175,8 @@ function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...) #Shorten array range to shape of actual array array_range = ntuple(j -> array_range[j], ID) outer_range = fix_outerrangeshape(outer_range, array_range) - if ismissing(myar) - f(outer_range, array_range, missing) + if myar isa MissingTile + f(outer_range, array_range, myar) else f(outer_range, array_range, cI) end @@ -189,13 +194,13 @@ function concat_chunksize(parents) newchunks = map(s -> Vector{Union{RegularChunks,IrregularChunks}}(undef, s), size(parents)) for i in CartesianIndices(parents) array = parents[i] - ismissing(array) && continue + array isa MissingTile && continue chunks = eachchunk(array) foreach(chunks.chunks, i.I, newchunks) do c, ind, newc if !isassigned(newc, ind) newc[ind] = c elseif c != newc[ind] - throw(ArgumentError("Chunk sizes don't forma grid")) + throw(ArgumentError("Chunk sizes don't form a grid")) end end end diff --git a/src/subarray.jl b/src/subarray.jl index 847752a..76b282d 100644 --- a/src/subarray.jl +++ b/src/subarray.jl @@ -44,7 +44,7 @@ function eachchunk_view(::Chunked, vv) end eachchunk_view(::Unchunked, a) = estimate_chunksize(a) -# Implementaion macro +# Implementation macro macro implement_subarray(t) t = esc(t) diff --git a/test/runtests.jl b/test/runtests.jl index 2218cc8..3cb9e11 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -492,22 +492,52 @@ end @test slic == Float64[1, 2, 3, 4, 1, 2, 3, 4] end + @testset "Concat DiskArray with fill zero tiles" begin + a = zeros(Int, 3, 4) + b = ones(Int, 2, 4) + c = fill(2, 3, 5) + d = fill(0, 2, 5) + aconc = DiskArrays.ConcatDiskArray(reshape([a, b, c, DiskArrays.MissingTile(0)], 2, 2)) + abase = [a c; b d] + @test all(isequal.(aconc[:, :], abase)) + @test all(isequal.(aconc[3:4, 4:6], abase[3:4, 4:6])) + ch = DiskArrays.eachchunk(aconc) + @test ch.chunks[1] == [1:3, 4:5] + @test ch.chunks[2] == [1:4, 5:9] + @test eltype(aconc) == Int + + a = ones(100, 50) + b = [rem(i.I[3], 5) == 0 ? DiskArrays.MissingTile(0) : a for i in CartesianIndices((1, 1, 100))] + b[1] = DiskArrays.MissingTile(0) + a_conc = DiskArrays.ConcatDiskArray(b) + ch = eachchunk(a_conc) + @test ch.chunks[1] == [1:100] + @test ch.chunks[2] == [1:50] + @test ch.chunks[3] === DiskArrays.RegularChunks(1, 0, 100) + + @test all(isequal.(a_conc[2, 2, 1:5], [0, 1.0, 1.0, 1.0, 0])) + @test all(isequal.(a_conc[end, end, 95:100], [0, 1.0, 1.0, 1.0, 1.0, 0])) + + end + + @testset "Concat DiskArray with missing tiles" begin a = zeros(Int, 3, 4) b = ones(Int, 2, 4) c = fill(2, 3, 5) d = fill(missing, 2, 5) - aconc = DiskArrays.ConcatDiskArray(reshape([a, b, c, missing], 2, 2)) + aconc = DiskArrays.ConcatDiskArray(reshape([a, b, c, DiskArrays.MissingTile(missing)], 2, 2)) abase = [a c; b d] @test all(isequal.(aconc[:, :], abase)) @test all(isequal.(aconc[3:4, 4:6], abase[3:4, 4:6])) ch = DiskArrays.eachchunk(aconc) @test ch.chunks[1] == [1:3, 4:5] @test ch.chunks[2] == [1:4, 5:9] + @test eltype(aconc) == Union{Int, Missing} a = ones(100, 50) - b = [rem(i.I[3], 5) == 0 ? missing : a for i in CartesianIndices((1, 1, 100))] - b[1] = missing + b = [rem(i.I[3], 5) == 0 ? DiskArrays.MissingTile(missing) : a for i in CartesianIndices((1, 1, 100))] + b[1] = DiskArrays.MissingTile(missing) a_conc = DiskArrays.ConcatDiskArray(b) ch = eachchunk(a_conc) @test ch.chunks[1] == [1:100] @@ -518,6 +548,35 @@ end @test all(isequal.(a_conc[end, end, 95:100], [missing, 1.0, 1.0, 1.0, 1.0, missing])) end + + @testset "Concat DiskArray with fill zero vector tiles" begin + a = fill([1,1], 3, 4) + b = fill([1,2], 2, 4) + c = fill([2,1], 3, 5) + d = fill([2,2], 2, 5) + aconc = DiskArrays.ConcatDiskArray(reshape([a, b, c, DiskArrays.MissingTile([2,2])], 2, 2)) + abase = [a c; b d] + @test all(isequal.(aconc[:, :], abase)) + @test all(isequal.(aconc[3:4, 4:6], abase[3:4, 4:6])) + ch = DiskArrays.eachchunk(aconc) + @test ch.chunks[1] == [1:3, 4:5] + @test ch.chunks[2] == [1:4, 5:9] + @test eltype(aconc) == Vector{Int} + + a = fill([1,1], 100, 50) + b = [rem(i.I[3], 5) == 0 ? DiskArrays.MissingTile([0,0]) : a for i in CartesianIndices((1, 1, 100))] + b[1] = DiskArrays.MissingTile([0,0]) + a_conc = DiskArrays.ConcatDiskArray(b) + ch = eachchunk(a_conc) + @test ch.chunks[1] == [1:100] + @test ch.chunks[2] == [1:50] + @test ch.chunks[3] === DiskArrays.RegularChunks(1, 0, 100) + + @test all(isequal.(a_conc[2, 2, 1:5], [[0,0], [1,1],[1,1] , [1,1], [0,0]])) + @test all(isequal.(a_conc[end, end, 95:100], [[0,0], [1,1], [1,1], [1,1],[1,1], [0,0]])) + + end + end @testset "Broadcast with length 1 and 0 final dim" begin @@ -929,8 +988,10 @@ struct TestArray{T,N} <: AbstractArray{T,N} end DiskArrays.@implement_array_methods TestArray DiskArrays.@implement_permutedims TestArray DiskArrays.@implement_subarray TestArray - DiskArrays.@implement_diskarray TestArray @test DiskArrays.isdisk(TestArray) == true + DiskArrays.@implement_diskarray TestArray2 + @test DiskArrays.isdisk(TestArray2) == true + end # issue #123 From b480c4f098194341fd567ae81e876f5216df1532 Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Tue, 13 May 2025 16:29:14 +0200 Subject: [PATCH 9/9] Update src/cat.jl --- src/cat.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cat.jl b/src/cat.jl index 682751e..1c8830c 100644 --- a/src/cat.jl +++ b/src/cat.jl @@ -8,7 +8,7 @@ a single disk array, using lazy concatination. Note that if some elements of `arrays` are `missing`, this array will be interpreted as a block containing only missing elements. This can be useful when concatenating mosaics of tiles where some tiles in are missing or when stacking arrays along a new dimension -and some layers are missing. +and some layers are missing. Returned from `cat` on disk arrays.