From 3c46beda588554619b68f8bf4cbad10901eac4c4 Mon Sep 17 00:00:00 2001 From: nhz2 Date: Mon, 15 May 2023 12:43:51 -0400 Subject: [PATCH 1/4] updates for 64bit writes --- src/ZipFile.jl | 13 ++------ src/Zlib.jl | 85 ++++++++++++++++++++++++++------------------------ 2 files changed, 47 insertions(+), 51 deletions(-) diff --git a/src/ZipFile.jl b/src/ZipFile.jl index eb1b5df..afd440a 100644 --- a/src/ZipFile.jl +++ b/src/ZipFile.jl @@ -516,13 +516,7 @@ function update_reader!(f::ReadableFile, data::Array{UInt8}) f._zpos = position(f._io) - f._datapos datalen = length(data) f._pos += datalen - chunk_size = if Sys.WORD_SIZE > 32 2^31 else datalen end - start = 1 - while datalen > 0 - f._currentcrc32 = Zlib.crc32(view(data, start:start-1+min(datalen, chunk_size)), f._currentcrc32) - datalen -= chunk_size - start += chunk_size - end + f._currentcrc32 = Zlib.crc32(data, f._currentcrc32) if eof(f) if f.method == Deflate @@ -669,9 +663,8 @@ Base.readavailable(io::ZipFile.ReadableFile) = read(io) # Write nb elements located at p into f. function unsafe_write(f::WritableFile, p::Ptr{UInt8}, nb::UInt) - # zlib doesn't like 0 length writes if nb == 0 - return 0 + return UInt(0) end n = unsafe_write(f._zio, p, nb) @@ -679,7 +672,7 @@ function unsafe_write(f::WritableFile, p::Ptr{UInt8}, nb::UInt) error("short write") end - f.crc32 = Zlib.crc32(unsafe_wrap(Array, p, nb), f.crc32) + f.crc32 = Zlib.unsafe_crc32(p, nb, f.crc32) f.uncompressedsize += n n end diff --git a/src/Zlib.jl b/src/Zlib.jl index e7d1ca1..a9600df 100644 --- a/src/Zlib.jl +++ b/src/Zlib.jl @@ -122,15 +122,20 @@ end Writer(io::IO, raw::Bool=false) = Writer(io, 9, raw) -function write(w::Writer, p::Ptr, nb::Integer) +function Base.unsafe_write(w::Writer, p::Ptr{UInt8}, nb::UInt) + if nb == 0 + return zero(nb) + end + max_chunk_size::UInt = UInt(typemax(Cuint))>>1 + chunk_offset = UInt(0) + num_bytes_left = nb + chunk_size = min(max_chunk_size, num_bytes_left) + w.strm.avail_in = chunk_size w.strm.next_in = p - w.strm.avail_in = nb outbuf = Vector{UInt8}(undef, 1024) - GC.@preserve outbuf while true w.strm.avail_out = length(outbuf) w.strm.next_out = pointer(outbuf) - ret = ccall((:deflate, libz), Int32, (Ptr{z_stream}, Int32), Ref(w.strm), Z_NO_FLUSH) @@ -139,10 +144,21 @@ function write(w::Writer, p::Ptr, nb::Integer) end n = length(outbuf) - w.strm.avail_out - if n > 0 && write(w.io, outbuf[1:n]) != n + if n > 0 && write(w.io, view(outbuf,1:n)) != n error("short write") end - if w.strm.avail_out != 0 + # Update w.strm.avail_in if needed + if w.strm.avail_in == 0 + # mark that previous chunk was written + chunk_offset += chunk_size + num_bytes_left -= chunk_size + # new chunk size, will be zero at the end. + chunk_size = min(max_chunk_size, num_bytes_left) + @assert chunk_offset + chunk_size ≤ nb + w.strm.next_in = p + chunk_offset + w.strm.avail_in = chunk_size + end + if (w.strm.avail_out != 0) && (w.strm.avail_in == 0) break end end @@ -151,38 +167,8 @@ function write(w::Writer, p::Ptr, nb::Integer) nb end -function write(w::Writer, a::Array{UInt8}) - GC.@preserve a write(w, pointer(a), length(a)) -end - -# If this is not provided, Base.IO write methods will write -# arrays one element at a time. -function write(w::Writer, a::Array{T}) where T - if isbits(T) - GC.@preserve a write(w, pointer(a), length(a)*sizeof(T)) - else - invoke(write, Tuple{IO,Array}, w, a) - end -end - -# Copied from Julia base/io.jl -function write(w::Writer, a::SubArray{T,N,A}) where {T,N,A<:Array} - if !isbits(T) || stride(a,1)!=1 - return invoke(write, Tuple{Any,AbstractArray}, s, a) - end - colsz = size(a,1)*sizeof(T) - if N<=1 - return GC.@preserve a write(s, pointer(a, 1), colsz) - else - for idx in CartesianRange(tuple(1, size(a)[2:end]...)) - GC.@preserve a write(w, pointer(a, idx.I), colsz) - end - return colsz*Base.trailingsize(a,2) - end -end - function write(w::Writer, b::UInt8) - write(w, UInt8[b]) + write(w, Ref(b)) end function close(w::Writer) @@ -365,10 +351,27 @@ function eof(r::Reader) bytesavailable(r.buf) == 0 && eof(r.io) end -function crc32(data::AbstractArray{UInt8}, crc::Integer=0) - convert(UInt32, (ccall((:crc32, libz), - Culong, (Culong, Ptr{UInt8}, Cuint), - crc, data, length(data)))) +function unsafe_crc32(p::Ptr{UInt8}, nb::UInt, crc::UInt32) + max_chunk_size::UInt = UInt(typemax(Cuint))>>1 + chunk_offset = UInt(0) + num_bytes_left = nb + while num_bytes_left > 0 + chunk_size = min(max_chunk_size, num_bytes_left) + @assert chunk_offset + chunk_size ≤ nb + crc = ccall((:crc32, libz), + Culong, (Culong, Ptr{UInt8}, Cuint), + crc, p + chunk_offset, chunk_size, + ) + chunk_offset += chunk_size + num_bytes_left -= chunk_size + end + crc +end + +function crc32(data::AbstractArray{UInt8}, crc::Integer=0)::UInt32 + GC.@preserve data begin + unsafe_crc32(pointer(data), UInt(length(data)), crc) + end end crc32(data::AbstractString, crc::Integer=0) = crc32(convert(AbstractArray{UInt8}, data), crc) From 8a0696613ea773c41005d03940b8df26072786ed Mon Sep 17 00:00:00 2001 From: nhz2 Date: Mon, 15 May 2023 20:45:37 -0400 Subject: [PATCH 2/4] test writing a big array --- src/Zlib.jl | 4 ++-- test/bigtests.jl | 43 +++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 4 ++++ 3 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 test/bigtests.jl diff --git a/src/Zlib.jl b/src/Zlib.jl index a9600df..617e15b 100644 --- a/src/Zlib.jl +++ b/src/Zlib.jl @@ -351,7 +351,7 @@ function eof(r::Reader) bytesavailable(r.buf) == 0 && eof(r.io) end -function unsafe_crc32(p::Ptr{UInt8}, nb::UInt, crc::UInt32) +function unsafe_crc32(p::Ptr{UInt8}, nb::UInt, crc::UInt32)::UInt32 max_chunk_size::UInt = UInt(typemax(Cuint))>>1 chunk_offset = UInt(0) num_bytes_left = nb @@ -370,7 +370,7 @@ end function crc32(data::AbstractArray{UInt8}, crc::Integer=0)::UInt32 GC.@preserve data begin - unsafe_crc32(pointer(data), UInt(length(data)), crc) + unsafe_crc32(pointer(data), UInt(length(data)), UInt32(crc)) end end diff --git a/test/bigtests.jl b/test/bigtests.jl new file mode 100644 index 0000000..9a4f68c --- /dev/null +++ b/test/bigtests.jl @@ -0,0 +1,43 @@ +# These tests require over 8 GB of memory and a 64 bit Int + +using ZipFile +using Test + +@testset "big array with Zlib" begin + big_array = collect(1:2^29+2^25) + + io = IOBuffer() + w = ZipFile.Zlib.Writer(io, 1, true) + write(w, big_array) + close(w) + w = nothing + @info "done writing big_array" + seekstart(io) + r = ZipFile.Zlib.Reader(io, true) + buffer = zeros(Int, 2^22) + for bi in 1:(length(big_array)>>22) + read!(r, buffer) + @test ((bi-1)<<22+1):(bi<<22) == buffer + end + close(r) + r = nothing + close(io) + io = nothing + @info "done reading big_array" + + # Check that crc32 works + crc32_big::UInt32 = ZipFile.Zlib.crc32( + reinterpret(UInt8, big_array) + ) + crc32_parts::UInt32 = 0 + for bi in 1:(length(big_array)>>22) + crc32_parts = ZipFile.Zlib.crc32( + reinterpret(UInt8, view(big_array,((bi-1)<<22+1):(bi<<22))), + crc32_parts + ) + end + @test crc32_parts == crc32_big + + + big_array = nothing +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 5ce1221..5d83a8e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -182,4 +182,8 @@ if !Debug rm(tmp, recursive=true) end +if "bigtests" in ARGS + include("bigtests.jl") +end + println("done") From 3383a3d158220a59f97d0d76313c882679d4c1d9 Mon Sep 17 00:00:00 2001 From: nhz2 Date: Mon, 15 May 2023 21:16:53 -0400 Subject: [PATCH 3/4] add type annotation --- src/Zlib.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Zlib.jl b/src/Zlib.jl index 617e15b..6d6ee00 100644 --- a/src/Zlib.jl +++ b/src/Zlib.jl @@ -122,9 +122,9 @@ end Writer(io::IO, raw::Bool=false) = Writer(io, 9, raw) -function Base.unsafe_write(w::Writer, p::Ptr{UInt8}, nb::UInt) +function Base.unsafe_write(w::Writer, p::Ptr{UInt8}, nb::UInt)::UInt if nb == 0 - return zero(nb) + return UInt(0) end max_chunk_size::UInt = UInt(typemax(Cuint))>>1 chunk_offset = UInt(0) @@ -358,7 +358,7 @@ function unsafe_crc32(p::Ptr{UInt8}, nb::UInt, crc::UInt32)::UInt32 while num_bytes_left > 0 chunk_size = min(max_chunk_size, num_bytes_left) @assert chunk_offset + chunk_size ≤ nb - crc = ccall((:crc32, libz), + crc::UInt32 = ccall((:crc32, libz), Culong, (Culong, Ptr{UInt8}, Cuint), crc, p + chunk_offset, chunk_size, ) From 9e41792a08ba7d6e5759853f7e7155b839c7f2cb Mon Sep 17 00:00:00 2001 From: nhz2 Date: Mon, 15 May 2023 21:51:27 -0400 Subject: [PATCH 4/4] use crc32_z --- src/Zlib.jl | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/Zlib.jl b/src/Zlib.jl index 6d6ee00..1174e61 100644 --- a/src/Zlib.jl +++ b/src/Zlib.jl @@ -352,20 +352,10 @@ function eof(r::Reader) end function unsafe_crc32(p::Ptr{UInt8}, nb::UInt, crc::UInt32)::UInt32 - max_chunk_size::UInt = UInt(typemax(Cuint))>>1 - chunk_offset = UInt(0) - num_bytes_left = nb - while num_bytes_left > 0 - chunk_size = min(max_chunk_size, num_bytes_left) - @assert chunk_offset + chunk_size ≤ nb - crc::UInt32 = ccall((:crc32, libz), - Culong, (Culong, Ptr{UInt8}, Cuint), - crc, p + chunk_offset, chunk_size, - ) - chunk_offset += chunk_size - num_bytes_left -= chunk_size - end - crc + ccall((:crc32_z, libz), + Culong, (Culong, Ptr{UInt8}, Csize_t), + crc, p, nb, + ) end function crc32(data::AbstractArray{UInt8}, crc::Integer=0)::UInt32