From dde66bb9db26680e2eca9dd3abf5d361ca336131 Mon Sep 17 00:00:00 2001 From: Jonathan Marler Date: Sat, 4 May 2024 09:16:16 -0600 Subject: [PATCH] update zip --- zigup.zig | 2 +- zip.zig | 1044 ++++++++++++++++++++++++++++++++++------------------- 2 files changed, 681 insertions(+), 365 deletions(-) diff --git a/zigup.zig b/zigup.zig index cd10800..19314ff 100644 --- a/zigup.zig +++ b/zigup.zig @@ -1007,7 +1007,7 @@ fn installCompiler(allocator: Allocator, compiler_dir: []const u8, url: []const var timer = try std.time.Timer.start(); var archive_file = try std.fs.openFileAbsolute(archive_absolute, .{}); defer archive_file.close(); - try zip.pipeToFileSystem(installing_dir_opened, archive_file); + try zip.extract(installing_dir_opened, archive_file.seekableStream(), .{}); const time = timer.read(); loginfo("extracted archive in {d:.2} s", .{@as(f32, @floatFromInt(time)) / @as(f32, @floatFromInt(std.time.ns_per_s))}); } diff --git a/zip.zig b/zip.zig index e5dc14b..2bfc1fe 100644 --- a/zip.zig +++ b/zip.zig @@ -1,251 +1,165 @@ /// The .ZIP File Format Specification is found here: /// https://pkwaredownloads.blob.core.windows.net/pem/APPNOTE.txt +/// +/// Note that this file uses the abbreviation "cd" for "central directory" +/// +const builtin = @import("builtin"); const std = @import("std"); const testing = std.testing; -pub const File = @import("zip/test.zig").File; -pub const FileCache = @import("zip/test.zig").FileCache; -pub const writeFile = @import("zip/test.zig").writeFile; +pub const testutil = @import("zip/test.zig"); +const File = testutil.File; +const FileStore = testutil.FileStore; pub const CompressionMethod = enum(u16) { store = 0, deflate = 8, - deflate64 = 9, _, }; pub const central_file_header_sig = [4]u8{ 'P', 'K', 1, 2 }; pub const local_file_header_sig = [4]u8{ 'P', 'K', 3, 4 }; -pub const end_of_central_directory_sig = [4]u8{ 'P', 'K', 5, 6 }; - -pub const LocalFileHeader = struct { - signature: [4]u8, - minimum_version: u16, - flags: u16, - compression_method: CompressionMethod, - last_modification_time: u16, - last_modification_date: u16, - crc32: u32, - compressed_size: u32, - uncompressed_size: u32, - filename_len: u16, - extra_len: u16, - pub fn deserialize(bytes: [30]u8) LocalFileHeader { - return .{ - .signature = bytes[0..4].*, - .minimum_version = std.mem.readInt(u16, bytes[4..6], .little), - .flags = std.mem.readInt(u16, bytes[6..8], .little), - .compression_method = @enumFromInt(std.mem.readInt(u16, bytes[8..10], .little)), - .last_modification_time = std.mem.readInt(u16, bytes[10..12], .little), - .last_modification_date = std.mem.readInt(u16, bytes[12..14], .little), - .crc32 = std.mem.readInt(u32, bytes[14..18], .little), - .compressed_size = std.mem.readInt(u32, bytes[18..22], .little), - .uncompressed_size = std.mem.readInt(u32, bytes[22..26], .little), - .filename_len = std.mem.readInt(u16, bytes[26..28], .little), - .extra_len = std.mem.readInt(u16, bytes[28..30], .little), - }; - } - pub fn serialize(self: LocalFileHeader) [30]u8 { - var result: [30]u8 = undefined; - result[0..4].* = self.signature; - std.mem.writeInt(u16, result[4..6], self.minimum_version, .little); - std.mem.writeInt(u16, result[6..8], self.flags, .little); - std.mem.writeInt(u16, result[8..10], @intFromEnum(self.compression_method), .little); - std.mem.writeInt(u16, result[10..12], self.last_modification_time, .little); - std.mem.writeInt(u16, result[12..14], self.last_modification_date, .little); - std.mem.writeInt(u32, result[14..18], self.crc32, .little); - std.mem.writeInt(u32, result[18..22], self.compressed_size, .little); - std.mem.writeInt(u32, result[22..26], self.uncompressed_size, .little); - std.mem.writeInt(u16, result[26..28], self.filename_len, .little); - std.mem.writeInt(u16, result[28..30], self.extra_len, .little); - return result; - } +pub const end_record_sig = [4]u8{ 'P', 'K', 5, 6 }; +pub const end_record64_sig = [4]u8{ 'P', 'K', 6, 6 }; +pub const end_locator64_sig = [4]u8{ 'P', 'K', 6, 7 }; +pub const ExtraHeader = enum(u16) { + zip64_info = 0x1, + _, }; -pub const CentralDirectoryFileHeader = struct { - signature: [4]u8, - version: u16, - minimum_version: u16, - flags: u16, - compression_method: CompressionMethod, - last_modification_time: u16, - last_modification_date: u16, - crc32: u32, - compressed_size: u32, - uncompressed_size: u32, - filename_len: u16, - extra_len: u16, - comment_len: u16, - disk_number: u16, - internal_file_attributes: u16, - external_file_attributes: u32, - local_file_header_offset: u32, - - pub fn deserialize(bytes: [46]u8) CentralDirectoryFileHeader { - return .{ - .signature = bytes[0..4].*, - .version = std.mem.readInt(u16, bytes[4..6], .little), - .minimum_version = std.mem.readInt(u16, bytes[6..8], .little), - .flags = std.mem.readInt(u16, bytes[8..10], .little), - .compression_method = @enumFromInt(std.mem.readInt(u16, bytes[10..12], .little)), - .last_modification_time = std.mem.readInt(u16, bytes[12..14], .little), - .last_modification_date = std.mem.readInt(u16, bytes[14..16], .little), - .crc32 = std.mem.readInt(u32, bytes[16..20], .little), - .compressed_size = std.mem.readInt(u32, bytes[20..24], .little), - .uncompressed_size = std.mem.readInt(u32, bytes[24..28], .little), - .filename_len = std.mem.readInt(u16, bytes[28..30], .little), - .extra_len = std.mem.readInt(u16, bytes[30..32], .little), - .comment_len = std.mem.readInt(u16, bytes[32..34], .little), - .disk_number = std.mem.readInt(u16, bytes[34..36], .little), - .internal_file_attributes = std.mem.readInt(u16, bytes[36..38], .little), - .external_file_attributes = std.mem.readInt(u32, bytes[38..42], .little), - .local_file_header_offset = std.mem.readInt(u32, bytes[42..46], .little), - }; - } - pub fn serialize(self: CentralDirectoryFileHeader) [46]u8 { - var result: [46]u8 = undefined; - result[0..4].* = self.signature; - std.mem.writeInt(u16, result[4..6], self.version, .little); - std.mem.writeInt(u16, result[6..8], self.minimum_version, .little); - std.mem.writeInt(u16, result[8..10], self.flags, .little); - std.mem.writeInt(u16, result[10..12], @intFromEnum(self.compression_method), .little); - std.mem.writeInt(u16, result[12..14], self.last_modification_time, .little); - std.mem.writeInt(u16, result[14..16], self.last_modification_date, .little); - std.mem.writeInt(u32, result[16..20], self.crc32, .little); - std.mem.writeInt(u32, result[20..24], self.compressed_size, .little); - std.mem.writeInt(u32, result[24..28], self.uncompressed_size, .little); - std.mem.writeInt(u16, result[28..30], self.filename_len, .little); - std.mem.writeInt(u16, result[30..32], self.extra_len, .little); - std.mem.writeInt(u16, result[32..34], self.comment_len, .little); - std.mem.writeInt(u16, result[34..36], self.disk_number, .little); - std.mem.writeInt(u16, result[36..38], self.internal_file_attributes, .little); - std.mem.writeInt(u32, result[38..42], self.external_file_attributes, .little); - std.mem.writeInt(u32, result[42..46], self.local_file_header_offset, .little); - return result; - } +const GeneralPurposeFlags = packed struct(u16) { + encrypted: bool, + _: u15, }; -pub const EndOfCentralDirectoryRecord = struct { - disk_number: u16, - central_directory_disk_number: u16, - record_count_disk: u16, - record_count_total: u16, - central_directory_size: u32, - central_directory_offset: u32, - comment_len: u16, - - pub fn read(bytes: [22]u8) EndOfCentralDirectoryRecord { - return EndOfCentralDirectoryRecord{ - .disk_number = std.mem.readInt(u16, bytes[4..6], .little), - .central_directory_disk_number = std.mem.readInt(u16, bytes[6..8], .little), - .record_count_disk = std.mem.readInt(u16, bytes[8..10], .little), - .record_count_total = std.mem.readInt(u16, bytes[10..12], .little), - .central_directory_size = std.mem.readInt(u32, bytes[12..16], .little), - .central_directory_offset = std.mem.readInt(u32, bytes[16..20], .little), - .comment_len = std.mem.readInt(u16, bytes[20..22], .little), - }; - } - pub fn serialize(self: EndOfCentralDirectoryRecord) [22]u8 { - var result: [22]u8 = undefined; - result[0..4].* = end_of_central_directory_sig; - std.mem.writeInt(u16, result[4..6], self.disk_number, .little); - std.mem.writeInt(u16, result[6..8], self.central_directory_disk_number, .little); - std.mem.writeInt(u16, result[8..10], self.record_count_disk, .little); - std.mem.writeInt(u16, result[10..12], self.record_count_total, .little); - std.mem.writeInt(u32, result[12..16], self.central_directory_size, .little); - std.mem.writeInt(u32, result[16..20], self.central_directory_offset, .little); - std.mem.writeInt(u16, result[20..22], self.comment_len, .little); - return result; - } +pub const LocalFileHeader = extern struct { + signature: [4]u8 align(1), + version_needed_to_extract: u16 align(1), + flags: GeneralPurposeFlags align(1), + compression_method: CompressionMethod align(1), + last_modification_time: u16 align(1), + last_modification_date: u16 align(1), + crc32: u32 align(1), + compressed_size: u32 align(1), + uncompressed_size: u32 align(1), + filename_len: u16 align(1), + extra_len: u16 align(1), }; -pub fn findEocdr(file: std.fs.File) ![22]u8 { - // The EOCD record can contain a variable-length comment at the end, - // which makes ZIP file parsing ambiguous in general, since a valid - // comment could contain the bytes of another valid EOCD record. - // Here we just search backwards for the first instance of the EOCD - // signature, and return an error if a valid EOCD record doesn't follow. - - // TODO: make this more efficient - // we need a backward_buffered_reader - const file_size = try file.getEndPos(); - - const record_len = 22; - var record: [record_len]u8 = undefined; - if (file_size < record_len) - return error.ZipTruncated; - try file.seekFromEnd(-record_len); - { - const len = try file.readAll(&record); - if (len != record_len) - return error.ZipTruncated; +pub const CentralDirectoryFileHeader = extern struct { + signature: [4]u8 align(1), + version_made_by: u16 align(1), + version_needed_to_extract: u16 align(1), + flags: GeneralPurposeFlags align(1), + compression_method: CompressionMethod align(1), + last_modification_time: u16 align(1), + last_modification_date: u16 align(1), + crc32: u32 align(1), + compressed_size: u32 align(1), + uncompressed_size: u32 align(1), + filename_len: u16 align(1), + extra_len: u16 align(1), + comment_len: u16 align(1), + disk_number: u16 align(1), + internal_file_attributes: u16 align(1), + external_file_attributes: u32 align(1), + local_file_header_offset: u32 align(1), +}; + +pub const EndRecord64 = extern struct { + signature: [4]u8 align(1), + end_record_size: u64 align(1), + version_made_by: u16 align(1), + version_needed_to_extract: u16 align(1), + disk_number: u32 align(1), + central_directory_disk_number: u32 align(1), + record_count_disk: u64 align(1), + record_count_total: u64 align(1), + central_directory_size: u64 align(1), + central_directory_offset: u64 align(1), +}; + +pub const EndLocator64 = extern struct { + signature: [4]u8 align(1), + zip64_disk_count: u32 align(1), + record_file_offset: u64 align(1), + total_disk_count: u32 align(1), +}; + +pub const EndRecord = extern struct { + signature: [4]u8 align(1), + disk_number: u16 align(1), + central_directory_disk_number: u16 align(1), + record_count_disk: u16 align(1), + record_count_total: u16 align(1), + central_directory_size: u32 align(1), + central_directory_offset: u32 align(1), + comment_len: u16 align(1), + pub fn need_zip64(self: EndRecord) bool { + return isMaxInt(self.record_count_disk) or + isMaxInt(self.record_count_total) or + isMaxInt(self.central_directory_size) or + isMaxInt(self.central_directory_offset); } +}; + +/// Find and return the end record for the given seekable zip stream. +/// Note that `seekable_stream` must be an instance of `std.io.SeekabkeStream` and +/// its context must also have a `.reader()` method that returns an instance of +/// `std.io.Reader`. +pub fn findEndRecord(seekable_stream: anytype, stream_len: u64) !EndRecord { + var buf: [@sizeOf(EndRecord) + std.math.maxInt(u16)]u8 = undefined; + const record_len_max = @min(stream_len, buf.len); + var loaded_len: u32 = 0; var comment_len: u16 = 0; while (true) { - if (std.mem.eql(u8, record[0..4], &end_of_central_directory_sig) and - std.mem.readInt(u16, record[20..22], .little) == comment_len) + const record_len: u32 = @as(u32, comment_len) + @sizeOf(EndRecord); + if (record_len > record_len_max) + return error.ZipNoEndRecord; + + if (record_len > loaded_len) { + const new_loaded_len = @min(loaded_len + 300, record_len_max); + const read_len = new_loaded_len - loaded_len; + + try seekable_stream.seekTo(stream_len - @as(u64, new_loaded_len)); + const read_buf: []u8 = buf[buf.len - new_loaded_len ..][0..read_len]; + const len = try seekable_stream.context.reader().readAll(read_buf); + if (len != read_len) + return error.ZipTruncated; + loaded_len = new_loaded_len; + } + + const record_bytes = buf[buf.len - record_len ..][0..@sizeOf(EndRecord)]; + if (std.mem.eql(u8, record_bytes[0..4], &end_record_sig) and + std.mem.readInt(u16, record_bytes[20..22], .little) == comment_len) { - break; + const record: *align(1) EndRecord = @ptrCast(record_bytes.ptr); + if (builtin.target.cpu.arch.endian() != .little) { + std.mem.byteSwapAllFields(@TypeOf(record.*), record); + } + return record.*; } if (comment_len == std.math.maxInt(u16)) - return error.ZipMissingEocdr; - std.mem.copyBackwards(u8, record[1..], record[0 .. record.len - 1]); + return error.ZipNoEndRecord; comment_len += 1; - - if (@as(u64, record_len) + @as(u64, comment_len) > file_size) - return error.ZipMissingEocdr; - - try file.seekFromEnd(-record_len - @as(i64, comment_len)); - { - const len = try file.readAll(record[0..1]); - if (len != 1) - return error.ZipTruncated; - } } - return record; -} - -fn LimitedReader(comptime UnderlyingReader: type) type { - return struct { - const Self = @This(); - - underlying_reader: UnderlyingReader, - remaining: usize, - - pub const Error = UnderlyingReader.Error; - pub const Reader = std.io.Reader(*Self, Error, read); - fn read(self: *Self, buffer: []u8) Error!usize { - const next_read_len = @min(buffer.len, self.remaining); - if (next_read_len == 0) return 0; - const len = try self.underlying_reader.read(buffer[0..next_read_len]); - self.remaining -= len; - return len; - } - pub fn reader(self: *Self) Reader { - return Reader{ .context = self }; - } - }; -} -fn limitedReader(reader: anytype, limit: usize) LimitedReader(@TypeOf(reader)) { - return .{ - .underlying_reader = reader, - .remaining = limit, - }; } -/// `decompress` returns the actual CRC-32 of the decompressed bytes, -/// which should be validated against the expected entry.crc32 value. +/// Decompresses the given data from `reader` into `writer`. Stops early if more +/// than `uncompressed_size` bytes are processed and verifies that exactly that +/// number of bytes are decompressed. Returns the CRC-32 of the uncompressed data. /// `writer` can be anything with a `writeAll(self: *Self, chunk: []const u8) anyerror!void` method. pub fn decompress( method: CompressionMethod, - uncompressed_size: u32, + uncompressed_size: u64, reader: anytype, writer: anytype, ) !u32 { var hash = std.hash.Crc32.init(); + var total_uncompressed: u64 = 0; switch (method) { .store => { var buf: [std.mem.page_size]u8 = undefined; @@ -254,228 +168,630 @@ pub fn decompress( if (len == 0) break; try writer.writeAll(buf[0..len]); hash.update(buf[0..len]); + total_uncompressed += @intCast(len); } }, - .deflate, .deflate64 => { + .deflate => { var br = std.io.bufferedReader(reader); - var total_uncompressed: u32 = 0; var decompressor = std.compress.flate.decompressor(br.reader()); while (try decompressor.next()) |chunk| { try writer.writeAll(chunk); hash.update(chunk); total_uncompressed += @intCast(chunk.len); + if (total_uncompressed > uncompressed_size) + return error.ZipUncompressSizeTooSmall; } if (br.end != br.start) return error.ZipDeflateTruncated; - if (total_uncompressed != uncompressed_size) - return error.ZipUncompressSizeMismatch; }, _ => return error.UnsupportedCompressionMethod, } + if (total_uncompressed != uncompressed_size) + return error.ZipUncompressSizeMismatch; return hash.final(); } -pub const Iterator = struct { - file: std.fs.File, - eocdr: EndOfCentralDirectoryRecord, - next_central_header_index: u16, - next_central_header_offset: u64, +fn isBadFilename(filename: []const u8) bool { + if (filename.len == 0 or filename[0] == '/') + return true; - pub fn init(file: std.fs.File) !Iterator { - const eocdr = blk: { - const eocdr_bytes = try findEocdr(file); - break :blk EndOfCentralDirectoryRecord.read(eocdr_bytes); - }; + var it = std.mem.splitScalar(u8, filename, '/'); + while (it.next()) |part| { + if (std.mem.eql(u8, part, "..")) + return true; + } - // Don't support multi-disk archives. - if (eocdr.disk_number != 0 or - eocdr.central_directory_disk_number != 0 or - eocdr.record_count_disk != eocdr.record_count_total) - { - return error.ZipUnsupportedMultiDisk; - } + return false; +} - return .{ - .file = file, - .eocdr = eocdr, - .next_central_header_offset = 0, - .next_central_header_index = 0, - }; +fn isMaxInt(uint: anytype) bool { + return uint == std.math.maxInt(@TypeOf(uint)); +} + +const FileExtents = struct { + uncompressed_size: u64, + compressed_size: u64, + local_file_header_offset: u64, +}; + +fn readZip64FileExtents(header: CentralDirectoryFileHeader, extents: *FileExtents, data: []u8) !void { + var data_offset: usize = 0; + if (isMaxInt(header.uncompressed_size)) { + if (data_offset + 8 > data.len) + return error.ZipBadCd64Size; + extents.uncompressed_size = std.mem.readInt(u64, data[data_offset..][0..8], .little); + data_offset += 8; + } + if (isMaxInt(header.compressed_size)) { + if (data_offset + 8 > data.len) + return error.ZipBadCd64Size; + extents.compressed_size = std.mem.readInt(u64, data[data_offset..][0..8], .little); + data_offset += 8; + } + if (isMaxInt(header.local_file_header_offset)) { + if (data_offset + 8 > data.len) + return error.ZipBadCd64Size; + extents.local_file_header_offset = std.mem.readInt(u64, data[data_offset..][0..8], .little); + data_offset += 8; + } + if (isMaxInt(header.disk_number)) { + if (data_offset + 4 > data.len) + return error.ZipInvalid; + const disk_number = std.mem.readInt(u32, data[data_offset..][0..4], .little); + if (disk_number != 0) + return error.ZipMultiDiskUnsupported; + data_offset += 4; } + if (data_offset > data.len) + return error.ZipBadCd64Size; +} - pub fn next(self: *Iterator) !?Entry { - if (self.next_central_header_index >= self.eocdr.record_count_total) { - return null; - } +pub fn Iterator(comptime SeekableStream: type) type { + return struct { + stream: SeekableStream, - const header_file_offset: u64 = @as(u64, self.eocdr.central_directory_offset) + self.next_central_header_offset; - const header = blk: { - try self.file.seekTo(header_file_offset); - var header: [46]u8 = undefined; - const len = try self.file.readAll(&header); - if (len != header.len) - return error.ZipTruncated; - break :blk CentralDirectoryFileHeader.deserialize(header); - }; - if (!std.mem.eql(u8, &header.signature, ¢ral_file_header_sig)) - return error.ZipHeader; + cd_record_count: u64, + cd_zip_offset: u64, + cd_size: u64, - self.next_central_header_index += 1; - self.next_central_header_offset += 46 + header.filename_len + header.extra_len + header.comment_len; + cd_record_index: u64 = 0, + cd_record_offset: u64 = 0, - if (header.disk_number != 0) - return error.ZipUnsupportedMultiDisk; - return .{ - .header_file_offset = header_file_offset, - .header = header, - }; - } + const Self = @This(); - pub const Entry = struct { - header_file_offset: u64, - header: CentralDirectoryFileHeader, + pub fn init(stream: SeekableStream) !Self { + const stream_len = try stream.getEndPos(); - pub fn extract(self: Entry, zip_file: std.fs.File, filename_buf: []u8, dest: std.fs.Dir) !u32 { - if (filename_buf.len < self.header.filename_len) - return error.ZipInsufficientBuffer; - const filename = filename_buf[0..self.header.filename_len]; + const end_record = try findEndRecord(stream, stream_len); + + if (!isMaxInt(end_record.record_count_disk) and end_record.record_count_disk > end_record.record_count_total) + return error.ZipDiskRecordCountTooLarge; + + if (end_record.disk_number != 0 or end_record.central_directory_disk_number != 0) + return error.ZipMultiDiskUnsupported; - try zip_file.seekTo(self.header_file_offset + 46); { - const len = try zip_file.readAll(filename); - if (len != filename.len) - return error.ZipTruncated; + const counts_valid = !isMaxInt(end_record.record_count_disk) and !isMaxInt(end_record.record_count_total); + if (counts_valid and end_record.record_count_disk != end_record.record_count_total) + return error.ZipMultiDiskUnsupported; } - const local_data_header_offset: u64 = local_data_header_offset: { - const local_header = blk: { - try zip_file.seekTo(self.header.local_file_header_offset); - var local_header: [30]u8 = undefined; - const len = try zip_file.readAll(&local_header); - if (len != local_header.len) - return error.ZipTruncated; - break :blk LocalFileHeader.deserialize(local_header); - }; - if (!std.mem.eql(u8, &local_header.signature, &local_file_header_sig)) - return error.ZipHeader; - // TODO: verify minimum_version - // TODO: verify flags - // TODO: verify compression method - // TODO: verify last_mod_time - // TODO: verify last_mod_date - // TODO: verify filename_len and filename? - // TODO: extra? - - if (local_header.crc32 != 0 and local_header.crc32 != self.header.crc32) - return error.ZipRedundancyFail; - if (local_header.compressed_size != 0 and - local_header.compressed_size != self.header.compressed_size) - return error.ZipRedundancyFail; - if (local_header.uncompressed_size != 0 and - local_header.uncompressed_size != self.header.uncompressed_size) - return error.ZipRedundancyFail; - - break :local_data_header_offset @as(u64, local_header.filename_len) + - @as(u64, local_header.extra_len); + var result = Self{ + .stream = stream, + .cd_record_count = end_record.record_count_total, + .cd_zip_offset = end_record.central_directory_offset, + .cd_size = end_record.central_directory_size, }; + if (!end_record.need_zip64()) return result; + + const locator_end_offset: u64 = @as(u64, end_record.comment_len) + @sizeOf(EndRecord) + @sizeOf(EndLocator64); + if (locator_end_offset > stream_len) + return error.ZipTruncated; + try stream.seekTo(stream_len - locator_end_offset); + const locator = try readStructEndian(stream.context.reader(), EndLocator64, .little); + if (!std.mem.eql(u8, &locator.signature, &end_locator64_sig)) + return error.ZipBadLocatorSig; + if (locator.zip64_disk_count != 0) + return error.ZipUnsupportedZip64DiskCount; + if (locator.total_disk_count != 1) + return error.ZipMultiDiskUnsupported; + + try stream.seekTo(locator.record_file_offset); + + const record64 = try readStructEndian(stream.context.reader(), EndRecord64, .little); - if (filename.len == 0 or filename[0] == '/') - return error.ZipBadFilename; + if (!std.mem.eql(u8, &record64.signature, &end_record64_sig)) + return error.ZipBadEndRecord64Sig; - // All entries that end in '/' are directories - if (filename[filename.len - 1] == '/') { - if (self.header.uncompressed_size != 0) - return error.ZipBadDirectorySize; - try dest.makePath(filename[0 .. filename.len - 1]); - return std.hash.Crc32.hash(&.{}); + if (record64.end_record_size < @sizeOf(EndRecord64) - 12) + return error.ZipEndRecord64SizeTooSmall; + if (record64.end_record_size > @sizeOf(EndRecord64) - 12) + return error.ZipEndRecord64UnhandledExtraData; + + if (record64.version_needed_to_extract > 45) + return error.ZipUnsupportedVersion; + + { + const is_multidisk = record64.disk_number != 0 or + record64.central_directory_disk_number != 0 or + record64.record_count_disk != record64.record_count_total; + if (is_multidisk) + return error.ZipMultiDiskUnsupported; + } + + if (isMaxInt(end_record.record_count_total)) { + result.cd_record_count = record64.record_count_total; + } else if (end_record.record_count_total != record64.record_count_total) + return error.Zip64RecordCountTotalMismatch; + + if (isMaxInt(end_record.central_directory_offset)) { + result.cd_zip_offset = record64.central_directory_offset; + } else if (end_record.central_directory_offset != record64.central_directory_offset) + return error.Zip64CentralDirectoryOffsetMismatch; + + if (isMaxInt(end_record.central_directory_size)) { + result.cd_size = record64.central_directory_size; + } else if (end_record.central_directory_size != record64.central_directory_size) + return error.Zip64CentralDirectorySizeMismatch; + + return result; + } + + pub fn next(self: *Self) !?Entry { + if (self.cd_record_index == self.cd_record_count) { + if (self.cd_record_offset != self.cd_size) + return if (self.cd_size > self.cd_record_offset) + error.ZipCdOversized + else + error.ZipCdUndersized; + + return null; } - const out_file = blk: { - if (std.fs.path.dirname(filename)) |dirname| { - var parent_dir = try dest.makeOpenPath(dirname, .{}); - defer parent_dir.close(); + const header_zip_offset = self.cd_zip_offset + self.cd_record_offset; + try self.stream.seekTo(header_zip_offset); + const header = try readStructEndian(self.stream.context.reader(), CentralDirectoryFileHeader, .little); + if (!std.mem.eql(u8, &header.signature, ¢ral_file_header_sig)) + return error.ZipBadCdOffset; + + self.cd_record_index += 1; + self.cd_record_offset += @sizeOf(CentralDirectoryFileHeader) + header.filename_len + header.extra_len + header.comment_len; + + // Note: checking the version_needed_to_extract doesn't seem to be helpful, i.e. the zip file + // at https://github.com/ninja-build/ninja/releases/download/v1.12.0/ninja-linux.zip + // has an undocumented version 788 but extracts just fine. + + if (header.flags.encrypted) + return error.ZipEncryptionUnsupported; + // TODO: check/verify more flags + if (header.disk_number != 0) + return error.ZipMultiDiskUnsupported; + + var extents: FileExtents = .{ + .uncompressed_size = header.uncompressed_size, + .compressed_size = header.compressed_size, + .local_file_header_offset = header.local_file_header_offset, + }; + + if (header.extra_len > 0) { + var extra_buf: [std.math.maxInt(u16)]u8 = undefined; + const extra = extra_buf[0..header.extra_len]; + + { + try self.stream.seekTo(header_zip_offset + @sizeOf(CentralDirectoryFileHeader) + header.filename_len); + const len = try self.stream.context.reader().readAll(extra); + if (len != extra.len) + return error.ZipTruncated; + } - const basename = std.fs.path.basename(filename); - break :blk try parent_dir.createFile(basename, .{ .exclusive = true }); + var extra_offset: usize = 0; + while (extra_offset + 4 <= extra.len) { + const header_id = std.mem.readInt(u16, extra[extra_offset..][0..2], .little); + const data_size = std.mem.readInt(u16, extra[extra_offset..][2..4], .little); + const end = extra_offset + 4 + data_size; + if (end > extra.len) + return error.ZipBadExtraFieldSize; + const data = extra[extra_offset + 4 .. end]; + switch (@as(ExtraHeader, @enumFromInt(header_id))) { + .zip64_info => try readZip64FileExtents(header, &extents, data), + else => {}, // ignore + } + extra_offset = end; } - break :blk try dest.createFile(filename, .{ .exclusive = true }); + } + + return .{ + .version_needed_to_extract = header.version_needed_to_extract, + .flags = header.flags, + .compression_method = header.compression_method, + .last_modification_time = header.last_modification_time, + .last_modification_date = header.last_modification_date, + .header_zip_offset = header_zip_offset, + .crc32 = header.crc32, + .filename_len = header.filename_len, + .compressed_size = extents.compressed_size, + .uncompressed_size = extents.uncompressed_size, + .file_offset = extents.local_file_header_offset, }; - defer out_file.close(); - const local_data_file_offset: u64 = - @as(u64, self.header.local_file_header_offset) + - @as(u64, 30) + - local_data_header_offset; - try zip_file.seekTo(local_data_file_offset); - var limited_reader = limitedReader(zip_file.reader(), self.header.compressed_size); - const crc = try decompress( - self.header.compression_method, - self.header.uncompressed_size, - limited_reader.reader(), - out_file.writer(), - ); - if (limited_reader.remaining != 0) - return error.ZipDecompressTruncated; - return crc; } + + pub const Entry = struct { + version_needed_to_extract: u16, + flags: GeneralPurposeFlags, + compression_method: CompressionMethod, + last_modification_time: u16, + last_modification_date: u16, + header_zip_offset: u64, + crc32: u32, + filename_len: u32, + compressed_size: u64, + uncompressed_size: u64, + file_offset: u64, + + pub fn extract( + self: Entry, + stream: SeekableStream, + options: ExtractOptions, + filename_buf: []u8, + dest: std.fs.Dir, + ) !u32 { + if (filename_buf.len < self.filename_len) + return error.ZipInsufficientBuffer; + const filename = filename_buf[0..self.filename_len]; + + try stream.seekTo(self.header_zip_offset + @sizeOf(CentralDirectoryFileHeader)); + + { + const len = try stream.context.reader().readAll(filename); + if (len != filename.len) + return error.ZipBadFileOffset; + } + + const local_data_header_offset: u64 = local_data_header_offset: { + const local_header = blk: { + try stream.seekTo(self.file_offset); + break :blk try readStructEndian(stream.context.reader(), LocalFileHeader, .little); + }; + if (!std.mem.eql(u8, &local_header.signature, &local_file_header_sig)) + return error.ZipBadFileOffset; + if (local_header.version_needed_to_extract != self.version_needed_to_extract) + return error.ZipMismatchVersionNeeded; + if (local_header.last_modification_time != self.last_modification_time) + return error.ZipMismatchModTime; + if (local_header.last_modification_date != self.last_modification_date) + return error.ZipMismatchModDate; + + if (@as(u16, @bitCast(local_header.flags)) != @as(u16, @bitCast(self.flags))) + return error.ZipMismatchFlags; + if (local_header.crc32 != 0 and local_header.crc32 != self.crc32) + return error.ZipMismatchCrc32; + if (local_header.compressed_size != 0 and + local_header.compressed_size != self.compressed_size) + return error.ZipMismatchCompLen; + if (local_header.uncompressed_size != 0 and + local_header.uncompressed_size != self.uncompressed_size) + return error.ZipMismatchUncompLen; + if (local_header.filename_len != self.filename_len) + return error.ZipMismatchFilenameLen; + + break :local_data_header_offset @as(u64, local_header.filename_len) + + @as(u64, local_header.extra_len); + }; + + if (isBadFilename(filename)) + return error.ZipBadFilename; + + if (options.allow_backslashes) { + std.mem.replaceScalar(u8, filename, '\\', '/'); + } else { + if (std.mem.indexOfScalar(u8, filename, '\\')) |_| + return error.ZipFilenameHasBackslash; + } + + // All entries that end in '/' are directories + if (filename[filename.len - 1] == '/') { + if (self.uncompressed_size != 0) + return error.ZipBadDirectorySize; + try dest.makePath(filename[0 .. filename.len - 1]); + return std.hash.Crc32.hash(&.{}); + } + + const out_file = blk: { + if (std.fs.path.dirname(filename)) |dirname| { + var parent_dir = try dest.makeOpenPath(dirname, .{}); + defer parent_dir.close(); + + const basename = std.fs.path.basename(filename); + break :blk try parent_dir.createFile(basename, .{ .exclusive = true }); + } + break :blk try dest.createFile(filename, .{ .exclusive = true }); + }; + defer out_file.close(); + const local_data_file_offset: u64 = + @as(u64, self.file_offset) + + @as(u64, @sizeOf(LocalFileHeader)) + + local_data_header_offset; + try stream.seekTo(local_data_file_offset); + var limited_reader = std.io.limitedReader(stream.context.reader(), self.compressed_size); + const crc = try decompress( + self.compression_method, + self.uncompressed_size, + limited_reader.reader(), + out_file.writer(), + ); + if (limited_reader.bytes_left != 0) + return error.ZipDecompressTruncated; + return crc; + } + }; }; +} + +// returns true if `filename` starts with `root` followed by a forward slash +fn filenameInRoot(filename: []const u8, root: []const u8) bool { + return (filename.len >= root.len + 1) and + (filename[root.len] == '/') and + std.mem.eql(u8, filename[0..root.len], root); +} + +pub const Diagnostics = struct { + allocator: std.mem.Allocator, + + /// The common root directory for all extracted files if there is one. + root_dir: []const u8 = "", + + saw_first_file: bool = false, + + pub fn deinit(self: *Diagnostics) void { + self.allocator.free(self.root_dir); + self.* = undefined; + } + + // This function assumes name is a filename from a zip file which has already been verified to + // not start with a slash, backslashes have been normalized to forward slashes, and directories + // always end in a slash. + pub fn nextFilename(self: *Diagnostics, name: []const u8) error{OutOfMemory}!void { + if (!self.saw_first_file) { + self.saw_first_file = true; + std.debug.assert(self.root_dir.len == 0); + const root_len = std.mem.indexOfScalar(u8, name, '/') orelse return; + std.debug.assert(root_len > 0); + self.root_dir = try self.allocator.dupe(u8, name[0..root_len]); + } else if (self.root_dir.len > 0) { + if (!filenameInRoot(name, self.root_dir)) { + self.allocator.free(self.root_dir); + self.root_dir = ""; + } + } + } }; -pub fn pipeToFileSystem(dest: std.fs.Dir, file: std.fs.File) !void { - var iter = try Iterator.init(file); +pub const ExtractOptions = struct { + /// Allow filenames within the zip to use backslashes. Back slashes are normalized + /// to forward slashes before forwarding them to platform APIs. + allow_backslashes: bool = false, + + diagnostics: ?*Diagnostics = null, +}; + +/// Extract the zipped files inside `seekable_stream` to the given `dest` directory. +/// Note that `seekable_stream` must be an instance of `std.io.SeekabkeStream` and +/// its context must also have a `.reader()` method that returns an instance of +/// `std.io.Reader`. +pub fn extract(dest: std.fs.Dir, seekable_stream: anytype, options: ExtractOptions) !void { + const SeekableStream = @TypeOf(seekable_stream); + var iter = try Iterator(SeekableStream).init(seekable_stream); var filename_buf: [std.fs.MAX_PATH_BYTES]u8 = undefined; while (try iter.next()) |entry| { - const crc32 = try entry.extract(file, &filename_buf, dest); - if (crc32 != entry.header.crc32) + const crc32 = try entry.extract(seekable_stream, options, &filename_buf, dest); + if (crc32 != entry.crc32) return error.ZipCrcMismatch; + if (options.diagnostics) |d| { + try d.nextFilename(filename_buf[0..entry.filename_len]); + } } } -fn testZip(comptime files: []const File) !void { - var cache: [files.len]FileCache = undefined; - try testZipWithCache(files, &cache); +fn testZip(options: ExtractOptions, comptime files: []const File, write_opt: testutil.WriteZipOptions) !void { + var store: [files.len]FileStore = undefined; + try testZipWithStore(options, files, write_opt, &store); } -fn testZipWithCache(files: []const File, cache: []FileCache) !void { +fn testZipWithStore( + options: ExtractOptions, + test_files: []const File, + write_opt: testutil.WriteZipOptions, + store: []FileStore, +) !void { + var zip_buf: [4096]u8 = undefined; + var fbs = try testutil.makeZipWithStore(&zip_buf, test_files, write_opt, store); + var tmp = testing.tmpDir(.{ .no_follow = true }); defer tmp.cleanup(); - const dir = tmp.dir; - - { - var file = try dir.createFile("zip", .{}); - defer file.close(); - try writeFile(file, files, cache); - } - - var zip_file = try dir.openFile("zip", .{}); - defer zip_file.close(); - try pipeToFileSystem(dir, zip_file); - - for (files) |test_file| { - var file = try dir.openFile(test_file.name, .{}); - defer file.close(); - var buf: [4096]u8 = undefined; - const n = try file.reader().readAll(&buf); - try testing.expectEqualStrings(test_file.content, buf[0..n]); - } + try extract(tmp.dir, fbs.seekableStream(), options); + try testutil.expectFiles(test_files, tmp.dir, .{}); +} +fn testZipError(expected_error: anyerror, file: File, options: ExtractOptions) !void { + var zip_buf: [4096]u8 = undefined; + var store: [1]FileStore = undefined; + var fbs = try testutil.makeZipWithStore(&zip_buf, &[_]File{file}, .{}, &store); + var tmp = testing.tmpDir(.{ .no_follow = true }); + defer tmp.cleanup(); + try testing.expectError(expected_error, extract(tmp.dir, fbs.seekableStream(), options)); } test "zip one file" { - try testZip(&[_]File{ + try testZip(.{}, &[_]File{ .{ .name = "onefile.txt", .content = "Just a single file\n", .compression = .store }, - }); + }, .{}); } test "zip multiple files" { - try testZip(&[_]File{ + try testZip(.{ .allow_backslashes = true }, &[_]File{ .{ .name = "foo", .content = "a foo file\n", .compression = .store }, .{ .name = "subdir/bar", .content = "bar is this right?\nanother newline\n", .compression = .store }, + .{ .name = "subdir\\whoa", .content = "you can do backslashes", .compression = .store }, .{ .name = "subdir/another/baz", .content = "bazzy mc bazzerson", .compression = .store }, - }); + }, .{}); } test "zip deflated" { - try testZip(&[_]File{ + try testZip(.{}, &[_]File{ .{ .name = "deflateme", .content = "This is a deflated file.\nIt should be smaller in the Zip file1\n", .compression = .deflate }, - .{ .name = "deflateme64", .content = "The 64k version of deflate!\n", .compression = .deflate64 }, + // TODO: re-enable this if/when we add support for deflate64 + //.{ .name = "deflateme64", .content = "The 64k version of deflate!\n", .compression = .deflate64 }, .{ .name = "raw", .content = "Not all files need to be deflated in the same Zip.\n", .compression = .store }, + }, .{}); +} +test "zip verify filenames" { + // no empty filenames + try testZipError(error.ZipBadFilename, .{ .name = "", .content = "", .compression = .store }, .{}); + // no absolute paths + try testZipError(error.ZipBadFilename, .{ .name = "/", .content = "", .compression = .store }, .{}); + try testZipError(error.ZipBadFilename, .{ .name = "/foo", .content = "", .compression = .store }, .{}); + try testZipError(error.ZipBadFilename, .{ .name = "/foo/bar", .content = "", .compression = .store }, .{}); + // no '..' components + try testZipError(error.ZipBadFilename, .{ .name = "..", .content = "", .compression = .store }, .{}); + try testZipError(error.ZipBadFilename, .{ .name = "foo/..", .content = "", .compression = .store }, .{}); + try testZipError(error.ZipBadFilename, .{ .name = "foo/bar/..", .content = "", .compression = .store }, .{}); + try testZipError(error.ZipBadFilename, .{ .name = "foo/bar/../", .content = "", .compression = .store }, .{}); + // no backslashes + try testZipError(error.ZipFilenameHasBackslash, .{ .name = "foo\\bar", .content = "", .compression = .store }, .{}); +} + +test "zip64" { + const test_files = [_]File{ + .{ .name = "fram", .content = "fram foo fro fraba", .compression = .store }, + .{ .name = "subdir/barro", .content = "aljdk;jal;jfd;lajkf", .compression = .store }, + }; + + try testZip(.{}, &test_files, .{ + .end = .{ + .zip64 = .{}, + .record_count_disk = std.math.maxInt(u16), // trigger zip64 + }, }); + try testZip(.{}, &test_files, .{ + .end = .{ + .zip64 = .{}, + .record_count_total = std.math.maxInt(u16), // trigger zip64 + }, + }); + try testZip(.{}, &test_files, .{ + .end = .{ + .zip64 = .{}, + .record_count_disk = std.math.maxInt(u16), // trigger zip64 + .record_count_total = std.math.maxInt(u16), // trigger zip64 + }, + }); + try testZip(.{}, &test_files, .{ + .end = .{ + .zip64 = .{}, + .central_directory_size = std.math.maxInt(u32), // trigger zip64 + }, + }); + try testZip(.{}, &test_files, .{ + .end = .{ + .zip64 = .{}, + .central_directory_offset = std.math.maxInt(u32), // trigger zip64 + }, + }); +} + +test "bad zip files" { + var tmp = testing.tmpDir(.{ .no_follow = true }); + defer tmp.cleanup(); + var zip_buf: [4096]u8 = undefined; + + const file_a = [_]File{.{ .name = "a", .content = "", .compression = .store }}; + + { + var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .sig = [_]u8{ 1, 2, 3, 4 } } }); + try testing.expectError(error.ZipNoEndRecord, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .comment_len = 1 } }); + try testing.expectError(error.ZipNoEndRecord, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .comment = "a", .comment_len = 0 } }); + try testing.expectError(error.ZipNoEndRecord, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .disk_number = 1 } }); + try testing.expectError(error.ZipMultiDiskUnsupported, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .central_directory_disk_number = 1 } }); + try testing.expectError(error.ZipMultiDiskUnsupported, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .record_count_disk = 1 } }); + try testing.expectError(error.ZipDiskRecordCountTooLarge, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .central_directory_size = 1 } }); + try testing.expectError(error.ZipCdOversized, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &file_a, .{ .end = .{ .central_directory_size = 0 } }); + try testing.expectError(error.ZipCdUndersized, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &file_a, .{ .end = .{ .central_directory_offset = 0 } }); + try testing.expectError(error.ZipBadCdOffset, extract(tmp.dir, fbs.seekableStream(), .{})); + } + { + var fbs = try testutil.makeZip(&zip_buf, &file_a, .{ + .end = .{ + .zip64 = .{ .locator_sig = [_]u8{ 1, 2, 3, 4 } }, + .central_directory_size = std.math.maxInt(u32), // trigger 64 + }, + }); + try testing.expectError(error.ZipBadLocatorSig, extract(tmp.dir, fbs.seekableStream(), .{})); + } +} + +const native_endian = @import("builtin").target.cpu.arch.endian(); +pub fn readStructEndian(reader: anytype, comptime T: type, endian: std.builtin.Endian) anyerror!T { + var res = try reader.readStruct(T); + if (native_endian != endian) { + byteSwapAllFields(T, &res); + } + return res; +} +pub fn byteSwapAllFields(comptime S: type, ptr: *S) void { + switch (@typeInfo(S)) { + .Struct => { + inline for (std.meta.fields(S)) |f| { + switch (@typeInfo(f.type)) { + .Struct => |struct_info| if (struct_info.backing_integer) |Int| { + @field(ptr, f.name) = @bitCast(@byteSwap(@as(Int, @bitCast(@field(ptr, f.name))))); + } else { + byteSwapAllFields(f.type, &@field(ptr, f.name)); + }, + .Array => byteSwapAllFields(f.type, &@field(ptr, f.name)), + .Enum => { + @field(ptr, f.name) = @enumFromInt(@byteSwap(@intFromEnum(@field(ptr, f.name)))); + }, + else => { + @field(ptr, f.name) = @byteSwap(@field(ptr, f.name)); + }, + } + } + }, + .Array => { + for (ptr) |*item| { + switch (@typeInfo(@TypeOf(item.*))) { + .Struct, .Array => byteSwapAllFields(@TypeOf(item.*), item), + .Enum => { + item.* = @enumFromInt(@byteSwap(@intFromEnum(item.*))); + }, + else => { + item.* = @byteSwap(item.*); + }, + } + } + }, + else => @compileError("byteSwapAllFields expects a struct or array as the first argument"), + } }