Add string parsing + fuzzed string test set

I followed mostly the same procedure outlined here: https://www.ryanliptak.com/blog/fuzzing-as-test-case-generator/ but used a combination of Zua and fuzzing-lua to ultimately create the sets of inputs/outputs. - First, a giant corpus (16k+) of fuzzed string literals was created by iterating through all the fuzzed lexer inputs and outputting the source of every <string> token to a separate file (see test/fuzz_strings_gen.zig). This step would be difficult to do with Lua's API because strings are parsed as they are lexed, meaning any relationship to the original source is lost once the token is parsed. - Then, I used libFuzzer and fuzzing-lua to minimize the string corpus (via the -merge=1 flag). - Then, I used Lua to generate corresponding output files containing the parsed version of each input string (this code will be committed to fuzzing-lua once I clean it up). Kind of convoluted, but it ended up working well--there were a lot of bugs in my initial string parsing implementation that the fuzzed set allowed me to find.
squeek502 · Jan 8, 2020 · 5de41fd · 5de41fd
1 parent 0735e62
commit 5de41fd
Show file tree

Hide file tree

Showing 176 changed files with 503 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -21,6 +21,8 @@ Goals, in order of priority:
   + [x] Improve tests, perhaps use fuzz testing
     - See [Fuzzing As a Test Case Generator](https://www.ryanliptak.com/blog/fuzzing-as-test-case-generator/) and [squeek502/fuzzing-lua](https://github.com/squeek502/fuzzing-lua/)
   + [ ] Cleanup implementation
+- [x] String parsing (in Lua this was done at lex-time) -> [parse.zig](src/parse.zig)
+- [ ] Number parsing (in Lua this was done as lex-time)
 - [ ] Parser (lparser.c/.h)
 - [ ] ...
 

diff --git a/build.zig b/build.zig
@@ -38,4 +38,28 @@ pub fn build(b: *Builder) void {
     bench_lex_tests.addPackagePath("zua", "src/zua.zig");
     const bench_lex_test_step = b.step("bench_lex", "Bench lexer against a fuzzed corpus from fuzzing-lua");
     bench_lex_test_step.dependOn(&bench_lex_tests.step);
+
+    const fuzz_strings_inputs_dir_default = "test/strings/inputs";
+    const fuzz_strings_outputs_dir_default = "test/strings/outputs";
+    const fuzz_strings_gen_dir_default = "test/strings/generated";
+    const fuzz_strings_inputs_dir = b.option([]const u8, "fuzz_strings_inputs_dir", "Directory with input strings for string parsing tests") orelse fuzz_strings_inputs_dir_default;
+    const fuzz_strings_outputs_dir = b.option([]const u8, "fuzz_strings_outputs_dir", "Directory with output strings for string parsing tests") orelse fuzz_strings_outputs_dir_default;
+    const fuzz_strings_gen_dir = b.option([]const u8, "fuzz_strings_gen_dir", "Directory to output generated string inputs to") orelse fuzz_strings_gen_dir_default;
+
+    var fuzz_strings = b.addTest("test/fuzz_strings.zig");
+    fuzz_strings.setBuildMode(mode);
+    fuzz_strings.addBuildOption([]const u8, "fuzz_strings_inputs_dir", b.fmt("\"{}\"", .{fuzz_strings_inputs_dir}));
+    fuzz_strings.addBuildOption([]const u8, "fuzz_strings_outputs_dir", b.fmt("\"{}\"", .{fuzz_strings_outputs_dir}));
+    fuzz_strings.addPackagePath("zua", "src/zua.zig");
+    const fuzz_strings_step = b.step("fuzz_strings", "Test string parsing against a fuzzed corpus from fuzzing-lua");
+    fuzz_strings_step.dependOn(&fuzz_strings.step);
+
+    var fuzz_strings_gen = b.addExecutable("fuzz_strings_gen", "test/fuzz_strings_gen.zig");
+    fuzz_strings_gen.setBuildMode(mode);
+    fuzz_strings_gen.addBuildOption([]const u8, "fuzz_lex_inputs_dir", b.fmt("\"{}\"", .{fuzz_lex_inputs_dir}));
+    fuzz_strings_gen.addBuildOption([]const u8, "fuzz_strings_gen_dir", b.fmt("\"{}\"", .{fuzz_strings_gen_dir}));
+    fuzz_strings_gen.addPackagePath("zua", "src/zua.zig");
+
+    const fuzz_strings_gen_run_step = b.step("fuzz_strings_gen_run", "Generate string inputs from a fuzzed corpus of lexer inputs");
+    fuzz_strings_gen_run_step.dependOn(&fuzz_strings_gen.run().step);
 }
diff --git a/src/parse.zig b/src/parse.zig
@@ -0,0 +1,175 @@
+const std = @import("std");
+const lex = @import("lex.zig");
+
+// Notes:
+//
+// Lua parser always parses into a function (called the 'main' function) which
+// is always varargs (the values in the varargs differs depending on Lua version)
+
+pub const Parser = struct {
+    /// Because the lexer has already validated that strings don't contain
+    /// any invalid characters, this function can be implemented without
+    /// the possibility of failure. Any failures are a bug in the lexer.
+    ///
+    /// dest_buf must be at least as big as source to ensure it is large enough
+    /// to hold the parsed string
+    /// TODO: should this function be part of lex.Token instead?
+    pub fn parseStringLiteral(source_raw: []const u8, dest_buf: []u8) []u8 {
+        std.debug.assert(dest_buf.len >= source_raw.len);
+        var source: []const u8 = source_raw[0..];
+
+        // trim the start/end delimeters
+        var delim_len: usize = undefined;
+        var is_long_string: bool = false;
+        var skip_first_char: bool = false;
+        switch (source[0]) {
+            '\'', '"' => delim_len = 1,
+            '[' => {
+                var num_sep: usize = 0;
+                while (source[1 + num_sep] == '=') : (num_sep += 1) {}
+                std.debug.assert(source[1 + num_sep] == '[');
+                delim_len = 2 + num_sep;
+                is_long_string = true;
+                // if the first line of a long string is a newline char, it gets skipped
+                skip_first_char = source[delim_len] == '\r' or source[delim_len] == '\n';
+            },
+            else => unreachable,
+        }
+        source = source[delim_len .. source.len - delim_len];
+        if (skip_first_char) source = source[1..];
+
+        // like std.io.SliceOutStream but no need to check bounds of slice
+        // and can only append 1 character at a time (also doesn't implement Stream)
+        const SliceWriter = struct {
+            const Self = @This();
+
+            pos: usize = 0,
+            slice: []u8,
+
+            fn write(self: *Self, char: u8) void {
+                self.slice[self.pos] = char;
+                self.pos += 1;
+            }
+
+            fn getWritten(self: Self) []u8 {
+                return self.slice[0..self.pos];
+            }
+        };
+
+        const State = enum {
+            Normal,
+            Escaped,
+            EscapedNumerals,
+            EscapedLineEndings,
+        };
+
+        var writer = SliceWriter{ .slice = dest_buf };
+
+        var string_escape_n: u8 = 0;
+        var string_escape_i: std.math.IntFittingRange(0, 3) = 0;
+        var state: State = State.Normal;
+        var index: usize = 0;
+        while (index < source.len) : (index += 1) {
+            const c = source[index];
+            switch (state) {
+                State.Normal => switch (c) {
+                    // Lua's string parser transforms all \r to \n
+                    '\r' => writer.write('\n'),
+                    '\\' => state = State.Escaped,
+                    else => writer.write(c),
+                },
+                State.Escaped => switch (c) {
+                    '0'...'9' => {
+                        string_escape_n = c - '0';
+                        string_escape_i = 1;
+                        state = State.EscapedNumerals;
+                    },
+                    '\r', '\n' => {
+                        // escaped \r and \n get transformed to \n
+                        writer.write('\n');
+                        state = State.EscapedLineEndings;
+                    },
+                    else => {
+                        switch (c) {
+                            'a' => writer.write('\x07'),
+                            'b' => writer.write('\x08'),
+                            'f' => writer.write('\x0C'),
+                            'n' => writer.write('\n'),
+                            'r' => writer.write('\r'),
+                            't' => writer.write('\t'),
+                            'v' => writer.write('\x0B'),
+                            else => writer.write(c),
+                        }
+                        state = State.Normal;
+                    },
+                },
+                State.EscapedNumerals => switch(c) {
+                    '0'...'9' => {
+                        string_escape_n = 10 * string_escape_n + (c - '0');
+                        string_escape_i += 1;
+                        if (string_escape_i == 3) {
+                            writer.write(string_escape_n);
+                            state = State.Normal;
+                        }
+                    },
+                    else => {
+                        writer.write(string_escape_n);
+                        // backtrack so that we handle the current char properly
+                        index -= 1;
+                        state = State.Normal;
+                    },
+                },
+                State.EscapedLineEndings => switch(c) {
+                    '\r', '\n' => {
+                        state = State.Normal;
+                    },
+                    else => {
+                        // backtrack so that we handle the current char properly
+                        index -= 1;
+                        state = State.Normal;
+                    },
+                },
+            }
+        }
+        // we could be in a state that still needs processing here,
+        // since we could have hit the end of the string while unsure
+        // if a \ddd pattern was finished
+        switch (state) {
+            State.EscapedNumerals => {
+                writer.write(string_escape_n);
+            },
+            State.Normal,
+            State.EscapedLineEndings,
+            => {},
+            else => unreachable,
+        }
+
+        return writer.getWritten();
+    }
+};
+
+test "parseStringLiteral" {
+    var buf_arr: [100]u8 = undefined;
+    var buf: []u8 = buf_arr[0..];
+    std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("'hello'", buf));
+    std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("\"hello\"", buf));
+    std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[[hello]]", buf));
+    std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[=[hello]=]", buf));
+    std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[===[hello]===]", buf));
+    std.testing.expectEqualSlices(u8, "\\ \n \x0B", Parser.parseStringLiteral("'\\\\ \\n \\v'", buf));
+
+    // long strings skip initial newline
+    std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[[\nhello]]", buf));
+    std.testing.expectEqualSlices(u8, "\nhello", Parser.parseStringLiteral("[[\r\rhello]]", buf));
+
+    // escaped \r gets transformed into \n
+    std.testing.expectEqualSlices(u8, "\n", Parser.parseStringLiteral("\"\\\r\"", buf));
+
+    // escaped newlines and newline pairs
+    std.testing.expectEqualSlices(u8, "\n\\ ", Parser.parseStringLiteral("\"\\\r\\\\ \"", buf));
+    std.testing.expectEqualSlices(u8, "\n\\ ", Parser.parseStringLiteral("\"\\\r\n\\\\ \"", buf));
+    std.testing.expectEqualSlices(u8, "\n", Parser.parseStringLiteral("\"\\\n\r\"", buf));
+
+    // escaped numerals
+    std.testing.expectEqualSlices(u8, "\x01-\x02", Parser.parseStringLiteral("\"\\1-\\2\"", buf));
+}
diff --git a/src/zua.zig b/src/zua.zig
@@ -1,10 +1,12 @@
 const std = @import("std");
 
 pub const lex = @import("lex.zig");
+pub const parse = @import("parse.zig");
 
 pub fn main() void {
 }
 
 test "zua" {
     _ = @import("lex.zig");
+    _ = @import("parse.zig");
 }
diff --git a/test/fuzz_strings.zig b/test/fuzz_strings.zig
@@ -0,0 +1,72 @@
+const std = @import("std");
+const zua = @import("zua");
+const lex = zua.lex;
+const parse = zua.parse;
+
+// Tests for comparing parsed strings between Zua and Lua.
+// Expects @import("build_options").fuzz_strings_inputs_dir to be a path to
+// a directory containing a corpus of inputs to test and 
+// @import("build_options").fuzz_strings_outputs_dir to be a path to a
+// directory containing the corresponding expected string after
+// parsing.
+//
+// A usable inputs/outputs pair can be obtained from
+// https://github.com/squeek502/fuzzing-lua
+
+const verboseTestPrinting = false;
+
+const build_options = @import("build_options");
+const inputs_dir_opt = build_options.fuzz_strings_inputs_dir;
+const outputs_dir_opt = build_options.fuzz_strings_outputs_dir;
+
+test "string input/output pairs" {
+    var arena_allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_allocator.deinit();
+    var allocator = &arena_allocator.allocator;
+
+    // resolve these now since Zig's std lib on Windows rejects paths with / as the path sep
+    const inputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{ inputs_dir_opt });
+    const outputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{ outputs_dir_opt });
+
+    var walker = try std.fs.walkPath(allocator, inputs_dir);
+    defer walker.deinit();
+    var path_buffer = try std.Buffer.init(allocator, outputs_dir);
+    defer path_buffer.deinit();
+    var result_buffer: [1024 * 1024]u8 = undefined;
+
+    var n: usize = 0;
+    while (try walker.next()) |entry| {
+        if (verboseTestPrinting) {
+            std.debug.warn("\n{}\n", .{entry.basename});
+        }
+        const contents = try entry.dir.readFileAlloc(allocator, entry.basename, std.math.maxInt(usize));
+        defer allocator.free(contents);
+
+        path_buffer.shrink(outputs_dir.len);
+        try path_buffer.appendByte(std.fs.path.sep);
+        try path_buffer.append(entry.basename);
+        const expectedContents = try std.io.readFileAlloc(allocator, path_buffer.toSliceConst());
+        defer allocator.free(expectedContents);
+
+        var lexer = lex.DefaultLexer.init(contents);
+        while (true) {
+            const token = lexer.next() catch |e| {
+                break;
+            };
+            if (token.id == lex.Token.Id.Eof) break;
+            if (token.id != lex.Token.Id.String) continue;
+
+            const string_source = contents[token.start..token.end];
+            var buf = try allocator.alloc(u8, string_source.len);
+            defer allocator.free(buf);
+            const parsed = parse.Parser.parseStringLiteral(string_source, buf);
+            if (verboseTestPrinting) {
+                std.debug.warn("got\n{x}\n", .{parsed});
+                std.debug.warn("expected\n{x}\n", .{expectedContents});
+            }
+            std.testing.expectEqualSlices(u8, expectedContents, parsed);
+        }
+        n += 1;
+    }
+    std.debug.warn("{} input/output pairs checked...", .{n});
+}
diff --git a/test/fuzz_strings_gen.zig b/test/fuzz_strings_gen.zig
@@ -0,0 +1,65 @@
+const std = @import("std");
+const lex = @import("zua").lex;
+
+// Code for generating a potentially huge collection of
+// files containing the source of every string literal token
+// in the corpus provided in @import("build_options").fuzz_lex_inputs_dir
+// and outputting them to @import("build_options").fuzz_strings_gen_dir
+//
+// This is a building block for use later with fuzz_strings.zig,
+// after minimizing/generating outputs with https://github.com/squeek502/fuzzing-lua
+
+const build_options = @import("build_options");
+const inputs_dir_opt = build_options.fuzz_lex_inputs_dir;
+const outputs_dir_opt = build_options.fuzz_strings_gen_dir;
+
+pub fn main() !void {
+    var arena_allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_allocator.deinit();
+    var allocator = &arena_allocator.allocator;
+
+    // resolve these now since Zig's std lib on Windows rejects paths with / as the path sep
+    const inputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{inputs_dir_opt});
+    const outputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{outputs_dir_opt});
+
+    // clean the outputs dir
+    std.fs.deleteTree(outputs_dir) catch |err| switch(err) {
+        error.FileNotFound => {},
+        else => |e| return e,
+    };
+    try std.fs.makePath(allocator, outputs_dir);
+
+    var walker = try std.fs.walkPath(allocator, inputs_dir);
+    defer walker.deinit();
+    var path_buffer = try std.Buffer.init(allocator, outputs_dir);
+    defer path_buffer.deinit();
+    var result_buffer: [1024 * 1024]u8 = undefined;
+
+    var n: usize = 0;
+    while (try walker.next()) |entry| {
+        const contents = try entry.dir.readFileAlloc(allocator, entry.basename, std.math.maxInt(usize));
+        defer allocator.free(contents);
+
+        var lexer = lex.DefaultLexer.init(contents);
+        while (true) {
+            const token = lexer.next() catch |e| {
+                break;
+            };
+            if (token.id == lex.Token.Id.Eof) break;
+            if (token.id != lex.Token.Id.String) continue;
+
+            path_buffer.shrink(outputs_dir.len);
+            try path_buffer.appendByte(std.fs.path.sep);
+            var buffer_out_stream = std.io.BufferOutStream.init(&path_buffer);
+            try buffer_out_stream.stream.print("{}", .{n});
+
+            try std.io.writeFile(path_buffer.toSliceConst(), contents[token.start..token.end]);
+
+            n += 1;
+            if (n % 100 == 0) {
+                std.debug.warn("{}...\r", .{n});
+            }
+        }
+    }
+    std.debug.warn("{} files written to '{}'\n", .{n, outputs_dir});
+}
diff --git a/test/strings/inputs/00481498e388a2f418944bc78031b746a1a596ab b/test/strings/inputs/00481498e388a2f418944bc78031b746a1a596ab
diff --git a/test/strings/inputs/014219ef2de7b83b526f7aedf62bf5ca0e6d4a28 b/test/strings/inputs/014219ef2de7b83b526f7aedf62bf5ca0e6d4a28
diff --git a/test/strings/inputs/028c9fd8fc81eba2eada237fa285984d71f469c9 b/test/strings/inputs/028c9fd8fc81eba2eada237fa285984d71f469c9
@@ -0,0 +1 @@
+[[[�]Hk*[�]]
diff --git a/test/strings/inputs/07bb15ee6f5a600516272729371fe49b7d7d08a2 b/test/strings/inputs/07bb15ee6f5a600516272729371fe49b7d7d08a2
diff --git a/test/strings/inputs/0a6d0007e9c15bf9b31647c3d7d271f5cc083d21 b/test/strings/inputs/0a6d0007e9c15bf9b31647c3d7d271f5cc083d21
diff --git a/test/strings/inputs/0abae03161ab6b45bbf27f4c9fc3e81701bac30e b/test/strings/inputs/0abae03161ab6b45bbf27f4c9fc3e81701bac30e
@@ -0,0 +1 @@
+"\\\tr\tr\tr\t"
diff --git a/test/strings/inputs/0c43b9c752ca8118a8c123267170ea9efca1886a b/test/strings/inputs/0c43b9c752ca8118a8c123267170ea9efca1886a
@@ -0,0 +1 @@
+"Ea*G"
diff --git a/test/strings/inputs/0cb91f63eb37118dc0b172a3ef62967b41b8ce03 b/test/strings/inputs/0cb91f63eb37118dc0b172a3ef62967b41b8ce03
@@ -0,0 +1,6 @@
+[[
+
+
+
+
+]]
diff --git a/test/strings/inputs/0ceb310acd7fcc41f5689b7d018cc63a56d28134 b/test/strings/inputs/0ceb310acd7fcc41f5689b7d018cc63a56d28134
diff --git a/test/strings/inputs/0d9cb37522cd8483f42333313dfaf1112d659139 b/test/strings/inputs/0d9cb37522cd8483f42333313dfaf1112d659139
diff --git a/test/strings/inputs/10be8216b9f81d3189e0796380f6828c8060ea0b b/test/strings/inputs/10be8216b9f81d3189e0796380f6828c8060ea0b
@@ -0,0 +1 @@
+"\1-\2"
diff --git a/test/strings/inputs/1180befcf95c00f8da94a0d0fe5f977b2b818423 b/test/strings/inputs/1180befcf95c00f8da94a0d0fe5f977b2b818423
@@ -0,0 +1 @@
+[[e�]]
diff --git a/test/strings/inputs/1201cd03760d6f494872d573731bd700b59df058 b/test/strings/inputs/1201cd03760d6f494872d573731bd700b59df058
@@ -0,0 +1 @@
+"\""
diff --git a/test/strings/inputs/129ed7fc495dcc12e941ebdd1877a6a189711e98 b/test/strings/inputs/129ed7fc495dcc12e941ebdd1877a6a189711e98