From 3a38a57803439ff06eda3c64ec3acfbaadb6cb41 Mon Sep 17 00:00:00 2001 From: fuzzy-boiii23a Date: Mon, 16 Oct 2023 04:10:22 -0700 Subject: [PATCH 01/14] added fuzzing support --- gumbo-parser/.gitignore | 2 ++ gumbo-parser/Makefile | 12 +++++++++ gumbo-parser/fuzzer/build.sh | 41 +++++++++++++++++++++++++++++ gumbo-parser/fuzzer/parse_fuzzer.cc | 40 ++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+) create mode 100755 gumbo-parser/fuzzer/build.sh create mode 100644 gumbo-parser/fuzzer/parse_fuzzer.cc diff --git a/gumbo-parser/.gitignore b/gumbo-parser/.gitignore index 3d04bd296b..37a46fc2b3 100644 --- a/gumbo-parser/.gitignore +++ b/gumbo-parser/.gitignore @@ -1,3 +1,5 @@ build googletest src/*.o +fuzzer/build +src/libgumbo.a \ No newline at end of file diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile index c2d0721344..7ffb74256d 100644 --- a/gumbo-parser/Makefile +++ b/gumbo-parser/Makefile @@ -13,6 +13,17 @@ LDFLAGS := -pthread all: check +fuzzing: fuzzer-normal fuzzer-asan fuzzer-ubsan + +fuzzer-normal: + cd fuzzer && ./build.sh && cd - + +fuzzer-asan: + cd fuzzer && SANITIZER=address ./build.sh && cd - + +fuzzer-ubsan: + cd fuzzer && SANITIZER=undefined ./build.sh && cd - + # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and # the generated files should be committed to SCM ifneq ($(CI),true) @@ -81,6 +92,7 @@ coverage: clean: $(RM) -r build + $(RM) -r fuzzer/build build/src/flags: | build/src @echo 'old_CC := $(CC)' > $@ diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh new file mode 100755 index 0000000000..68a5ba0370 --- /dev/null +++ b/gumbo-parser/fuzzer/build.sh @@ -0,0 +1,41 @@ +export SANITIZER_OPTS="" +export SANITIZER_LINK="" + +if [ -z "${LLVM_CONFIG}" ] +then + echo '$LLVM_CONFIG has not been configured, expecting "export LLVM_CONFIG=/usr/bin/llvm-config-12" assuming clang-12 is installed, however any clang version works' + exit +fi + +if [ ! -d "build" ] +then + mkdir build +fi + +export CC="$(llvm-config-12 --bindir)/clang" +export CXX="$(llvm-config-12 --bindir)/clang++" +export CXXFLAGS="-fsanitize=fuzzer-no-link" +export CFLAGS="-fsanitize=fuzzer-no-link" +export ENGINE_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.fuzzer-x86_64.a | head -1)" + +if [ "$SANITIZER" = "undefined" ] +then + export SANITIZER_OPTS="-fsanitize=undefined" + export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.ubsan_standalone_cxx-x86_64.a | head -1)" +fi +if [ "$SANITIZER" = "address" ] +then + export SANITIZER_OPTS="-fsanitize=address" + export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.asan_cxx-x86_64.a | head -1)" +fi + +export CXXFLAGS="-O0 $CXXFLAGS $SANITIZER_OPTS" +export CFLAGS="-O0 $CFLAGS $SANITIZER_OPTS" +cd ../src && make clean && make && cd - + +if [ -z "${SANITIZER}" ] +then + $CXX $CXXFLAGS -o build/parse_fuzzer parse_fuzzer.cc ../src/libgumbo.a $ENGINE_LINK $SANITIZER_LINK +else + $CXX $CXXFLAGS -o build/parse_fuzzer-$SANITIZER parse_fuzzer.cc ../src/libgumbo.a $ENGINE_LINK $SANITIZER_LINK +fi \ No newline at end of file diff --git a/gumbo-parser/fuzzer/parse_fuzzer.cc b/gumbo-parser/fuzzer/parse_fuzzer.cc new file mode 100644 index 0000000000..690996277e --- /dev/null +++ b/gumbo-parser/fuzzer/parse_fuzzer.cc @@ -0,0 +1,40 @@ +#include "../src/nokogiri_gumbo.h" +#include + +void SanityCheckPointers( + const char* input, size_t input_length, const GumboNode* node, int depth) { + if (node->type == GUMBO_NODE_DOCUMENT || depth > 400) { + return; + } + if (node->type == GUMBO_NODE_ELEMENT) { + const GumboElement* element = &node->v.element; + const GumboVector* children = &element->children; + for (unsigned int i = 0; i < children->length; ++i) { + const GumboNode* child = static_cast(children->data[i]); + SanityCheckPointers(input, input_length, child, depth + 1); + } + } else { + const GumboText* text = &node->v.text; + } +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + if (size < 10) + { + return 0; + } + + GumboOptions options = kGumboDefaultOptions; + GumboOutput* output; + GumboNode* root; + + output = gumbo_parse_with_options(&options, (char*)data, size); + root = output->document; + SanityCheckPointers((char*)data, size, output->root, 0); + + if (output) { + gumbo_destroy_output(output); + } + + return 0; +} \ No newline at end of file From 0fc502af356fb0b46223905bc1344d3b7db9b8b4 Mon Sep 17 00:00:00 2001 From: fuzzy-boiii23a Date: Tue, 17 Oct 2023 02:38:54 -0700 Subject: [PATCH 02/14] added code to ensure nothing gets pruned or dead stripped and added memory sanitizer --- gumbo-parser/Makefile | 5 +++- gumbo-parser/fuzzer/build.sh | 9 ++++++-- gumbo-parser/fuzzer/parse_fuzzer.cc | 36 +++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile index 7ffb74256d..c7189d988f 100644 --- a/gumbo-parser/Makefile +++ b/gumbo-parser/Makefile @@ -13,7 +13,7 @@ LDFLAGS := -pthread all: check -fuzzing: fuzzer-normal fuzzer-asan fuzzer-ubsan +fuzzing: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan fuzzer-normal: cd fuzzer && ./build.sh && cd - @@ -24,6 +24,9 @@ fuzzer-asan: fuzzer-ubsan: cd fuzzer && SANITIZER=undefined ./build.sh && cd - +fuzzer-msan: + cd fuzzer && SANITIZER=memory ./build.sh && cd - + # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and # the generated files should be committed to SCM ifneq ($(CI),true) diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index 68a5ba0370..342ef3df3a 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -28,9 +28,14 @@ then export SANITIZER_OPTS="-fsanitize=address" export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.asan_cxx-x86_64.a | head -1)" fi +if [ "$SANITIZER" = "memory" ] +then + export SANITIZER_OPTS="-fsanitize=memory -fPIE -pie -Wno-unused-command-line-argument" + export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.msan_cxx-x86_64.a | head -1)" +fi -export CXXFLAGS="-O0 $CXXFLAGS $SANITIZER_OPTS" -export CFLAGS="-O0 $CFLAGS $SANITIZER_OPTS" +export CXXFLAGS="-O3 $CXXFLAGS $SANITIZER_OPTS" +export CFLAGS="-O3 $CFLAGS $SANITIZER_OPTS" cd ../src && make clean && make && cd - if [ -z "${SANITIZER}" ] diff --git a/gumbo-parser/fuzzer/parse_fuzzer.cc b/gumbo-parser/fuzzer/parse_fuzzer.cc index 690996277e..89db6814b1 100644 --- a/gumbo-parser/fuzzer/parse_fuzzer.cc +++ b/gumbo-parser/fuzzer/parse_fuzzer.cc @@ -1,21 +1,39 @@ #include "../src/nokogiri_gumbo.h" #include -void SanityCheckPointers( - const char* input, size_t input_length, const GumboNode* node, int depth) { +int SanityCheckPointers(const char* input, size_t input_length, const GumboNode* node, int depth) { if (node->type == GUMBO_NODE_DOCUMENT || depth > 400) { - return; + return -1; } if (node->type == GUMBO_NODE_ELEMENT) { const GumboElement* element = &node->v.element; + const GumboVector* attributes = &element->attributes; + + for (unsigned int i = 0; i < attributes->length; ++i) { + const GumboAttribute* attribute = static_cast(attributes->data[i]); + if (!attribute) + { + return -1; + } + } const GumboVector* children = &element->children; for (unsigned int i = 0; i < children->length; ++i) { const GumboNode* child = static_cast(children->data[i]); + if (!child) + { + return -1; + } SanityCheckPointers(input, input_length, child, depth + 1); } } else { const GumboText* text = &node->v.text; + if (!text) + { + return -1; + } } + + return 0; } extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { @@ -30,7 +48,17 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { output = gumbo_parse_with_options(&options, (char*)data, size); root = output->document; - SanityCheckPointers((char*)data, size, output->root, 0); + + int result = SanityCheckPointers((char*)data, size, output->root, 0); + + if (result < 0) + { + if (output) { + gumbo_destroy_output(output); + } + + return -1; + } if (output) { gumbo_destroy_output(output); From 733e993058f11c30298a1e33135535405ad41052 Mon Sep 17 00:00:00 2001 From: liam doe Date: Mon, 20 Nov 2023 02:39:04 -0800 Subject: [PATCH 03/14] added doco, fixed typo, added checking for default llvm-config --- CONTRIBUTING.md | 37 +++++++++++++++++++++++++++++++++++- gumbo-parser/fuzzer/build.sh | 10 +++++++--- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 22c4f95edc..e02ec2b461 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,6 +26,7 @@ If you're looking for guidance on filing a bug report or getting support, please - [Bumping Java dependencies](#bumping-java-dependencies) - [Rake tasks](#rake-tasks) - [Making a release](#making-a-release) +- [Fuzzing your gumbo-parser changes](#fuzzing-your-changes) @@ -382,7 +383,7 @@ To modify or add a dependency, a few things needs to be in sync: A quick summary of what this looks like for you, the developer: 1. edit the `requirements` in the gemspec -2. run `bundle exec rake vendor_jars` which updates everything under `lib/nokogiri/jruby` +2. run `bundle exec rake vendor_jars` which updates everything under `lib/nokogiri/jrubfuzzing-your-changesy` 3. run `bundle exec rake check_manifest` and if necessary update the gemspec `files` 4. make sure to check everything under `lib/nokogiri/jruby` into git, including the jar files @@ -408,3 +409,37 @@ A quick checklist: - [ ] submit a PR to https://github.com/rubysec/ruby-advisory-db - [ ] update nokogiri.org - [ ] bump `lib/nokogiri/version/constant.rb` to a prerelease version like `v1.14.0.dev` + +## Fuzzing your gumbo-parser changes + +When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). This can be done by navigating to the `nokogiri/gumbo-parser` directory and executing `make fuzzing` in order to build the `gumbo-parser` fuzzer. Once built, navigate to the `nokogiri/gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory with no arguments to start fuzzing: + +- parse_fuzzer (standard fuzzer with no sanitizer) +- parse_fuzzer-address (fuzzer built using [ASAN](https://clang.llvm.org/docs/AddressSanitizer.html)) +- parse_fuzzer-memory (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) +- parse_fuzzer-undefined (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) + +If the fuzzer finds a "crash" (indicating that a bug has been found), the following output would be expected: + +``` +INFO: Seed: 1523017872 +INFO: Loaded 1 modules (16 guards): [0x744e60, 0x744ea0), +INFO: -max_len is not provided, using 64 +INFO: A corpus is not provided, starting from an empty corpus +#0 READ units: 1 +#1 INITED cov: 3 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb +#3811 NEW cov: 4 ft: 3 corp: 2/2b exec/s: 0 rss: 25Mb L: 1 MS: 5 ChangeBit-ChangeByte-ChangeBit-ShuffleBytes-ChangeByte- +#3827 NEW cov: 5 ft: 4 corp: 3/4b exec/s: 0 rss: 25Mb L: 2 MS: 1 CopyPart- +#3963 NEW cov: 6 ft: 5 corp: 4/6b exec/s: 0 rss: 25Mb L: 2 MS: 2 ShuffleBytes-ChangeBit- +#4167 NEW cov: 7 ft: 6 corp: 5/9b exec/s: 0 rss: 25Mb L: 3 MS: 1 InsertByte- +==31511== ERROR: libFuzzer: deadly signal +... +artifact_prefix='./'; Test unit written to ./crash-b13e8756b13a00cf168300179061fb4b91fefbed +``` + +The above indicates that a crash has been identified and it can be reproduced by feeding the `crash-b13e8756b13a00cf168300179061fb4b91fefbed` file back into the binary used for fuzzing (e.g. parse-fuzzer) using the following command: + +``` +parse_fuzzer crash-b13e8756b13a00cf168300179061fb4b91fefbed +``` + diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index 342ef3df3a..ffb9e5f380 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -1,9 +1,13 @@ export SANITIZER_OPTS="" export SANITIZER_LINK="" +if [ -x "$(command -v llvm-config)" ]; then + export LLVM_CONFIG=$(which llvm-config) +fi + if [ -z "${LLVM_CONFIG}" ] then - echo '$LLVM_CONFIG has not been configured, expecting "export LLVM_CONFIG=/usr/bin/llvm-config-12" assuming clang-12 is installed, however any clang version works' + echo 'llvm-config could not be found and $LLVM_CONFIG has not been set, expecting "export LLVM_CONFIG=/usr/bin/llvm-config-12" assuming clang-12 is installed, however any clang version works' exit fi @@ -12,8 +16,8 @@ then mkdir build fi -export CC="$(llvm-config-12 --bindir)/clang" -export CXX="$(llvm-config-12 --bindir)/clang++" +export CC="$($LLVM_CONFIG --bindir)/clang" +export CXX="$($LLVM_CONFIG --bindir)/clang++" export CXXFLAGS="-fsanitize=fuzzer-no-link" export CFLAGS="-fsanitize=fuzzer-no-link" export ENGINE_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.fuzzer-x86_64.a | head -1)" From 45118a61e19a4dd96e5c65b1ab5571eb5eb17280 Mon Sep 17 00:00:00 2001 From: liam doe Date: Mon, 20 Nov 2023 02:42:31 -0800 Subject: [PATCH 04/14] improved fuzzer doco --- CONTRIBUTING.md | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e02ec2b461..1c01a853c9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -419,11 +419,32 @@ When making changes or adding new features to `gumbo-parser`, it's recommended t - parse_fuzzer-memory (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) - parse_fuzzer-undefined (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) -If the fuzzer finds a "crash" (indicating that a bug has been found), the following output would be expected: +If the binary executed successfully you should now be seeing the following output filling up your terminal: + +``` +INFO: Seed: 4156947595 +INFO: Loaded 1 modules (7149 inline 8-bit counters): 7149 0x58a462, 0x58c04f, +INFO: Loaded 1 PC tables (7149 PCs): 7149 0x53beb0,0x557d80, +INFO: -max_len is not provided; libFuzzer will not generate inputs larger than 4096 bytes +INFO: A corpus is not provided, starting from an empty corpus +#2 INITED cov: 2 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb + NEW_FUNC[1/44]: 0x429840 in gumbo_parse_with_options (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x429840) + NEW_FUNC[2/44]: 0x42c0d0 in destroy_node (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x42c0d0) +#721 NEW cov: 180 ft: 181 corp: 2/12b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 4 ChangeByte-ChangeByte-ChangeBit-InsertRepeatedBytes- +#722 NEW cov: 186 ft: 196 corp: 3/23b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- +#723 NEW cov: 186 ft: 228 corp: 4/34b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBinInt- +#724 NEW cov: 188 ft: 241 corp: 5/45b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- +#725 NEW cov: 188 ft: 254 corp: 6/56b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeByte- +#726 NEW cov: 188 ft: 270 corp: 7/67b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 CopyPart- +#732 NEW cov: 188 ft: 279 corp: 8/78b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- + NEW_FUNC[1/1]: 0x441de0 in gumbo_token_destroy (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x441de0) +``` + +However, if the fuzzer finds a "crash" (indicating that a bug has been found) it will stop fuzzing and the following output would be expected: ``` INFO: Seed: 1523017872 -INFO: Loaded 1 modules (16 guards): [0x744e60, 0x744ea0), +INFO: Loaded 1 modules (16 guards): 0x744e60, 0x744ea0, INFO: -max_len is not provided, using 64 INFO: A corpus is not provided, starting from an empty corpus #0 READ units: 1 @@ -443,3 +464,4 @@ The above indicates that a crash has been identified and it can be reproduced by parse_fuzzer crash-b13e8756b13a00cf168300179061fb4b91fefbed ``` +If you'd like to learn more about libfuzzer please give https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md a try. From 9db4d7dab88335ee175516a2deb8ad9d98c83cbe Mon Sep 17 00:00:00 2001 From: liam doe Date: Mon, 20 Nov 2023 02:46:15 -0800 Subject: [PATCH 05/14] fixed formatting --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1c01a853c9..4ca8996ccf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,7 +26,7 @@ If you're looking for guidance on filing a bug report or getting support, please - [Bumping Java dependencies](#bumping-java-dependencies) - [Rake tasks](#rake-tasks) - [Making a release](#making-a-release) -- [Fuzzing your gumbo-parser changes](#fuzzing-your-changes) +- [Fuzzing your gumbo parser changes](#fuzzing-your-gumbo-parser-changes) @@ -410,7 +410,7 @@ A quick checklist: - [ ] update nokogiri.org - [ ] bump `lib/nokogiri/version/constant.rb` to a prerelease version like `v1.14.0.dev` -## Fuzzing your gumbo-parser changes +## Fuzzing your gumbo parser changes When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). This can be done by navigating to the `nokogiri/gumbo-parser` directory and executing `make fuzzing` in order to build the `gumbo-parser` fuzzer. Once built, navigate to the `nokogiri/gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory with no arguments to start fuzzing: From 42a099c67488ee3388d2f60a807d54e27d021239 Mon Sep 17 00:00:00 2001 From: fuzzy-boiii23a Date: Mon, 20 Nov 2023 11:59:27 -0800 Subject: [PATCH 06/14] fixed doco and build.sh --- CONTRIBUTING.md | 115 +++++++++++++++++------------------ gumbo-parser/fuzzer/build.sh | 3 +- 2 files changed, 59 insertions(+), 59 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4ca8996ccf..ac5cdcf27f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,7 +26,6 @@ If you're looking for guidance on filing a bug report or getting support, please - [Bumping Java dependencies](#bumping-java-dependencies) - [Rake tasks](#rake-tasks) - [Making a release](#making-a-release) -- [Fuzzing your gumbo parser changes](#fuzzing-your-gumbo-parser-changes) @@ -237,6 +236,62 @@ git submodule update --init # test/html5lib-tests bundle exec rake compile test ``` +### Fuzzing your gumbo HTML5 parser changes + +When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). This can be done by navigating to the `nokogiri/gumbo-parser` directory and executing `make fuzzing` in order to build the `gumbo-parser` fuzzer. Once built, navigate to the `nokogiri/gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory with no arguments to start fuzzing: + +- parse_fuzzer (standard fuzzer with no sanitizer) +- parse_fuzzer-address (fuzzer built using [ASAN](https://clang.llvm.org/docs/AddressSanitizer.html)) +- parse_fuzzer-memory (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) +- parse_fuzzer-undefined (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) + +If the binary executed successfully you should now be seeing the following output filling up your terminal: + +``` +INFO: Seed: 4156947595 +INFO: Loaded 1 modules (7149 inline 8-bit counters): 7149 0x58a462, 0x58c04f, +INFO: Loaded 1 PC tables (7149 PCs): 7149 0x53beb0,0x557d80, +INFO: -max_len is not provided; libFuzzer will not generate inputs larger than 4096 bytes +INFO: A corpus is not provided, starting from an empty corpus +#2 INITED cov: 2 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb + NEW_FUNC[1/44]: 0x429840 in gumbo_parse_with_options (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x429840) + NEW_FUNC[2/44]: 0x42c0d0 in destroy_node (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x42c0d0) +#721 NEW cov: 180 ft: 181 corp: 2/12b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 4 ChangeByte-ChangeByte-ChangeBit-InsertRepeatedBytes- +#722 NEW cov: 186 ft: 196 corp: 3/23b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- +#723 NEW cov: 186 ft: 228 corp: 4/34b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBinInt- +#724 NEW cov: 188 ft: 241 corp: 5/45b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- +#725 NEW cov: 188 ft: 254 corp: 6/56b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeByte- +#726 NEW cov: 188 ft: 270 corp: 7/67b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 CopyPart- +#732 NEW cov: 188 ft: 279 corp: 8/78b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- + NEW_FUNC[1/1]: 0x441de0 in gumbo_token_destroy (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x441de0) +``` + +However, if the fuzzer finds a "crash" (indicating that a bug has been found) it will stop fuzzing and the following output would be expected: + +``` +INFO: Seed: 1523017872 +INFO: Loaded 1 modules (16 guards): 0x744e60, 0x744ea0, +INFO: -max_len is not provided, using 64 +INFO: A corpus is not provided, starting from an empty corpus +#0 READ units: 1 +#1 INITED cov: 3 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb +#3811 NEW cov: 4 ft: 3 corp: 2/2b exec/s: 0 rss: 25Mb L: 1 MS: 5 ChangeBit-ChangeByte-ChangeBit-ShuffleBytes-ChangeByte- +#3827 NEW cov: 5 ft: 4 corp: 3/4b exec/s: 0 rss: 25Mb L: 2 MS: 1 CopyPart- +#3963 NEW cov: 6 ft: 5 corp: 4/6b exec/s: 0 rss: 25Mb L: 2 MS: 2 ShuffleBytes-ChangeBit- +#4167 NEW cov: 7 ft: 6 corp: 5/9b exec/s: 0 rss: 25Mb L: 3 MS: 1 InsertByte- +==31511== ERROR: libFuzzer: deadly signal +... +artifact_prefix='./'; Test unit written to ./crash-b13e8756b13a00cf168300179061fb4b91fefbed +``` + +The above indicates that a crash has been identified and it can be reproduced by feeding the `crash-b13e8756b13a00cf168300179061fb4b91fefbed` file back into the binary used for fuzzing (e.g. parse-fuzzer) using the following command: + +``` +parse_fuzzer crash-b13e8756b13a00cf168300179061fb4b91fefbed +``` + +If you'd like to learn more about libfuzzer please give https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md a try. + ## Style Guide @@ -383,7 +438,7 @@ To modify or add a dependency, a few things needs to be in sync: A quick summary of what this looks like for you, the developer: 1. edit the `requirements` in the gemspec -2. run `bundle exec rake vendor_jars` which updates everything under `lib/nokogiri/jrubfuzzing-your-changesy` +2. run `bundle exec rake vendor_jars` which updates everything under `lib/nokogiri/jruby` 3. run `bundle exec rake check_manifest` and if necessary update the gemspec `files` 4. make sure to check everything under `lib/nokogiri/jruby` into git, including the jar files @@ -409,59 +464,3 @@ A quick checklist: - [ ] submit a PR to https://github.com/rubysec/ruby-advisory-db - [ ] update nokogiri.org - [ ] bump `lib/nokogiri/version/constant.rb` to a prerelease version like `v1.14.0.dev` - -## Fuzzing your gumbo parser changes - -When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). This can be done by navigating to the `nokogiri/gumbo-parser` directory and executing `make fuzzing` in order to build the `gumbo-parser` fuzzer. Once built, navigate to the `nokogiri/gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory with no arguments to start fuzzing: - -- parse_fuzzer (standard fuzzer with no sanitizer) -- parse_fuzzer-address (fuzzer built using [ASAN](https://clang.llvm.org/docs/AddressSanitizer.html)) -- parse_fuzzer-memory (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) -- parse_fuzzer-undefined (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) - -If the binary executed successfully you should now be seeing the following output filling up your terminal: - -``` -INFO: Seed: 4156947595 -INFO: Loaded 1 modules (7149 inline 8-bit counters): 7149 0x58a462, 0x58c04f, -INFO: Loaded 1 PC tables (7149 PCs): 7149 0x53beb0,0x557d80, -INFO: -max_len is not provided; libFuzzer will not generate inputs larger than 4096 bytes -INFO: A corpus is not provided, starting from an empty corpus -#2 INITED cov: 2 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb - NEW_FUNC[1/44]: 0x429840 in gumbo_parse_with_options (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x429840) - NEW_FUNC[2/44]: 0x42c0d0 in destroy_node (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x42c0d0) -#721 NEW cov: 180 ft: 181 corp: 2/12b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 4 ChangeByte-ChangeByte-ChangeBit-InsertRepeatedBytes- -#722 NEW cov: 186 ft: 196 corp: 3/23b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- -#723 NEW cov: 186 ft: 228 corp: 4/34b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBinInt- -#724 NEW cov: 188 ft: 241 corp: 5/45b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- -#725 NEW cov: 188 ft: 254 corp: 6/56b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeByte- -#726 NEW cov: 188 ft: 270 corp: 7/67b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 CopyPart- -#732 NEW cov: 188 ft: 279 corp: 8/78b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- - NEW_FUNC[1/1]: 0x441de0 in gumbo_token_destroy (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x441de0) -``` - -However, if the fuzzer finds a "crash" (indicating that a bug has been found) it will stop fuzzing and the following output would be expected: - -``` -INFO: Seed: 1523017872 -INFO: Loaded 1 modules (16 guards): 0x744e60, 0x744ea0, -INFO: -max_len is not provided, using 64 -INFO: A corpus is not provided, starting from an empty corpus -#0 READ units: 1 -#1 INITED cov: 3 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb -#3811 NEW cov: 4 ft: 3 corp: 2/2b exec/s: 0 rss: 25Mb L: 1 MS: 5 ChangeBit-ChangeByte-ChangeBit-ShuffleBytes-ChangeByte- -#3827 NEW cov: 5 ft: 4 corp: 3/4b exec/s: 0 rss: 25Mb L: 2 MS: 1 CopyPart- -#3963 NEW cov: 6 ft: 5 corp: 4/6b exec/s: 0 rss: 25Mb L: 2 MS: 2 ShuffleBytes-ChangeBit- -#4167 NEW cov: 7 ft: 6 corp: 5/9b exec/s: 0 rss: 25Mb L: 3 MS: 1 InsertByte- -==31511== ERROR: libFuzzer: deadly signal -... -artifact_prefix='./'; Test unit written to ./crash-b13e8756b13a00cf168300179061fb4b91fefbed -``` - -The above indicates that a crash has been identified and it can be reproduced by feeding the `crash-b13e8756b13a00cf168300179061fb4b91fefbed` file back into the binary used for fuzzing (e.g. parse-fuzzer) using the following command: - -``` -parse_fuzzer crash-b13e8756b13a00cf168300179061fb4b91fefbed -``` - -If you'd like to learn more about libfuzzer please give https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md a try. diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index ffb9e5f380..e9ed05c5b8 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -1,8 +1,9 @@ export SANITIZER_OPTS="" export SANITIZER_LINK="" +export LLVM_CONFIG="" if [ -x "$(command -v llvm-config)" ]; then - export LLVM_CONFIG=$(which llvm-config) + LLVM_CONFIG=$(which llvm-config) fi if [ -z "${LLVM_CONFIG}" ] From 0206d8c88bc357a6d8a6e5166fc2e4ce05f8e6f6 Mon Sep 17 00:00:00 2001 From: fuzzy-boiii23a Date: Mon, 20 Nov 2023 12:04:53 -0800 Subject: [PATCH 07/14] fixed build.sh --- gumbo-parser/fuzzer/build.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index e9ed05c5b8..62f4e64067 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -1,17 +1,18 @@ export SANITIZER_OPTS="" export SANITIZER_LINK="" -export LLVM_CONFIG="" - -if [ -x "$(command -v llvm-config)" ]; then - LLVM_CONFIG=$(which llvm-config) -fi if [ -z "${LLVM_CONFIG}" ] then - echo 'llvm-config could not be found and $LLVM_CONFIG has not been set, expecting "export LLVM_CONFIG=/usr/bin/llvm-config-12" assuming clang-12 is installed, however any clang version works' - exit + if [ -x "$(command -v llvm-config)" ]; then + LLVM_CONFIG=$(which llvm-config) + else + echo 'llvm-config could not be found and $LLVM_CONFIG has not been set, expecting "export LLVM_CONFIG=/usr/bin/llvm-config-12" assuming clang-12 is installed, however any clang version works' + exit + fi fi +echo $LLVM_CONFIG + if [ ! -d "build" ] then mkdir build From e64381d3aa6bcf89abb853c4925ed551b1c59265 Mon Sep 17 00:00:00 2001 From: fuzzy-boiii23a Date: Mon, 20 Nov 2023 12:05:10 -0800 Subject: [PATCH 08/14] fixed typo in build.sh --- gumbo-parser/fuzzer/build.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index 62f4e64067..a1eb68d399 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -11,8 +11,6 @@ then fi fi -echo $LLVM_CONFIG - if [ ! -d "build" ] then mkdir build From a9588dd5e16139974590619740c3a68a3b6ff885 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 20 Nov 2023 15:32:25 -0500 Subject: [PATCH 09/14] dev(fuzz): convert fuzzer/build.sh to a bash script and - set some bash safety flags - make it runnable from any directory - simplify the makefile execution --- gumbo-parser/Makefile | 8 ++++---- gumbo-parser/fuzzer/build.sh | 30 ++++++++++++++---------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile index c7189d988f..f27a63af28 100644 --- a/gumbo-parser/Makefile +++ b/gumbo-parser/Makefile @@ -16,16 +16,16 @@ all: check fuzzing: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan fuzzer-normal: - cd fuzzer && ./build.sh && cd - + ./fuzzer/build.sh fuzzer-asan: - cd fuzzer && SANITIZER=address ./build.sh && cd - + SANITIZER=address ./fuzzer/build.sh fuzzer-ubsan: - cd fuzzer && SANITIZER=undefined ./build.sh && cd - + SANITIZER=undefined ./fuzzer/build.sh fuzzer-msan: - cd fuzzer && SANITIZER=memory ./build.sh && cd - + SANITIZER=memory ./fuzzer/build.sh # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and # the generated files should be committed to SCM diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index a1eb68d399..e5e03de177 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -1,9 +1,14 @@ +#!/usr/bin/env bash + +set -eu + +cd $(dirname $0) + export SANITIZER_OPTS="" export SANITIZER_LINK="" -if [ -z "${LLVM_CONFIG}" ] -then - if [ -x "$(command -v llvm-config)" ]; then +if [[ -z "${LLVM_CONFIG:-}" ]] ; then + if [[ -x "$(command -v llvm-config)" ]]; then LLVM_CONFIG=$(which llvm-config) else echo 'llvm-config could not be found and $LLVM_CONFIG has not been set, expecting "export LLVM_CONFIG=/usr/bin/llvm-config-12" assuming clang-12 is installed, however any clang version works' @@ -11,10 +16,7 @@ then fi fi -if [ ! -d "build" ] -then - mkdir build -fi +mkdir -p build export CC="$($LLVM_CONFIG --bindir)/clang" export CXX="$($LLVM_CONFIG --bindir)/clang++" @@ -22,18 +24,15 @@ export CXXFLAGS="-fsanitize=fuzzer-no-link" export CFLAGS="-fsanitize=fuzzer-no-link" export ENGINE_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.fuzzer-x86_64.a | head -1)" -if [ "$SANITIZER" = "undefined" ] -then +if [[ "${SANITIZER:-}" = "undefined" ]] ; then export SANITIZER_OPTS="-fsanitize=undefined" export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.ubsan_standalone_cxx-x86_64.a | head -1)" fi -if [ "$SANITIZER" = "address" ] -then +if [[ "${SANITIZER:-}" = "address" ]] ; then export SANITIZER_OPTS="-fsanitize=address" export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.asan_cxx-x86_64.a | head -1)" fi -if [ "$SANITIZER" = "memory" ] -then +if [[ "${SANITIZER:-}" = "memory" ]] ; then export SANITIZER_OPTS="-fsanitize=memory -fPIE -pie -Wno-unused-command-line-argument" export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.msan_cxx-x86_64.a | head -1)" fi @@ -42,9 +41,8 @@ export CXXFLAGS="-O3 $CXXFLAGS $SANITIZER_OPTS" export CFLAGS="-O3 $CFLAGS $SANITIZER_OPTS" cd ../src && make clean && make && cd - -if [ -z "${SANITIZER}" ] -then +if [[ -z "${SANITIZER:-}" ]] ; then $CXX $CXXFLAGS -o build/parse_fuzzer parse_fuzzer.cc ../src/libgumbo.a $ENGINE_LINK $SANITIZER_LINK else $CXX $CXXFLAGS -o build/parse_fuzzer-$SANITIZER parse_fuzzer.cc ../src/libgumbo.a $ENGINE_LINK $SANITIZER_LINK -fi \ No newline at end of file +fi From 8ab3a99689e0a18870f0128ee56ab0c6c3173402 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 20 Nov 2023 16:12:25 -0500 Subject: [PATCH 10/14] dev(fuzz): allow fuzzers to be build in parallel use separate compilation directories to avoid the jobs clobbering each other if users run make with a `-j` flag --- gumbo-parser/Makefile | 2 +- gumbo-parser/fuzzer/build.sh | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile index f27a63af28..163ed7ef64 100644 --- a/gumbo-parser/Makefile +++ b/gumbo-parser/Makefile @@ -95,7 +95,7 @@ coverage: clean: $(RM) -r build - $(RM) -r fuzzer/build + $(RM) -r fuzzer/build fuzzer/src-* build/src/flags: | build/src @echo 'old_CC := $(CC)' > $@ diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index e5e03de177..12920b92e9 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -17,6 +17,7 @@ if [[ -z "${LLVM_CONFIG:-}" ]] ; then fi mkdir -p build +srcdir=src-${SANITIZER} export CC="$($LLVM_CONFIG --bindir)/clang" export CXX="$($LLVM_CONFIG --bindir)/clang++" @@ -39,10 +40,15 @@ fi export CXXFLAGS="-O3 $CXXFLAGS $SANITIZER_OPTS" export CFLAGS="-O3 $CFLAGS $SANITIZER_OPTS" -cd ../src && make clean && make && cd - + +rm -rf $srcdir +cp -ar ../src $srcdir +pushd $srcdir +make +popd if [[ -z "${SANITIZER:-}" ]] ; then - $CXX $CXXFLAGS -o build/parse_fuzzer parse_fuzzer.cc ../src/libgumbo.a $ENGINE_LINK $SANITIZER_LINK + $CXX $CXXFLAGS -o build/parse_fuzzer parse_fuzzer.cc $srcdir/libgumbo.a $ENGINE_LINK $SANITIZER_LINK else - $CXX $CXXFLAGS -o build/parse_fuzzer-$SANITIZER parse_fuzzer.cc ../src/libgumbo.a $ENGINE_LINK $SANITIZER_LINK + $CXX $CXXFLAGS -o build/parse_fuzzer-$SANITIZER parse_fuzzer.cc $srcdir/libgumbo.a $ENGINE_LINK $SANITIZER_LINK fi From 83900bec8fed62c1b5ff70c58afa8cdfe80fea75 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 20 Nov 2023 16:17:34 -0500 Subject: [PATCH 11/14] dev(fuzz): fuzzers match the make target names and rename the top-level target from `fuzzing` to `fuzzers` --- CONTRIBUTING.md | 14 ++++++++------ gumbo-parser/Makefile | 8 ++++---- gumbo-parser/fuzzer/build.sh | 9 +++++---- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ac5cdcf27f..beb83ffb71 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -238,14 +238,16 @@ bundle exec rake compile test ### Fuzzing your gumbo HTML5 parser changes -When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). This can be done by navigating to the `nokogiri/gumbo-parser` directory and executing `make fuzzing` in order to build the `gumbo-parser` fuzzer. Once built, navigate to the `nokogiri/gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory with no arguments to start fuzzing: +When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). -- parse_fuzzer (standard fuzzer with no sanitizer) -- parse_fuzzer-address (fuzzer built using [ASAN](https://clang.llvm.org/docs/AddressSanitizer.html)) -- parse_fuzzer-memory (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) -- parse_fuzzer-undefined (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) +Build the fuzzers by navigating to the `gumbo-parser` directory and running `make fuzzers`. Once built, navigate to the `gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory with no arguments to start fuzzing: -If the binary executed successfully you should now be seeing the following output filling up your terminal: +- parse_fuzzer-normal (standard fuzzer with no sanitizer) +- parse_fuzzer-asan (fuzzer built using [ASAN](https://clang.llvm.org/docs/AddressSanitizer.html)) +- parse_fuzzer-msan (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) +- parse_fuzzer-ubsan (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) + +If the binary executed successfully you should now be seeing the following output filling up your terminal (see https://llvm.org/docs/LibFuzzer.html#output for more information): ``` INFO: Seed: 4156947595 diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile index 163ed7ef64..c22dfc0b09 100644 --- a/gumbo-parser/Makefile +++ b/gumbo-parser/Makefile @@ -13,19 +13,19 @@ LDFLAGS := -pthread all: check -fuzzing: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan +fuzzers: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan fuzzer-normal: ./fuzzer/build.sh fuzzer-asan: - SANITIZER=address ./fuzzer/build.sh + SANITIZER=asan ./fuzzer/build.sh fuzzer-ubsan: - SANITIZER=undefined ./fuzzer/build.sh + SANITIZER=ubsan ./fuzzer/build.sh fuzzer-msan: - SANITIZER=memory ./fuzzer/build.sh + SANITIZER=msan ./fuzzer/build.sh # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and # the generated files should be committed to SCM diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index 12920b92e9..1ac62e2881 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -6,6 +6,7 @@ cd $(dirname $0) export SANITIZER_OPTS="" export SANITIZER_LINK="" +SANITIZER=${SANITIZER:-normal} if [[ -z "${LLVM_CONFIG:-}" ]] ; then if [[ -x "$(command -v llvm-config)" ]]; then @@ -25,17 +26,17 @@ export CXXFLAGS="-fsanitize=fuzzer-no-link" export CFLAGS="-fsanitize=fuzzer-no-link" export ENGINE_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.fuzzer-x86_64.a | head -1)" -if [[ "${SANITIZER:-}" = "undefined" ]] ; then export SANITIZER_OPTS="-fsanitize=undefined" export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.ubsan_standalone_cxx-x86_64.a | head -1)" +if [[ "${SANITIZER}" = "ubsan" ]] ; then fi -if [[ "${SANITIZER:-}" = "address" ]] ; then export SANITIZER_OPTS="-fsanitize=address" export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.asan_cxx-x86_64.a | head -1)" +if [[ "${SANITIZER}" = "asan" ]] ; then fi -if [[ "${SANITIZER:-}" = "memory" ]] ; then export SANITIZER_OPTS="-fsanitize=memory -fPIE -pie -Wno-unused-command-line-argument" export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.msan_cxx-x86_64.a | head -1)" +if [[ "${SANITIZER}" = "msan" ]] ; then fi export CXXFLAGS="-O3 $CXXFLAGS $SANITIZER_OPTS" @@ -47,7 +48,7 @@ pushd $srcdir make popd -if [[ -z "${SANITIZER:-}" ]] ; then +if [[ "${SANITIZER}" = "normal" ]] ; then $CXX $CXXFLAGS -o build/parse_fuzzer parse_fuzzer.cc $srcdir/libgumbo.a $ENGINE_LINK $SANITIZER_LINK else $CXX $CXXFLAGS -o build/parse_fuzzer-$SANITIZER parse_fuzzer.cc $srcdir/libgumbo.a $ENGINE_LINK $SANITIZER_LINK From d3e166c9620f770f0837adb3e8a3c6f8a97e73ef Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 20 Nov 2023 16:32:49 -0500 Subject: [PATCH 12/14] dev(fuzz): only export necessary env vars --- gumbo-parser/fuzzer/build.sh | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index 1ac62e2881..b2cee6bf08 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -4,8 +4,8 @@ set -eu cd $(dirname $0) -export SANITIZER_OPTS="" -export SANITIZER_LINK="" +SANITIZER_OPTS="" +SANITIZER_LINK="" SANITIZER=${SANITIZER:-normal} if [[ -z "${LLVM_CONFIG:-}" ]] ; then @@ -20,27 +20,29 @@ fi mkdir -p build srcdir=src-${SANITIZER} -export CC="$($LLVM_CONFIG --bindir)/clang" -export CXX="$($LLVM_CONFIG --bindir)/clang++" -export CXXFLAGS="-fsanitize=fuzzer-no-link" -export CFLAGS="-fsanitize=fuzzer-no-link" -export ENGINE_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.fuzzer-x86_64.a | head -1)" +CC="$($LLVM_CONFIG --bindir)/clang" +CXX="$($LLVM_CONFIG --bindir)/clang++" +CXXFLAGS="-fsanitize=fuzzer-no-link" +CFLAGS="-fsanitize=fuzzer-no-link" +ENGINE_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.fuzzer-x86_64.a | head -1)" - export SANITIZER_OPTS="-fsanitize=undefined" - export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.ubsan_standalone_cxx-x86_64.a | head -1)" if [[ "${SANITIZER}" = "ubsan" ]] ; then + SANITIZER_OPTS="-fsanitize=undefined" + SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.ubsan_standalone_cxx-x86_64.a | head -1)" fi - export SANITIZER_OPTS="-fsanitize=address" - export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.asan_cxx-x86_64.a | head -1)" if [[ "${SANITIZER}" = "asan" ]] ; then + SANITIZER_OPTS="-fsanitize=address" + SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.asan_cxx-x86_64.a | head -1)" fi - export SANITIZER_OPTS="-fsanitize=memory -fPIE -pie -Wno-unused-command-line-argument" - export SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.msan_cxx-x86_64.a | head -1)" if [[ "${SANITIZER}" = "msan" ]] ; then + SANITIZER_OPTS="-fsanitize=memory -fPIE -pie -Wno-unused-command-line-argument" + SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.msan_cxx-x86_64.a | head -1)" fi -export CXXFLAGS="-O3 $CXXFLAGS $SANITIZER_OPTS" -export CFLAGS="-O3 $CFLAGS $SANITIZER_OPTS" +CXXFLAGS="-O3 $CXXFLAGS $SANITIZER_OPTS" +CFLAGS="-O3 $CFLAGS $SANITIZER_OPTS" + +export CC CFLAGS CXX CXXFLAGS rm -rf $srcdir cp -ar ../src $srcdir From ea7154ced3cf4c0521de1eed9a908f585e5a6da6 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 20 Nov 2023 16:37:37 -0500 Subject: [PATCH 13/14] dev(fuzz): add -g to compiler flags so we see line numbers --- gumbo-parser/fuzzer/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index b2cee6bf08..dc75c516e3 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -39,8 +39,8 @@ if [[ "${SANITIZER}" = "msan" ]] ; then SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.msan_cxx-x86_64.a | head -1)" fi -CXXFLAGS="-O3 $CXXFLAGS $SANITIZER_OPTS" -CFLAGS="-O3 $CFLAGS $SANITIZER_OPTS" +CXXFLAGS="-O3 -g $CXXFLAGS $SANITIZER_OPTS" +CFLAGS="-O3 -g $CFLAGS $SANITIZER_OPTS" export CC CFLAGS CXX CXXFLAGS From 65d96b2ca30b34bcd942126c4965f4c20eede37c Mon Sep 17 00:00:00 2001 From: fuzzy-boiii23a Date: Tue, 21 Nov 2023 00:33:25 -0800 Subject: [PATCH 14/14] added dictionary and corpus --- CONTRIBUTING.md | 10 +- gumbo-parser/Makefile | 2 +- gumbo-parser/fuzzer/build.sh | 6 + gumbo-parser/fuzzer/gumbo.dict | 560 +++++++++++++++++++++++++++ gumbo-parser/fuzzer/gumbo_corpus.zip | Bin 0 -> 418446 bytes 5 files changed, 575 insertions(+), 3 deletions(-) create mode 100644 gumbo-parser/fuzzer/gumbo.dict create mode 100644 gumbo-parser/fuzzer/gumbo_corpus.zip diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index beb83ffb71..acfba27312 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -240,13 +240,19 @@ bundle exec rake compile test When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). -Build the fuzzers by navigating to the `gumbo-parser` directory and running `make fuzzers`. Once built, navigate to the `gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory with no arguments to start fuzzing: +Build the fuzzers by navigating to the `gumbo-parser` directory and running `make fuzzers`. Once built, navigate to the `gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory: -- parse_fuzzer-normal (standard fuzzer with no sanitizer) +- parse_fuzzer (standard fuzzer with no sanitizer) - parse_fuzzer-asan (fuzzer built using [ASAN](https://clang.llvm.org/docs/AddressSanitizer.html)) - parse_fuzzer-msan (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) - parse_fuzzer-ubsan (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) +To fuzz more efficiently, use the dictionary (gumbo.dict) and corpus (gumbo_corpus) found in `gumbo-parser/fuzzer` using the following arguments (assuming parse_fuzzer is in use): + +``` +./parse_fuzzer -dict=../gumbo.dict ../gumbo_corpus +``` + If the binary executed successfully you should now be seeing the following output filling up your terminal (see https://llvm.org/docs/LibFuzzer.html#output for more information): ``` diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile index c22dfc0b09..dd729bc15d 100644 --- a/gumbo-parser/Makefile +++ b/gumbo-parser/Makefile @@ -95,7 +95,7 @@ coverage: clean: $(RM) -r build - $(RM) -r fuzzer/build fuzzer/src-* + $(RM) -r fuzzer/build fuzzer/src-* fuzzer/gumbo_corpus build/src/flags: | build/src @echo 'old_CC := $(CC)' > $@ diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh index dc75c516e3..849cd12f2a 100755 --- a/gumbo-parser/fuzzer/build.sh +++ b/gumbo-parser/fuzzer/build.sh @@ -4,6 +4,12 @@ set -eu cd $(dirname $0) +echo $PWD + +if [ ! -d gumbo_corpus ]; then + unzip gumbo_corpus.zip -d gumbo_corpus +fi + SANITIZER_OPTS="" SANITIZER_LINK="" SANITIZER=${SANITIZER:-normal} diff --git a/gumbo-parser/fuzzer/gumbo.dict b/gumbo-parser/fuzzer/gumbo.dict new file mode 100644 index 0000000000..7a10b3b4d7 --- /dev/null +++ b/gumbo-parser/fuzzer/gumbo.dict @@ -0,0 +1,560 @@ +# +# AFL dictionary for HTML parsers +# ------------------------------- +# +# A basic collection of HTML string likely to matter to HTML parsers. +# +# Created by Michal Zalewski +# + +tag_a="" +tag_abbr="" +tag_acronym="" +tag_address="
" +tag_annotation_xml="" +tag_applet="" +tag_area="" +tag_article="
" +tag_aside="