diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 22c4f95edc..acfba27312 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -236,6 +236,70 @@ git submodule update --init # test/html5lib-tests bundle exec rake compile test ``` +### Fuzzing your gumbo HTML5 parser changes + +When making changes or adding new features to `gumbo-parser`, it's recommended to run [libfuzzer](https://llvm.org/docs/LibFuzzer.html) against `gumbo-parser` using various [sanitizers](https://github.com/google/sanitizers/wiki). + +Build the fuzzers by navigating to the `gumbo-parser` directory and running `make fuzzers`. Once built, navigate to the `gumbo-parser/fuzzer/build` directory and execute one of the following binaries in this directory: + +- parse_fuzzer (standard fuzzer with no sanitizer) +- parse_fuzzer-asan (fuzzer built using [ASAN](https://clang.llvm.org/docs/AddressSanitizer.html)) +- parse_fuzzer-msan (fuzzer built using [MSAN](https://clang.llvm.org/docs/MemorySanitizer.html)) +- parse_fuzzer-ubsan (fuzzer built using [UBSAN](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html)) + +To fuzz more efficiently, use the dictionary (gumbo.dict) and corpus (gumbo_corpus) found in `gumbo-parser/fuzzer` using the following arguments (assuming parse_fuzzer is in use): + +``` +./parse_fuzzer -dict=../gumbo.dict ../gumbo_corpus +``` + +If the binary executed successfully you should now be seeing the following output filling up your terminal (see https://llvm.org/docs/LibFuzzer.html#output for more information): + +``` +INFO: Seed: 4156947595 +INFO: Loaded 1 modules (7149 inline 8-bit counters): 7149 0x58a462, 0x58c04f, +INFO: Loaded 1 PC tables (7149 PCs): 7149 0x53beb0,0x557d80, +INFO: -max_len is not provided; libFuzzer will not generate inputs larger than 4096 bytes +INFO: A corpus is not provided, starting from an empty corpus +#2 INITED cov: 2 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb + NEW_FUNC[1/44]: 0x429840 in gumbo_parse_with_options (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x429840) + NEW_FUNC[2/44]: 0x42c0d0 in destroy_node (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x42c0d0) +#721 NEW cov: 180 ft: 181 corp: 2/12b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 4 ChangeByte-ChangeByte-ChangeBit-InsertRepeatedBytes- +#722 NEW cov: 186 ft: 196 corp: 3/23b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- +#723 NEW cov: 186 ft: 228 corp: 4/34b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBinInt- +#724 NEW cov: 188 ft: 241 corp: 5/45b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- +#725 NEW cov: 188 ft: 254 corp: 6/56b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeByte- +#726 NEW cov: 188 ft: 270 corp: 7/67b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 CopyPart- +#732 NEW cov: 188 ft: 279 corp: 8/78b lim: 11 exec/s: 0 rss: 27Mb L: 11/11 MS: 1 ChangeBit- + NEW_FUNC[1/1]: 0x441de0 in gumbo_token_destroy (/home/user/nokogiri/gumbo-parser/fuzzer/build/parse_fuzzer+0x441de0) +``` + +However, if the fuzzer finds a "crash" (indicating that a bug has been found) it will stop fuzzing and the following output would be expected: + +``` +INFO: Seed: 1523017872 +INFO: Loaded 1 modules (16 guards): 0x744e60, 0x744ea0, +INFO: -max_len is not provided, using 64 +INFO: A corpus is not provided, starting from an empty corpus +#0 READ units: 1 +#1 INITED cov: 3 ft: 2 corp: 1/1b exec/s: 0 rss: 24Mb +#3811 NEW cov: 4 ft: 3 corp: 2/2b exec/s: 0 rss: 25Mb L: 1 MS: 5 ChangeBit-ChangeByte-ChangeBit-ShuffleBytes-ChangeByte- +#3827 NEW cov: 5 ft: 4 corp: 3/4b exec/s: 0 rss: 25Mb L: 2 MS: 1 CopyPart- +#3963 NEW cov: 6 ft: 5 corp: 4/6b exec/s: 0 rss: 25Mb L: 2 MS: 2 ShuffleBytes-ChangeBit- +#4167 NEW cov: 7 ft: 6 corp: 5/9b exec/s: 0 rss: 25Mb L: 3 MS: 1 InsertByte- +==31511== ERROR: libFuzzer: deadly signal +... +artifact_prefix='./'; Test unit written to ./crash-b13e8756b13a00cf168300179061fb4b91fefbed +``` + +The above indicates that a crash has been identified and it can be reproduced by feeding the `crash-b13e8756b13a00cf168300179061fb4b91fefbed` file back into the binary used for fuzzing (e.g. parse-fuzzer) using the following command: + +``` +parse_fuzzer crash-b13e8756b13a00cf168300179061fb4b91fefbed +``` + +If you'd like to learn more about libfuzzer please give https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md a try. + ## Style Guide diff --git a/gumbo-parser/.gitignore b/gumbo-parser/.gitignore index 3d04bd296b..37a46fc2b3 100644 --- a/gumbo-parser/.gitignore +++ b/gumbo-parser/.gitignore @@ -1,3 +1,5 @@ build googletest src/*.o +fuzzer/build +src/libgumbo.a \ No newline at end of file diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile index c2d0721344..dd729bc15d 100644 --- a/gumbo-parser/Makefile +++ b/gumbo-parser/Makefile @@ -13,6 +13,20 @@ LDFLAGS := -pthread all: check +fuzzers: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan + +fuzzer-normal: + ./fuzzer/build.sh + +fuzzer-asan: + SANITIZER=asan ./fuzzer/build.sh + +fuzzer-ubsan: + SANITIZER=ubsan ./fuzzer/build.sh + +fuzzer-msan: + SANITIZER=msan ./fuzzer/build.sh + # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and # the generated files should be committed to SCM ifneq ($(CI),true) @@ -81,6 +95,7 @@ coverage: clean: $(RM) -r build + $(RM) -r fuzzer/build fuzzer/src-* fuzzer/gumbo_corpus build/src/flags: | build/src @echo 'old_CC := $(CC)' > $@ diff --git a/gumbo-parser/fuzzer/build.sh b/gumbo-parser/fuzzer/build.sh new file mode 100755 index 0000000000..849cd12f2a --- /dev/null +++ b/gumbo-parser/fuzzer/build.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -eu + +cd $(dirname $0) + +echo $PWD + +if [ ! -d gumbo_corpus ]; then + unzip gumbo_corpus.zip -d gumbo_corpus +fi + +SANITIZER_OPTS="" +SANITIZER_LINK="" +SANITIZER=${SANITIZER:-normal} + +if [[ -z "${LLVM_CONFIG:-}" ]] ; then + if [[ -x "$(command -v llvm-config)" ]]; then + LLVM_CONFIG=$(which llvm-config) + else + echo 'llvm-config could not be found and $LLVM_CONFIG has not been set, expecting "export LLVM_CONFIG=/usr/bin/llvm-config-12" assuming clang-12 is installed, however any clang version works' + exit + fi +fi + +mkdir -p build +srcdir=src-${SANITIZER} + +CC="$($LLVM_CONFIG --bindir)/clang" +CXX="$($LLVM_CONFIG --bindir)/clang++" +CXXFLAGS="-fsanitize=fuzzer-no-link" +CFLAGS="-fsanitize=fuzzer-no-link" +ENGINE_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.fuzzer-x86_64.a | head -1)" + +if [[ "${SANITIZER}" = "ubsan" ]] ; then + SANITIZER_OPTS="-fsanitize=undefined" + SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.ubsan_standalone_cxx-x86_64.a | head -1)" +fi +if [[ "${SANITIZER}" = "asan" ]] ; then + SANITIZER_OPTS="-fsanitize=address" + SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.asan_cxx-x86_64.a | head -1)" +fi +if [[ "${SANITIZER}" = "msan" ]] ; then + SANITIZER_OPTS="-fsanitize=memory -fPIE -pie -Wno-unused-command-line-argument" + SANITIZER_LINK="$(find $($LLVM_CONFIG --libdir) -name libclang_rt.msan_cxx-x86_64.a | head -1)" +fi + +CXXFLAGS="-O3 -g $CXXFLAGS $SANITIZER_OPTS" +CFLAGS="-O3 -g $CFLAGS $SANITIZER_OPTS" + +export CC CFLAGS CXX CXXFLAGS + +rm -rf $srcdir +cp -ar ../src $srcdir +pushd $srcdir +make +popd + +if [[ "${SANITIZER}" = "normal" ]] ; then + $CXX $CXXFLAGS -o build/parse_fuzzer parse_fuzzer.cc $srcdir/libgumbo.a $ENGINE_LINK $SANITIZER_LINK +else + $CXX $CXXFLAGS -o build/parse_fuzzer-$SANITIZER parse_fuzzer.cc $srcdir/libgumbo.a $ENGINE_LINK $SANITIZER_LINK +fi diff --git a/gumbo-parser/fuzzer/gumbo.dict b/gumbo-parser/fuzzer/gumbo.dict new file mode 100644 index 0000000000..7a10b3b4d7 --- /dev/null +++ b/gumbo-parser/fuzzer/gumbo.dict @@ -0,0 +1,560 @@ +# +# AFL dictionary for HTML parsers +# ------------------------------- +# +# A basic collection of HTML string likely to matter to HTML parsers. +# +# Created by Michal Zalewski +# + +tag_a="" +tag_abbr="" +tag_acronym="" +tag_address="
" +tag_annotation_xml="" +tag_applet="" +tag_area="" +tag_article="
" +tag_aside="