gorilla · riking · Mar 13, 2018 · Mar 13, 2018 · Mar 13, 2018 · Mar 13, 2018
diff --git a/README.md b/README.md
@@ -2,4 +2,4 @@ css
 ===
 [![GoDoc](https://godoc.org/github.com/gorilla/css?status.svg)](https://godoc.org/github.com/gorilla/css) [![Build Status](https://travis-ci.org/gorilla/css.png?branch=master)](https://travis-ci.org/gorilla/css)
 
-A CSS3 tokenizer.
+A CSS3 tokenizer based on https://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms
diff --git a/tokenizer/.gitignore b/tokenizer/.gitignore
@@ -0,0 +1 @@
+testdata/fuzz
diff --git a/tokenizer/crlf.go b/tokenizer/crlf.go
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 Kane York. Licensed under 2-Clause BSD.
+
+package tokenizer
+
+// The crlf package helps in dealing with files that have DOS-style CR/LF line
+// endings.
+//
+// Copyright (c) 2015 Andy Balholm. Licensed under 2-Clause BSD.
+//
+// package crlf
+
+import "golang.org/x/text/transform"
+
+// Normalize takes CRLF, CR, or LF line endings in src, and converts them
+// to LF in dst.
+//
+// cssparse: Also replace null bytes with U+FFFD REPLACEMENT CHARACTER.
+type normalize struct {
+	prev byte
+}
+
+const replacementCharacter = "\uFFFD"
+
+func (n *normalize) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	for nDst < len(dst) && nSrc < len(src) {
+		c := src[nSrc]
+		switch c {
+		case '\r':
+			dst[nDst] = '\n'
+		case '\n':
+			if n.prev == '\r' {
+				nSrc++
+				n.prev = c
+				continue
+			}
+			dst[nDst] = '\n'
+		case 0:
+			// nb: len(replacementCharacter) == 3
+			if nDst+3 >= len(dst) {
+				err = transform.ErrShortDst
+				return
+			}
+			copy(dst[nDst:], replacementCharacter[:])
+			nDst += 2
+		default:
+			dst[nDst] = c
+		}
+		n.prev = c
+		nDst++
+		nSrc++
+	}
+	if nSrc < len(src) {
+		err = transform.ErrShortDst
+	}
+	return
+}
+
+func (n *normalize) Reset() {
+	n.prev = 0
+}
diff --git a/tokenizer/doc.go b/tokenizer/doc.go
@@ -0,0 +1,52 @@
+// Copyright 2018 Kane York.
+// Copyright 2012 The Gorilla Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package gorilla/css/tokenizer generates tokens for a CSS3 input.
+
+It follows the CSS3 specification located at:
+
+	http://www.w3.org/TR/css3-syntax/
+
+To use it, create a new scanner for a given CSS input and call Next() until
+the token returned is a "stop token":
+
+	s := tokenizer.New(strings.NewReader(myCSS))
+	for {
+		token := s.Next()
+		if token.Type.StopToken() {
+			break
+		}
+		// Do something with the token...
+	}
+
+If the consumer wants to accept malformed input, change the check to the
+following instead:
+
+		token := s.Next()
+		if token.Type == tokenizer.TokenEOF || token.Type == tokenizer.TokenError {
+			break
+		}
+
+The three potential tokenization errors are a "bad-escape" (backslash-newline
+outside a "string" or url() in the input), a "bad-string" (unescaped newline
+inside a "string"), and a "bad-url" (a few different cases). Parsers can choose
+to abort when seeing one of these errors, or ignore the declaration and attempt
+to recover.
+
+Returned tokens that carry extra information have a non-nil .Extra value. For
+TokenError, TokenBadEscape, TokenBadString, and TokenBadURI, the
+TokenExtraError type carries an `error` with informative text about the nature
+of the error. For TokenNumber, TokenPercentage, and TokenDimension, the
+TokenExtraNumeric specifies whether the number is integral, and for
+TokenDimension, contains the unit string (e.g. "px"). For TokenUnicodeRange,
+the TokenExtraUnicodeRange type contains the actual start and end values of the
+range.
+
+Note: the scanner doesn't perform lexical analysis or, in other words, it
+doesn't care about the token context. It is intended to be used by a
+lexer or parser.
+*/
+package tokenizer
diff --git a/tokenizer/fuzz.go b/tokenizer/fuzz.go
@@ -0,0 +1,95 @@
+// Copyright 2018 Kane York.
+
+package tokenizer
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"reflect"
+)
+
+// Entry point for fuzz testing.
+func Fuzz(b []byte) int {
+	fmt.Printf("=== Start fuzz test ===\n%s\n", b)
+	var tokens []Token
+
+	tz := NewTokenizer(bytes.NewReader(b))
+	for {
+		tt := tz.Next()
+		fmt.Printf("[OT] %v\n", tt)
+		if tt.Type == TokenError {
+			// We should not have reading errors
+			panic(tt)
+		} else if tt.Type == TokenEOF {
+			break
+		} else {
+			tokens = append(tokens, tt)
+		}
+	}
+
+	// Render and retokenize
+
+	var wr TokenRenderer
+	var rerenderBuf bytes.Buffer
+	success := false
+	defer func() {
+		if !success {
+			fmt.Println("RERENDER BUFFER:", rerenderBuf.String())
+		}
+	}()
+	pr, pw := io.Pipe()
+	defer pr.Close()
+
+	go func() {
+		writeTarget := io.MultiWriter(pw, &rerenderBuf)
+		for _, v := range tokens {
+			wr.WriteTokenTo(writeTarget, v)
+		}
+		pw.Close()
+	}()
+
+	tz = NewTokenizer(pr)
+	i := 0
+	for {
+		for i < len(tokens) && tokens[i].Type == TokenComment {
+			i++
+		}
+		tt := tz.Next()
+		fmt.Printf("[RT] %v\n", tt)
+		if tt.Type == TokenComment {
+			// Ignore comments while comparing
+			continue
+		}
+		if tt.Type == TokenError {
+			panic(tt)
+		}
+		if tt.Type == TokenEOF {
+			if i != len(tokens) {
+				panic(fmt.Sprintf("unexpected EOF: got EOF from retokenizer, but original token stream is at %d/%d\n%v", i, len(tokens), tokens))
+			} else {
+				break
+			}
+		}
+		if i == len(tokens) {
+			panic(fmt.Sprintf("expected EOF: reached end of original token stream but got %v from retokenizer\n%v", tt, tokens))
+		}
+
+		ot := tokens[i]
+		if tt.Type != ot.Type {
+			panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Type not equal)\n%v", tt, ot, tokens))
+		}
+		if tt.Value != ot.Value && !tt.Type.StopToken() {
+			panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Value not equal)\n%v", tt, ot, tokens))
+		}
+		if TokenExtraTypeLookup[tt.Type] != nil {
+			if !reflect.DeepEqual(tt, ot) && !tt.Type.StopToken() {
+				panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Extra not equal)\n%v", tt, ot, tokens))
+			}
+		}
+		i++
+		continue
+	}
+	success = true
+	return 1
+}
diff --git a/tokenizer/scanner_test.go b/tokenizer/scanner_test.go
@@ -0,0 +1,158 @@
+// Copyright 2018 Kane York.
+// Copyright 2012 The Gorilla Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tokenizer
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestMatchers(t *testing.T) {
+	// Just basic checks, not exhaustive at all.
+	checkMatch := func(s string, ttList ...interface{}) {
+		tz := NewTokenizer(strings.NewReader(s))
+
+		i := 0
+		for i < len(ttList) {
+			tt := ttList[i].(TokenType)
+			tVal := ttList[i+1].(string)
+			var tExtra TokenExtra
+			if TokenExtraTypeLookup[tt] != nil {
+				tExtra = ttList[i+2].(TokenExtra)
+			}
+			if tok := tz.Next(); tok.Type != tt {
+				t.Errorf("did not match: %s (got %v, wanted %v)", s, tok, tt)
+			} else if tok.Value != tVal {
+				t.Errorf("did not match: %s (got %s, wanted %s): %v", s, tok.Value, tVal, tok)
+			} else if tExtra != nil && !reflect.DeepEqual(tok.Extra, tExtra) {
+				if tt.StopToken() && tt != TokenError && tt != TokenEOF {
+					// mismatch ok
+				} else {
+					t.Errorf("did not match .Extra: %s (got %#v, wanted %#v): %v", s, tok.Extra, tExtra, tok)
+				}
+			}
+
+			i += 2
+			if TokenExtraTypeLookup[tt] != nil {
+				i++
+			}
+		}
+
+		if tok := tz.Next(); tok.Type != TokenEOF {
+			t.Errorf("missing EOF after token %s, got %+v", s, tok)
+			if tok := tz.Next(); tok.Type != TokenEOF {
+				t.Errorf("double missing EOF after token %s, got %+v", s, tok)
+			}
+		}
+
+		Fuzz([]byte(s))
+	}
+
+	checkMatch("abcd", TokenIdent, "abcd")
+	checkMatch(`"abcd"`, TokenString, `abcd`)
+	checkMatch(`"ab'cd"`, TokenString, `ab'cd`)
+	checkMatch(`"ab\"cd"`, TokenString, `ab"cd`)
+	checkMatch(`"ab\\cd"`, TokenString, `ab\cd`)
+	checkMatch("'abcd'", TokenString, "abcd")
+	checkMatch(`'ab"cd'`, TokenString, `ab"cd`)
+	checkMatch(`'ab\'cd'`, TokenString, `ab'cd`)
+	checkMatch(`'ab\\cd'`, TokenString, `ab\cd`)
+	checkMatch("#name", TokenHash, "name", &TokenExtraHash{IsIdentifier: true})
+	checkMatch("##name", TokenDelim, "#", TokenHash, "name", &TokenExtraHash{IsIdentifier: true})
+	checkMatch("#123", TokenHash, "123", &TokenExtraHash{IsIdentifier: false})
+	checkMatch("42''", TokenNumber, "42", &TokenExtraNumeric{}, TokenString, "")
+	checkMatch("+42", TokenNumber, "+42", &TokenExtraNumeric{})
+	checkMatch("-42", TokenNumber, "-42", &TokenExtraNumeric{})
+	checkMatch("42.", TokenNumber, "42", &TokenExtraNumeric{}, TokenDelim, ".")
+	checkMatch("42.0", TokenNumber, "42.0", &TokenExtraNumeric{NonInteger: true})
+	checkMatch("4.2", TokenNumber, "4.2", &TokenExtraNumeric{NonInteger: true})
+	checkMatch(".42", TokenNumber, ".42", &TokenExtraNumeric{NonInteger: true})
+	checkMatch("+.42", TokenNumber, "+.42", &TokenExtraNumeric{NonInteger: true})
+	checkMatch("-.42", TokenNumber, "-.42", &TokenExtraNumeric{NonInteger: true})
+	checkMatch("42%", TokenPercentage, "42", &TokenExtraNumeric{})
+	checkMatch("4.2%", TokenPercentage, "4.2", &TokenExtraNumeric{NonInteger: true})
+	checkMatch(".42%", TokenPercentage, ".42", &TokenExtraNumeric{NonInteger: true})
+	checkMatch("42px", TokenDimension, "42", &TokenExtraNumeric{Dimension: "px"}) // TODO check the dimension stored in .Extra
+
+	checkMatch("5e", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e"})
+	checkMatch("5e-", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e-"})
+	checkMatch("5e-3", TokenNumber, "5e-3", &TokenExtraNumeric{NonInteger: true})
+	checkMatch("5e-\xf1", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e-\xf1"})
+
+	checkMatch("url(http://domain.com)", TokenURI, "http://domain.com")
+	checkMatch("url( http://domain.com/uri/between/space )", TokenURI, "http://domain.com/uri/between/space")
+	checkMatch("url('http://domain.com/uri/between/single/quote')", TokenURI, "http://domain.com/uri/between/single/quote")
+	checkMatch(`url("http://domain.com/uri/between/double/quote")`, TokenURI, `http://domain.com/uri/between/double/quote`)
+	checkMatch("url(http://domain.com/?parentheses=%28)", TokenURI, "http://domain.com/?parentheses=%28")
+	checkMatch("url( http://domain.com/?parentheses=%28&between=space )", TokenURI, "http://domain.com/?parentheses=%28&between=space")
+	checkMatch("url('http://domain.com/uri/(parentheses)/between/single/quote')", TokenURI, "http://domain.com/uri/(parentheses)/between/single/quote")
+	checkMatch(`url("http://domain.com/uri/(parentheses)/between/double/quote")`, TokenURI, `http://domain.com/uri/(parentheses)/between/double/quote`)
+	checkMatch(`url(http://domain.com/uri/\(bare%20escaped\)/parentheses)`, TokenURI, `http://domain.com/uri/(bare%20escaped)/parentheses`)
+	checkMatch("url(http://domain.com/uri/1)url(http://domain.com/uri/2)",
+		TokenURI, "http://domain.com/uri/1",
+		TokenURI, "http://domain.com/uri/2",
+	)
+	checkMatch("url(http://domain.com/uri/1) url(http://domain.com/uri/2)",
+		TokenURI, "http://domain.com/uri/1",
+		TokenS, " ",
+		TokenURI, "http://domain.com/uri/2",
+	)
+	checkMatch("U+0042", TokenUnicodeRange, "U+0042", &TokenExtraUnicodeRange{Start: 0x42, End: 0x42})
+	checkMatch("U+FFFFFF", TokenUnicodeRange, "U+FFFFFF", &TokenExtraUnicodeRange{Start: 0xFFFFFF, End: 0xFFFFFF})
+	checkMatch("U+??????", TokenUnicodeRange, "U+0000-FFFFFF", &TokenExtraUnicodeRange{Start: 0, End: 0xFFFFFF})
+	checkMatch("<!--", TokenCDO, "<!--")
+	checkMatch("-->", TokenCDC, "-->")
+	checkMatch("   \n   \t   \n", TokenS, "\n") // TODO - whitespace preservation
+	checkMatch("/**/", TokenComment, "")
+	checkMatch("/***/", TokenComment, "*")
+	checkMatch("/**", TokenComment, "*")
+	checkMatch("/*foo*/", TokenComment, "foo")
+	checkMatch("/* foo */", TokenComment, " foo ")
+	checkMatch("bar(", TokenFunction, "bar")
+	checkMatch("~=", TokenIncludes, "~=")
+	checkMatch("|=", TokenDashMatch, "|=")
+	checkMatch("||", TokenColumn, "||")
+	checkMatch("^=", TokenPrefixMatch, "^=")
+	checkMatch("$=", TokenSuffixMatch, "$=")
+	checkMatch("*=", TokenSubstringMatch, "*=")
+	checkMatch("{", TokenOpenBrace, "{")
+	// checkMatch("\uFEFF", TokenBOM, "\uFEFF")
+	checkMatch(`╯︵┻━┻"stuff"`, TokenIdent, "╯︵┻━┻", TokenString, "stuff")
+
+	checkMatch("foo { bar: rgb(255, 0, 127); }",
+		TokenIdent, "foo", TokenS, " ",
+		TokenOpenBrace, "{", TokenS, " ",
+		TokenIdent, "bar", TokenColon, ":", TokenS, " ",
+		TokenFunction, "rgb",
+		TokenNumber, "255", &TokenExtraNumeric{}, TokenComma, ",", TokenS, " ",
+		TokenNumber, "0", &TokenExtraNumeric{}, TokenComma, ",", TokenS, " ",
+		TokenNumber, "127", &TokenExtraNumeric{}, TokenCloseParen, ")",
+		TokenSemicolon, ";", TokenS, " ",
+		TokenCloseBrace, "}",
+	)
+	// Fuzzing results
+	checkMatch("ur(0", TokenFunction, "ur", TokenNumber, "0", &TokenExtraNumeric{})
+	checkMatch("1\\15", TokenDimension, "1", &TokenExtraNumeric{Dimension: "\x15"})
+	checkMatch("url(0t')", TokenBadURI, "0t", &TokenExtraError{})
+	checkMatch("uri/", TokenIdent, "uri", TokenDelim, "/")
+	checkMatch("\x00", TokenIdent, "\uFFFD")
+	checkMatch("a\\0", TokenIdent, "a\uFFFD")
+	checkMatch("b\\\\0", TokenIdent, "b\\0")
+	checkMatch("00\\d", TokenDimension, "00", &TokenExtraNumeric{Dimension: "\r"})
+	// note: \f is form feed, which is 0x0C
+	checkMatch("\\0\\0\\C\\\f\\\\0",
+		TokenIdent, "\uFFFD\uFFFD\x0C\x0C\\0")
+	// String running to EOF is success, not badstring
+	checkMatch("\"a0\\d", TokenString, "a0\x0D")
+	checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n")
+	checkMatch("\\fun(", TokenFunction, "\x0fun")
+	checkMatch("\"abc\\\"def\nghi", TokenBadString, "abc\"def", &TokenExtraError{}, TokenS, "\n", TokenIdent, "ghi")
+	// checkMatch("---\\\x18-00", TokenDelim, "-", TokenDelim, "-", TokenIdent, "-\x18-00")
+	Fuzz([]byte(
+		`#sw_tfbb,#id_d{display:none}.sw_pref{border-style:solid;border-width:7px 0 7px 10px;vertical-align:bottom}#b_tween{margin-top:-28px}#b_tween>span{line-height:30px}#b_tween .ftrH{line-height:30px;height:30px}input{font:inherit;font-size:100%}.b_searchboxForm{font:18px/normal 'Segoe UI',Arial,Helvetica,Sans-Serif}.b_beta{font:11px/normal Arial,Helvetica,Sans-Serif}.b_scopebar,.id_button{line-height:30px}.sa_ec{font:13px Arial,Helvetica,Sans-Serif}#sa_ul .sa_hd{font-size:11px;line-height:16px}#sw_as strong{font-family:'Segoe UI Semibold',Arial,Helvetica,Sans-Serif}#id_h{background-color:transparent!important;position:relativ e!important;float:right;height:35px!important;width:280px!important}.sw_pref{margin:0 15px 3px 0}#id_d{left:auto;right:26px;top:35px!important}.id_avatar{vertical-align:middle;margin:10px 0 10px 10px}`),
+	)
+}
diff --git a/tokenizer/testdata/fuzz/corpus/test-1 b/tokenizer/testdata/fuzz/corpus/test-1
@@ -0,0 +1 @@
+abcd
diff --git a/tokenizer/testdata/fuzz/corpus/test-10 b/tokenizer/testdata/fuzz/corpus/test-10
@@ -0,0 +1 @@
+#name
diff --git a/tokenizer/testdata/fuzz/corpus/test-11 b/tokenizer/testdata/fuzz/corpus/test-11
@@ -0,0 +1 @@
+##name
diff --git a/tokenizer/testdata/fuzz/corpus/test-12 b/tokenizer/testdata/fuzz/corpus/test-12
@@ -0,0 +1 @@
+42''
diff --git a/tokenizer/testdata/fuzz/corpus/test-13 b/tokenizer/testdata/fuzz/corpus/test-13
@@ -0,0 +1 @@
++42
diff --git a/tokenizer/testdata/fuzz/corpus/test-14 b/tokenizer/testdata/fuzz/corpus/test-14
@@ -0,0 +1 @@
+-42
diff --git a/tokenizer/testdata/fuzz/corpus/test-15 b/tokenizer/testdata/fuzz/corpus/test-15
@@ -0,0 +1 @@
+4.2