gorilla · riking · Mar 13, 2018 · Mar 13, 2018 · Mar 13, 2018 · Mar 13, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -3,12 +3,12 @@ sudo: false
 
 matrix:
   include:
-    - go: 1.3
-    - go: 1.4
-    - go: 1.5
-    - go: 1.6
-    - go: 1.7
-    - go: 1.8
+    - go: "1.5"
+    - go: "1.6"
+    - go: "1.7"
+    - go: "1.8"
+    - go: "1.9"
+    - go: "1.10"
     - go: tip
   allow_failures:
     - go: tip

diff --git a/README.md b/README.md
@@ -3,3 +3,7 @@ css
 [![GoDoc](https://godoc.org/github.com/gorilla/css?status.svg)](https://godoc.org/github.com/gorilla/css) [![Build Status](https://travis-ci.org/gorilla/css.png?branch=master)](https://travis-ci.org/gorilla/css)
 
 A CSS3 tokenizer.
+
+This repository contains two packages. The 'scanner' package is based on an older version of the CSS specification, and is kept around for compatibility with existing code. Minimum Go version is 1.3.
+
+The 'tokenizer' package is based on the CSS Syntax Level 3 specification at <https://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms>. Minimum Go version is 1.5.
diff --git a/tokenizer/.gitignore b/tokenizer/.gitignore
@@ -0,0 +1 @@
+testdata/fuzz
diff --git a/tokenizer/crlf.go b/tokenizer/crlf.go
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 Kane York. Licensed under 2-Clause BSD.
+
+package tokenizer
+
+// The crlf package helps in dealing with files that have DOS-style CR/LF line
+// endings.
+//
+// Copyright (c) 2015 Andy Balholm. Licensed under 2-Clause BSD.
+//
+// package crlf
+
+import "golang.org/x/text/transform"
+
+// Normalize takes CRLF, CR, or LF line endings in src, and converts them
+// to LF in dst.
+//
+// cssparse: Also replace null bytes with U+FFFD REPLACEMENT CHARACTER.
+type normalize struct {
+	prev byte
+}
+
+const replacementCharacter = "\uFFFD"
+
+func (n *normalize) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	for nDst < len(dst) && nSrc < len(src) {
+		c := src[nSrc]
+		switch c {
+		case '\r':
+			dst[nDst] = '\n'
+		case '\n':
+			if n.prev == '\r' {
+				nSrc++
+				n.prev = c
+				continue
+			}
+			dst[nDst] = '\n'
+		case 0:
+			// nb: len(replacementCharacter) == 3
+			if nDst+3 >= len(dst) {
+				err = transform.ErrShortDst
+				return
+			}
+			copy(dst[nDst:], replacementCharacter[:])
+			nDst += 2
+		default:
+			dst[nDst] = c
+		}
+		n.prev = c
+		nDst++
+		nSrc++
+	}
+	if nSrc < len(src) {
+		err = transform.ErrShortDst
+	}
+	return
+}
+
+func (n *normalize) Reset() {
+	n.prev = 0
+}
diff --git a/tokenizer/doc.go b/tokenizer/doc.go
@@ -0,0 +1,52 @@
+// Copyright 2018 Kane York.
+// Copyright 2012 The Gorilla Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package gorilla/css/tokenizer generates tokens for a CSS3 input.
+
+It follows the CSS3 specification located at:
+
+	http://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms
+
+To use it, create a new tokenizer for a given CSS input and call Next() until
+the token returned is a "stop token":
+
+	s := tokenizer.New(strings.NewReader(myCSS))
+	for {
+		token := s.Next()
+		if token.Type.StopToken() {
+			break
+		}
+		// Do something with the token...
+	}
+
+If the consumer wants to accept malformed input, use the following check
+instead:
+
+		token := s.Next()
+		if token.Type == tokenizer.TokenEOF || token.Type == tokenizer.TokenError {
+			break
+		}
+
+The three potential tokenization errors are a "bad-escape" (backslash-newline
+outside a "string" or url() in the input), a "bad-string" (unescaped newline
+inside a "string"), and a "bad-url" (a few different cases).  Parsers can
+choose to abort when seeing one of these errors, or ignore the declaration and
+attempt to recover.
+
+Returned tokens that carry extra information have a non-nil .Extra value.  For
+TokenError, TokenBadEscape, TokenBadString, and TokenBadURI, the
+TokenExtraError type carries an `error` with informative text about the nature
+of the error.  For TokenNumber, TokenPercentage, and TokenDimension, the
+TokenExtraNumeric specifies whether the number is integral, and for
+TokenDimension, contains the unit string (e.g. "px").  For TokenUnicodeRange,
+the TokenExtraUnicodeRange type contains the actual start and end values of the
+range.
+
+Note: the tokenizer doesn't perform lexical analysis, it only implements
+Section 4 of the CSS Syntax Level 3 specification.  See Section 5 for the
+parsing rules.
+*/
+package tokenizer
diff --git a/tokenizer/fuzz.go b/tokenizer/fuzz.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Kane York.
+
+package tokenizer
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"reflect"
+)
+
+// Tests should set this to true to suppress fuzzer output except on failure.
+var fuzzNoPrint = false
+
+// Entry point for fuzz testing.
+func Fuzz(b []byte) int {
+	success := false
+
+	var testLogBuf bytes.Buffer
+	fuzzPrintf := func(f string, v ...interface{}) {
+		fmt.Fprintf(&testLogBuf, f, v...)
+	}
+	defer func() {
+		if !success {
+			fmt.Print(testLogBuf.String())
+		}
+	}()
+	fuzzPrintf("=== Start fuzz test ===\n%s\n", b)
+
+	var tokens []Token
+	tz := NewTokenizer(bytes.NewReader(b))
+	for {
+		tt := tz.Next()
+		fuzzPrintf("[OT] %v\n", tt)
+		if tt.Type == TokenError {
+			// We should not have reading errors
+			panic(tt)
+		} else if tt.Type == TokenEOF {
+			break
+		} else {
+			tokens = append(tokens, tt)
+		}
+	}
+
+	// Render and retokenize
+
+	var wr TokenRenderer
+	var rerenderBuf bytes.Buffer
+	defer func() {
+		if !success {
+			fuzzPrintf("RE-RENDER BUFFER:\n%s\n", rerenderBuf.String())
+		}
+	}()
+	pr, pw := io.Pipe()
+	defer pr.Close()
+
+	go func() {
+		writeTarget := io.MultiWriter(&rerenderBuf, pw)
+		for _, v := range tokens {
+			wr.WriteTokenTo(writeTarget, v)
+		}
+		pw.Close()
+	}()
+
+	tz = NewTokenizer(pr)
+	i := 0
+	for {
+		for i < len(tokens) && tokens[i].Type == TokenComment {
+			i++
+		}
+		tt := tz.Next()
+		fuzzPrintf("[RT] %v\n", tt)
+		if tt.Type == TokenComment {
+			// Ignore comments while comparing
+			continue
+		}
+		if tt.Type == TokenError {
+			panic(tt)
+		}
+		if tt.Type == TokenEOF {
+			if i != len(tokens) {
+				panic(fmt.Sprintf("unexpected EOF: got EOF from retokenizer, but original token stream is at %d/%d\n%v", i, len(tokens), tokens))
+			} else {
+				break
+			}
+		}
+		if i == len(tokens) {
+			panic(fmt.Sprintf("expected EOF: reached end of original token stream but got %v from retokenizer\n%v", tt, tokens))
+		}
+
+		ot := tokens[i]
+		if tt.Type != ot.Type {
+			panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Type not equal)\n%v", tt, ot, tokens))
+		}
+		if tt.Value != ot.Value && !tt.Type.StopToken() {
+			panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Value not equal)\n%v", tt, ot, tokens))
+		}
+		if TokenExtraTypeLookup[tt.Type] != nil {
+			if !reflect.DeepEqual(tt, ot) && !tt.Type.StopToken() {
+				panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Extra not equal)\n%v", tt, ot, tokens))
+			}
+		}
+		i++
+		continue
+	}
+	success = true
+	return 1
+}