diff --git a/decode_test.go b/decode_test.go index e67bc58b..6779d8e0 100644 --- a/decode_test.go +++ b/decode_test.go @@ -458,30 +458,6 @@ func TestDecodeBadDatetime(t *testing.T) { } } -func TestDecodeMultilineStrings(t *testing.T) { - var x struct { - S string - } - const s0 = `s = """ -a b \n c -d e f -"""` - if _, err := Decode(s0, &x); err != nil { - t.Fatal(err) - } - if want := "a b \n c\nd e f\n"; x.S != want { - t.Errorf("got: %q; want: %q", x.S, want) - } - const s1 = `s = """a b c\ -"""` - if _, err := Decode(s1, &x); err != nil { - t.Fatal(err) - } - if want := "a b c"; x.S != want { - t.Errorf("got: %q; want: %q", x.S, want) - } -} - type sphere struct { Center [3]float64 Radius float64 @@ -1004,6 +980,16 @@ func TestDecodeErrors(t *testing.T) { {`x = [{ key = 42 #`, "expected a comma or an inline table terminator", true}, // panic {`x = {a = 42 #`, "expected a comma or an inline table terminator '}', but got end of file instead", true}, {`x = [42 #`, "expected a comma or array terminator ']', but got end of file instead", false}, + + // Literal escape characters are not alllowed in any strings + {`x = """` + "\r" + `"""`, `control characters are not allowed`, true}, + {`x = """` + "\x01" + `"""`, `control characters are not allowed`, true}, + {`x = '''` + "\r" + `'''`, `control characters are not allowed`, true}, + {`x = '''` + "\x01" + `'''`, `control characters are not allowed`, true}, + {`x = "` + "\r" + `"`, `control characters are not allowed`, true}, + {`x = "` + "\x01" + `"`, `control characters are not allowed`, true}, + {`x = '` + "\r" + `'`, `control characters are not allowed`, true}, + {`x = '` + "\x01" + `'`, `control characters are not allowed`, true}, } for _, tt := range tests { @@ -1028,36 +1014,66 @@ func TestDecodeMultilineNewlines(t *testing.T) { in string want string }{ - // Note `NL` gets replaced by "\n"; this makes it easier to read and - // write these tests. + // Note "NL" gets replaced by "\n" and "\r\n" (the tests are run twice); + // this makes it easier to read and write these tests. + + {`x = """"""`, ``}, + {`x = """\NL"""`, ``}, // Empty string + {`x = """\NL\NL\NL"""`, ``}, // Empty string - {`x=""""""`, ``}, - {`x="""\NL"""`, ``}, // Empty string - {`x="""\NL\NL\NL"""`, ``}, // Empty string + {`x = """a\NL u2222b"""`, `au2222b`}, // Remove all whitespace after \ + {`x = """a\NLNLNLu2222b"""`, `au2222b`}, // Remove all newlines + {`x = """a \NL u2222b"""`, `a u2222b`}, // Don't remove whitespace before \ - {`x="""a\NL u2222b"""`, `au2222b`}, // Remove all whitespace after \ - {`x="""a\NLNLNLu2222b"""`, `au2222b`}, // Remove all newlines - {`x="""a \NL u2222b"""`, `a u2222b`}, // Don't remove whitespace before \ + {`x = """a \ NLb"""`, `a b`}, // Allow any whitespace between \n and \ + {`x = """a \ NL b"""`, `a b`}, + {`x = """a \ NLb"""`, `a b`}, {`x="""a\NLu2222b"""`, `au2222b`}, // Ends in \ → remove {`x="""a\\NLu2222b"""`, `a\NLu2222b`}, // Ends in \\ → literal backslash, so keep NL. {`x="""a\\\NLu2222b"""`, `a\u2222b`}, // Ends in \\\ → backslash followed by NL escape, so remove. {`x="""a\\\\NLu2222b"""`, `a\\NLu2222b`}, // Ends in \\\\ → two lieral backslashes; keep NL + + {`x = """NLa b \n cNLd e fNL"""`, "a b \n c\nd e f\n"}, + {`x = """a b c\NL"""`, "a b c"}, + + {`x = """NLThe quick brown \NLNLNLfox jumps over \NL the lazy dog."""`, + `The quick brown fox jumps over the lazy dog.`}, + {`x = """\NL The quick brown \NLNLNL fox jumps over \NL the lazy dog.\NL """`, + `The quick brown fox jumps over the lazy dog.`}, } + replUnix := strings.NewReplacer("NL", "\n") + replWin := strings.NewReplacer("NL", "\r\n") for _, tt := range tests { t.Run("", func(t *testing.T) { - tt.in = strings.ReplaceAll(tt.in, "NL", "\n") - tt.want = strings.ReplaceAll(tt.want, "NL", "\n") - - var s struct{ X string } - _, err := Decode(tt.in, &s) - if err != nil { - t.Fatal(err) - } - if s.X != tt.want { - t.Errorf("\nhave: %s\nwant: %s", s.X, tt.want) - } + t.Run("unix", func(t *testing.T) { + in := replUnix.Replace(tt.in) + want := replUnix.Replace(tt.want) + + var s struct{ X string } + _, err := Decode(in, &s) + if err != nil { + t.Fatal(err) + } + if s.X != want { + t.Errorf("\nhave: %q\nwant: %q", s.X, want) + } + }) + + t.Run("windows", func(t *testing.T) { + in := replWin.Replace(tt.in) + want := replWin.Replace(tt.want) + + var s struct{ X string } + _, err := Decode(in, &s) + if err != nil { + t.Fatal(err) + } + if s.X != want { + t.Errorf("\nhave: %q\nwant: %q", s.X, want) + } + }) }) } } diff --git a/lex.go b/lex.go index 212947ae..86912a30 100644 --- a/lex.go +++ b/lex.go @@ -596,6 +596,8 @@ func lexString(lx *lexer) stateFn { switch { case r == eof: return lx.errorf(`unexpected EOF; expected '"'`) + case isControl(r) || r == '\r': + return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r) case isNL(r): return lx.errorf("strings cannot contain newlines") case r == '\\': @@ -614,9 +616,15 @@ func lexString(lx *lexer) stateFn { // lexMultilineString consumes the inner contents of a string. It assumes that // the beginning '"""' has already been consumed and ignored. func lexMultilineString(lx *lexer) stateFn { - switch lx.next() { + r := lx.next() + switch r { case eof: return lx.errorf(`unexpected EOF; expected '"""'`) + case '\r': + if lx.peek() != '\n' { + return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r) + } + return lexMultilineString case '\\': return lexMultilineStringEscape case stringEnd: @@ -635,6 +643,10 @@ func lexMultilineString(lx *lexer) stateFn { lx.backup() } } + + if isControl(r) { + return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r) + } return lexMultilineString } @@ -645,6 +657,8 @@ func lexRawString(lx *lexer) stateFn { switch { case r == eof: return lx.errorf(`unexpected EOF; expected "'"`) + case isControl(r) || r == '\r': + return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r) case isNL(r): return lx.errorf("strings cannot contain newlines") case r == rawStringEnd: @@ -661,9 +675,15 @@ func lexRawString(lx *lexer) stateFn { // a string. It assumes that the beginning "'''" has already been consumed and // ignored. func lexMultilineRawString(lx *lexer) stateFn { - switch lx.next() { + r := lx.next() + switch r { case eof: return lx.errorf(`unexpected EOF; expected "'''"`) + case '\r': + if lx.peek() != '\n' { + return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r) + } + return lexMultilineString case rawStringEnd: if lx.accept(rawStringEnd) { if lx.accept(rawStringEnd) { @@ -680,6 +700,10 @@ func lexMultilineRawString(lx *lexer) stateFn { lx.backup() } } + + if isControl(r) { + return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r) + } return lexMultilineRawString } @@ -710,6 +734,10 @@ func lexStringEscape(lx *lexer) stateFn { fallthrough case '"': fallthrough + // Inside """ .. """ strings you can use \ to escape newlines, and any + // amount of whitespace can be between the \ and \n. + case ' ', '\t': + fallthrough case '\\': return lx.pop() case 'u': @@ -908,6 +936,16 @@ func isNL(r rune) bool { return r == '\n' || r == '\r' } +// Control characters except \n, \t +func isControl(r rune) bool { + switch r { + case '\t', '\r', '\n': + return false + default: + return (r >= 0x00 && r <= 0x1f) || r == 0x7f + } +} + func isDigit(r rune) bool { return r >= '0' && r <= '9' } diff --git a/parse.go b/parse.go index 1505a71b..8b0f74f0 100644 --- a/parse.go +++ b/parse.go @@ -537,47 +537,54 @@ func (p *parser) current() string { } func stripFirstNewline(s string) string { - if len(s) == 0 || s[0] != '\n' { - return s + if len(s) > 0 && s[0] == '\n' { + return s[1:] + } + if len(s) > 1 && s[0] == '\r' && s[1] == '\n' { + return s[2:] } - return s[1:] + return s } // Remove newlines inside triple-quoted strings if a line ends with "\". -// -// \NL → remove -// \\NL → is escaped: do nothing -// \\\NL → is backslash and then \\n: remove func stripEscapedNewlines(s string) string { - i := strings.Index(s, "\\\n") - if i == -1 { + split := strings.Split(s, "\n") + if len(split) < 1 { return s } - // Find all instances of "\\n"; remove them unless it's prefixed by an odd - // number of "\"s, incidating this was escaped. - var ( - b strings.Builder - upto string - ) - b.Grow(len(s)) - for ; i > -1; i = strings.Index(s, "\\\n") { - upto, s = s[:i], s[i+1:] - c := 0 - for j := len(upto) - 1; j >= 0 && upto[j] == '\\'; j-- { - c++ - } - - b.WriteString(upto) - if c > 0 && c%2 == 1 { - b.WriteString("\\") - } else { - s = strings.TrimLeft(s, " \n\t") + escNL := false // Keep track of the last non-blank line was escaped. + for i, line := range split { + line = strings.TrimRight(line, " \t\r") + + if len(line) == 0 || line[len(line)-1] != '\\' { + split[i] = strings.TrimRight(split[i], "\r") + if !escNL && i != len(split)-1 { + split[i] += "\n" + } + continue } - } - b.WriteString(s) - return b.String() + escBS := true + for j := len(line) - 1; j >= 0 && line[j] == '\\'; j-- { + escBS = !escBS + } + if escNL { + line = strings.TrimLeft(line, " \t\r") + } + escNL = !escBS + + if escBS { + split[i] += "\n" + continue + } + + split[i] = line[:len(line)-1] // Remove \ + if len(split)-1 > i { + split[i+1] = strings.TrimLeft(split[i+1], " \t\r") + } + } + return strings.Join(split, "") } func (p *parser) replaceEscapes(str string) string {