Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(stdlib): add unicode/utf16 pacakge #1764

Merged
merged 3 commits into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/reference/go-gno-compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ Legend:
| time | `full`[^7] |
| time/tzdata | `tbd` |
| unicode | `full` |
| unicode/utf16 | `tbd` |
| unicode/utf16 | `full` |
| unicode/utf8 | `full` |
| unsafe | `nondet` |

Expand Down
1 change: 1 addition & 0 deletions gnovm/pkg/transpiler/transpiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ var stdlibWhitelist = []string{
"time",
"unicode",
"unicode/utf8",
"unicode/utf16",

// gno
"std",
Expand Down
125 changes: 125 additions & 0 deletions gnovm/stdlibs/unicode/utf16/utf16.gno
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package utf16 implements encoding and decoding of UTF-16 sequences.
package utf16

// The conditions replacementChar==unicode.ReplacementChar and
// maxRune==unicode.MaxRune are verified in the tests.
// Defining them locally avoids this package depending on package unicode.

const (
replacementChar = '\uFFFD' // Unicode replacement character
maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
)

const (
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
// the value is those 20 bits plus 0x10000.
surr1 = 0xd800
surr2 = 0xdc00
surr3 = 0xe000

surrSelf = 0x10000
)

// IsSurrogate reports whether the specified Unicode code point
// can appear in a surrogate pair.
func IsSurrogate(r rune) bool {
return surr1 <= r && r < surr3
}

// DecodeRune returns the UTF-16 decoding of a surrogate pair.
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
// the Unicode replacement code point U+FFFD.
func DecodeRune(r1, r2 rune) rune {
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
}
return replacementChar
}

// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
// If the rune is not a valid Unicode code point or does not need encoding,
// EncodeRune returns U+FFFD, U+FFFD.
func EncodeRune(r rune) (r1, r2 rune) {
if r < surrSelf || r > maxRune {
return replacementChar, replacementChar
}
r -= surrSelf
return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
}

// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []rune) []uint16 {
n := len(s)
for _, v := range s {
if v >= surrSelf {
n++
}
}

a := make([]uint16, n)
n = 0
for _, v := range s {
switch {
case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
// normal rune
a[n] = uint16(v)
n++
case surrSelf <= v && v <= maxRune:
// needs surrogate sequence
r1, r2 := EncodeRune(v)
a[n] = uint16(r1)
a[n+1] = uint16(r2)
n += 2
default:
a[n] = uint16(replacementChar)
n++
}
}
return a[:n]
}

// AppendRune appends the UTF-16 encoding of the Unicode code point r
// to the end of p and returns the extended buffer. If the rune is not
// a valid Unicode code point, it appends the encoding of U+FFFD.
func AppendRune(a []uint16, r rune) []uint16 {
// This function is inlineable for fast handling of ASCII.
switch {
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
// normal rune
return append(a, uint16(r))
case surrSelf <= r && r <= maxRune:
// needs surrogate sequence
r1, r2 := EncodeRune(r)
return append(a, uint16(r1), uint16(r2))
}
return append(a, replacementChar)
}

// Decode returns the Unicode code point sequence represented
// by the UTF-16 encoding s.
func Decode(s []uint16) []rune {
a := make([]rune, len(s))
n := 0
for i := 0; i < len(s); i++ {
switch r := s[i]; {
case r < surr1, surr3 <= r:
// normal rune
a[n] = rune(r)
case surr1 <= r && r < surr2 && i+1 < len(s) &&
surr2 <= s[i+1] && s[i+1] < surr3:
// valid surrogate sequence
a[n] = DecodeRune(rune(r), rune(s[i+1]))
i++
default:
// invalid surrogate sequence
a[n] = replacementChar
}
n++
}
return a[:n]
}
221 changes: 221 additions & 0 deletions gnovm/stdlibs/unicode/utf16/utf16_test.gno
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package utf16

import (
"testing"
"unicode"
"unicode/utf16"
)

type encodeTest struct {
in []rune
out []uint16
}

var encodeTests = []encodeTest{
{[]rune{1, 2, 3, 4}, []uint16{1, 2, 3, 4}},
{
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
},
{
[]rune{'a', 'b', 0xd7ff, 0xd800, 0xdfff, 0xe000, 0x110000, -1},
[]uint16{'a', 'b', 0xd7ff, 0xfffd, 0xfffd, 0xe000, 0xfffd, 0xfffd},
},
}

func slicesEqual(a, b []uint16) bool {
if len(a) != len(b) {
return false
}
for i, v := range a {
if v != b[i] {
return false
}
}
return true
}

func TestEncode(t *testing.T) {
for _, tt := range encodeTests {
out := Encode(tt.in)
if !slicesEqual(out, tt.out) {
t.Errorf("Encode(%x) = %x; want %x", tt.in, out, tt.out)
}
}
}

func TestEncodeRune(t *testing.T) {
for i, tt := range encodeTests {
j := 0
for _, r := range tt.in {
r1, r2 := EncodeRune(r)
if r < 0x10000 || r > unicode.MaxRune {
if j >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2)
}
j++
} else {
if j+1 >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != rune(tt.out[j]) || r2 != rune(tt.out[j+1]) {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1])
}
j += 2
dec := DecodeRune(r1, r2)
if dec != r {
t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r)
}
}
}
if j != len(tt.out) {
t.Errorf("#%d: EncodeRune didn't generate enough output", i)
}
}
}

type decodeTest struct {
in []uint16
out []rune
}

var decodeTests = []decodeTest{
{[]uint16{1, 2, 3, 4}, []rune{1, 2, 3, 4}},
{
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
},
{[]uint16{0xd800, 'a'}, []rune{0xfffd, 'a'}},
{[]uint16{0xdfff}, []rune{0xfffd}},
}

func TestDecode(t *testing.T) {
for _, tt := range decodeTests {
out := Decode(tt.in)
if !runesEqual(out, tt.out) {
t.Errorf("Decode(%x) = %x; want %x", tt.in, out, tt.out)
}
}
}

func runesEqual(a, b []rune) bool {
if len(a) != len(b) {
return false
}
for i, v := range a {
if v != b[i] {
return false
}
}
return true
}

var decodeRuneTests = []struct {
r1, r2 rune
want rune
}{
{0xd800, 0xdc00, 0x10000},
{0xd800, 0xdc01, 0x10001},
{0xd808, 0xdf45, 0x12345},
{0xdbff, 0xdfff, 0x10ffff},
{0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted
}

func TestDecodeRune(t *testing.T) {
for i, tt := range decodeRuneTests {
got := DecodeRune(tt.r1, tt.r2)
if got != tt.want {
t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want)
}
}
}

var surrogateTests = []struct {
r rune
want bool
}{
// from https://en.wikipedia.org/wiki/UTF-16
{'\u007A', false}, // LATIN SMALL LETTER Z
{'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water)
{'\uFEFF', false}, // Byte Order Mark
{'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point)
{'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF
{'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point)

{rune(0xd7ff), false}, // surr1-1
{rune(0xd800), true}, // surr1
{rune(0xdc00), true}, // surr2
{rune(0xe000), false}, // surr3
{rune(0xdfff), true}, // surr3-1
}

func TestIsSurrogate(t *testing.T) {
for i, tt := range surrogateTests {
got := IsSurrogate(tt.r)
if got != tt.want {
t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want)
}
}
}

func BenchmarkDecodeValidASCII(b *testing.B) {
// "hello world"
data := []uint16{104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100}
for i := 0; i < b.N; i++ {
Decode(data)
}
}

func BenchmarkDecodeValidJapaneseChars(b *testing.B) {
// "日本語日本語日本語"
data := []uint16{26085, 26412, 35486, 26085, 26412, 35486, 26085, 26412, 35486}
for i := 0; i < b.N; i++ {
Decode(data)
}
}

func BenchmarkDecodeRune(b *testing.B) {
rs := make([]rune, 10)
// U+1D4D0 to U+1D4D4: MATHEMATICAL BOLD SCRIPT CAPITAL LETTERS
for i, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
rs[2*i], rs[2*i+1] = EncodeRune(u)
}

b.ResetTimer()
for i := 0; i < b.N; i++ {
for j := 0; j < 5; j++ {
DecodeRune(rs[2*j], rs[2*j+1])
}
}
}

func BenchmarkEncodeValidASCII(b *testing.B) {
data := []rune{'h', 'e', 'l', 'l', 'o'}
for i := 0; i < b.N; i++ {
Encode(data)
}
}

func BenchmarkEncodeValidJapaneseChars(b *testing.B) {
data := []rune{'日', '本', '語'}
for i := 0; i < b.N; i++ {
Encode(data)
}
}

func BenchmarkEncodeRune(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
EncodeRune(u)
}
}
}
Loading