Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-38718: [Go][Format][Integration] Add StringView/BinaryView to Go implementation #35769

Merged
merged 38 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d88fc91
flatbuffer regen and add types
zeroshade May 19, 2023
a0eb736
initial impl of binary/string views
zeroshade May 22, 2023
daf1796
implement concat
zeroshade May 23, 2023
0491fd2
initial IPC stuff
zeroshade May 24, 2023
0ba9d56
ipc integration tests
zeroshade May 25, 2023
46e7034
don't bump to v6 metadata yet
zeroshade May 25, 2023
2a85125
update datagen with fix
zeroshade May 25, 2023
0b141d8
fix lint and integration issue
zeroshade May 26, 2023
7e19d39
forgot to update the test
zeroshade May 26, 2023
15888ac
fix 32bit and tinygo compat
zeroshade Jun 15, 2023
76e9bbc
update inlinedata for compatibility with go1.20 and tinygo
zeroshade Jun 15, 2023
3595e5d
Update go/arrow/datatype_binary.go
zeroshade Jun 21, 2023
92a8362
Update go/arrow/datatype_binary.go
zeroshade Jun 21, 2023
704bf82
Update go/arrow/array/binary.go
zeroshade Jun 21, 2023
dcdc1b1
embedded field
zeroshade Jun 21, 2023
c15b7ba
updates from review feedback
zeroshade Jun 21, 2023
d6bbd35
add AppendNulls and AppendEmptyValues
zeroshade Jun 21, 2023
8dbcf52
handle flaky test
zeroshade Jun 21, 2023
d0e03bb
Update go/arrow/internal/testing/gen/random_array_gen.go
zeroshade Jun 22, 2023
646b1e2
Update go/arrow/array/binarybuilder.go
zeroshade Jun 22, 2023
24fb628
Update go/arrow/array/binarybuilder.go
zeroshade Jun 22, 2023
306ee94
Update go/arrow/array/bufferbuilder.go
zeroshade Jun 22, 2023
5dc1d51
Update go/arrow/datatype_stringheader.go
zeroshade Jun 22, 2023
b620e45
updates from review feedback
zeroshade Jun 22, 2023
0b10bed
update gitattributes for generated go files
zeroshade Jun 22, 2023
a009c47
updates from merge
zeroshade Oct 25, 2023
bccebbe
rename and fix imports
zeroshade Oct 26, 2023
1792a98
implement rename
zeroshade Oct 26, 2023
5cfc237
fix endian swap default
zeroshade Oct 26, 2023
829a850
updates from round of feedback
zeroshade Oct 27, 2023
a560917
rename to InlineString
zeroshade Oct 27, 2023
a84ee2e
Update go/arrow/datatype.go
zeroshade Oct 30, 2023
be3b3a5
updates from feedback
zeroshade Oct 30, 2023
e2bbe6f
Update go/arrow/datatype_viewheader.go
zeroshade Nov 13, 2023
cab4899
Merge branch 'main' into string-view
zeroshade Nov 13, 2023
9db6557
update with rebase
zeroshade Nov 13, 2023
460c06e
fix version imports
zeroshade Nov 13, 2023
8e41840
fix formatting, updates from feedback, add tests
zeroshade Nov 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ cpp/src/generated/*.cpp linguist-generated=true
cpp/src/generated/*.h linguist-generated=true
go/**/*.s linguist-generated=true
go/arrow/unionmode_string.go linguist-generated=true
go/arrow/internal/flatbuf/*.go linguist-generated=true
go/**/*.pb.go linguist-generated=true
go/parquet/internal/gen-go/parquet/*.go linguist-generated=true
r/R/RcppExports.R linguist-generated=true
r/R/arrowExports.R linguist-generated=true
r/src/RcppExports.cpp linguist-generated=true
Expand Down
4 changes: 4 additions & 0 deletions docs/source/status.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ Data Types
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Large Utf8 |||| | ||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Binary View || || | | | | |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| String View || || | | | | |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+

+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift |
Expand Down
2 changes: 1 addition & 1 deletion format/Schema.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ enum MetadataVersion:short {
/// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
V4,

/// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
/// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
/// metadata and IPC messages). Implementations are recommended to provide a
/// V4 compatibility mode with V5 format changes disabled.
///
Expand Down
3 changes: 2 additions & 1 deletion go/arrow/array/array.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ func init() {
arrow.RUN_END_ENCODED: func(data arrow.ArrayData) arrow.Array { return NewRunEndEncodedData(data) },
arrow.LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewListViewData(data) },
arrow.LARGE_LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewLargeListViewData(data) },

arrow.BINARY_VIEW: func(data arrow.ArrayData) arrow.Array { return NewBinaryViewData(data) },
arrow.STRING_VIEW: func(data arrow.ArrayData) arrow.Array { return NewStringViewData(data) },
// invalid data types to fill out array to size 2^6 - 1
63: invalidDataType,
}
Expand Down
121 changes: 121 additions & 0 deletions go/arrow/array/binary.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"unsafe"

"github.com/apache/arrow/go/v14/arrow"
"github.com/apache/arrow/go/v14/arrow/memory"
"github.com/apache/arrow/go/v14/internal/json"
)

Expand Down Expand Up @@ -318,6 +319,126 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
return true
}

type ViewLike interface {
arrow.Array
ValueHeader(int) *arrow.ViewHeader
}

type BinaryView struct {
array
values []arrow.ViewHeader
dataBuffers []*memory.Buffer
}

func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
a := &BinaryView{}
a.refCount = 1
zeroshade marked this conversation as resolved.
Show resolved Hide resolved
a.setData(data.(*Data))
return a
}

func (a *BinaryView) setData(data *Data) {
if len(data.buffers) < 2 {
panic("len(data.buffers) < 2")
}
a.array.setData(data)

if valueData := data.buffers[1]; valueData != nil {
a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes())
}

a.dataBuffers = data.buffers[2:]
}

func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
return &a.values[a.array.data.offset+i]
}

func (a *BinaryView) Value(i int) []byte {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since there is no compiler guarantee in Go that the slice returned by this method will remain unchanged (whether intentionally or unintentionally) by users of this API, I would suggest adding a comment to this method. This comment should clearly specify that it's unsafe to make any kind of changes to the returned slice.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a concern present everywhere in Arrow though, so a comment here could be understood as implying that places without a comment like this allow buffers to be mutated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably a reflex from my Rust experience where this slice will be immutable. In the context of Go, I agree with you that sporadically adding a comment might be counterproductive if we do not apply this globally.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the top level, there do already exist comments that state that it is intended that all Arrow Arrays be immutable. I agree with the concern that adding a comment here specifically could be counterproductive. If you can think of a good place to put such a comment that would be more universal, I'd be more than happy to do so.

s := a.ValueHeader(i)
if s.IsInline() {
return s.InlineBytes()
}
start := s.BufferOffset()
buf := a.dataBuffers[s.BufferIndex()]
return buf.Bytes()[start : start+int32(s.Len())]
}

// ValueString returns the value at index i as a string instead of
// a byte slice, without copying the underlying data.
func (a *BinaryView) ValueString(i int) string {
b := a.Value(i)
return *(*string)(unsafe.Pointer(&b))
}

func (a *BinaryView) String() string {
var o strings.Builder
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
switch {
case a.IsNull(i):
o.WriteString(NullValueStr)
default:
fmt.Fprintf(&o, "%q", a.ValueString(i))
}
}
o.WriteString("]")
return o.String()
}

// ValueStr is paired with AppendValueFromString in that it returns
// the value at index i as a string: Semantically this means that for
// a null value it will return the string "(null)", otherwise it will
// return the value as a base64 encoded string suitable for CSV/JSON.
//
// This is always going to be less performant than just using ValueString
// and exists to fulfill the Array interface to provide a method which
// can produce a human readable string for a given index.
func (a *BinaryView) ValueStr(i int) string {
zeroshade marked this conversation as resolved.
Show resolved Hide resolved
zeroshade marked this conversation as resolved.
Show resolved Hide resolved
if a.IsNull(i) {
return NullValueStr
}
return base64.StdEncoding.EncodeToString(a.Value(i))
}

func (a *BinaryView) GetOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
return a.Value(i)
}

func (a *BinaryView) MarshalJSON() ([]byte, error) {
bkietz marked this conversation as resolved.
Show resolved Hide resolved
vals := make([]interface{}, a.Len())
for i := 0; i < a.Len(); i++ {
vals[i] = a.GetOneForMarshal(i)
}
// golang marshal standard says that []byte will be marshalled
// as a base64-encoded string
return json.Marshal(vals)
}

func arrayEqualBinaryView(left, right *BinaryView) bool {
leftBufs, rightBufs := left.dataBuffers, right.dataBuffers
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
zeroshade marked this conversation as resolved.
Show resolved Hide resolved
continue
}
if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) {
return false
}
}
return true
}

var (
_ arrow.Array = (*Binary)(nil)
_ arrow.Array = (*LargeBinary)(nil)
_ arrow.Array = (*BinaryView)(nil)
)
Loading