Skip to content

Commit

Permalink
Implement Parquet Support For Egress Pipeline Output (#67)
Browse files Browse the repository at this point in the history
  • Loading branch information
atris authored May 13, 2021
1 parent d80834d commit d7001c3
Show file tree
Hide file tree
Showing 14 changed files with 666 additions and 15 deletions.
2 changes: 1 addition & 1 deletion internal/encoding/block/block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func TestBlock_Types(t *testing.T) {
assert.NoError(t, err)

schema := b[0].Schema()
assert.Equal(t, 8, len(schema))
assert.Equal(t, 9, len(schema))
assert.Contains(t, schema, "boolean1")
assert.Contains(t, schema, "double1")
assert.Contains(t, schema, "int1")
Expand Down
1 change: 1 addition & 0 deletions internal/encoding/block/from_orc.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,5 +147,6 @@ func convertToString(value interface{}) (string, bool) {
if ok {
return strconv.FormatInt(valueInt, 10), true
}

return "", false
}
40 changes: 36 additions & 4 deletions internal/encoding/block/from_parquet.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package block

import (
"fmt"

"github.com/kelindar/talaria/internal/column"
"github.com/kelindar/talaria/internal/encoding/parquet"
"github.com/kelindar/talaria/internal/encoding/typeof"
Expand Down Expand Up @@ -65,10 +67,9 @@ func FromParquetBy(payload []byte, partitionBy string, filter *typeof.Schema, ap
columnName := cols[i]
columnType := schema[columnName]

// Encode to JSON
if columnType == typeof.JSON {
if encoded, ok := convertToJSON(v); ok {
v = encoded
if handler := parquetHandlerFor(columnType.String()); handler != nil {
if v, err = handler(v); err != nil {
return true
}
}

Expand All @@ -92,3 +93,34 @@ func FromParquetBy(payload []byte, partitionBy string, filter *typeof.Schema, ap
blocks = append(blocks, last...)
return blocks, nil
}

type parquetFieldHandler func(interface{}) (interface{}, error)

func parquetHandlerFor(typ string) parquetFieldHandler {
switch typ {
case "string":
return parquetStringHandler

case "json":
return parquetJsonHandler
}

return nil
}

func parquetStringHandler(s interface{}) (interface{}, error) {
switch v := s.(type) {
case []byte:
return string(v), nil
}

return nil, nil
}

func parquetJsonHandler(s interface{}) (interface{}, error) {
if encoded, ok := convertToJSON(s); ok {
return encoded, nil
}

return nil, fmt.Errorf("Failed to convert to JSON")
}
3 changes: 2 additions & 1 deletion internal/encoding/block/from_parquet_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package block

import (
"github.com/stretchr/testify/assert"
"io/ioutil"
"testing"

"github.com/stretchr/testify/assert"
)

const testFileForParquet = "../../../test/test2.parquet"
Expand Down
2 changes: 2 additions & 0 deletions internal/encoding/merge/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ func New(mergeFunc string) (Func, error) {
switch strings.ToLower(mergeFunc) {
case "orc":
return ToOrc, nil
case "parquet":
return ToParquet, nil
case "": // Default to "orc" so we don't break existing configs
return ToOrc, nil
}
Expand Down
10 changes: 10 additions & 0 deletions internal/encoding/merge/merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
// BenchmarkFlush runs a benchmark for a Merge function for flushing
// To run it, go in the directory and do 'go test -benchmem -bench=. -benchtime=1s'
// BenchmarkMerge/orc-8 1 7195029600 ns/op 2101578032 B/op 36859501 allocs/op
// BenchmarkMerge/parquet-12 1 18666411036 ns/op 5142058320 B/op 115850463 allocs/op
func BenchmarkMerge(b *testing.B) {

// Append some files
Expand All @@ -27,6 +28,15 @@ func BenchmarkMerge(b *testing.B) {
ToOrc(blocks, blocks[0].Schema())
}
})

// Run the actual benchmark
b.Run("parquet", func(b *testing.B) {
b.ResetTimer()
b.ReportAllocs()
for n := 0; n < b.N; n++ {
ToParquet(blocks, blocks[0].Schema())
}
})
}

func TestMergeNew(t *testing.T) {
Expand Down
Loading

0 comments on commit d7001c3

Please sign in to comment.