From a3d15a21d7dcd07c696c0c1ae03a2a73d86cf7a6 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 20 Aug 2019 15:19:44 -0500 Subject: [PATCH 01/26] WIP: proof of concept for batch based ingest no channels, should be more memory efficient. Currently only supports strings. --- client.go | 5 + cmd/picsv/main.go | 114 ++++++++++++++++++++ go.mod | 1 + go.sum | 4 + importbatch.go | 220 ++++++++++++++++++++++++++++++++++++++ importbatch_test.go | 253 ++++++++++++++++++++++++++++++++++++++++++++ orm.go | 4 +- translator.go | 47 ++++++++ 8 files changed, 646 insertions(+), 2 deletions(-) create mode 100644 cmd/picsv/main.go create mode 100644 importbatch.go create mode 100644 importbatch_test.go create mode 100644 translator.go diff --git a/client.go b/client.go index ff3f0ed..db7fa01 100644 --- a/client.go +++ b/client.go @@ -89,6 +89,9 @@ type Client struct { importLogEncoder encoder logLock sync.Mutex + + translator *Translator + // TODO threadsafe key translation cache on client using embedded K/V store. } // DefaultClient creates a client with the default address and options. @@ -138,6 +141,8 @@ func newClientWithOptions(options *ClientOptions) *Client { client: newHTTPClient(options.withDefaults()), logger: log.New(os.Stderr, "go-pilosa ", log.Flags()), coordinatorLock: &sync.RWMutex{}, + + translator: NewTranslator(), } if options.importLogWriter != nil { c.importLogEncoder = newImportLogEncoder(options.importLogWriter) diff --git a/cmd/picsv/main.go b/cmd/picsv/main.go new file mode 100644 index 0000000..04bee5d --- /dev/null +++ b/cmd/picsv/main.go @@ -0,0 +1,114 @@ +package main + +import ( + "encoding/csv" + "fmt" + "io" + "log" + "os" + "time" + + "github.com/jaffee/commandeer" + "github.com/pilosa/go-pilosa" + "github.com/pkg/errors" +) + +type Main struct { + Pilosa []string + File string + Index string + BatchSize int +} + +func NewMain() *Main { + return &Main{ + Pilosa: []string{"localhost:10101"}, + File: "data.csv", + Index: "picsvtest", + BatchSize: 1000, + } +} + +func (m *Main) Run() error { + start := time.Now() + defer func() { + fmt.Println("Duration: ", time.Since(start)) + }() + f, err := os.Open(m.File) + if err != nil { + return errors.Wrap(err, "opening file") + } + defer f.Close() + reader := csv.NewReader(f) + + client, err := pilosa.NewClient(m.Pilosa) + if err != nil { + return errors.Wrap(err, "getting pilosa client") + } + schema, err := client.Schema() + if err != nil { + return errors.Wrap(err, "getting schema") + } + index := schema.Index(m.Index) + + header, err := reader.Read() + if err != nil { + return errors.Wrap(err, "reading CSV header") + } + log.Println("Got Header: ", header) + fields := processHeader(index, header) + // this has a non-obvious dependence on the previous line... the fields are set up in the index which comes from the schema + client.SyncSchema(schema) + batch := pilosa.NewBatch(client, m.BatchSize, fields) + record := pilosa.Row{ + Values: make([]interface{}, len(header)), + } + id := uint64(0) + row, err := reader.Read() + for ; err == nil; row, err = reader.Read() { + record.ID = id + for i, _ := range record.Values { + if i < len(row) { + record.Values[i] = row[i] + } else { + record.Values[i] = nil + log.Printf("row is shorter than header: %v", row) + } + } + err := batch.Add(record) + if err == pilosa.ErrBatchNowFull { + err := batch.Import() + if err != nil { + return errors.Wrap(err, "importing") + } + } else if err != nil { + return errors.Wrap(err, "adding to batch") + } + + id++ + } + log.Printf("processed %d ids\n", id) + if err != io.EOF && err != nil { + return errors.Wrap(err, "reading csv") + } + err = batch.Import() + if err != nil { + return errors.Wrap(err, "final import") + } + + return nil +} + +func processHeader(index *pilosa.Index, header []string) []*pilosa.Field { + ret := make([]*pilosa.Field, 0, len(header)) + for _, fieldName := range header { + ret = append(ret, index.Field(fieldName, pilosa.OptFieldKeys(true))) + } + return ret +} + +func main() { + if err := commandeer.Run(NewMain()); err != nil { + log.Fatal(err) + } +} diff --git a/go.mod b/go.mod index 595ece4..272f708 100644 --- a/go.mod +++ b/go.mod @@ -23,6 +23,7 @@ require ( github.com/hashicorp/go-sockaddr v1.0.2 // indirect github.com/hashicorp/go-uuid v1.0.1 // indirect github.com/hashicorp/memberlist v0.1.4 // indirect + github.com/jaffee/commandeer v0.1.1-0.20190726022955-4d43b78ebc4e github.com/kisielk/errcheck v1.2.0 // indirect github.com/konsorten/go-windows-terminal-sequences v1.0.2 // indirect github.com/kr/pty v1.1.5 // indirect diff --git a/go.sum b/go.sum index a52c7ab..eb03b3c 100644 --- a/go.sum +++ b/go.sum @@ -120,6 +120,10 @@ github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2p github.com/hashicorp/memberlist v0.1.4 h1:gkyML/r71w3FL8gUi74Vk76avkj/9lYAY9lvg0OcoGs= github.com/hashicorp/memberlist v0.1.4/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= +github.com/jaffee/commandeer v0.1.0 h1:UxHHnhKmtz8gAgqu67lYK5tlX5D9A86mGc9AWcEMSWU= +github.com/jaffee/commandeer v0.1.0/go.mod h1:x1WpthEI14PRNcPtVna43ontBxJ1o7plCOsZ8kksl8M= +github.com/jaffee/commandeer v0.1.1-0.20190726022955-4d43b78ebc4e h1:CC1usSIzu9p6zmz7jPj0QiP3FdpGW+PCGc9d1yhSls0= +github.com/jaffee/commandeer v0.1.1-0.20190726022955-4d43b78ebc4e/go.mod h1:N5yIzoHN6EwVFi0QCKvpFPJeECoZyEcFBQSR8r+7Mz0= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= diff --git a/importbatch.go b/importbatch.go new file mode 100644 index 0000000..ecd7f65 --- /dev/null +++ b/importbatch.go @@ -0,0 +1,220 @@ +package pilosa + +import ( + "github.com/pilosa/pilosa/roaring" + "github.com/pkg/errors" + "golang.org/x/sync/errgroup" +) + +type Row struct { + ID interface{} + Values []interface{} +} + +func (b *Batch) Import() error { + // first we need to translate the toTranslate, then fill out the missing row IDs + err := b.doTranslation() + if err != nil { + return errors.Wrap(err, "doing Translation") + } + + // create bitmaps out of each field in b.rowIDs and import + err = b.doImport() + if err != nil { + return errors.Wrap(err, "doing import") + } + + // clear existing structures without reclaiming the memory + b.reset() + return nil +} + +func (b *Batch) doTranslation() error { + for i, field := range b.header { + tt := b.toTranslate[i] + keys := make([]string, 0, len(tt)) + + // make a slice of keys + for k, _ := range tt { + keys = append(keys, k) + } + + if len(keys) == 0 { + continue + } + + // translate keys from Pilosa + ids, err := b.client.translateRowKeys(field, keys) + if err != nil { + return errors.Wrap(err, "translating row keys") + } + + // fill out missing IDs in local batch records with translated IDs + for j, key := range keys { + id := ids[j] + for _, recordIdx := range tt[key] { + b.rowIDs[i][recordIdx] = id + } + b.client.translator.AddRow(b.index, field.Name(), key, id) + } + } + return nil +} + +func (b *Batch) doImport() error { + eg := errgroup.Group{} + index := b.header[0].index + + frags := b.makeFragments() + uri := b.client.cluster.hosts[0] // TODO get URI per-shard performantly. + for shard, viewMap := range frags { + for viewSpec, bitmap := range viewMap { + viewSpec := viewSpec + bitmap := bitmap + eg.Go(func() error { + err := b.client.importRoaringBitmap(uri, index.Field(viewSpec.field), shard, map[string]*roaring.Bitmap{"": bitmap}, &ImportOptions{}) + return errors.Wrap(err, "doing import") + }) + } + } + return eg.Wait() +} + +func (b *Batch) makeFragments() fragments { + shardWidth := b.header[0].index.shardWidth + if shardWidth == 0 { + shardWidth = DefaultShardWidth + } + frags := make(fragments) + if len(b.ids) == 0 { + return frags // exit early if no records + } + for i, field := range b.header { + curShard := b.ids[0] / shardWidth + curBM := frags.GetOrCreate(curShard, field.Name(), "") + rowIDs := b.rowIDs[i] + for j, _ := range b.ids { + col, row := b.ids[j], rowIDs[j] + if col%shardWidth != curShard { + curShard = col / shardWidth + curBM = frags.GetOrCreate(curShard, field.Name(), "") + } + curBM.DirectAdd(row*shardWidth + (col % shardWidth)) + } + } + return frags +} + +// reset is called at the end of importing to ready the batch for the +// next round. Where possible it does not re-allocate memory. +func (b *Batch) reset() { + b.ids = b.ids[:0] + for i, rowIDs := range b.rowIDs { + b.rowIDs[i] = rowIDs[:0] + m := b.toTranslate[i] + for k := range m { + delete(m, k) + } + } +} + +type Batch struct { + client *Client + header []*Field + index string + + // ids is a slice of length batchSize of record IDs + // TODO support string IDs + ids []uint64 + // rowIDs is a slice of length len(Batch.header) which contains slices of length batchSize + rowIDs [][]uint64 + // TODO, support int fields, set fields without translation, timestamps, set fields with more than one value per record. + + // for each field, keep a map of key to which record indexes that key mapped to + toTranslate []map[string][]int +} + +func NewBatch(client *Client, size int, fields []*Field) *Batch { + if len(fields) == 0 || size == 0 { + panic("can't batch with no fields or batch size") + } + rowIDs := make([][]uint64, len(fields)) + tt := make([]map[string][]int, len(fields)) + for i, _ := range fields { + rowIDs[i] = make([]uint64, 0, size) + tt[i] = make(map[string][]int) + } + return &Batch{ + client: client, + header: fields, + index: fields[0].index.Name(), + ids: make([]uint64, 0, size), + rowIDs: rowIDs, + toTranslate: tt, + } +} + +var ErrBatchNowFull = errors.New("batch is now full - you cannot add any more records (though the one you just added was accepted)") +var ErrBatchAlreadyFull = errors.New("batch was already full, record was rejected") + +func (b *Batch) Add(rec Row) error { + if len(b.ids) == cap(b.ids) { + return ErrBatchAlreadyFull + } + if len(rec.Values) != len(b.header) { + return errors.Errorf("record needs to match up with batch fields, got %d fields and %d record", len(b.header), len(rec.Values)) + } + + if _, ok := rec.ID.(uint64); !ok { + return errors.New("TODO support non integer IDs") + } + b.ids = append(b.ids, rec.ID.(uint64)) + + for i := 0; i < len(rec.Values); i++ { + field := b.header[i] + if val, ok := rec.Values[i].(string); ok { + // translate val and append to b.rowIDs[i] + if rowID, ok := b.client.translator.GetRow(b.index, field.Name(), val); ok { + b.rowIDs[i] = append(b.rowIDs[i], rowID) + } else { + ints, ok := b.toTranslate[i][val] + if !ok { + ints = make([]int, 0) + } + ints = append(ints, len(b.rowIDs[i])) + b.toTranslate[i][val] = ints + b.rowIDs[i] = append(b.rowIDs[i], 0) + } + } else { + return errors.New("TODO support types other than string") + } + } + if len(b.ids) == cap(b.ids) { + return ErrBatchNowFull + } + return nil +} + +type viewSpec struct { + field string + view string +} + +// type fragments map[viewSpec]*roaring.Bitmap + +// map[shard][fieldview]fragmentData +type fragments map[uint64]map[viewSpec]*roaring.Bitmap + +func (f fragments) GetOrCreate(shard uint64, field, view string) *roaring.Bitmap { + viewMap, ok := f[shard] + if !ok { + viewMap = make(map[viewSpec]*roaring.Bitmap) + } + bm, ok := viewMap[viewSpec{field: field, view: view}] + if !ok { + bm = roaring.NewBTreeBitmap() + viewMap[viewSpec{field: field, view: view}] = bm + } + f[shard] = viewMap + return bm +} diff --git a/importbatch_test.go b/importbatch_test.go new file mode 100644 index 0000000..2fde849 --- /dev/null +++ b/importbatch_test.go @@ -0,0 +1,253 @@ +package pilosa + +import ( + "reflect" + "testing" +) + +func TestBatches(t *testing.T) { + client := DefaultClient() + schema := NewSchema() + idx := schema.Index("gopilosatest-blah") + fields := make([]*Field, 3) + fields[0] = idx.Field("zero", OptFieldKeys(true)) + fields[1] = idx.Field("one", OptFieldKeys(true)) + fields[2] = idx.Field("two", OptFieldKeys(true)) + err := client.SyncSchema(schema) + if err != nil { + t.Fatalf("syncing schema: %v", err) + } + defer func() { + err := client.DeleteIndex(idx) + if err != nil { + t.Logf("problem cleaning up from test: %v", err) + } + }() + b := NewBatch(client, 10, fields) + + r := Row{Values: make([]interface{}, 3)} + + for i := 0; i < 9; i++ { + r.ID = uint64(i) + if i%2 == 0 { + r.Values[0] = "a" + r.Values[1] = "b" + r.Values[2] = "c" + } else { + r.Values[0] = "x" + r.Values[1] = "y" + r.Values[2] = "z" + } + err := b.Add(r) + if err != nil { + t.Fatalf("unexpected err adding record: %v", err) + } + + } + + if len(b.toTranslate[0]) != 2 { + t.Fatalf("wrong number of keys in toTranslate[0]") + } + for k, ints := range b.toTranslate[0] { + if k == "a" { + if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { + t.Fatalf("wrong ints for key a in field zero: %v", ints) + } + } else if k == "x" { + if !reflect.DeepEqual(ints, []int{1, 3, 5, 7}) { + t.Fatalf("wrong ints for key x in field zero: %v", ints) + } + + } else { + t.Fatalf("unexpected key %s", k) + } + } + + if len(b.toTranslate[1]) != 2 { + t.Fatalf("wrong number of keys in toTranslate[1]") + } + for k, ints := range b.toTranslate[1] { + if k == "b" { + if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { + t.Fatalf("wrong ints for key b in field one: %v", ints) + } + } else if k == "y" { + if !reflect.DeepEqual(ints, []int{1, 3, 5, 7}) { + t.Fatalf("wrong ints for key y in field one: %v", ints) + } + + } else { + t.Fatalf("unexpected key %s", k) + } + } + + if len(b.toTranslate[2]) != 2 { + t.Fatalf("wrong number of keys in toTranslate[2]") + } + for k, ints := range b.toTranslate[2] { + if k == "c" { + if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { + t.Fatalf("wrong ints for key c in field two: %v", ints) + } + } else if k == "z" { + if !reflect.DeepEqual(ints, []int{1, 3, 5, 7}) { + t.Fatalf("wrong ints for key z in field two: %v", ints) + } + + } else { + t.Fatalf("unexpected key %s", k) + } + } + + err = b.Add(r) + if err != ErrBatchNowFull { + t.Fatalf("should have gotten full batch error, but got %v", err) + } + + err = b.Add(r) + if err != ErrBatchAlreadyFull { + t.Fatalf("should have gotten already full batch error, but got %v", err) + } + + err = b.doTranslation() + if err != nil { + t.Fatalf("doing translation: %v", err) + } + + for i, rowIDs := range b.rowIDs { + // we don't know which key will get translated first, but we do know the pattern + if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 1}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 2}) { + t.Fatalf("unexpected row ids for field %d: %v", i, rowIDs) + } + } + + err = b.doImport() + if err != nil { + t.Fatalf("doing import: %v", err) + } + + b.reset() + + for i := 9; i < 19; i++ { + r.ID = uint64(i) + if i%2 == 0 { + r.Values[0] = "a" + r.Values[1] = "b" + r.Values[2] = "c" + } else { + r.Values[0] = "x" + r.Values[1] = "y" + r.Values[2] = "z" + } + err := b.Add(r) + if i != 18 && err != nil { + t.Fatalf("unexpected err adding record: %v", err) + } + if i == 18 && err != ErrBatchNowFull { + t.Fatalf("unexpected err: %v", err) + } + } + + // should do nothing + err = b.doTranslation() + if err != nil { + t.Fatalf("doing translation: %v", err) + } + + err = b.doImport() + if err != nil { + t.Fatalf("doing import: %v", err) + } + + for i, rowIDs := range b.rowIDs { + // we don't know which key will get translated first, but we do know the pattern + if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 2}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 1}) { + t.Fatalf("unexpected row ids for field %d: %v", i, rowIDs) + } + } + + b.reset() + + for i := 19; i < 29; i++ { + r.ID = uint64(i) + if i%2 == 0 { + r.Values[0] = "d" + r.Values[1] = "e" + r.Values[2] = "f" + } else { + r.Values[0] = "u" + r.Values[1] = "v" + r.Values[2] = "w" + } + err := b.Add(r) + if i != 28 && err != nil { + t.Fatalf("unexpected err adding record: %v", err) + } + if i == 28 && err != ErrBatchNowFull { + t.Fatalf("unexpected err: %v", err) + } + } + + err = b.doTranslation() + if err != nil { + t.Fatalf("doing translation: %v", err) + } + + err = b.doImport() + if err != nil { + t.Fatalf("doing import: %v", err) + } + + for i, rowIDs := range b.rowIDs { + // we don't know which key will get translated first, but we do know the pattern + if !reflect.DeepEqual(rowIDs, []uint64{3, 4, 3, 4, 3, 4, 3, 4, 3, 4}) && !reflect.DeepEqual(rowIDs, []uint64{4, 3, 4, 3, 4, 3, 4, 3, 4, 3}) { + t.Fatalf("unexpected row ids for field %d: %v", i, rowIDs) + } + } + + frags := b.makeFragments() + + if len(frags) != 1 { + t.Fatalf("unexpected # of shards in fragments: %d", len(frags)) + } + viewMap, ok := frags[0] + if !ok { + t.Fatalf("shard 0 should be in frags") + } + if len(viewMap) != 3 { + t.Fatalf("there should be 3 views") + } + + // TODO query Pilosa to confirm data is in place + resp, err := client.Query(idx.BatchQuery(fields[0].Row("a"), + fields[1].Row("b"), + fields[2].Row("c"))) + if err != nil { + t.Fatalf("querying: %v", err) + } + + results := resp.Results() + for _, res := range results { + cols := res.Row().Columns + if !reflect.DeepEqual(cols, []uint64{0, 2, 4, 6, 8, 10, 12, 14, 16, 18}) { + t.Fatalf("unexpected columns: %v", cols) + } + } + + resp, err = client.Query(idx.BatchQuery(fields[0].Row("d"), + fields[1].Row("e"), + fields[2].Row("f"))) + if err != nil { + t.Fatalf("querying: %v", err) + } + + results = resp.Results() + for _, res := range results { + cols := res.Row().Columns + if !reflect.DeepEqual(cols, []uint64{20, 22, 24, 26, 28}) { + t.Fatalf("unexpected columns: %v", cols) + } + } + + // TODO test non-full batches, test behavior of doing import on empty batch +} diff --git a/orm.go b/orm.go index 2d7e814..465f74e 100644 --- a/orm.go +++ b/orm.go @@ -215,8 +215,8 @@ func (q PQLRowQuery) Error() error { // // Usage: // -// index, err := NewIndex("repository") -// stargazer, err := index.Field("stargazer") +// repo, err := NewIndex("repository") +// stargazer, err := repo.Field("stargazer") // query := repo.BatchQuery( // stargazer.Row(5), // stargazer.Row(15), diff --git a/translator.go b/translator.go new file mode 100644 index 0000000..c6417d5 --- /dev/null +++ b/translator.go @@ -0,0 +1,47 @@ +package pilosa + +type Translator struct { + indexes map[string]map[string]uint64 + fields map[indexfield]map[string]uint64 +} + +func NewTranslator() *Translator { + return &Translator{ + indexes: make(map[string]map[string]uint64), + fields: make(map[indexfield]map[string]uint64), + } +} + +type indexfield struct { + index string + field string +} + +func (t *Translator) GetCol(index, key string) (uint64, bool) { + if idx, ok := t.indexes[index]; ok { + if val, ok := idx[key]; ok { + return val, true + } + } + return 0, false +} + +func (t *Translator) GetRow(index, field, key string) (uint64, bool) { + if fld, ok := t.fields[indexfield{index: index, field: field}]; ok { + if val, ok := fld[key]; ok { + return val, true + } + } + return 0, false +} + +func (t *Translator) AddRow(index, field, key string, value uint64) { + keys, ok := t.fields[indexfield{index: index, field: field}] + if !ok { + keys = make(map[string]uint64) + } + keys[key] = value + t.fields[indexfield{index: index, field: field}] = keys +} + +// TODO AddCol From 62bc87fb0a03d95f942cd27f58750fb4b84c6980 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Wed, 21 Aug 2019 10:55:23 -0500 Subject: [PATCH 02/26] support for string record IDs, parity with old ingest I've confirmed that this and the "old" ingest produce the same results in Pilosa (at least by TopNing each field) --- cmd/picsv/main.go | 69 +++++++++---- importbatch.go | 236 ++++++++++++++++++++++++++------------------ importbatch_test.go | 129 +++++++++++++++++++++++- 3 files changed, 316 insertions(+), 118 deletions(-) diff --git a/cmd/picsv/main.go b/cmd/picsv/main.go index 04bee5d..230f83a 100644 --- a/cmd/picsv/main.go +++ b/cmd/picsv/main.go @@ -18,6 +18,7 @@ type Main struct { File string Index string BatchSize int + IDField string } func NewMain() *Main { @@ -26,6 +27,7 @@ func NewMain() *Main { File: "data.csv", Index: "picsvtest", BatchSize: 1000, + IDField: "id", } } @@ -49,29 +51,34 @@ func (m *Main) Run() error { if err != nil { return errors.Wrap(err, "getting schema") } - index := schema.Index(m.Index) + opts := []pilosa.IndexOption{} + if m.IDField != "" { + opts = append(opts, pilosa.OptIndexKeys(true)) + } + index := schema.Index(m.Index, opts...) - header, err := reader.Read() + headerRow, err := reader.Read() if err != nil { return errors.Wrap(err, "reading CSV header") } - log.Println("Got Header: ", header) - fields := processHeader(index, header) + log.Println("Got Header: ", headerRow) + fields, header, getIDFn := processHeader(index, m.IDField, headerRow) + // this has a non-obvious dependence on the previous line... the fields are set up in the index which comes from the schema client.SyncSchema(schema) - batch := pilosa.NewBatch(client, m.BatchSize, fields) + batch := pilosa.NewBatch(client, m.BatchSize, index, fields) record := pilosa.Row{ Values: make([]interface{}, len(header)), } - id := uint64(0) - row, err := reader.Read() - for ; err == nil; row, err = reader.Read() { - record.ID = id - for i, _ := range record.Values { - if i < len(row) { - record.Values[i] = row[i] + + numRecords := uint64(0) + for row, err := reader.Read(); err == nil; row, err = reader.Read() { + record.ID = getIDFn(row, numRecords) + for _, meta := range header { + if meta.srcIndex < len(row) { + record.Values[meta.recordIndex] = row[meta.srcIndex] } else { - record.Values[i] = nil + record.Values[meta.recordIndex] = nil log.Printf("row is shorter than header: %v", row) } } @@ -85,9 +92,9 @@ func (m *Main) Run() error { return errors.Wrap(err, "adding to batch") } - id++ + numRecords++ } - log.Printf("processed %d ids\n", id) + if err != io.EOF && err != nil { return errors.Wrap(err, "reading csv") } @@ -96,15 +103,37 @@ func (m *Main) Run() error { return errors.Wrap(err, "final import") } + log.Printf("processed %d ids\n", numRecords) + return nil } -func processHeader(index *pilosa.Index, header []string) []*pilosa.Field { - ret := make([]*pilosa.Field, 0, len(header)) - for _, fieldName := range header { - ret = append(ret, index.Field(fieldName, pilosa.OptFieldKeys(true))) +type valueMeta struct { + srcIndex int + recordIndex int +} + +type idGetter func(row []string, numRecords uint64) interface{} + +func processHeader(index *pilosa.Index, idField string, headerRow []string) ([]*pilosa.Field, map[string]valueMeta, idGetter) { + fields := make([]*pilosa.Field, 0, len(headerRow)) + header := make(map[string]valueMeta) + getIDFn := func(row []string, numRecords uint64) interface{} { + return numRecords + } + for i, fieldName := range headerRow { + if fieldName == idField { + idIndex := i + getIDFn = func(row []string, numRecords uint64) interface{} { + return row[idIndex] + } + continue + } + header[fieldName] = valueMeta{srcIndex: i, recordIndex: len(fields)} + fields = append(fields, index.Field(fieldName, pilosa.OptFieldKeys(true), pilosa.OptFieldTypeSet(pilosa.CacheTypeRanked, 100000))) } - return ret + + return fields, header, getIDFn } func main() { diff --git a/importbatch.go b/importbatch.go index ecd7f65..a922122 100644 --- a/importbatch.go +++ b/importbatch.go @@ -6,11 +6,112 @@ import ( "golang.org/x/sync/errgroup" ) +type Batch struct { + client *Client + index *Index + header []*Field + + // ids is a slice of length batchSize of record IDs + ids []uint64 + + // rowIDs is a slice of length len(Batch.header) which contains slices of length batchSize + rowIDs [][]uint64 + // TODO, support int fields, set fields without translation, timestamps, set fields with more than one value per record. + + // for each field, keep a map of key to which record indexes that key mapped to + toTranslate []map[string][]int + + // for string ids which we weren't able to immediately translate, + // keep a map of which record(s) each string id maps to. + // + // TODO: + // this is probably super inefficient in the (common) case where + // each record has a different string ID. In that case, a simple + // slice of strings would probably work better. + toTranslateID map[string][]int +} + +func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { + if len(fields) == 0 || size == 0 { + panic("can't batch with no fields or batch size") + } + rowIDs := make([][]uint64, len(fields)) + tt := make([]map[string][]int, len(fields)) + for i, _ := range fields { + rowIDs[i] = make([]uint64, 0, size) + tt[i] = make(map[string][]int) + } + return &Batch{ + client: client, + header: fields, + index: index, + ids: make([]uint64, 0, size), + rowIDs: rowIDs, + toTranslate: tt, + toTranslateID: make(map[string][]int), + } +} + type Row struct { ID interface{} Values []interface{} } +func (b *Batch) Add(rec Row) error { + if len(b.ids) == cap(b.ids) { + return ErrBatchAlreadyFull + } + if len(rec.Values) != len(b.header) { + return errors.Errorf("record needs to match up with batch fields, got %d fields and %d record", len(b.header), len(rec.Values)) + } + + switch rid := rec.ID.(type) { + case uint64: + b.ids = append(b.ids, rid) + case string: + if colID, ok := b.client.translator.GetCol(b.index.Name(), rid); ok { + b.ids = append(b.ids, colID) + } else { + ints, ok := b.toTranslateID[rid] + if !ok { + ints = make([]int, 0) + } + ints = append(ints, len(b.ids)) + b.toTranslateID[rid] = ints + b.ids = append(b.ids, 0) + } + default: + return errors.Errorf("unsupported id type %T value %v", rid, rid) + } + + for i := 0; i < len(rec.Values); i++ { + field := b.header[i] + if val, ok := rec.Values[i].(string); ok { + // translate val and append to b.rowIDs[i] + if rowID, ok := b.client.translator.GetRow(b.index.Name(), field.Name(), val); ok { + b.rowIDs[i] = append(b.rowIDs[i], rowID) + } else { + ints, ok := b.toTranslate[i][val] + if !ok { + ints = make([]int, 0) + } + ints = append(ints, len(b.rowIDs[i])) + b.toTranslate[i][val] = ints + b.rowIDs[i] = append(b.rowIDs[i], 0) + } + } else { + return errors.New("TODO support types other than string") + } + } + if len(b.ids) == cap(b.ids) { + return ErrBatchNowFull + } + return nil +} + +var ErrBatchNowFull = errors.New("batch is now full - you cannot add any more records (though the one you just added was accepted)") +var ErrBatchAlreadyFull = errors.New("batch was already full, record was rejected") + func (b *Batch) Import() error { // first we need to translate the toTranslate, then fill out the missing row IDs err := b.doTranslation() @@ -30,9 +131,32 @@ func (b *Batch) Import() error { } func (b *Batch) doTranslation() error { + var keys []string + + // translate column keys if there are any + if len(b.toTranslateID) > 0 { + keys = make([]string, 0, len(b.toTranslateID)) + for k, _ := range b.toTranslateID { + keys = append(keys, k) + } + ids, err := b.client.translateColumnKeys(b.index, keys) + if err != nil { + return errors.Wrap(err, "translating col keys") + } + for j, key := range keys { + id := ids[j] + for _, recordIdx := range b.toTranslateID[key] { + b.ids[recordIdx] = id + } + } + } else { + keys = make([]string, 0, len(b.toTranslate[0])) + } + + // translate row keys for i, field := range b.header { tt := b.toTranslate[i] - keys := make([]string, 0, len(tt)) + keys = keys[:0] // make a slice of keys for k, _ := range tt { @@ -55,7 +179,7 @@ func (b *Batch) doTranslation() error { for _, recordIdx := range tt[key] { b.rowIDs[i][recordIdx] = id } - b.client.translator.AddRow(b.index, field.Name(), key, id) + b.client.translator.AddRow(b.index.Name(), field.Name(), key, id) } } return nil @@ -63,16 +187,15 @@ func (b *Batch) doTranslation() error { func (b *Batch) doImport() error { eg := errgroup.Group{} - index := b.header[0].index frags := b.makeFragments() uri := b.client.cluster.hosts[0] // TODO get URI per-shard performantly. for shard, viewMap := range frags { - for viewSpec, bitmap := range viewMap { - viewSpec := viewSpec + for fieldView, bitmap := range viewMap { + fieldView := fieldView bitmap := bitmap eg.Go(func() error { - err := b.client.importRoaringBitmap(uri, index.Field(viewSpec.field), shard, map[string]*roaring.Bitmap{"": bitmap}, &ImportOptions{}) + err := b.client.importRoaringBitmap(uri, b.index.Field(fieldView.field), shard, map[string]*roaring.Bitmap{"": bitmap}, &ImportOptions{}) return errors.Wrap(err, "doing import") }) } @@ -81,21 +204,18 @@ func (b *Batch) doImport() error { } func (b *Batch) makeFragments() fragments { - shardWidth := b.header[0].index.shardWidth + shardWidth := b.index.shardWidth if shardWidth == 0 { shardWidth = DefaultShardWidth } frags := make(fragments) - if len(b.ids) == 0 { - return frags // exit early if no records - } for i, field := range b.header { - curShard := b.ids[0] / shardWidth - curBM := frags.GetOrCreate(curShard, field.Name(), "") + curShard := ^uint64(0) // impossible sentinel value. + var curBM *roaring.Bitmap rowIDs := b.rowIDs[i] for j, _ := range b.ids { col, row := b.ids[j], rowIDs[j] - if col%shardWidth != curShard { + if col/shardWidth != curShard { curShard = col / shardWidth curBM = frags.GetOrCreate(curShard, field.Name(), "") } @@ -116,104 +236,28 @@ func (b *Batch) reset() { delete(m, k) } } -} - -type Batch struct { - client *Client - header []*Field - index string - - // ids is a slice of length batchSize of record IDs - // TODO support string IDs - ids []uint64 - // rowIDs is a slice of length len(Batch.header) which contains slices of length batchSize - rowIDs [][]uint64 - // TODO, support int fields, set fields without translation, timestamps, set fields with more than one value per record. - - // for each field, keep a map of key to which record indexes that key mapped to - toTranslate []map[string][]int -} - -func NewBatch(client *Client, size int, fields []*Field) *Batch { - if len(fields) == 0 || size == 0 { - panic("can't batch with no fields or batch size") - } - rowIDs := make([][]uint64, len(fields)) - tt := make([]map[string][]int, len(fields)) - for i, _ := range fields { - rowIDs[i] = make([]uint64, 0, size) - tt[i] = make(map[string][]int) - } - return &Batch{ - client: client, - header: fields, - index: fields[0].index.Name(), - ids: make([]uint64, 0, size), - rowIDs: rowIDs, - toTranslate: tt, - } -} - -var ErrBatchNowFull = errors.New("batch is now full - you cannot add any more records (though the one you just added was accepted)") -var ErrBatchAlreadyFull = errors.New("batch was already full, record was rejected") - -func (b *Batch) Add(rec Row) error { - if len(b.ids) == cap(b.ids) { - return ErrBatchAlreadyFull - } - if len(rec.Values) != len(b.header) { - return errors.Errorf("record needs to match up with batch fields, got %d fields and %d record", len(b.header), len(rec.Values)) - } - - if _, ok := rec.ID.(uint64); !ok { - return errors.New("TODO support non integer IDs") - } - b.ids = append(b.ids, rec.ID.(uint64)) - - for i := 0; i < len(rec.Values); i++ { - field := b.header[i] - if val, ok := rec.Values[i].(string); ok { - // translate val and append to b.rowIDs[i] - if rowID, ok := b.client.translator.GetRow(b.index, field.Name(), val); ok { - b.rowIDs[i] = append(b.rowIDs[i], rowID) - } else { - ints, ok := b.toTranslate[i][val] - if !ok { - ints = make([]int, 0) - } - ints = append(ints, len(b.rowIDs[i])) - b.toTranslate[i][val] = ints - b.rowIDs[i] = append(b.rowIDs[i], 0) - } - } else { - return errors.New("TODO support types other than string") - } - } - if len(b.ids) == cap(b.ids) { - return ErrBatchNowFull + for k := range b.toTranslateID { + delete(b.toTranslateID, k) } - return nil } -type viewSpec struct { +type fieldView struct { // TODO rename to fieldview field string view string } -// type fragments map[viewSpec]*roaring.Bitmap - // map[shard][fieldview]fragmentData -type fragments map[uint64]map[viewSpec]*roaring.Bitmap +type fragments map[uint64]map[fieldView]*roaring.Bitmap func (f fragments) GetOrCreate(shard uint64, field, view string) *roaring.Bitmap { viewMap, ok := f[shard] if !ok { - viewMap = make(map[viewSpec]*roaring.Bitmap) + viewMap = make(map[fieldView]*roaring.Bitmap) } - bm, ok := viewMap[viewSpec{field: field, view: view}] + bm, ok := viewMap[fieldView{field: field, view: view}] if !ok { bm = roaring.NewBTreeBitmap() - viewMap[viewSpec{field: field, view: view}] = bm + viewMap[fieldView{field: field, view: view}] = bm } f[shard] = viewMap return bm diff --git a/importbatch_test.go b/importbatch_test.go index 2fde849..0a1320b 100644 --- a/importbatch_test.go +++ b/importbatch_test.go @@ -2,6 +2,7 @@ package pilosa import ( "reflect" + "strconv" "testing" ) @@ -23,8 +24,8 @@ func TestBatches(t *testing.T) { t.Logf("problem cleaning up from test: %v", err) } }() - b := NewBatch(client, 10, fields) - + b := NewBatch(client, 10, idx, fields) + n r := Row{Values: make([]interface{}, 3)} for i := 0; i < 9; i++ { @@ -251,3 +252,127 @@ func TestBatches(t *testing.T) { // TODO test non-full batches, test behavior of doing import on empty batch } + +func TestBatchesStringIDs(t *testing.T) { + client := DefaultClient() + schema := NewSchema() + idx := schema.Index("gopilosatest-blah", OptIndexKeys(true)) + fields := make([]*Field, 1) + fields[0] = idx.Field("zero", OptFieldKeys(true)) + err := client.SyncSchema(schema) + if err != nil { + t.Fatalf("syncing schema: %v", err) + } + defer func() { + err := client.DeleteIndex(idx) + if err != nil { + t.Logf("problem cleaning up from test: %v", err) + } + }() + + b := NewBatch(client, 3, idx, fields) + + r := Row{Values: make([]interface{}, 1)} + + for i := 0; i < 3; i++ { + r.ID = strconv.Itoa(i) + if i%2 == 0 { + r.Values[0] = "a" + } else { + r.Values[0] = "x" + } + err := b.Add(r) + if err != nil && err != ErrBatchNowFull { + t.Fatalf("unexpected err adding record: %v", err) + } + } + + if len(b.toTranslateID) != 3 { + t.Fatalf("id translation table unexpected size: %v", b.toTranslateID) + } + for k, indexes := range b.toTranslateID { + if k == "0" { + if !reflect.DeepEqual(indexes, []int{0}) { + t.Fatalf("unexpected result k: %s, indexes: %v", k, indexes) + } + } + if k == "1" { + if !reflect.DeepEqual(indexes, []int{1}) { + t.Fatalf("unexpected result k: %s, indexes: %v", k, indexes) + } + } + if k == "2" { + if !reflect.DeepEqual(indexes, []int{2}) { + t.Fatalf("unexpected result k: %s, indexes: %v", k, indexes) + } + } + } + + err = b.doTranslation() + if err != nil { + t.Fatalf("translating: %v", err) + } + + if !reflect.DeepEqual(b.ids, []uint64{1, 2, 3}) { + t.Fatalf("unexpected ids: %v", b.ids) + } + + err = b.Import() + if err != nil { + t.Fatalf("importing: %v", err) + } + + resp, err := client.Query(idx.BatchQuery(fields[0].Row("a"), fields[0].Row("x"))) + if err != nil { + t.Fatalf("querying: %v", err) + } + + results := resp.Results() + for i, res := range results { + cols := res.Row().Keys + if i == 0 && !reflect.DeepEqual(cols, []string{"0", "2"}) { + t.Fatalf("unexpected columns: %v", cols) + } + if i == 1 && !reflect.DeepEqual(cols, []string{"1"}) { + t.Fatalf("unexpected columns: %v", cols) + } + } + + b.reset() + + r.ID = "1" + r.Values[0] = "a" + err = b.Add(r) + if err != nil { + t.Fatalf("unexpected err adding record: %v", err) + } + + r.ID = "3" + r.Values[0] = "z" + err = b.Add(r) + if err != nil { + t.Fatalf("unexpected err adding record: %v", err) + } + + err = b.Import() + if err != nil { + t.Fatalf("importing: %v", err) + } + + resp, err = client.Query(idx.BatchQuery(fields[0].Row("a"), fields[0].Row("z"))) + if err != nil { + t.Fatalf("querying: %v", err) + } + + results = resp.Results() + for i, res := range results { + cols := res.Row().Keys + if i == 0 && !reflect.DeepEqual(cols, []string{"0", "1", "2"}) { + t.Fatalf("unexpected columns: %v", cols) + } + if i == 1 && !reflect.DeepEqual(cols, []string{"3"}) { + t.Fatalf("unexpected columns: %v", cols) + } + } + +} From edf6cb889a97ba013c7500dcf50261577cae9c95 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Mon, 26 Aug 2019 10:55:41 -0500 Subject: [PATCH 03/26] implement int field support add local cache for node URIs per index/shard add exported (questionable) method to client for doing simple integer imports. add translated column keys to the local cache. --- client.go | 55 +++++++++++++++++- client_it_test.go | 26 +++++++++ go.mod | 2 - go.sum | 9 +++ importbatch.go | 139 +++++++++++++++++++++++++++++++++----------- importbatch_test.go | 127 ++++++++++++++++++++++++++++++++-------- shardnodes.go | 39 +++++++++++++ translator.go | 11 +++- 8 files changed, 345 insertions(+), 63 deletions(-) create mode 100644 shardnodes.go diff --git a/client.go b/client.go index db7fa01..94cb934 100644 --- a/client.go +++ b/client.go @@ -90,8 +90,28 @@ type Client struct { importLogEncoder encoder logLock sync.Mutex + // TODO make this threadsafe using key translation cache on client using embedded K/V store. translator *Translator - // TODO threadsafe key translation cache on client using embedded K/V store. + + // TODO shardNodes needs to be invalidated/updated when cluster topology changes. + shardNodes shardNodes +} + +func (c *Client) GetURIsForShard(index string, shard uint64) ([]*URI, error) { + uris, ok := c.shardNodes.Get(index, shard) + if ok { + return uris, nil + } + fragmentNodes, err := c.fetchFragmentNodes(index, shard) + if err != nil { + return nil, errors.Wrap(err, "trying to look up nodes for shard") + } + uris = make([]*URI, 0, len(fragmentNodes)) + for _, fn := range fragmentNodes { + uris = append(uris, fn.URI()) + } + c.shardNodes.Put(index, shard, uris) + return uris, nil } // DefaultClient creates a client with the default address and options. @@ -143,6 +163,7 @@ func newClientWithOptions(options *ClientOptions) *Client { coordinatorLock: &sync.RWMutex{}, translator: NewTranslator(), + shardNodes: newShardNodes(), } if options.importLogWriter != nil { c.importLogEncoder = newImportLogEncoder(options.importLogWriter) @@ -646,6 +667,38 @@ func (c *Client) importValues(field *Field, return errors.Wrap(err, "importing values to nodes") } +// ImportValues takes the given integer values and column ids +// (which must all be in the given shard) and imports them into the +// given index,field,shard on all nodes which should hold that shard. +func (c *Client) ImportValues(index, field string, shard uint64, vals []int64, ids []uint64) error { + msg := &pbuf.ImportValueRequest{ + Index: index, + Field: field, + Shard: shard, + ColumnIDs: ids, + Values: vals, + } + data, err := proto.Marshal(msg) + if err != nil { + return errors.Wrap(err, "marshaling to protobuf") + } + path := fmt.Sprintf("/index/%s/field/%s/import", index, field) + c.logImport(index, path, shard, false, data) + + uris, err := c.GetURIsForShard(index, shard) + if err != nil { + return errors.Wrap(err, "getting uris") + } + + eg := errgroup.Group{} + for _, uri := range uris { + eg.Go(func() error { + return c.importData(uri, path, data) + }) + } + return errors.Wrap(eg.Wait(), "importing values to nodes") +} + func importPathData(field *Field, shard uint64, msg proto.Message, options *ImportOptions) (path string, data []byte, err error) { data, err = proto.Marshal(msg) if err != nil { diff --git a/client_it_test.go b/client_it_test.go index 97344f5..f208ac7 100644 --- a/client_it_test.go +++ b/client_it_test.go @@ -851,6 +851,32 @@ func TestImportWithBatchSize(t *testing.T) { } } +func TestImportValues(t *testing.T) { + client := getClient() + schema, err := client.Schema() + if err != nil { + t.Fatalf("getting schema: %v", err) + } + index := schema.Index("go-testindex") + intfield := index.Field("intfield", OptFieldTypeInt()) + err = client.SyncSchema(schema) + if err != nil { + t.Fatalf("syncing schema: %v", err) + } + + err = client.ImportValues("go-testindex", "intfield", 0, []int64{1, 2, 3}, []uint64{1, 2, 3}) + if err != nil { + t.Fatalf("importing values: %v", err) + } + + resp, err := client.Query(intfield.GT(0)) + result := resp.Result() + if !reflect.DeepEqual(result.Row().Columns, []uint64{1, 2, 3}) { + t.Fatalf("unexpected result: %v", result.Row().Columns) + } + +} + // Ensure that the client does not send batches of zero records to Pilosa. // In our case it should send: // batch 1: shard[0,1] diff --git a/go.mod b/go.mod index 272f708..7110b0e 100644 --- a/go.mod +++ b/go.mod @@ -46,8 +46,6 @@ require ( github.com/spf13/jwalterweatherman v1.1.0 // indirect github.com/spf13/viper v1.4.0 // indirect github.com/stretchr/objx v0.2.0 // indirect - github.com/uber/jaeger-client-go v2.16.0+incompatible // indirect - github.com/uber/jaeger-lib v2.0.0+incompatible // indirect github.com/ugorji/go v1.1.5-pre // indirect go.etcd.io/bbolt v1.3.3 // indirect go.opencensus.io v0.22.0 // indirect diff --git a/go.sum b/go.sum index eb03b3c..ecc454b 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,7 @@ github.com/DataDog/datadog-go v2.2.0+incompatible h1:V5BKkxACZLjzHjSgBbr2gvLA2Ae github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/OneOfOne/xxhash v1.2.5/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= +github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= @@ -54,6 +55,7 @@ github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeME github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-ole/go-ole v1.2.4/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= @@ -170,6 +172,8 @@ github.com/pilosa/pilosa v0.0.0-20190104143002-8c4b1548bc4b h1:2H/+JUxL4dv0uJ4G4 github.com/pilosa/pilosa v0.0.0-20190104143002-8c4b1548bc4b/go.mod h1:NgpkJkefqUKUHV7O3TqBOu89tsao3ksth2wzTNe8CPQ= github.com/pilosa/pilosa v1.2.1-0.20190410162749-b973f8c96356 h1:jDxhpV4l+CpKqVVgld73e9/EyogdCcO1ftbCvifrhSc= github.com/pilosa/pilosa v1.2.1-0.20190410162749-b973f8c96356/go.mod h1:QN7EwQwoQHNPVsd7CHXFDasPznLDA6DPswmnLr4eJ6o= +github.com/pilosa/pilosa v1.2.1-0.20190807173852-bc9747cc0f19 h1:93vMMs0jAhynsJpbC3AMynz1M9g5G5vnVVPjM1cpU94= +github.com/pilosa/pilosa v1.2.1-0.20190807173852-bc9747cc0f19/go.mod h1:57zHA92sPbJ01QsMyyEDASX2TJnf8qSM7ZdUnVzM0b8= github.com/pilosa/pilosa v1.3.1 h1:rLDVqJBuRzhPtue730D+EX0YEVS4R0oDzsE4bJBwLcE= github.com/pilosa/pilosa v1.3.1/go.mod h1:97yLL9mpUqOj9naKu5XA/b/U6JLe3JGGUlc2HOTDw+A= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -209,6 +213,7 @@ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUt github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/shirou/gopsutil v2.18.12+incompatible h1:1eaJvGomDnH74/5cF4CTmTbLHAriGFsTZppLXDX93OM= github.com/shirou/gopsutil v2.18.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= +github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= @@ -238,6 +243,7 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM= +github.com/uber-go/atomic v1.4.0/go.mod h1:/Ct5t2lcmbJ4OSe/waGBoaVvVqtO0bmtfVNex1PFV8g= github.com/uber/jaeger-client-go v2.15.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/uber/jaeger-client-go v2.16.0+incompatible h1:Q2Pp6v3QYiocMxomCaJuwQGFt7E53bPYqEgug/AoBtY= github.com/uber/jaeger-client-go v2.16.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= @@ -261,6 +267,7 @@ golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnf golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190617133340-57b3e21c3d56 h1:ZpKuNIejY8P0ExLOVyKhb0WsgG8UdvHXe6TWjY7eL6k= @@ -289,6 +296,7 @@ golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190424112056-4829fb13d2c6/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= @@ -317,6 +325,7 @@ golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190429190828-d89cdac9e872/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/importbatch.go b/importbatch.go index a922122..7103f64 100644 --- a/importbatch.go +++ b/importbatch.go @@ -7,19 +7,25 @@ import ( ) type Batch struct { - client *Client - index *Index - header []*Field + client *Client + index *Index + header []*Field + headerMap map[string]*Field // ids is a slice of length batchSize of record IDs ids []uint64 // rowIDs is a slice of length len(Batch.header) which contains slices of length batchSize - rowIDs [][]uint64 - // TODO, support int fields, set fields without translation, timestamps, set fields with more than one value per record. + rowIDs map[string][]uint64 + + // values holds the values for each record of an int field + values map[string][]int64 + + // TODO, support set fields without translation, timestamps, set fields with more than one value per record, mutex, and bool. + // also null values // for each field, keep a map of key to which record indexes that key mapped to - toTranslate []map[string][]int + toTranslate map[string]map[string][]int // for string ids which we weren't able to immediately translate, // keep a map of which record(s) each string id maps to. @@ -35,18 +41,31 @@ func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { if len(fields) == 0 || size == 0 { panic("can't batch with no fields or batch size") } - rowIDs := make([][]uint64, len(fields)) - tt := make([]map[string][]int, len(fields)) - for i, _ := range fields { - rowIDs[i] = make([]uint64, 0, size) - tt[i] = make(map[string][]int) + headerMap := make(map[string]*Field, len(fields)) + rowIDs := make(map[string][]uint64) + values := make(map[string][]int64) + tt := make(map[string]map[string][]int) + for _, field := range fields { + headerMap[field.Name()] = field + opts := field.Opts() + switch opts.Type() { + case FieldTypeDefault, FieldTypeSet: + if opts.Keys() { + tt[field.Name()] = make(map[string][]int) + } + rowIDs[field.Name()] = make([]uint64, 0, size) + case FieldTypeInt: + values[field.Name()] = make([]int64, 0, size) + } } return &Batch{ client: client, header: fields, + headerMap: headerMap, index: index, ids: make([]uint64, 0, size), rowIDs: rowIDs, + values: values, toTranslate: tt, toTranslateID: make(map[string][]int), } @@ -57,6 +76,10 @@ type Row struct { Values []interface{} } +// Add adds a record to the batch. Performance will be best if record +// IDs are shard-sorted. That is, all records which belong to the same +// Pilosa shard are added adjacent to each other. If the records are +// also in-order within a shard this will likely help as well. func (b *Batch) Add(rec Row) error { if len(b.ids) == cap(b.ids) { return ErrBatchAlreadyFull @@ -86,21 +109,27 @@ func (b *Batch) Add(rec Row) error { for i := 0; i < len(rec.Values); i++ { field := b.header[i] - if val, ok := rec.Values[i].(string); ok { + switch val := rec.Values[i].(type) { + case string: + rowIDs := b.rowIDs[field.Name()] // translate val and append to b.rowIDs[i] if rowID, ok := b.client.translator.GetRow(b.index.Name(), field.Name(), val); ok { - b.rowIDs[i] = append(b.rowIDs[i], rowID) + b.rowIDs[field.Name()] = append(rowIDs, rowID) } else { - ints, ok := b.toTranslate[i][val] + ints, ok := b.toTranslate[field.Name()][val] if !ok { ints = make([]int, 0) } - ints = append(ints, len(b.rowIDs[i])) - b.toTranslate[i][val] = ints - b.rowIDs[i] = append(b.rowIDs[i], 0) + ints = append(ints, len(rowIDs)) + b.toTranslate[field.Name()][val] = ints + b.rowIDs[field.Name()] = append(rowIDs, 0) } - } else { - return errors.New("TODO support types other than string") + case uint64: + b.rowIDs[field.Name()] = append(b.rowIDs[field.Name()], val) + case int64: + b.values[field.Name()] = append(b.values[field.Name()], val) + default: + return errors.Errorf("Val %v Type %[1]T is not currently supported. Use string, uint64 (row id), or int64 (integer value)", val) } } if len(b.ids) == cap(b.ids) { @@ -148,14 +177,14 @@ func (b *Batch) doTranslation() error { for _, recordIdx := range b.toTranslateID[key] { b.ids[recordIdx] = id } + b.client.translator.AddCol(b.index.Name(), key, id) } } else { - keys = make([]string, 0, len(b.toTranslate[0])) + keys = make([]string, 0) } // translate row keys - for i, field := range b.header { - tt := b.toTranslate[i] + for fieldName, tt := range b.toTranslate { keys = keys[:0] // make a slice of keys @@ -168,18 +197,19 @@ func (b *Batch) doTranslation() error { } // translate keys from Pilosa - ids, err := b.client.translateRowKeys(field, keys) + ids, err := b.client.translateRowKeys(b.headerMap[fieldName], keys) if err != nil { return errors.Wrap(err, "translating row keys") } // fill out missing IDs in local batch records with translated IDs + rows := b.rowIDs[fieldName] for j, key := range keys { id := ids[j] for _, recordIdx := range tt[key] { - b.rowIDs[i][recordIdx] = id + rows[recordIdx] = id } - b.client.translator.AddRow(b.index.Name(), field.Name(), key, id) + b.client.translator.AddRow(b.index.Name(), fieldName, key, id) } } return nil @@ -189,17 +219,24 @@ func (b *Batch) doImport() error { eg := errgroup.Group{} frags := b.makeFragments() - uri := b.client.cluster.hosts[0] // TODO get URI per-shard performantly. for shard, viewMap := range frags { + uris, err := b.client.GetURIsForShard(b.index.Name(), shard) + uri := uris[0] + if err != nil { + return errors.Wrap(err, "getting uris for shard") + } for fieldView, bitmap := range viewMap { fieldView := fieldView bitmap := bitmap eg.Go(func() error { err := b.client.importRoaringBitmap(uri, b.index.Field(fieldView.field), shard, map[string]*roaring.Bitmap{"": bitmap}, &ImportOptions{}) - return errors.Wrap(err, "doing import") + return errors.Wrapf(err, "importing data for %s", fieldView.field) }) } } + eg.Go(func() error { + return b.importValueData() + }) return eg.Wait() } @@ -209,15 +246,14 @@ func (b *Batch) makeFragments() fragments { shardWidth = DefaultShardWidth } frags := make(fragments) - for i, field := range b.header { + for fname, rowIDs := range b.rowIDs { curShard := ^uint64(0) // impossible sentinel value. var curBM *roaring.Bitmap - rowIDs := b.rowIDs[i] for j, _ := range b.ids { col, row := b.ids[j], rowIDs[j] if col/shardWidth != curShard { curShard = col / shardWidth - curBM = frags.GetOrCreate(curShard, field.Name(), "") + curBM = frags.GetOrCreate(curShard, fname, "") } curBM.DirectAdd(row*shardWidth + (col % shardWidth)) } @@ -225,13 +261,47 @@ func (b *Batch) makeFragments() fragments { return frags } +func (b *Batch) importValueData() error { + shardWidth := b.index.shardWidth + if shardWidth == 0 { + shardWidth = DefaultShardWidth + } + + eg := errgroup.Group{} + curShard := b.ids[0] / shardWidth + startIdx := 0 + for i := 1; i <= len(b.ids); i++ { + // when i==len(b.ids) we ensure that the import logic gets run + // by making a fake shard once we're past the last ID + recordID := ^uint64(0) + if i < len(b.ids) { + recordID = b.ids[i] + } + if recordID/shardWidth != curShard { + endIdx := i + ids := b.ids[startIdx:endIdx] + for field, values := range b.values { + vslice := values[startIdx:endIdx] + eg.Go(func() error { + err := b.client.ImportValues(b.index.Name(), field, curShard, vslice, ids) + return errors.Wrapf(err, "importing values for %s", field) + }) + } + startIdx = i + curShard = recordID / shardWidth + } + } + + return errors.Wrap(eg.Wait(), "importing value data") +} + // reset is called at the end of importing to ready the batch for the // next round. Where possible it does not re-allocate memory. func (b *Batch) reset() { b.ids = b.ids[:0] - for i, rowIDs := range b.rowIDs { - b.rowIDs[i] = rowIDs[:0] - m := b.toTranslate[i] + for fieldName, rowIDs := range b.rowIDs { + b.rowIDs[fieldName] = rowIDs[:0] + m := b.toTranslate[fieldName] for k := range m { delete(m, k) } @@ -239,6 +309,9 @@ func (b *Batch) reset() { for k := range b.toTranslateID { delete(b.toTranslateID, k) } + for k, _ := range b.values { + delete(b.values, k) + } } type fieldView struct { // TODO rename to fieldview diff --git a/importbatch_test.go b/importbatch_test.go index 0a1320b..cfeb43f 100644 --- a/importbatch_test.go +++ b/importbatch_test.go @@ -4,16 +4,19 @@ import ( "reflect" "strconv" "testing" + + "github.com/pkg/errors" ) func TestBatches(t *testing.T) { client := DefaultClient() schema := NewSchema() idx := schema.Index("gopilosatest-blah") - fields := make([]*Field, 3) + fields := make([]*Field, 4) fields[0] = idx.Field("zero", OptFieldKeys(true)) fields[1] = idx.Field("one", OptFieldKeys(true)) fields[2] = idx.Field("two", OptFieldKeys(true)) + fields[3] = idx.Field("three", OptFieldTypeInt()) err := client.SyncSchema(schema) if err != nil { t.Fatalf("syncing schema: %v", err) @@ -25,8 +28,7 @@ func TestBatches(t *testing.T) { } }() b := NewBatch(client, 10, idx, fields) - n - r := Row{Values: make([]interface{}, 3)} + r := Row{Values: make([]interface{}, 4)} for i := 0; i < 9; i++ { r.ID = uint64(i) @@ -34,10 +36,12 @@ func TestBatches(t *testing.T) { r.Values[0] = "a" r.Values[1] = "b" r.Values[2] = "c" + r.Values[3] = int64(99) } else { r.Values[0] = "x" r.Values[1] = "y" r.Values[2] = "z" + r.Values[3] = int64(-10) } err := b.Add(r) if err != nil { @@ -46,10 +50,10 @@ func TestBatches(t *testing.T) { } - if len(b.toTranslate[0]) != 2 { + if len(b.toTranslate["zero"]) != 2 { t.Fatalf("wrong number of keys in toTranslate[0]") } - for k, ints := range b.toTranslate[0] { + for k, ints := range b.toTranslate["zero"] { if k == "a" { if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { t.Fatalf("wrong ints for key a in field zero: %v", ints) @@ -64,10 +68,14 @@ func TestBatches(t *testing.T) { } } - if len(b.toTranslate[1]) != 2 { - t.Fatalf("wrong number of keys in toTranslate[1]") + if !reflect.DeepEqual(b.values["three"], []int64{99, -10, 99, -10, 99, -10, 99, -10, 99}) { + t.Fatalf("unexpected values: %v", b.values["three"]) + } + + if len(b.toTranslate["one"]) != 2 { + t.Fatalf("wrong number of keys in toTranslate[\"one\"]") } - for k, ints := range b.toTranslate[1] { + for k, ints := range b.toTranslate["one"] { if k == "b" { if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { t.Fatalf("wrong ints for key b in field one: %v", ints) @@ -82,10 +90,10 @@ func TestBatches(t *testing.T) { } } - if len(b.toTranslate[2]) != 2 { + if len(b.toTranslate["two"]) != 2 { t.Fatalf("wrong number of keys in toTranslate[2]") } - for k, ints := range b.toTranslate[2] { + for k, ints := range b.toTranslate["two"] { if k == "c" { if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { t.Fatalf("wrong ints for key c in field two: %v", ints) @@ -110,15 +118,19 @@ func TestBatches(t *testing.T) { t.Fatalf("should have gotten already full batch error, but got %v", err) } + if !reflect.DeepEqual(b.values["three"], []int64{99, -10, 99, -10, 99, -10, 99, -10, 99, 99}) { + t.Fatalf("unexpected values: %v", b.values["three"]) + } + err = b.doTranslation() if err != nil { t.Fatalf("doing translation: %v", err) } - for i, rowIDs := range b.rowIDs { + for fname, rowIDs := range b.rowIDs { // we don't know which key will get translated first, but we do know the pattern if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 1}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 2}) { - t.Fatalf("unexpected row ids for field %d: %v", i, rowIDs) + t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) } } @@ -135,10 +147,12 @@ func TestBatches(t *testing.T) { r.Values[0] = "a" r.Values[1] = "b" r.Values[2] = "c" + r.Values[3] = int64(99) } else { r.Values[0] = "x" r.Values[1] = "y" r.Values[2] = "z" + r.Values[3] = int64(-10) } err := b.Add(r) if i != 18 && err != nil { @@ -160,10 +174,10 @@ func TestBatches(t *testing.T) { t.Fatalf("doing import: %v", err) } - for i, rowIDs := range b.rowIDs { + for fname, rowIDs := range b.rowIDs { // we don't know which key will get translated first, but we do know the pattern if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 2}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 1}) { - t.Fatalf("unexpected row ids for field %d: %v", i, rowIDs) + t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) } } @@ -175,10 +189,12 @@ func TestBatches(t *testing.T) { r.Values[0] = "d" r.Values[1] = "e" r.Values[2] = "f" + r.Values[3] = int64(100) } else { r.Values[0] = "u" r.Values[1] = "v" r.Values[2] = "w" + r.Values[3] = int64(0) } err := b.Add(r) if i != 28 && err != nil { @@ -199,10 +215,10 @@ func TestBatches(t *testing.T) { t.Fatalf("doing import: %v", err) } - for i, rowIDs := range b.rowIDs { + for fname, rowIDs := range b.rowIDs { // we don't know which key will get translated first, but we do know the pattern if !reflect.DeepEqual(rowIDs, []uint64{3, 4, 3, 4, 3, 4, 3, 4, 3, 4}) && !reflect.DeepEqual(rowIDs, []uint64{4, 3, 4, 3, 4, 3, 4, 3, 4, 3}) { - t.Fatalf("unexpected row ids for field %d: %v", i, rowIDs) + t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) } } @@ -219,19 +235,19 @@ func TestBatches(t *testing.T) { t.Fatalf("there should be 3 views") } - // TODO query Pilosa to confirm data is in place resp, err := client.Query(idx.BatchQuery(fields[0].Row("a"), fields[1].Row("b"), - fields[2].Row("c"))) + fields[2].Row("c"), + fields[3].Equals(99))) if err != nil { t.Fatalf("querying: %v", err) } results := resp.Results() - for _, res := range results { + for i, res := range results { cols := res.Row().Columns if !reflect.DeepEqual(cols, []uint64{0, 2, 4, 6, 8, 10, 12, 14, 16, 18}) { - t.Fatalf("unexpected columns: %v", cols) + t.Fatalf("unexpected columns at %d: %v", i, cols) } } @@ -250,7 +266,28 @@ func TestBatches(t *testing.T) { } } + resp, err = client.Query(idx.BatchQuery(fields[3].GT(-11), + fields[3].Equals(0), + fields[3].Equals(100))) + if err != nil { + t.Fatalf("querying: %v", err) + } + results = resp.Results() + cols := results[0].Row().Columns + if !reflect.DeepEqual(cols, []uint64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}) { + t.Fatalf("all columns should be greater than -11, but got: %v", cols) + } + cols = results[1].Row().Columns + if !reflect.DeepEqual(cols, []uint64{19, 21, 23, 25, 27}) { + t.Fatalf("wrong cols for ==0: %v", cols) + } + cols = results[2].Row().Columns + if !reflect.DeepEqual(cols, []uint64{20, 22, 24, 26, 28}) { + t.Fatalf("wrong cols for ==100: %v", cols) + } + // TODO test non-full batches, test behavior of doing import on empty batch + // TODO test importing across multiple shards } func TestBatchesStringIDs(t *testing.T) { @@ -313,8 +350,8 @@ func TestBatchesStringIDs(t *testing.T) { t.Fatalf("translating: %v", err) } - if !reflect.DeepEqual(b.ids, []uint64{1, 2, 3}) { - t.Fatalf("unexpected ids: %v", b.ids) + if err := isPermutationOfInt(b.ids, []uint64{1, 2, 3}); err != nil { + t.Fatalf("wrong ids: %v", err) } err = b.Import() @@ -330,7 +367,7 @@ func TestBatchesStringIDs(t *testing.T) { results := resp.Results() for i, res := range results { cols := res.Row().Keys - if i == 0 && !reflect.DeepEqual(cols, []string{"0", "2"}) { + if i == 0 && !reflect.DeepEqual(cols, []string{"0", "2"}) && !reflect.DeepEqual(cols, []string{"2", "0"}) { t.Fatalf("unexpected columns: %v", cols) } if i == 1 && !reflect.DeepEqual(cols, []string{"1"}) { @@ -367,8 +404,8 @@ func TestBatchesStringIDs(t *testing.T) { results = resp.Results() for i, res := range results { cols := res.Row().Keys - if i == 0 && !reflect.DeepEqual(cols, []string{"0", "1", "2"}) { - t.Fatalf("unexpected columns: %v", cols) + if err := isPermutationOf(cols, []string{"0", "1", "2"}); i == 0 && err != nil { + t.Fatalf("unexpected columns: %v: %v", cols, err) } if i == 1 && !reflect.DeepEqual(cols, []string{"3"}) { t.Fatalf("unexpected columns: %v", cols) @@ -376,3 +413,43 @@ func TestBatchesStringIDs(t *testing.T) { } } + +func isPermutationOf(one, two []string) error { + if len(one) != len(two) { + return errors.Errorf("different lengths %d and %d", len(one), len(two)) + } +outer: + for _, vOne := range one { + for j, vTwo := range two { + if vOne == vTwo { + two = append(two[:j], two[j+1:]...) + continue outer + } + } + return errors.Errorf("%s in one but not two", vOne) + } + if len(two) != 0 { + return errors.Errorf("vals in two but not one: %v", two) + } + return nil +} + +func isPermutationOfInt(one, two []uint64) error { + if len(one) != len(two) { + return errors.Errorf("different lengths %d and %d", len(one), len(two)) + } +outer: + for _, vOne := range one { + for j, vTwo := range two { + if vOne == vTwo { + two = append(two[:j], two[j+1:]...) + continue outer + } + } + return errors.Errorf("%d in one but not two", vOne) + } + if len(two) != 0 { + return errors.Errorf("vals in two but not one: %v", two) + } + return nil +} diff --git a/shardnodes.go b/shardnodes.go new file mode 100644 index 0000000..d05603a --- /dev/null +++ b/shardnodes.go @@ -0,0 +1,39 @@ +package pilosa + +import ( + "sync" +) + +type shardNodes struct { + data map[string]map[uint64][]*URI + mu *sync.RWMutex +} + +func newShardNodes() shardNodes { + return shardNodes{ + data: make(map[string]map[uint64][]*URI), + mu: &sync.RWMutex{}, + } +} + +func (s shardNodes) Get(index string, shard uint64) ([]*URI, bool) { + s.mu.RLock() + defer s.mu.RUnlock() + if idx, ok := s.data[index]; ok { + if uris, ok := idx[shard]; ok { + return uris, true + } + } + return nil, false +} + +func (s shardNodes) Put(index string, shard uint64, uris []*URI) { + s.mu.Lock() + defer s.mu.Unlock() + idx, ok := s.data[index] + if !ok { + idx = make(map[uint64][]*URI) + } + idx[shard] = uris + s.data[index] = idx +} diff --git a/translator.go b/translator.go index c6417d5..35429fb 100644 --- a/translator.go +++ b/translator.go @@ -26,6 +26,15 @@ func (t *Translator) GetCol(index, key string) (uint64, bool) { return 0, false } +func (t *Translator) AddCol(index, key string, value uint64) { + idx, ok := t.indexes[index] + if !ok { + idx = make(map[string]uint64) + } + idx[key] = value + t.indexes[index] = idx +} + func (t *Translator) GetRow(index, field, key string) (uint64, bool) { if fld, ok := t.fields[indexfield{index: index, field: field}]; ok { if val, ok := fld[key]; ok { @@ -43,5 +52,3 @@ func (t *Translator) AddRow(index, field, key string, value uint64) { keys[key] = value t.fields[indexfield{index: index, field: field}] = keys } - -// TODO AddCol From 3146a3180e6e647e978a5f1e8fecb3dc21701f12 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Mon, 26 Aug 2019 22:08:52 -0500 Subject: [PATCH 04/26] add many config options to picsv, test, benchmark --- client.go | 8 +- cmd/picsv/.gitignore | 1 + cmd/picsv/Makefile | 3 + cmd/picsv/main.go | 177 ++++++++++++++++++++++++++++++++++++----- cmd/picsv/main_test.go | 139 ++++++++++++++++++++++++++++++++ 5 files changed, 308 insertions(+), 20 deletions(-) create mode 100644 cmd/picsv/.gitignore create mode 100644 cmd/picsv/Makefile create mode 100644 cmd/picsv/main_test.go diff --git a/client.go b/client.go index 94cb934..366fd6f 100644 --- a/client.go +++ b/client.go @@ -322,13 +322,17 @@ func (c *Client) EnsureField(field *Field) error { // DeleteIndex deletes an index on the server. func (c *Client) DeleteIndex(index *Index) error { + return c.DeleteIndexByName(index.Name()) +} + +// DeleteIndexByName deletes the named index on the server. +func (c *Client) DeleteIndexByName(index string) error { span := c.tracer.StartSpan("Client.DeleteIndex") defer span.Finish() - path := fmt.Sprintf("/index/%s", index.name) + path := fmt.Sprintf("/index/%s", index) _, _, err := c.httpRequest("DELETE", path, nil, nil, false) return err - } // DeleteField deletes a field on the server. diff --git a/cmd/picsv/.gitignore b/cmd/picsv/.gitignore new file mode 100644 index 0000000..d179ee1 --- /dev/null +++ b/cmd/picsv/.gitignore @@ -0,0 +1 @@ +marketing-*.csv \ No newline at end of file diff --git a/cmd/picsv/Makefile b/cmd/picsv/Makefile new file mode 100644 index 0000000..d8276e1 --- /dev/null +++ b/cmd/picsv/Makefile @@ -0,0 +1,3 @@ + +bench: + GO111MODULE=on go test -bench=. -run=ZZZ -benchtime=3x diff --git a/cmd/picsv/main.go b/cmd/picsv/main.go index 230f83a..e602c15 100644 --- a/cmd/picsv/main.go +++ b/cmd/picsv/main.go @@ -2,10 +2,11 @@ package main import ( "encoding/csv" - "fmt" + "encoding/json" "io" "log" "os" + "strconv" "time" "github.com/jaffee/commandeer" @@ -14,11 +15,13 @@ import ( ) type Main struct { - Pilosa []string - File string - Index string - BatchSize int - IDField string + Pilosa []string + File string + Index string + BatchSize int + ConfigFile string + + Config *Config `flag:"-"` } func NewMain() *Main { @@ -27,15 +30,28 @@ func NewMain() *Main { File: "data.csv", Index: "picsvtest", BatchSize: 1000, - IDField: "id", + + Config: NewConfig(), } } func (m *Main) Run() error { start := time.Now() - defer func() { - fmt.Println("Duration: ", time.Since(start)) - }() + + // Load Config File (if available) + if m.ConfigFile != "" { + f, err := os.Open(m.ConfigFile) + if err != nil { + return errors.Wrap(err, "opening config file") + } + dec := json.NewDecoder(f) + err = dec.Decode(m.Config) + if err != nil { + return errors.Wrap(err, "decoding config file") + } + } + log.Printf("Config: %+v\n", *m) + f, err := os.Open(m.File) if err != nil { return errors.Wrap(err, "opening file") @@ -52,7 +68,7 @@ func (m *Main) Run() error { return errors.Wrap(err, "getting schema") } opts := []pilosa.IndexOption{} - if m.IDField != "" { + if m.Config.IDField != "" { opts = append(opts, pilosa.OptIndexKeys(true)) } index := schema.Index(m.Index, opts...) @@ -62,7 +78,7 @@ func (m *Main) Run() error { return errors.Wrap(err, "reading CSV header") } log.Println("Got Header: ", headerRow) - fields, header, getIDFn := processHeader(index, m.IDField, headerRow) + fields, header, getIDFn := processHeader(m.Config, index, headerRow) // this has a non-obvious dependence on the previous line... the fields are set up in the index which comes from the schema client.SyncSchema(schema) @@ -76,7 +92,7 @@ func (m *Main) Run() error { record.ID = getIDFn(row, numRecords) for _, meta := range header { if meta.srcIndex < len(row) { - record.Values[meta.recordIndex] = row[meta.srcIndex] + record.Values[meta.recordIndex] = meta.valGetter(row[meta.srcIndex]) } else { record.Values[meta.recordIndex] = nil log.Printf("row is shorter than header: %v", row) @@ -104,33 +120,111 @@ func (m *Main) Run() error { } log.Printf("processed %d ids\n", numRecords) - + log.Println("Duration: ", time.Since(start)) return nil } type valueMeta struct { srcIndex int recordIndex int + valGetter func(val string) interface{} } type idGetter func(row []string, numRecords uint64) interface{} -func processHeader(index *pilosa.Index, idField string, headerRow []string) ([]*pilosa.Field, map[string]valueMeta, idGetter) { +func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]*pilosa.Field, map[string]valueMeta, idGetter) { fields := make([]*pilosa.Field, 0, len(headerRow)) header := make(map[string]valueMeta) getIDFn := func(row []string, numRecords uint64) interface{} { return numRecords } for i, fieldName := range headerRow { - if fieldName == idField { + if fieldName == config.IDField { idIndex := i getIDFn = func(row []string, numRecords uint64) interface{} { return row[idIndex] } continue } - header[fieldName] = valueMeta{srcIndex: i, recordIndex: len(fields)} - fields = append(fields, index.Field(fieldName, pilosa.OptFieldKeys(true), pilosa.OptFieldTypeSet(pilosa.CacheTypeRanked, 100000))) + + var valGetter func(val string) interface{} + srcField, ok := config.SourceFields[fieldName] + if !ok { + srcField = SourceField{ + TargetField: fieldName, + Type: "string", + } + config.SourceFields[fieldName] = srcField + } + pilosaField, ok := config.PilosaFields[srcField.TargetField] + if !ok { + pilosaField = Field{ + Type: "string", + CacheType: pilosa.CacheTypeRanked, + CacheSize: 100000, + Keys: true, + } + config.PilosaFields[fieldName] = pilosaField + } + switch srcField.Type { + case "ignore": + continue + case "int": + valGetter = func(val string) interface{} { + intVal, err := strconv.Atoi(val) + if err != nil { + return nil + } + return intVal + } + opts := []pilosa.FieldOption{pilosa.OptFieldTypeInt()} + if pilosaField.Max != 0 || pilosaField.Min != 0 { + opts[0] = pilosa.OptFieldTypeInt(pilosaField.Min, pilosaField.Max) + } + fields = append(fields, index.Field(fieldName, opts...)) + case "float": + if srcField.Multiplier != 0 { + valGetter = func(val string) interface{} { + floatVal, err := strconv.ParseFloat(val, 64) + if err != nil { + return nil + } + return int64(floatVal * srcField.Multiplier) + } + } else { + valGetter = func(val string) interface{} { + floatVal, err := strconv.ParseFloat(val, 64) + if err != nil { + return nil + } + return int64(floatVal) + } + } + opts := []pilosa.FieldOption{pilosa.OptFieldTypeInt()} + if pilosaField.Max != 0 || pilosaField.Min != 0 { + opts[0] = pilosa.OptFieldTypeInt(pilosaField.Min, pilosaField.Max) + } + fields = append(fields, index.Field(fieldName, opts...)) + case "string": + valGetter = func(val string) interface{} { + return val + } + fields = append(fields, index.Field(fieldName, pilosa.OptFieldKeys(pilosaField.Keys), pilosa.OptFieldTypeSet(pilosaField.CacheType, pilosaField.CacheSize))) + case "uint64": + valGetter = func(val string) interface{} { + uintVal, err := strconv.ParseUint(val, 0, 64) + if err != nil { + return nil + } + return uintVal + } + fields = append(fields, index.Field(fieldName, pilosa.OptFieldKeys(pilosaField.Keys), pilosa.OptFieldTypeSet(pilosaField.CacheType, pilosaField.CacheSize))) + } + header[fieldName] = valueMeta{ + valGetter: valGetter, + srcIndex: i, + recordIndex: len(fields) - 1, + } } return fields, header, getIDFn @@ -141,3 +235,50 @@ func main() { log.Fatal(err) } } + +func NewConfig() *Config { + return &Config{ + PilosaFields: make(map[string]Field), + SourceFields: make(map[string]SourceField), + } +} + +type Config struct { + PilosaFields map[string]Field `json:"pilosa-fields"` + SourceFields map[string]SourceField `json:"source-fields"` + + // IDField denotes which field in the source should be used for Pilosa record IDs. + IDField string `json:"id-field"` + + // IDType denotes whether the ID field should be parsed as a string or uint64. + IDType string `json:"id-type"` +} + +type Field struct { + Type string `json:"type"` + Min int64 `json:"min"` + Max int64 `json:"max"` + Keys bool `json:"keys"` + CacheType pilosa.CacheType `json:"cache-type"` + CacheSize int `json:"cache-size"` + // TODO time stuff +} + +type SourceField struct { + // TargetField is the Pilosa field that this source field should map to. + TargetField string `json:"target-field"` + + // Type denotes how the source field should be parsed. (string, + // int, rowID, float, or ignore). rowID means that the field will + // be parsed as a uint64 and then used directly as a rowID for a + // set field. If "string", key translation must be on for that + // Pilosa field, and it must be a set field. If int or float, it + // must be a Pilosa int field. + Type string `json:"type"` + + // Multiplier is for float fields. Because Pilosa does not support + // floats natively, it is sometimes useful to store a float in + // Pilosa as an integer, but first multiplied by some constant + // factor to preserve some amount of precision. If 0 this field won't be used. + Multiplier float64 `json:"multiplier"` +} diff --git a/cmd/picsv/main_test.go b/cmd/picsv/main_test.go new file mode 100644 index 0000000..ccad440 --- /dev/null +++ b/cmd/picsv/main_test.go @@ -0,0 +1,139 @@ +package main_test + +import ( + "fmt" + "io" + "net/http" + "os" + "testing" + + "github.com/pilosa/go-pilosa" + picsv "github.com/pilosa/go-pilosa/cmd/picsv" +) + +func BenchmarkImportCSV(b *testing.B) { + m := picsv.NewMain() + m.BatchSize = 1 << 20 + m.Index = "picsvbench" + m.File = "marketing-200k.csv" + getRawData(b, m.File) + client, err := pilosa.NewClient(m.Pilosa) + if err != nil { + b.Fatalf("getting client: %v", err) + } + b.ResetTimer() + + for i := 0; i < b.N; i++ { + err := m.Run() + if err != nil { + b.Fatalf("running import: %v", err) + } + b.StopTimer() + err = client.DeleteIndexByName(m.Index) + if err != nil { + b.Fatalf("deleting index: %v", err) + } + b.StartTimer() + } + +} + +func getRawData(t testing.TB, file string) { + if _, err := os.Open(file); err == nil { + return + } else if !os.IsNotExist(err) { + t.Fatalf("opening %s: %v", file, err) + } + // if the file doesn't exist + f, err := os.Create(file) + if err != nil { + t.Fatalf("creating file: %v", err) + } + resp, err := http.Get(fmt.Sprintf("https://molecula-sample-data.s3.amazonaws.com/%s", file)) + if err != nil { + t.Fatalf("getting data: %v", err) + } + if resp.StatusCode > 299 { + t.Fatalf("getting data failed: %v", resp.Status) + } + _, err = io.Copy(f, resp.Body) + if err != nil { + t.Fatalf("copying data into file: %v", err) + } + + err = f.Close() + if err != nil { + t.Fatalf("closing file: %v", err) + } + +} + +func TestImportCSV(t *testing.T) { + m := picsv.NewMain() + m.BatchSize = 1 << 20 + m.Index = "testpicsv" + m.File = "marketing-200k.csv" + m.Config.SourceFields["age"] = picsv.SourceField{TargetField: "age", Type: "float"} + m.Config.PilosaFields["age"] = picsv.Field{Type: "int"} + getRawData(t, m.File) + client, err := pilosa.NewClient(m.Pilosa) + if err != nil { + t.Fatalf("getting client: %v", err) + } + + defer func() { + err = client.DeleteIndexByName(m.Index) + if err != nil { + t.Fatalf("deleting index: %v", err) + } + }() + err = m.Run() + if err != nil { + t.Fatalf("running ingest: %v", err) + } + + schema, err := client.Schema() + if err != nil { + t.Fatalf("getting schema: %v", err) + } + + index := schema.Index(m.Index) + marital := index.Field("marital") + converted := index.Field("converted") + age := index.Field("age") + + tests := []struct { + query *pilosa.PQLRowQuery + bash string + exp int64 + }{ + { + query: marital.Row("married"), + bash: `awk -F, '/married/ {print $1,$4}' marketing-200k.csv | sort | uniq | wc`, + exp: 125514, + }, + { + query: converted.Row("no"), + bash: `awk -F, '{print $1,$17}' marketing-200k.csv | grep "no" |sort | uniq | wc`, + exp: 199999, + }, + { + query: age.Equals(55), + bash: `awk -F, '{print $1,$2}' marketing-200k.csv | grep " 55.0" |sort | uniq | wc`, + exp: 3282, + }, + } + + for i, test := range tests { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + q := index.Count(test.query) + resp, err := client.Query(q) + if err != nil { + t.Fatalf("running query '%s': %v", q.Serialize(), err) + } + if resp.Result().Count() != test.exp { + t.Fatalf("Got unexpected result %d instead of %d for\nquery: %s\nbash: %s", resp.Result().Count(), test.exp, q.Serialize(), test.bash) + } + }) + } +} From 802f882a0aea3bccbf36fe31f3623f8a9bd02c38 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 10:43:32 -0500 Subject: [PATCH 05/26] support for empty values in importbatch --- client.go | 13 ++- client_it_test.go | 2 +- cmd/picsv/.gitignore | 3 +- cmd/picsv/main.go | 75 ++++++++---- cmd/picsv/main_internal_test.go | 20 ++++ cmd/picsv/main_test.go | 194 +++++++++++++++++++++++++++++++- cmd/picsv/testdata/sample.csv | 8 ++ importbatch.go | 112 ++++++++++++++++-- importbatch_test.go | 39 +++++-- 9 files changed, 420 insertions(+), 46 deletions(-) create mode 100644 cmd/picsv/main_internal_test.go create mode 100644 cmd/picsv/testdata/sample.csv diff --git a/client.go b/client.go index 366fd6f..e1f0269 100644 --- a/client.go +++ b/client.go @@ -671,10 +671,13 @@ func (c *Client) importValues(field *Field, return errors.Wrap(err, "importing values to nodes") } -// ImportValues takes the given integer values and column ids -// (which must all be in the given shard) and imports them into the -// given index,field,shard on all nodes which should hold that shard. -func (c *Client) ImportValues(index, field string, shard uint64, vals []int64, ids []uint64) error { +// ImportValues takes the given integer values and column ids (which +// must all be in the given shard) and imports them into the given +// index,field,shard on all nodes which should hold that shard. It +// assumes that the ids have been translated from keys if necessary +// and so tells Pilosa to ignore checking if the index uses column +// keys. +func (c *Client) ImportValues(index, field string, shard uint64, vals []int64, ids []uint64, clear bool) error { msg := &pbuf.ImportValueRequest{ Index: index, Field: field, @@ -686,7 +689,7 @@ func (c *Client) ImportValues(index, field string, shard uint64, vals []int64, i if err != nil { return errors.Wrap(err, "marshaling to protobuf") } - path := fmt.Sprintf("/index/%s/field/%s/import", index, field) + path := fmt.Sprintf("/index/%s/field/%s/import?clear=%s&ignoreKeyCheck=true", index, field, strconv.FormatBool(clear)) c.logImport(index, path, shard, false, data) uris, err := c.GetURIsForShard(index, shard) diff --git a/client_it_test.go b/client_it_test.go index f208ac7..aded5a8 100644 --- a/client_it_test.go +++ b/client_it_test.go @@ -864,7 +864,7 @@ func TestImportValues(t *testing.T) { t.Fatalf("syncing schema: %v", err) } - err = client.ImportValues("go-testindex", "intfield", 0, []int64{1, 2, 3}, []uint64{1, 2, 3}) + err = client.ImportValues("go-testindex", "intfield", 0, []int64{1, 2, 3}, []uint64{1, 2, 3}, false) if err != nil { t.Fatalf("importing values: %v", err) } diff --git a/cmd/picsv/.gitignore b/cmd/picsv/.gitignore index d179ee1..d9f1f6e 100644 --- a/cmd/picsv/.gitignore +++ b/cmd/picsv/.gitignore @@ -1 +1,2 @@ -marketing-*.csv \ No newline at end of file +marketing-*.csv +config.json diff --git a/cmd/picsv/main.go b/cmd/picsv/main.go index e602c15..39368b5 100644 --- a/cmd/picsv/main.go +++ b/cmd/picsv/main.go @@ -3,6 +3,7 @@ package main import ( "encoding/csv" "encoding/json" + "fmt" "io" "log" "os" @@ -50,7 +51,8 @@ func (m *Main) Run() error { return errors.Wrap(err, "decoding config file") } } - log.Printf("Config: %+v\n", *m) + log.Printf("Flags: %+v\n", *m) + log.Printf("Config: %+v\n", *m.Config) f, err := os.Open(m.File) if err != nil { @@ -78,7 +80,10 @@ func (m *Main) Run() error { return errors.Wrap(err, "reading CSV header") } log.Println("Got Header: ", headerRow) - fields, header, getIDFn := processHeader(m.Config, index, headerRow) + fields, header, getIDFn, err := processHeader(m.Config, index, headerRow) + if err != nil { + return errors.Wrap(err, "processing header") + } // this has a non-obvious dependence on the previous line... the fields are set up in the index which comes from the schema client.SyncSchema(schema) @@ -132,7 +137,7 @@ type valueMeta struct { type idGetter func(row []string, numRecords uint64) interface{} -func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]*pilosa.Field, map[string]valueMeta, idGetter) { +func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]*pilosa.Field, map[string]valueMeta, idGetter, error) { fields := make([]*pilosa.Field, 0, len(headerRow)) header := make(map[string]valueMeta) getIDFn := func(row []string, numRecords uint64) interface{} { @@ -141,8 +146,21 @@ func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]* for i, fieldName := range headerRow { if fieldName == config.IDField { idIndex := i - getIDFn = func(row []string, numRecords uint64) interface{} { - return row[idIndex] + switch config.IDType { + case "uint64": + getIDFn = func(row []string, numRecords uint64) interface{} { + uintVal, err := strconv.ParseUint(row[idIndex], 0, 64) + if err != nil { + return nil + } + return uintVal + } + case "string": + getIDFn = func(row []string, numRecords uint64) interface{} { + return row[idIndex] + } + default: + return nil, nil, nil, errors.Errorf("unknown IDType: %s", config.IDType) } continue } @@ -159,29 +177,27 @@ func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]* pilosaField, ok := config.PilosaFields[srcField.TargetField] if !ok { pilosaField = Field{ - Type: "string", + Type: "set", CacheType: pilosa.CacheTypeRanked, CacheSize: 100000, Keys: true, } config.PilosaFields[fieldName] = pilosaField } + + fieldName = srcField.TargetField switch srcField.Type { case "ignore": continue case "int": valGetter = func(val string) interface{} { - intVal, err := strconv.Atoi(val) + intVal, err := strconv.ParseInt(val, 10, 64) if err != nil { return nil } return intVal } - opts := []pilosa.FieldOption{pilosa.OptFieldTypeInt()} - if pilosaField.Max != 0 || pilosaField.Min != 0 { - opts[0] = pilosa.OptFieldTypeInt(pilosaField.Min, pilosaField.Max) - } - fields = append(fields, index.Field(fieldName, opts...)) + fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) case "float": if srcField.Multiplier != 0 { valGetter = func(val string) interface{} { @@ -200,16 +216,15 @@ func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]* return int64(floatVal) } } - opts := []pilosa.FieldOption{pilosa.OptFieldTypeInt()} - if pilosaField.Max != 0 || pilosaField.Min != 0 { - opts[0] = pilosa.OptFieldTypeInt(pilosaField.Min, pilosaField.Max) - } - fields = append(fields, index.Field(fieldName, opts...)) + fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) case "string": valGetter = func(val string) interface{} { + if val == "" { + return nil // ignore empty strings + } return val } - fields = append(fields, index.Field(fieldName, pilosa.OptFieldKeys(pilosaField.Keys), pilosa.OptFieldTypeSet(pilosaField.CacheType, pilosaField.CacheSize))) + fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) case "uint64": valGetter = func(val string) interface{} { uintVal, err := strconv.ParseUint(val, 0, 64) @@ -218,7 +233,7 @@ func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]* } return uintVal } - fields = append(fields, index.Field(fieldName, pilosa.OptFieldKeys(pilosaField.Keys), pilosa.OptFieldTypeSet(pilosaField.CacheType, pilosaField.CacheSize))) + fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) } header[fieldName] = valueMeta{ valGetter: valGetter, @@ -227,7 +242,7 @@ func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]* } } - return fields, header, getIDFn + return fields, header, getIDFn, nil } func main() { @@ -240,6 +255,7 @@ func NewConfig() *Config { return &Config{ PilosaFields: make(map[string]Field), SourceFields: make(map[string]SourceField), + IDType: "string", } } @@ -264,6 +280,22 @@ type Field struct { // TODO time stuff } +func (f Field) MakeOptions() (opts []pilosa.FieldOption) { + switch f.Type { + case "set": + opts = append(opts, pilosa.OptFieldKeys(f.Keys), pilosa.OptFieldTypeSet(f.CacheType, f.CacheSize)) + case "int": + if f.Max != 0 || f.Min != 0 { + opts = append(opts, pilosa.OptFieldTypeInt(f.Min, f.Max)) + } else { + opts = append(opts, pilosa.OptFieldTypeInt()) + } + default: + panic(fmt.Sprintf("unknown pilosa field type: %s", f.Type)) + } + return opts +} + type SourceField struct { // TargetField is the Pilosa field that this source field should map to. TargetField string `json:"target-field"` @@ -282,3 +314,6 @@ type SourceField struct { // factor to preserve some amount of precision. If 0 this field won't be used. Multiplier float64 `json:"multiplier"` } + +// TODO we should validate the Config once it is constructed. +// What are valid mappings from source fields to pilosa fields? diff --git a/cmd/picsv/main_internal_test.go b/cmd/picsv/main_internal_test.go new file mode 100644 index 0000000..bfae155 --- /dev/null +++ b/cmd/picsv/main_internal_test.go @@ -0,0 +1,20 @@ +package main + +import ( + "strings" + "testing" +) + +func TestProcessHeader(t *testing.T) { + config := NewConfig() + headerRow := []string{"a", "b", "c"} + + t.Run("invalid IDType", func(t *testing.T) { + config.IDField = "a" + config.IDType = "nope" + _, _, _, err := processHeader(config, nil, headerRow) + if err == nil || !strings.Contains(err.Error(), "unknown IDType") { + t.Fatalf("unknown IDType gave: %v", err) + } + }) +} diff --git a/cmd/picsv/main_test.go b/cmd/picsv/main_test.go index ccad440..aa96df3 100644 --- a/cmd/picsv/main_test.go +++ b/cmd/picsv/main_test.go @@ -9,6 +9,7 @@ import ( "github.com/pilosa/go-pilosa" picsv "github.com/pilosa/go-pilosa/cmd/picsv" + "github.com/pkg/errors" ) func BenchmarkImportCSV(b *testing.B) { @@ -70,11 +71,12 @@ func getRawData(t testing.TB, file string) { func TestImportCSV(t *testing.T) { m := picsv.NewMain() - m.BatchSize = 1 << 20 + m.BatchSize = 100000 m.Index = "testpicsv" m.File = "marketing-200k.csv" m.Config.SourceFields["age"] = picsv.SourceField{TargetField: "age", Type: "float"} m.Config.PilosaFields["age"] = picsv.Field{Type: "int"} + m.Config.IDField = "id" getRawData(t, m.File) client, err := pilosa.NewClient(m.Pilosa) if err != nil { @@ -137,3 +139,193 @@ func TestImportCSV(t *testing.T) { }) } } + +func TestSmallImport(t *testing.T) { + m := picsv.NewMain() + m.BatchSize = 1 << 20 + m.Index = "testsample" + m.File = "testdata/sample.csv" + m.ConfigFile = "config.json" + client, err := pilosa.NewClient(m.Pilosa) + if err != nil { + t.Fatalf("getting client: %v", err) + } + defer func() { + err = client.DeleteIndexByName(m.Index) + if err != nil { + t.Logf("deleting index: %v", err) + } + }() + config := `{ +"pilosa-fields": {"size": {"type": "set", "keys": true, "cache-type": "ranked", "cache-size": 100000}, + "age": {"type": "int"}, + "color": {"type": "set", "keys": true}, + "result": {"type": "int"}, + "dayofweek": {"type": "set", "keys": false, "cache-type": "ranked", "cache-size": 7} + }, +"id-field": "ID", +"id-type": "string", +"source-fields": { + "Size": {"target-field": "size", "type": "string"}, + "Color": {"target-field": "color", "type": "string"}, + "Age": {"target-field": "age", "type": "int"}, + "Result": {"target-field": "result", "type": "float", "multiplier": 100000000}, + "dayofweek": {"target-field": "dayofweek", "type": "uint64"} + } +} +` + data := ` +ID,Size,Color,Age,Result,dayofweek +ABDJ,small,green,42,1.13106317,1 +HFZP,large,red,99,30.23959735,2 +HFZP,small,green,99,NA,3 +EJSK,medium,purple,22,20.23959735,1 +EJSK,large,green,35,25.13106317, +FEFF,,,,,6 +` + writeFile(t, m.ConfigFile, config) + writeFile(t, m.File, data) + + err = m.Run() + if err != nil { + t.Fatalf("running ingest: %v", err) + } + + schema, err := client.Schema() + if err != nil { + t.Fatalf("getting schema: %v", err) + } + + index := schema.Index(m.Index) + size := index.Field("size") + color := index.Field("color") + age := index.Field("age") + result := index.Field("result") + day := index.Field("dayofweek") + + tests := []struct { + query pilosa.PQLQuery + resType string + exp interface{} + }{ + { + query: index.Count(size.Row("small")), + resType: "count", + exp: int64(2), + }, + { + query: size.Row("small"), + resType: "rowKeys", + exp: []string{"ABDJ", "HFZP"}, + }, + { + query: color.Row("green"), + resType: "rowKeys", + exp: []string{"ABDJ", "HFZP", "EJSK"}, + }, + { + query: age.Equals(99), + resType: "rowKeys", + exp: []string{"HFZP"}, + }, + { + query: age.GT(0), + resType: "rowKeys", + exp: []string{"ABDJ", "HFZP", "EJSK"}, + }, + { + query: result.GT(0), + resType: "rowKeys", + exp: []string{"ABDJ", "EJSK"}, + }, + { + query: result.GT(100000), + resType: "rowKeys", + exp: []string{"ABDJ", "EJSK"}, + }, + { + query: day.Row(1), + resType: "rowKeys", + exp: []string{"ABDJ", "EJSK"}, + }, + { + query: day.Row(6), + resType: "rowKeys", + exp: []string{"FEFF"}, + }, + { + query: index.Count(day.Row(3)), + resType: "count", + exp: int64(1), + }, + { + query: index.Count(day.Row(2)), + resType: "count", + exp: int64(1), // not mutually exclusive! + }, + { + query: size.Row(`""`), // TODO... go-pilosa should probably serialize keys into PQL using quotes. + resType: "rowKeys", + exp: []string{}, // empty strings are ignored rather than ingested + }, + { + query: color.Row(`""`), + resType: "rowKeys", + exp: []string{}, // empty strings are ignored rather than ingested + }, + } + + for i, test := range tests { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + resp, err := client.Query(test.query) + if err != nil { + t.Fatalf("running query: %v", err) + } + res := resp.Result() + switch test.resType { + case "count": + if res.Count() != test.exp.(int64) { + t.Fatalf("unexpected count %d is not %d", res.Count(), test.exp.(int64)) + } + case "rowKeys": + got := res.Row().Keys + exp := test.exp.([]string) + if err := isPermutationOf(got, exp); err != nil { + t.Fatalf("unequal rows %v expected/got:\n%v\n%v", err, exp, got) + } + } + }) + } + +} + +func writeFile(t testing.TB, name, contents string) { + cf, err := os.Create(name) + if err != nil { + t.Fatalf("creating config file: %v", err) + } + _, err = cf.Write([]byte(contents)) + if err != nil { + t.Fatalf("writing config file: %v", err) + } +} + +func isPermutationOf(one, two []string) error { + if len(one) != len(two) { + return errors.Errorf("different lengths %d and %d", len(one), len(two)) + } +outer: + for _, vOne := range one { + for j, vTwo := range two { + if vOne == vTwo { + two = append(two[:j], two[j+1:]...) + continue outer + } + } + return errors.Errorf("%s in one but not two", vOne) + } + if len(two) != 0 { + return errors.Errorf("vals in two but not one: %v", two) + } + return nil +} diff --git a/cmd/picsv/testdata/sample.csv b/cmd/picsv/testdata/sample.csv new file mode 100644 index 0000000..2804233 --- /dev/null +++ b/cmd/picsv/testdata/sample.csv @@ -0,0 +1,8 @@ + +ID,Size,Color,Age,Result,dayofweek +ABDJ,small,green,42,1.13106317,1 +HFZP,large,red,99,30.23959735,2 +HFZP,small,green,99,NA,3 +EJSK,medium,purple,22,20.23959735,1 +EJSK,large,green,35,25.13106317, +FEFF,,,,,6 diff --git a/importbatch.go b/importbatch.go index 7103f64..4d4dd0b 100644 --- a/importbatch.go +++ b/importbatch.go @@ -6,6 +6,16 @@ import ( "golang.org/x/sync/errgroup" ) +// TODO if using column translation, column ids might get way out of +// order. Could be worth sorting everything after translation (as an +// option?). Instead of sorting all simultaneously, it might be faster +// (more cache friendly) to sort ids and save the swap ops to apply to +// everything else that needs to be sorted. + +// TODO support clearing values? nil values in records are ignored, +// but perhaps we could have a special type indicating that a bit or +// value should explicitly be cleared? + type Batch struct { client *Client index *Index @@ -21,8 +31,13 @@ type Batch struct { // values holds the values for each record of an int field values map[string][]int64 - // TODO, support set fields without translation, timestamps, set fields with more than one value per record, mutex, and bool. - // also null values + // clearValues holds a slice of indexes into b.ids for each + // integer field which has nil values. After translation, these + // slices will be filled out with the actual column IDs those + // indexes pertain to so that they can be cleared. + clearValues map[string][]uint64 + + // TODO, support timestamps, set fields with more than one value per record, mutex, and bool. // for each field, keep a map of key to which record indexes that key mapped to toTranslate map[string]map[string][]int @@ -66,6 +81,7 @@ func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { ids: make([]uint64, 0, size), rowIDs: rowIDs, values: values, + clearValues: make(map[string][]uint64), toTranslate: tt, toTranslateID: make(map[string][]int), } @@ -128,6 +144,19 @@ func (b *Batch) Add(rec Row) error { b.rowIDs[field.Name()] = append(b.rowIDs[field.Name()], val) case int64: b.values[field.Name()] = append(b.values[field.Name()], val) + case nil: + if field.Opts().Type() == FieldTypeInt { + b.values[field.Name()] = append(b.values[field.Name()], 0) + clearIndexes, ok := b.clearValues[field.Name()] + if !ok { + clearIndexes = make([]uint64, 0) + } + clearIndexes = append(clearIndexes, uint64(len(b.ids)-1)) + b.clearValues[field.Name()] = clearIndexes + + } else { + b.rowIDs[field.Name()] = append(b.rowIDs[field.Name()], nilSentinel) + } default: return errors.Errorf("Val %v Type %[1]T is not currently supported. Use string, uint64 (row id), or int64 (integer value)", val) } @@ -212,6 +241,13 @@ func (b *Batch) doTranslation() error { b.client.translator.AddRow(b.index.Name(), fieldName, key, id) } } + + for field, idIndexes := range b.clearValues { + for i, index := range idIndexes { + idIndexes[i] = b.ids[index] + } + b.clearValues[field] = idIndexes // TODO this line should be unnecessary?? If it is necessary, is it OK to modify b.clearValues while iterating over it? + } return nil } @@ -240,6 +276,12 @@ func (b *Batch) doImport() error { return eg.Wait() } +// this is kind of bad as it means we can never import column id +// ^uint64(0) which is a valid column ID. I think it's unlikely to +// matter much in practice (we could maybe special case it somewhere +// if needed though). +var nilSentinel = ^uint64(0) + func (b *Batch) makeFragments() fragments { shardWidth := b.index.shardWidth if shardWidth == 0 { @@ -247,10 +289,13 @@ func (b *Batch) makeFragments() fragments { } frags := make(fragments) for fname, rowIDs := range b.rowIDs { - curShard := ^uint64(0) // impossible sentinel value. + curShard := ^uint64(0) // impossible sentinel value for shard. var curBM *roaring.Bitmap for j, _ := range b.ids { col, row := b.ids[j], rowIDs[j] + if row == nilSentinel { + continue + } if col/shardWidth != curShard { curShard = col / shardWidth curBM = frags.GetOrCreate(curShard, fname, "") @@ -273,7 +318,7 @@ func (b *Batch) importValueData() error { for i := 1; i <= len(b.ids); i++ { // when i==len(b.ids) we ensure that the import logic gets run // by making a fake shard once we're past the last ID - recordID := ^uint64(0) + recordID := (curShard + 2) * shardWidth if i < len(b.ids) { recordID = b.ids[i] } @@ -281,9 +326,11 @@ func (b *Batch) importValueData() error { endIdx := i ids := b.ids[startIdx:endIdx] for field, values := range b.values { + field := field + shard := curShard vslice := values[startIdx:endIdx] eg.Go(func() error { - err := b.client.ImportValues(b.index.Name(), field, curShard, vslice, ids) + err := b.client.ImportValues(b.index.Name(), field, shard, vslice, ids, false) return errors.Wrapf(err, "importing values for %s", field) }) } @@ -292,7 +339,49 @@ func (b *Batch) importValueData() error { } } - return errors.Wrap(eg.Wait(), "importing value data") + err := eg.Wait() + if err != nil { + return errors.Wrap(err, "importing value data") + } + + // Now we clear any values for which we got a nil. + // + // TODO we need an endpoint which lets us set and clear + // transactionally... this is kind of a hack. + maxLen := 0 + for _, ids := range b.clearValues { + if len(ids) > maxLen { + maxLen = len(ids) + } + } + eg = errgroup.Group{} + values := make([]int64, 0, maxLen) + for field, ids := range b.clearValues { + // TODO maybe sort ids here + curShard := b.ids[0] / shardWidth + startIdx := 0 + for i := 1; i <= len(ids); i++ { + recordID := (curShard + 2) * shardWidth + if i < len(ids) { + recordID = b.ids[i] + } + if recordID/shardWidth != curShard { + endIdx := i + idSlice := ids[startIdx:endIdx] + values := values[:len(idSlice)] + field := field + shard := curShard + eg.Go(func() error { + err := b.client.ImportValues(b.index.Name(), field, shard, values, idSlice, true) + return errors.Wrap(err, "clearing values") + }) + startIdx = i + curShard = recordID / shardWidth + } + } + } + + return errors.Wrap(eg.Wait(), "importing clear value data") } // reset is called at the end of importing to ready the batch for the @@ -303,18 +392,21 @@ func (b *Batch) reset() { b.rowIDs[fieldName] = rowIDs[:0] m := b.toTranslate[fieldName] for k := range m { - delete(m, k) + delete(m, k) // TODO pool these slices } } for k := range b.toTranslateID { - delete(b.toTranslateID, k) + delete(b.toTranslateID, k) // TODO pool these slices } for k, _ := range b.values { - delete(b.values, k) + delete(b.values, k) // TODO pool these slices + } + for k, _ := range b.clearValues { + delete(b.clearValues, k) // TODO pool these slices } } -type fieldView struct { // TODO rename to fieldview +type fieldView struct { field string view string } diff --git a/importbatch_test.go b/importbatch_test.go index cfeb43f..eda255f 100644 --- a/importbatch_test.go +++ b/importbatch_test.go @@ -8,6 +8,8 @@ import ( "github.com/pkg/errors" ) +// TODO test against cluster + func TestBatches(t *testing.T) { client := DefaultClient() schema := NewSchema() @@ -43,6 +45,10 @@ func TestBatches(t *testing.T) { r.Values[2] = "z" r.Values[3] = int64(-10) } + if i == 8 { + r.Values[0] = nil + r.Values[3] = nil + } err := b.Add(r) if err != nil { t.Fatalf("unexpected err adding record: %v", err) @@ -55,7 +61,7 @@ func TestBatches(t *testing.T) { } for k, ints := range b.toTranslate["zero"] { if k == "a" { - if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { + if !reflect.DeepEqual(ints, []int{0, 2, 4, 6}) { t.Fatalf("wrong ints for key a in field zero: %v", ints) } } else if k == "x" { @@ -68,9 +74,12 @@ func TestBatches(t *testing.T) { } } - if !reflect.DeepEqual(b.values["three"], []int64{99, -10, 99, -10, 99, -10, 99, -10, 99}) { + if !reflect.DeepEqual(b.values["three"], []int64{99, -10, 99, -10, 99, -10, 99, -10, 0}) { t.Fatalf("unexpected values: %v", b.values["three"]) } + if !reflect.DeepEqual(b.clearValues["three"], []uint64{8}) { + t.Fatalf("unexpected clearValues: %v", b.clearValues["three"]) + } if len(b.toTranslate["one"]) != 2 { t.Fatalf("wrong number of keys in toTranslate[\"one\"]") @@ -118,7 +127,7 @@ func TestBatches(t *testing.T) { t.Fatalf("should have gotten already full batch error, but got %v", err) } - if !reflect.DeepEqual(b.values["three"], []int64{99, -10, 99, -10, 99, -10, 99, -10, 99, 99}) { + if !reflect.DeepEqual(b.values["three"], []int64{99, -10, 99, -10, 99, -10, 99, -10, 0, 0}) { t.Fatalf("unexpected values: %v", b.values["three"]) } @@ -129,8 +138,16 @@ func TestBatches(t *testing.T) { for fname, rowIDs := range b.rowIDs { // we don't know which key will get translated first, but we do know the pattern - if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 1}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 2}) { - t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) + if fname == "zero" { + if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, nilSentinel, nilSentinel}) && + !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, nilSentinel, nilSentinel}) { + t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) + } + + } else { + if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 1}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 2}) { + t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) + } } } @@ -244,7 +261,13 @@ func TestBatches(t *testing.T) { } results := resp.Results() - for i, res := range results { + for _, j := range []int{0, 3} { + cols := results[j].Row().Columns + if !reflect.DeepEqual(cols, []uint64{0, 2, 4, 6, 10, 12, 14, 16, 18}) { + t.Fatalf("unexpected columns for a: %v", cols) + } + } + for i, res := range results[1:3] { cols := res.Row().Columns if !reflect.DeepEqual(cols, []uint64{0, 2, 4, 6, 8, 10, 12, 14, 16, 18}) { t.Fatalf("unexpected columns at %d: %v", i, cols) @@ -274,8 +297,8 @@ func TestBatches(t *testing.T) { } results = resp.Results() cols := results[0].Row().Columns - if !reflect.DeepEqual(cols, []uint64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}) { - t.Fatalf("all columns should be greater than -11, but got: %v", cols) + if !reflect.DeepEqual(cols, []uint64{0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}) { + t.Fatalf("all columns (but 8) should be greater than -11, but got: %v", cols) } cols = results[1].Row().Columns if !reflect.DeepEqual(cols, []uint64{19, 21, 23, 25, 27}) { From 96349545770228c14b9460e4ac90b34aee57ff25 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 11:13:00 -0500 Subject: [PATCH 06/26] documentation for batch importer --- importbatch.go | 62 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/importbatch.go b/importbatch.go index 4d4dd0b..981df04 100644 --- a/importbatch.go +++ b/importbatch.go @@ -16,6 +16,39 @@ import ( // but perhaps we could have a special type indicating that a bit or // value should explicitly be cleared? +// RecordBatch is a Pilosa ingest interface designed to allow for +// maximum throughput on common workloads. Users should call Add() +// with a Row object until it returns ErrBatchNowFull, at which time +// they should call Import(), and then repeat. +// +// Add will not modify or otherwise retain the Row once it returns, so +// it is recommended that callers reuse the same Row with repeated +// calls to Add, just modifying its values appropriately in between +// calls. This avoids allocating a new slice of Values for each +// inserted Row. +// +// The supported types of the values in Row.Values are implementation +// defined. Similarly, the supported types for Row.ID are +// implementation defined. +type RecordBatch interface { + Add(Row) error + Import() error +} + +// Batch implements RecordBatch. +// +// It supports Values of type string, uint64, int64, or nil. The +// following table describes what Pilosa field each type of value must +// map to. Fields are set up when calling "NewBatch". +// +// | type | pilosa field type | options | +// |--------+-------------------+-----------| +// | string | set | keys=true | +// | uint64 | set | any | +// | int64 | int | any | +// | nil | any | | +// +// nil values are ignored. type Batch struct { client *Client index *Index @@ -31,10 +64,10 @@ type Batch struct { // values holds the values for each record of an int field values map[string][]int64 - // clearValues holds a slice of indexes into b.ids for each + // clearValues holds a slice of indices into b.ids for each // integer field which has nil values. After translation, these // slices will be filled out with the actual column IDs those - // indexes pertain to so that they can be cleared. + // indices pertain to so that they can be cleared. clearValues map[string][]uint64 // TODO, support timestamps, set fields with more than one value per record, mutex, and bool. @@ -52,6 +85,11 @@ type Batch struct { toTranslateID map[string][]int } +// NewBatch initializes a new Batch object which will use the given +// Pilosa client, index, set of fields, and will take "size" records +// before returning ErrBatchNowFull. The positions of the Fields in +// 'fields' correspond to the positions of values in the Row's Values +// passed to Batch.Add(). func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { if len(fields) == 0 || size == 0 { panic("can't batch with no fields or batch size") @@ -87,6 +125,11 @@ func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { } } +// Row represents a single record which can be added to a RecordBatch. +// +// Note: it is not named "Record" because there is a conflict with +//another type in this package. This may be rectified by deprecating +//something or splitting packages in the future. type Row struct { ID interface{} Values []interface{} @@ -167,9 +210,18 @@ func (b *Batch) Add(rec Row) error { return nil } +// ErrBatchNowFull, similar to io.EOF, is a marker error to notify the +// user of a batch that it is time to call Import. var ErrBatchNowFull = errors.New("batch is now full - you cannot add any more records (though the one you just added was accepted)") + +// ErrBatchAlreadyFull is a real error saying that Batch.Add did not +// complete because the batch was full. var ErrBatchAlreadyFull = errors.New("batch was already full, record was rejected") +// Import does all necessary key translation and then imports the +// batch data into Pilosa. It readies itself for the next set of +// records by clearing internal structures without releasing the +// associated memory. func (b *Batch) Import() error { // first we need to translate the toTranslate, then fill out the missing row IDs err := b.doTranslation() @@ -177,7 +229,8 @@ func (b *Batch) Import() error { return errors.Wrap(err, "doing Translation") } - // create bitmaps out of each field in b.rowIDs and import + // create bitmaps out of each field in b.rowIDs and import. Also + // import int data. err = b.doImport() if err != nil { return errors.Wrap(err, "doing import") @@ -242,11 +295,10 @@ func (b *Batch) doTranslation() error { } } - for field, idIndexes := range b.clearValues { + for _, idIndexes := range b.clearValues { for i, index := range idIndexes { idIndexes[i] = b.ids[index] } - b.clearValues[field] = idIndexes // TODO this line should be unnecessary?? If it is necessary, is it OK to modify b.clearValues while iterating over it? } return nil } From 2fa57bd37eb4ffd779af1aea27c32543dfadda6a Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 11:23:13 -0500 Subject: [PATCH 07/26] gofmt -s importbatch.go --- importbatch.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/importbatch.go b/importbatch.go index 981df04..b364d64 100644 --- a/importbatch.go +++ b/importbatch.go @@ -247,7 +247,7 @@ func (b *Batch) doTranslation() error { // translate column keys if there are any if len(b.toTranslateID) > 0 { keys = make([]string, 0, len(b.toTranslateID)) - for k, _ := range b.toTranslateID { + for k := range b.toTranslateID { keys = append(keys, k) } ids, err := b.client.translateColumnKeys(b.index, keys) @@ -270,7 +270,7 @@ func (b *Batch) doTranslation() error { keys = keys[:0] // make a slice of keys - for k, _ := range tt { + for k := range tt { keys = append(keys, k) } @@ -343,7 +343,7 @@ func (b *Batch) makeFragments() fragments { for fname, rowIDs := range b.rowIDs { curShard := ^uint64(0) // impossible sentinel value for shard. var curBM *roaring.Bitmap - for j, _ := range b.ids { + for j := range b.ids { col, row := b.ids[j], rowIDs[j] if row == nilSentinel { continue @@ -450,10 +450,10 @@ func (b *Batch) reset() { for k := range b.toTranslateID { delete(b.toTranslateID, k) // TODO pool these slices } - for k, _ := range b.values { + for k := range b.values { delete(b.values, k) // TODO pool these slices } - for k, _ := range b.clearValues { + for k := range b.clearValues { delete(b.clearValues, k) // TODO pool these slices } } From eed88d32dd6079f3d1ce8157fb39c07875208a6f Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 17:14:51 -0500 Subject: [PATCH 08/26] add locks on translator cache, shardNodes invalidation --- client.go | 82 ++++++++++++++++++++++++++++++++++++-- client_internal_it_test.go | 9 +++++ go.mod | 61 +++------------------------- go.sum | 73 +++++++++++++++++++++++++++++++++ importbatch.go | 10 +++-- shardnodes.go | 8 ++++ uri.go | 14 +++++++ 7 files changed, 196 insertions(+), 61 deletions(-) diff --git a/client.go b/client.go index e1f0269..c283bd1 100644 --- a/client.go +++ b/client.go @@ -90,14 +90,45 @@ type Client struct { importLogEncoder encoder logLock sync.Mutex - // TODO make this threadsafe using key translation cache on client using embedded K/V store. + // TODO replace this with something like BoltDB. Need better + // concurrent performance, less lock contention. Persistence might + // be a nice bonus too. + tlock sync.RWMutex translator *Translator // TODO shardNodes needs to be invalidated/updated when cluster topology changes. shardNodes shardNodes + tick *time.Ticker } -func (c *Client) GetURIsForShard(index string, shard uint64) ([]*URI, error) { +func (c *Client) translateCol(index, key string) (uint64, bool) { + c.tlock.RLock() + v, b := c.translator.GetCol(index, key) + c.tlock.RUnlock() + return v, b +} + +func (c *Client) translateRow(index, field, key string) (uint64, bool) { + c.tlock.RLock() + v, b := c.translator.GetRow(index, field, key) + c.tlock.RUnlock() + return v, b +} + +func (c *Client) addTranslateCol(index, key string, value uint64) { + c.tlock.Lock() + c.translator.AddCol(index, key, value) + c.tlock.Unlock() +} + +func (c *Client) addTranslateRow(index, field, key string, value uint64) { + c.tlock.Lock() + c.translator.AddRow(index, field, key, value) + c.tlock.Unlock() +} + +// TODO unexport this, consider unexporting ImportValues, look for other candidates, put a note on translator about it being only used by batch, do something about shardNodes. +func (c *Client) getURIsForShard(index string, shard uint64) ([]*URI, error) { uris, ok := c.shardNodes.Get(index, shard) if ok { return uris, nil @@ -114,6 +145,51 @@ func (c *Client) GetURIsForShard(index string, shard uint64) ([]*URI, error) { return uris, nil } +func (c *Client) runChangeDetection() { + c.tick = time.NewTicker(time.Minute) + + for _ = range c.tick.C { + c.detectClusterChanges() + } +} + +// detectClusterChanges chooses a random index and shard from the +// shardNodes cache and deletes it. It then looks it up from Pilosa to +// see if it still matches, and if not it drops the whole cache. +func (c *Client) detectClusterChanges() { + c.shardNodes.mu.Lock() + // we rely on Go's random map iteration order to get a random + // element. If it doesn't end up being random, it shouldn't + // actually matter. + for index, shardMap := range c.shardNodes.data { + for shard, uris := range shardMap { + delete(shardMap, shard) + c.shardNodes.data[index] = shardMap + c.shardNodes.mu.Unlock() + newURIs, err := c.getURIsForShard(index, shard) // refetch URIs from server. + if err != nil { + c.logger.Printf("problem invalidating shard node cache: %v", err) + return + } + if len(uris) != len(newURIs) { + c.logger.Printf("invalidating shard node cache old: %s, new: %s", URIs(uris), URIs(newURIs)) + c.shardNodes.Invalidate() + return + } + for i := range uris { + u1, u2 := uris[i], newURIs[i] + if *u1 != *u2 { + c.logger.Printf("invalidating shard node cache, uri mismatch at %d old: %s, new: %s", i, URIs(uris), URIs(newURIs)) + c.shardNodes.Invalidate() + return + } + } + break + } + break + } +} + // DefaultClient creates a client with the default address and options. func DefaultClient() *Client { return newClientWithCluster(NewClusterWithHost(DefaultURI()), nil) @@ -692,7 +768,7 @@ func (c *Client) ImportValues(index, field string, shard uint64, vals []int64, i path := fmt.Sprintf("/index/%s/field/%s/import?clear=%s&ignoreKeyCheck=true", index, field, strconv.FormatBool(clear)) c.logImport(index, path, shard, false, data) - uris, err := c.GetURIsForShard(index, shard) + uris, err := c.getURIsForShard(index, shard) if err != nil { return errors.Wrap(err, "getting uris") } diff --git a/client_internal_it_test.go b/client_internal_it_test.go index 18e209e..6c4e5a3 100644 --- a/client_internal_it_test.go +++ b/client_internal_it_test.go @@ -171,3 +171,12 @@ func TestImportWithReplayErrors(t *testing.T) { t.Fatal("import replay hanging when no schema created") } } + +func TestDetectClusterChanges(t *testing.T) { + c := getClient() + + c.shardNodes.data["blah"] = make(map[uint64][]*URI) + c.shardNodes.data["blah"][1] = []*URI{&URI{scheme: "zzz"}} + + c.detectClusterChanges() +} diff --git a/go.mod b/go.mod index 7110b0e..ac26e55 100644 --- a/go.mod +++ b/go.mod @@ -1,64 +1,15 @@ module github.com/pilosa/go-pilosa require ( - cloud.google.com/go v0.40.0 // indirect - github.com/OneOfOne/xxhash v1.2.5 // indirect - github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878 // indirect - github.com/armon/go-radix v1.0.0 // indirect - github.com/coreos/bbolt v1.3.3 // indirect - github.com/coreos/etcd v3.3.13+incompatible // indirect - github.com/coreos/go-semver v0.3.0 // indirect - github.com/coreos/go-systemd v0.0.0-20190618135430-ff7011eec365 // indirect - github.com/golang/mock v1.3.1 // indirect - github.com/golang/protobuf v1.3.1 - github.com/google/pprof v0.0.0-20190515194954-54271f7e092f // indirect - github.com/googleapis/gax-go/v2 v2.0.5 // indirect - github.com/gorilla/handlers v1.4.0 // indirect - github.com/gorilla/mux v1.7.2 // indirect - github.com/grpc-ecosystem/grpc-gateway v1.9.2 // indirect - github.com/hashicorp/go-cleanhttp v0.5.1 // indirect - github.com/hashicorp/go-immutable-radix v1.1.0 // indirect - github.com/hashicorp/go-msgpack v0.5.5 // indirect - github.com/hashicorp/go-retryablehttp v0.5.4 // indirect - github.com/hashicorp/go-sockaddr v1.0.2 // indirect - github.com/hashicorp/go-uuid v1.0.1 // indirect - github.com/hashicorp/memberlist v0.1.4 // indirect + github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d // indirect + github.com/go-ole/go-ole v1.2.4 // indirect + github.com/golang/protobuf v1.3.2 github.com/jaffee/commandeer v0.1.1-0.20190726022955-4d43b78ebc4e - github.com/kisielk/errcheck v1.2.0 // indirect - github.com/konsorten/go-windows-terminal-sequences v1.0.2 // indirect - github.com/kr/pty v1.1.5 // indirect - github.com/magiconair/properties v1.8.1 // indirect - github.com/mattn/go-colorable v0.1.2 // indirect - github.com/miekg/dns v1.1.14 // indirect github.com/opentracing/opentracing-go v1.1.0 - github.com/pelletier/go-toml v1.4.0 // indirect + github.com/pilosa/demo-taxi v0.0.0-20190604185441-6b6ef983bff7 // indirect github.com/pilosa/pilosa v1.3.1 + github.com/pilosa/tools v0.0.0-20190810124639-ee77232ff3aa // indirect github.com/pkg/errors v0.8.1 - github.com/posener/complete v1.2.1 // indirect - github.com/prometheus/common v0.6.0 // indirect - github.com/remyoudompheng/bigfft v0.0.0-20190512091148-babf20351dd7 // indirect - github.com/rogpeppe/fastuuid v1.1.0 // indirect - github.com/russross/blackfriday v2.0.0+incompatible // indirect - github.com/sirupsen/logrus v1.4.2 // indirect - github.com/spaolacci/murmur3 v1.1.0 // indirect - github.com/spf13/afero v1.2.2 // indirect - github.com/spf13/cobra v0.0.5 // indirect - github.com/spf13/jwalterweatherman v1.1.0 // indirect - github.com/spf13/viper v1.4.0 // indirect - github.com/stretchr/objx v0.2.0 // indirect - github.com/ugorji/go v1.1.5-pre // indirect - go.etcd.io/bbolt v1.3.3 // indirect - go.opencensus.io v0.22.0 // indirect - golang.org/x/crypto v0.0.0-20190617133340-57b3e21c3d56 // indirect - golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522 // indirect - golang.org/x/image v0.0.0-20190618124811-92942e4437e2 // indirect - golang.org/x/mobile v0.0.0-20190607214518-6fa95d984e88 // indirect - golang.org/x/mod v0.1.0 // indirect + github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect golang.org/x/sync v0.0.0-20190423024810-112230192c58 - golang.org/x/sys v0.0.0-20190618155005-516e3c20635f // indirect - golang.org/x/tools v0.0.0-20190618163018-fdf1049a943a // indirect - google.golang.org/appengine v1.6.1 // indirect - google.golang.org/genproto v0.0.0-20190611190212-a7e196e89fd3 // indirect - google.golang.org/grpc v1.21.1 // indirect - honnef.co/go/tools v0.0.0-20190614002413-cb51c254f01b // indirect ) diff --git a/go.sum b/go.sum index ecc454b..c8f688f 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.40.0/go.mod h1:Tk58MuI9rbLMKlAjeO/bDnteAx7tX2gJIXw4T5Jwlro= +cloud.google.com/go v0.43.0/go.mod h1:BOSR3VbTLkk6FDC/TcffxP4NF/FFBGA5ku+jvKOP7pg= +github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/CAFxX/gcnotifier v0.0.0-20190112062741-224a280d589d h1:n0G4ckjMEj7bWuGYUX0i8YlBeBBJuZ+HEHvHfyBDZtI= @@ -10,8 +12,11 @@ github.com/DataDog/datadog-go v0.0.0-20180822151419-281ae9f2d895/go.mod h1:LButx github.com/DataDog/datadog-go v2.2.0+incompatible h1:V5BKkxACZLjzHjSgBbr2gvLA2Ae49yhc6CSY7MLy5k4= github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= +github.com/OneOfOne/xxhash v1.2.5 h1:zl/OfRA6nftbBK9qTohYBJ5xvw6C/oNKizR7cZGl3cI= github.com/OneOfOne/xxhash v1.2.5/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= +github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d h1:G0m3OIz70MZUWq3EgK3CesDbo8upS2Vm9/P3FtgI+Jk= +github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= @@ -30,6 +35,7 @@ github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghf github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd h1:qMd81Ts1T2OTKmB4acZcyKaMtRnY5Y44NuXGX2GFJ1w= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= github.com/coreos/bbolt v1.3.3/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= @@ -40,8 +46,10 @@ github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/go-systemd v0.0.0-20190618135430-ff7011eec365/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= +github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -53,8 +61,10 @@ github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= +github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-ole/go-ole v1.2.4 h1:nNBDSCOigTSiarFpYE9J/KtEA1IOW4CNeqT9TQDqCxI= github.com/go-ole/go-ole v1.2.4/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= @@ -63,14 +73,17 @@ github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/protobuf v0.0.0-20170427213220-18c9bb326172/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= @@ -83,17 +96,24 @@ github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OI github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/handlers v1.3.0/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ= github.com/gorilla/handlers v1.4.0 h1:XulKRWSQK5uChr4pEgSE4Tc/OcmnU9GJuSwdog/tZsA= github.com/gorilla/handlers v1.4.0/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ= +github.com/gorilla/handlers v1.4.1 h1:BHvcRGJe/TrL+OqFxoKQGddTgeibiOjaBssV5a/N9sw= +github.com/gorilla/handlers v1.4.1/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ= +github.com/gorilla/mux v1.4.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/mux v1.7.2 h1:zoNxOV7WjqXptQOVngLmcSQgXmgk4NMz1HibBchjl/I= github.com/gorilla/mux v1.7.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= +github.com/gorilla/mux v1.7.3 h1:gnP5JzjVOuiZD07fKKToCAOjS0yOpj/qPETTXCCS6hw= +github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/grpc-ecosystem/grpc-gateway v1.9.2/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= +github.com/grpc-ecosystem/grpc-gateway v1.9.4/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= @@ -112,6 +132,7 @@ github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerX github.com/hashicorp/go-sockaddr v1.0.2 h1:ztczhD1jLxIRjVejw8gFomI1BQZOe2WoVOu0SyteCQc= github.com/hashicorp/go-sockaddr v1.0.2/go.mod h1:rB4wwRAUzs07qva3c5SdrY/NEtAUjGlgmH/UkBUC97A= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.1 h1:fv1ep09latC32wFoVwnqcnKJGnMSdBanPczbHAYm1BE= github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1 h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU= @@ -136,9 +157,12 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= +github.com/kr/pty v1.1.8/go.mod h1:O1sed60cT9XZ5uDucP5qwvh+TE3NnUj51EiZO/lmSfw= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1 h1:ZC2Vc7/ZFkGmsVC9KvOjumD+G5lXy2RtTKyzRKO2BQ4= @@ -151,6 +175,8 @@ github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5 github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/miekg/dns v1.1.14 h1:wkQWn9wIp4mZbwW8XV6Km6owkvRPbOiV004ZM2CkGvA= github.com/miekg/dns v1.1.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= +github.com/miekg/dns v1.1.15 h1:CSSIDtllwGLMoA6zjdKnaE6Tx6eVUxQ29LUgGetiDCI= +github.com/miekg/dns v1.1.15/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo= @@ -159,15 +185,24 @@ github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsqf19k25Ur8rU= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= +github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pelletier/go-toml v1.4.0 h1:u3Z1r+oOXJIkxqw34zVhyPgjBsm6X2wn21NWs/HfSeg= github.com/pelletier/go-toml v1.4.0/go.mod h1:PN7xzY2wHTK0K9p34ErDQMlFxa51Fk0OUruD3k1mMwo= +github.com/pilosa/demo-taxi v0.0.0-20190604185441-6b6ef983bff7 h1:Jslbqy71rw4s8jZD0t5NJmxkFFtHAym4n5HvcOq/2F4= +github.com/pilosa/demo-taxi v0.0.0-20190604185441-6b6ef983bff7/go.mod h1:DM8Umjg0r/UscmOs49RJeE0WUb8Nj4PLUj4J02vigLk= +github.com/pilosa/go-pilosa v0.0.0-20181106203903-796d4f7d7f3b/go.mod h1:uli4HiTymHocSAXJ9XpDbkH6kS63P8Yc0xyWDzooouc= +github.com/pilosa/go-pilosa v1.3.1-0.20190715210601-8606626b90d6/go.mod h1:aFI9h49dhkkRoBLyeZFdHj+OHYtobmA7X7pn3AKVDMw= +github.com/pilosa/pdk v0.8.0 h1:of3TuThnqmnyPHSkc/kF1kFnAIx4qHecrB8CDarCXwk= +github.com/pilosa/picap v0.0.0-20190222201647-33fda7de22a1 h1:cWR2SpswkDmn5is65Fo4Ex2+65PXeqHoFDowfyScGH8= +github.com/pilosa/pilosa v0.0.0-20181115192138-84148d4ee6c0/go.mod h1:NgpkJkefqUKUHV7O3TqBOu89tsao3ksth2wzTNe8CPQ= github.com/pilosa/pilosa v0.0.0-20190104143002-8c4b1548bc4b h1:2H/+JUxL4dv0uJ4G4i+C83S1yq/+pUrHHjsF8TEY85I= github.com/pilosa/pilosa v0.0.0-20190104143002-8c4b1548bc4b/go.mod h1:NgpkJkefqUKUHV7O3TqBOu89tsao3ksth2wzTNe8CPQ= github.com/pilosa/pilosa v1.2.1-0.20190410162749-b973f8c96356 h1:jDxhpV4l+CpKqVVgld73e9/EyogdCcO1ftbCvifrhSc= @@ -176,9 +211,12 @@ github.com/pilosa/pilosa v1.2.1-0.20190807173852-bc9747cc0f19 h1:93vMMs0jAhynsJp github.com/pilosa/pilosa v1.2.1-0.20190807173852-bc9747cc0f19/go.mod h1:57zHA92sPbJ01QsMyyEDASX2TJnf8qSM7ZdUnVzM0b8= github.com/pilosa/pilosa v1.3.1 h1:rLDVqJBuRzhPtue730D+EX0YEVS4R0oDzsE4bJBwLcE= github.com/pilosa/pilosa v1.3.1/go.mod h1:97yLL9mpUqOj9naKu5XA/b/U6JLe3JGGUlc2HOTDw+A= +github.com/pilosa/tools v0.0.0-20190810124639-ee77232ff3aa h1:5HqLTUqaL8xKTtaZ4xjAEf/ICP608XcRgEE3CNfohLw= +github.com/pilosa/tools v0.0.0-20190810124639-ee77232ff3aa/go.mod h1:n/Od1ErfFlaIEueOaQjlbo06EzKuRhSPxUGR3xmfEqE= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= github.com/posener/complete v1.2.1/go.mod h1:6gapUrK/U1TAN7ciCoNRIdVC5sbdBTUh1DKN0g6uH7E= @@ -197,12 +235,17 @@ github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= +github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/prometheus/tsdb v0.8.0/go.mod h1:fSI0j+IUQrDd7+ZtR9WKIGtoYAYAJUKcKhYLG25tN4g= +github.com/rakyll/statik v0.0.0-20170410192944-89fe3459b5c8 h1:tLAYcZxG+XKuevcBQNkeGgV2Dc/EY1t/20FKWoYVI4E= +github.com/rakyll/statik v0.0.0-20170410192944-89fe3459b5c8/go.mod h1:OEi9wJV/fMUAGx1eNjq75DKDsJVuEv1U0oYdX6GX8Zs= github.com/remyoudompheng/bigfft v0.0.0-20190321074620-2f0d2b0e0001/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/remyoudompheng/bigfft v0.0.0-20190512091148-babf20351dd7 h1:FUL3b97ZY2EPqg2NbXKuMHs5pXJB9hjj1fDHnF2vl28= github.com/remyoudompheng/bigfft v0.0.0-20190512091148-babf20351dd7/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= +github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= github.com/russross/blackfriday v2.0.0+incompatible/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= @@ -213,11 +256,13 @@ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUt github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/shirou/gopsutil v2.18.12+incompatible h1:1eaJvGomDnH74/5cF4CTmTbLHAriGFsTZppLXDX93OM= github.com/shirou/gopsutil v2.18.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= +github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U= github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/afero v1.2.2 h1:5jhuqJyZCZf2JRofRvN/nIFgIWNzPa3/Vz8mYylgbWc= @@ -230,6 +275,7 @@ github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tL github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/jwalterweatherman v1.1.0 h1:ue6voC5bR5F8YxI5S67j9i582FU4Qvo2bmqnqMYADFk= github.com/spf13/jwalterweatherman v1.1.0/go.mod h1:aNWZUN0dPAAO/Ljvb5BEdw96iTZ0EXowPYD95IqWIGo= +github.com/spf13/pflag v0.0.0-20170427125145-f1d95a35e132/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= @@ -240,20 +286,25 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM= github.com/uber-go/atomic v1.4.0/go.mod h1:/Ct5t2lcmbJ4OSe/waGBoaVvVqtO0bmtfVNex1PFV8g= +github.com/uber/jaeger-client-go v2.15.0+incompatible h1:NP3qsSqNxh8VYr956ur1N/1C1PjvOJnJykCzcD5QHbk= github.com/uber/jaeger-client-go v2.15.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/uber/jaeger-client-go v2.16.0+incompatible h1:Q2Pp6v3QYiocMxomCaJuwQGFt7E53bPYqEgug/AoBtY= github.com/uber/jaeger-client-go v2.16.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= +github.com/uber/jaeger-lib v1.5.0 h1:OHbgr8l656Ub3Fw5k9SWnBfIEwvoHQ+W2y+Aa9D1Uyo= github.com/uber/jaeger-lib v1.5.0/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U= github.com/uber/jaeger-lib v2.0.0+incompatible h1:iMSCV0rmXEogjNWPh2D0xk9YVKvrtGoHJNe9ebLu/pw= github.com/uber/jaeger-lib v2.0.0+incompatible/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= github.com/ugorji/go v1.1.5-pre/go.mod h1:FwP/aQVg39TXzItUBMwnWp9T9gPQnXw4Poh4/oBQZ/0= +github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= github.com/ugorji/go/codec v1.1.5-pre/go.mod h1:tULtS6Gy1AE1yCENaw4Vb//HLH5njI2tfCQDUqRd8fI= +github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= @@ -272,11 +323,15 @@ golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190617133340-57b3e21c3d56 h1:ZpKuNIejY8P0ExLOVyKhb0WsgG8UdvHXe6TWjY7eL6k= golang.org/x/crypto v0.0.0-20190617133340-57b3e21c3d56/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc= +golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190718202018-cfdd5522f6f6/go.mod h1:JhuoJpWY28nO4Vef9tZUw9qufEGTyX1+7lmHxV5q5G4= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190618124811-92942e4437e2/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20190703141733-d6a02ce849c9/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -284,8 +339,10 @@ golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= golang.org/x/mobile v0.0.0-20190607214518-6fa95d984e88/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/net v0.0.0-20180530234432-1e491301e022/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -303,6 +360,9 @@ golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190613194153-d28f0bde5980 h1:dfGZHvZk057jK2MCeWus/TowKpJ8y4AmooUzdBSR9GU= golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190628185345-da137c7871d7 h1:rTIdg5QFRR7XCaK4LCjBiPbx8j4DQRpdYMnGn/bJUEU= +golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -331,6 +391,9 @@ golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190618155005-516e3c20635f h1:dHNZYIYdq2QuU6w73vZ/DzesPbVlZVYZTtTZmrnsbQ8= golang.org/x/sys v0.0.0-20190618155005-516e3c20635f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190712062909-fae7ac547cb7 h1:LepdCS8Gf/MVejFIt8lsiexZATdoGVyp5bcyS+rYoUI= +golang.org/x/sys v0.0.0-20190712062909-fae7ac547cb7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= @@ -347,11 +410,15 @@ golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3 golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190530171427-2b03ca6e44eb/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190618163018-fdf1049a943a/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190719005602-e377ae9d6386/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.6.0/go.mod h1:btoxGiFvQNVUZQ8W08zLtrVS08CNpINPEfxXxgJL1Q4= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -364,12 +431,15 @@ google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRn google.golang.org/genproto v0.0.0-20190530194941-fb225487d101/go.mod h1:z3L6/3dTEVtUr6QSP8miRzeRqwQOioJ9I66odjN4I7s= google.golang.org/genproto v0.0.0-20190611190212-a7e196e89fd3 h1:0LGHEA/u5XLibPOx6D7D8FBT/ax6wT57vNKY0QckCwo= google.golang.org/genproto v0.0.0-20190611190212-a7e196e89fd3/go.mod h1:z3L6/3dTEVtUr6QSP8miRzeRqwQOioJ9I66odjN4I7s= +google.golang.org/genproto v0.0.0-20190716160619-c506a9f90610/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.22.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= @@ -380,7 +450,10 @@ gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190614002413-cb51c254f01b/go.mod h1:JlmFZigtG9vBVR3QGIQ9g/Usz4BzH+Xm6Z8iHQWRYUw= +modernc.org/mathutil v1.0.0 h1:93vKjrJopTPrtTNpZ8XIovER7iCIH1QU7wNbOQXC60I= modernc.org/mathutil v1.0.0/go.mod h1:wU0vUrJsVWBZ4P6e7xtFJEhFSNsfRLJ8H458uRjg03k= +modernc.org/strutil v1.0.0 h1:XVFtQwFVwc02Wk+0L/Z/zDDXO81r5Lhe6iMKmGX3KhE= modernc.org/strutil v1.0.0/go.mod h1:lstksw84oURvj9y3tn8lGvRxyRC1S2+g5uuIzNfIOBs= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= diff --git a/importbatch.go b/importbatch.go index b364d64..1e14e06 100644 --- a/importbatch.go +++ b/importbatch.go @@ -151,7 +151,7 @@ func (b *Batch) Add(rec Row) error { case uint64: b.ids = append(b.ids, rid) case string: - if colID, ok := b.client.translator.GetCol(b.index.Name(), rid); ok { + if colID, ok := b.client.translateCol(b.index.Name(), rid); ok { b.ids = append(b.ids, colID) } else { ints, ok := b.toTranslateID[rid] @@ -172,7 +172,7 @@ func (b *Batch) Add(rec Row) error { case string: rowIDs := b.rowIDs[field.Name()] // translate val and append to b.rowIDs[i] - if rowID, ok := b.client.translator.GetRow(b.index.Name(), field.Name(), val); ok { + if rowID, ok := b.client.translateRow(b.index.Name(), field.Name(), val); ok { b.rowIDs[field.Name()] = append(rowIDs, rowID) } else { ints, ok := b.toTranslate[field.Name()][val] @@ -254,6 +254,7 @@ func (b *Batch) doTranslation() error { if err != nil { return errors.Wrap(err, "translating col keys") } + b.client.tlock.Lock() for j, key := range keys { id := ids[j] for _, recordIdx := range b.toTranslateID[key] { @@ -261,6 +262,7 @@ func (b *Batch) doTranslation() error { } b.client.translator.AddCol(b.index.Name(), key, id) } + b.client.tlock.Unlock() } else { keys = make([]string, 0) } @@ -286,6 +288,7 @@ func (b *Batch) doTranslation() error { // fill out missing IDs in local batch records with translated IDs rows := b.rowIDs[fieldName] + b.client.tlock.Lock() for j, key := range keys { id := ids[j] for _, recordIdx := range tt[key] { @@ -293,6 +296,7 @@ func (b *Batch) doTranslation() error { } b.client.translator.AddRow(b.index.Name(), fieldName, key, id) } + b.client.tlock.Unlock() } for _, idIndexes := range b.clearValues { @@ -308,7 +312,7 @@ func (b *Batch) doImport() error { frags := b.makeFragments() for shard, viewMap := range frags { - uris, err := b.client.GetURIsForShard(b.index.Name(), shard) + uris, err := b.client.getURIsForShard(b.index.Name(), shard) uri := uris[0] if err != nil { return errors.Wrap(err, "getting uris for shard") diff --git a/shardnodes.go b/shardnodes.go index d05603a..1ed45f5 100644 --- a/shardnodes.go +++ b/shardnodes.go @@ -37,3 +37,11 @@ func (s shardNodes) Put(index string, shard uint64, uris []*URI) { idx[shard] = uris s.data[index] = idx } + +func (s shardNodes) Invalidate() { + s.mu.Lock() + defer s.mu.Unlock() + for k := range s.data { + delete(s.data, k) + } +} diff --git a/uri.go b/uri.go index db8ba06..08a1e0a 100644 --- a/uri.go +++ b/uri.go @@ -65,6 +65,20 @@ type URI struct { error error } +type URIs []*URI + +func (uris URIs) String() string { + b := strings.Builder{} + b.WriteRune('[') + for _, u := range uris[:len(uris)-1] { + b.WriteString(u.Normalize()) + b.WriteRune(' ') + } + b.WriteString(uris[len(uris)-1].Normalize()) + b.WriteRune(']') + return b.String() +} + // DefaultURI creates and returns the default URI. func DefaultURI() *URI { return &URI{ From 4cc60a1eabf4af4b062fa00da0aec9494ec7fd1f Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 20:43:07 -0500 Subject: [PATCH 09/26] gofmt -s --- client.go | 2 +- client_internal_it_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/client.go b/client.go index c283bd1..90bc75d 100644 --- a/client.go +++ b/client.go @@ -148,7 +148,7 @@ func (c *Client) getURIsForShard(index string, shard uint64) ([]*URI, error) { func (c *Client) runChangeDetection() { c.tick = time.NewTicker(time.Minute) - for _ = range c.tick.C { + for range c.tick.C { c.detectClusterChanges() } } diff --git a/client_internal_it_test.go b/client_internal_it_test.go index 6c4e5a3..2471b90 100644 --- a/client_internal_it_test.go +++ b/client_internal_it_test.go @@ -176,7 +176,7 @@ func TestDetectClusterChanges(t *testing.T) { c := getClient() c.shardNodes.data["blah"] = make(map[uint64][]*URI) - c.shardNodes.data["blah"][1] = []*URI{&URI{scheme: "zzz"}} + c.shardNodes.data["blah"][1] = []*URI{{scheme: "zzz"}} c.detectClusterChanges() } From 4d7a11e6efc741710934649b4538b60aee433514 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 21:05:47 -0500 Subject: [PATCH 10/26] move Translator out of client to ImportBatch --- client.go | 58 ++++++++++++++++++++++---------------------------- importbatch.go | 19 ++++++++--------- 2 files changed, 34 insertions(+), 43 deletions(-) diff --git a/client.go b/client.go index 90bc75d..d0cc24d 100644 --- a/client.go +++ b/client.go @@ -90,44 +90,37 @@ type Client struct { importLogEncoder encoder logLock sync.Mutex - // TODO replace this with something like BoltDB. Need better - // concurrent performance, less lock contention. Persistence might - // be a nice bonus too. - tlock sync.RWMutex - translator *Translator - // TODO shardNodes needs to be invalidated/updated when cluster topology changes. shardNodes shardNodes tick *time.Ticker } -func (c *Client) translateCol(index, key string) (uint64, bool) { - c.tlock.RLock() - v, b := c.translator.GetCol(index, key) - c.tlock.RUnlock() - return v, b -} - -func (c *Client) translateRow(index, field, key string) (uint64, bool) { - c.tlock.RLock() - v, b := c.translator.GetRow(index, field, key) - c.tlock.RUnlock() - return v, b -} - -func (c *Client) addTranslateCol(index, key string, value uint64) { - c.tlock.Lock() - c.translator.AddCol(index, key, value) - c.tlock.Unlock() -} - -func (c *Client) addTranslateRow(index, field, key string, value uint64) { - c.tlock.Lock() - c.translator.AddRow(index, field, key, value) - c.tlock.Unlock() -} +// func (c *Client) translateCol(index, key string) (uint64, bool) { +// c.tlock.RLock() +// v, b := c.translator.GetCol(index, key) +// c.tlock.RUnlock() +// return v, b +// } + +// func (c *Client) translateRow(index, field, key string) (uint64, bool) { +// c.tlock.RLock() +// v, b := c.translator.GetRow(index, field, key) +// c.tlock.RUnlock() +// return v, b +// } + +// func (c *Client) addTranslateCol(index, key string, value uint64) { +// c.tlock.Lock() +// c.translator.AddCol(index, key, value) +// c.tlock.Unlock() +// } + +// func (c *Client) addTranslateRow(index, field, key string, value uint64) { +// c.tlock.Lock() +// c.translator.AddRow(index, field, key, value) +// c.tlock.Unlock() +// } -// TODO unexport this, consider unexporting ImportValues, look for other candidates, put a note on translator about it being only used by batch, do something about shardNodes. func (c *Client) getURIsForShard(index string, shard uint64) ([]*URI, error) { uris, ok := c.shardNodes.Get(index, shard) if ok { @@ -238,7 +231,6 @@ func newClientWithOptions(options *ClientOptions) *Client { logger: log.New(os.Stderr, "go-pilosa ", log.Flags()), coordinatorLock: &sync.RWMutex{}, - translator: NewTranslator(), shardNodes: newShardNodes(), } if options.importLogWriter != nil { diff --git a/importbatch.go b/importbatch.go index 1e14e06..cb11ca4 100644 --- a/importbatch.go +++ b/importbatch.go @@ -83,6 +83,8 @@ type Batch struct { // each record has a different string ID. In that case, a simple // slice of strings would probably work better. toTranslateID map[string][]int + + transCache *Translator } // NewBatch initializes a new Batch object which will use the given @@ -122,14 +124,15 @@ func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { clearValues: make(map[string][]uint64), toTranslate: tt, toTranslateID: make(map[string][]int), + transCache: NewTranslator(), } } // Row represents a single record which can be added to a RecordBatch. // // Note: it is not named "Record" because there is a conflict with -//another type in this package. This may be rectified by deprecating -//something or splitting packages in the future. +// another type in this package. This may be rectified by deprecating +// something or splitting packages in the future. type Row struct { ID interface{} Values []interface{} @@ -151,7 +154,7 @@ func (b *Batch) Add(rec Row) error { case uint64: b.ids = append(b.ids, rid) case string: - if colID, ok := b.client.translateCol(b.index.Name(), rid); ok { + if colID, ok := b.transCache.GetCol(b.index.Name(), rid); ok { b.ids = append(b.ids, colID) } else { ints, ok := b.toTranslateID[rid] @@ -172,7 +175,7 @@ func (b *Batch) Add(rec Row) error { case string: rowIDs := b.rowIDs[field.Name()] // translate val and append to b.rowIDs[i] - if rowID, ok := b.client.translateRow(b.index.Name(), field.Name(), val); ok { + if rowID, ok := b.transCache.GetRow(b.index.Name(), field.Name(), val); ok { b.rowIDs[field.Name()] = append(rowIDs, rowID) } else { ints, ok := b.toTranslate[field.Name()][val] @@ -254,15 +257,13 @@ func (b *Batch) doTranslation() error { if err != nil { return errors.Wrap(err, "translating col keys") } - b.client.tlock.Lock() for j, key := range keys { id := ids[j] for _, recordIdx := range b.toTranslateID[key] { b.ids[recordIdx] = id } - b.client.translator.AddCol(b.index.Name(), key, id) + b.transCache.AddCol(b.index.Name(), key, id) } - b.client.tlock.Unlock() } else { keys = make([]string, 0) } @@ -288,15 +289,13 @@ func (b *Batch) doTranslation() error { // fill out missing IDs in local batch records with translated IDs rows := b.rowIDs[fieldName] - b.client.tlock.Lock() for j, key := range keys { id := ids[j] for _, recordIdx := range tt[key] { rows[recordIdx] = id } - b.client.translator.AddRow(b.index.Name(), fieldName, key, id) + b.transCache.AddRow(b.index.Name(), fieldName, key, id) } - b.client.tlock.Unlock() } for _, idIndexes := range b.clearValues { From c33123ac545d9ec9f10a35b5dfe3935c9e240ebf Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 21:55:17 -0500 Subject: [PATCH 11/26] interfacify Translator, add errors and batch addition also add the ability to set different implementations on a Batch. This is basically gearing up for a shared implementation using an embedded k/v store. --- cmd/picsv/main.go | 5 +++- importbatch.go | 42 +++++++++++++++++++++++++------- importbatch_test.go | 10 ++++++-- translator.go | 59 +++++++++++++++++++++++++++++---------------- 4 files changed, 83 insertions(+), 33 deletions(-) diff --git a/cmd/picsv/main.go b/cmd/picsv/main.go index 39368b5..7fbc662 100644 --- a/cmd/picsv/main.go +++ b/cmd/picsv/main.go @@ -87,7 +87,10 @@ func (m *Main) Run() error { // this has a non-obvious dependence on the previous line... the fields are set up in the index which comes from the schema client.SyncSchema(schema) - batch := pilosa.NewBatch(client, m.BatchSize, index, fields) + batch, err := pilosa.NewBatch(client, m.BatchSize, index, fields) + if err != nil { + return errors.Wrap(err, "getting new batch") + } record := pilosa.Row{ Values: make([]interface{}, len(header)), } diff --git a/importbatch.go b/importbatch.go index cb11ca4..555e72f 100644 --- a/importbatch.go +++ b/importbatch.go @@ -84,7 +84,16 @@ type Batch struct { // slice of strings would probably work better. toTranslateID map[string][]int - transCache *Translator + transCache Translator +} + +type BatchOption func(b *Batch) error + +func OptTranslator(t Translator) BatchOption { + return func(b *Batch) error { + b.transCache = t + return nil + } } // NewBatch initializes a new Batch object which will use the given @@ -92,9 +101,9 @@ type Batch struct { // before returning ErrBatchNowFull. The positions of the Fields in // 'fields' correspond to the positions of values in the Row's Values // passed to Batch.Add(). -func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { +func NewBatch(client *Client, size int, index *Index, fields []*Field, opts ...BatchOption) (*Batch, error) { if len(fields) == 0 || size == 0 { - panic("can't batch with no fields or batch size") + return nil, errors.New("can't batch with no fields or batch size") } headerMap := make(map[string]*Field, len(fields)) rowIDs := make(map[string][]uint64) @@ -113,7 +122,7 @@ func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { values[field.Name()] = make([]int64, 0, size) } } - return &Batch{ + b := &Batch{ client: client, header: fields, headerMap: headerMap, @@ -124,8 +133,15 @@ func NewBatch(client *Client, size int, index *Index, fields []*Field) *Batch { clearValues: make(map[string][]uint64), toTranslate: tt, toTranslateID: make(map[string][]int), - transCache: NewTranslator(), + transCache: NewMapTranslator(), } + for _, opt := range opts { + err := opt(b) + if err != nil { + return nil, errors.Wrap(err, "applying options") + } + } + return b, nil } // Row represents a single record which can be added to a RecordBatch. @@ -154,7 +170,9 @@ func (b *Batch) Add(rec Row) error { case uint64: b.ids = append(b.ids, rid) case string: - if colID, ok := b.transCache.GetCol(b.index.Name(), rid); ok { + if colID, ok, err := b.transCache.GetCol(b.index.Name(), rid); err != nil { + return errors.Wrap(err, "translating column") + } else if ok { b.ids = append(b.ids, colID) } else { ints, ok := b.toTranslateID[rid] @@ -175,7 +193,9 @@ func (b *Batch) Add(rec Row) error { case string: rowIDs := b.rowIDs[field.Name()] // translate val and append to b.rowIDs[i] - if rowID, ok := b.transCache.GetRow(b.index.Name(), field.Name(), val); ok { + if rowID, ok, err := b.transCache.GetRow(b.index.Name(), field.Name(), val); err != nil { + return errors.Wrap(err, "translating row") + } else if ok { b.rowIDs[field.Name()] = append(rowIDs, rowID) } else { ints, ok := b.toTranslate[field.Name()][val] @@ -257,12 +277,14 @@ func (b *Batch) doTranslation() error { if err != nil { return errors.Wrap(err, "translating col keys") } + if err := b.transCache.AddCols(b.index.Name(), keys, ids); err != nil { + return errors.Wrap(err, "adding cols to cache") + } for j, key := range keys { id := ids[j] for _, recordIdx := range b.toTranslateID[key] { b.ids[recordIdx] = id } - b.transCache.AddCol(b.index.Name(), key, id) } } else { keys = make([]string, 0) @@ -286,6 +308,9 @@ func (b *Batch) doTranslation() error { if err != nil { return errors.Wrap(err, "translating row keys") } + if err := b.transCache.AddRows(b.index.Name(), fieldName, keys, ids); err != nil { + return errors.Wrap(err, "adding rows to cache") + } // fill out missing IDs in local batch records with translated IDs rows := b.rowIDs[fieldName] @@ -294,7 +319,6 @@ func (b *Batch) doTranslation() error { for _, recordIdx := range tt[key] { rows[recordIdx] = id } - b.transCache.AddRow(b.index.Name(), fieldName, key, id) } } diff --git a/importbatch_test.go b/importbatch_test.go index eda255f..929ab49 100644 --- a/importbatch_test.go +++ b/importbatch_test.go @@ -29,7 +29,10 @@ func TestBatches(t *testing.T) { t.Logf("problem cleaning up from test: %v", err) } }() - b := NewBatch(client, 10, idx, fields) + b, err := NewBatch(client, 10, idx, fields) + if err != nil { + t.Fatalf("getting new batch: %v", err) + } r := Row{Values: make([]interface{}, 4)} for i := 0; i < 9; i++ { @@ -330,7 +333,10 @@ func TestBatchesStringIDs(t *testing.T) { } }() - b := NewBatch(client, 3, idx, fields) + b, err := NewBatch(client, 3, idx, fields) + if err != nil { + t.Fatalf("getting new batch: %v", err) + } r := Row{Values: make([]interface{}, 1)} diff --git a/translator.go b/translator.go index 35429fb..aef126c 100644 --- a/translator.go +++ b/translator.go @@ -1,12 +1,21 @@ package pilosa -type Translator struct { +type Translator interface { + GetCol(index, key string) (uint64, bool, error) + GetRow(index, field, key string) (uint64, bool, error) + AddCols(index string, keys []string, values []uint64) error + AddRows(index, field string, keys []string, values []uint64) error +} + +// MapTranslator implements Translator using in-memory maps. It is not +// threadsafe. +type MapTranslator struct { indexes map[string]map[string]uint64 fields map[indexfield]map[string]uint64 } -func NewTranslator() *Translator { - return &Translator{ +func NewMapTranslator() *MapTranslator { + return &MapTranslator{ indexes: make(map[string]map[string]uint64), fields: make(map[indexfield]map[string]uint64), } @@ -17,38 +26,46 @@ type indexfield struct { field string } -func (t *Translator) GetCol(index, key string) (uint64, bool) { +func (t *MapTranslator) GetCol(index, key string) (uint64, bool, error) { if idx, ok := t.indexes[index]; ok { if val, ok := idx[key]; ok { - return val, true + return val, true, nil } } - return 0, false + return 0, false, nil } -func (t *Translator) AddCol(index, key string, value uint64) { - idx, ok := t.indexes[index] - if !ok { - idx = make(map[string]uint64) +func (t *MapTranslator) AddCols(index string, keys []string, values []uint64) error { + for i := range keys { + key, value := keys[i], values[i] + idxMap, ok := t.indexes[index] + if !ok { + idxMap = make(map[string]uint64) + } + idxMap[key] = value + t.indexes[index] = idxMap } - idx[key] = value - t.indexes[index] = idx + return nil } -func (t *Translator) GetRow(index, field, key string) (uint64, bool) { +func (t *MapTranslator) GetRow(index, field, key string) (uint64, bool, error) { if fld, ok := t.fields[indexfield{index: index, field: field}]; ok { if val, ok := fld[key]; ok { - return val, true + return val, true, nil } } - return 0, false + return 0, false, nil } -func (t *Translator) AddRow(index, field, key string, value uint64) { - keys, ok := t.fields[indexfield{index: index, field: field}] - if !ok { - keys = make(map[string]uint64) +func (t *MapTranslator) AddRows(index, field string, keys []string, values []uint64) error { + for i := range keys { + key, value := keys[i], values[i] + keyMap, ok := t.fields[indexfield{index: index, field: field}] + if !ok { + keyMap = make(map[string]uint64) + } + keyMap[key] = value + t.fields[indexfield{index: index, field: field}] = keyMap } - keys[key] = value - t.fields[indexfield{index: index, field: field}] = keys + return nil } From 6d88b1711228fdfda3c0a1bd7165393589590b4c Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 22:09:37 -0500 Subject: [PATCH 12/26] export client KeyTranslation and ImportRoaring methods Now importbatch only uses exported client stuff. --- client.go | 20 ++++++++++++++++---- client_it_test.go | 4 ++-- importbatch.go | 11 +++-------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/client.go b/client.go index d0cc24d..f60764a 100644 --- a/client.go +++ b/client.go @@ -633,7 +633,7 @@ func (c *Client) translateRecordsRowKeys(rowKeyIDMap *lru.LRU, field *Field, col } if len(keys) > 0 { // translate missing keys - ids, err := c.translateRowKeys(field, keys) + ids, err := c.TranslateRowKeys(field, keys) if err != nil { return err } @@ -670,7 +670,7 @@ func (c *Client) translateRecordsColumnKeys(columnKeyIDMap *lru.LRU, index *Inde } if len(keys) > 0 { // translate missing keys - ids, err := c.translateColumnKeys(index, keys) + ids, err := c.TranslateColumnKeys(index, keys) if err != nil { return err } @@ -837,6 +837,18 @@ func (c *Client) importData(uri *URI, path string, data []byte) error { return nil } +// ImportRoaringBitmap can import pre-made bitmaps for a number of +// different views into the given field/shard. If the view name in the +// map is an empty string, the standard view will be used. +func (c *Client) ImportRoaringBitmap(field *Field, shard uint64, views map[string]*roaring.Bitmap, clear bool) error { + uris, err := c.getURIsForShard(field.index.Name(), shard) + if err != nil { + return errors.Wrap(err, "getting URIs for import") + } + err = c.importRoaringBitmap(uris[0], field, shard, views, &ImportOptions{clear: clear}) + return errors.Wrap(err, "importing bitmap") +} + func (c *Client) importRoaringBitmap(uri *URI, field *Field, shard uint64, views viewImports, options *ImportOptions) error { protoViews := []*pbuf.ImportRoaringRequestView{} for name, bmp := range views { @@ -1156,7 +1168,7 @@ func (c *Client) augmentHeaders(headers map[string]string) map[string]string { return headers } -func (c *Client) translateRowKeys(field *Field, keys []string) ([]uint64, error) { +func (c *Client) TranslateRowKeys(field *Field, keys []string) ([]uint64, error) { req := &pbuf.TranslateKeysRequest{ Index: field.index.name, Field: field.name, @@ -1165,7 +1177,7 @@ func (c *Client) translateRowKeys(field *Field, keys []string) ([]uint64, error) return c.translateKeys(req, keys) } -func (c *Client) translateColumnKeys(index *Index, keys []string) ([]uint64, error) { +func (c *Client) TranslateColumnKeys(index *Index, keys []string) ([]uint64, error) { req := &pbuf.TranslateKeysRequest{ Index: index.name, Keys: keys, diff --git a/client_it_test.go b/client_it_test.go index aded5a8..a597267 100644 --- a/client_it_test.go +++ b/client_it_test.go @@ -2485,7 +2485,7 @@ func TestTranslateRowKeys(t *testing.T) { if err != nil { t.Fatal(err) } - rowIDs, err := client.translateRowKeys(field, []string{"key1", "key2"}) + rowIDs, err := client.TranslateRowKeys(field, []string{"key1", "key2"}) if err != nil { t.Fatal(err) } @@ -2506,7 +2506,7 @@ func TestTranslateColKeys(t *testing.T) { if err != nil { t.Fatal(err) } - colIDs, err := client.translateColumnKeys(keysIndex, []string{"ten", "one-thousand"}) + colIDs, err := client.TranslateColumnKeys(keysIndex, []string{"ten", "one-thousand"}) if err != nil { t.Fatal(err) } diff --git a/importbatch.go b/importbatch.go index 555e72f..8b53710 100644 --- a/importbatch.go +++ b/importbatch.go @@ -273,7 +273,7 @@ func (b *Batch) doTranslation() error { for k := range b.toTranslateID { keys = append(keys, k) } - ids, err := b.client.translateColumnKeys(b.index, keys) + ids, err := b.client.TranslateColumnKeys(b.index, keys) if err != nil { return errors.Wrap(err, "translating col keys") } @@ -304,7 +304,7 @@ func (b *Batch) doTranslation() error { } // translate keys from Pilosa - ids, err := b.client.translateRowKeys(b.headerMap[fieldName], keys) + ids, err := b.client.TranslateRowKeys(b.headerMap[fieldName], keys) if err != nil { return errors.Wrap(err, "translating row keys") } @@ -335,16 +335,11 @@ func (b *Batch) doImport() error { frags := b.makeFragments() for shard, viewMap := range frags { - uris, err := b.client.getURIsForShard(b.index.Name(), shard) - uri := uris[0] - if err != nil { - return errors.Wrap(err, "getting uris for shard") - } for fieldView, bitmap := range viewMap { fieldView := fieldView bitmap := bitmap eg.Go(func() error { - err := b.client.importRoaringBitmap(uri, b.index.Field(fieldView.field), shard, map[string]*roaring.Bitmap{"": bitmap}, &ImportOptions{}) + err := b.client.ImportRoaringBitmap(b.index.Field(fieldView.field), shard, map[string]*roaring.Bitmap{"": bitmap}, false) return errors.Wrapf(err, "importing data for %s", fieldView.field) }) } From 994cf36c6e2379624c1a091e594810166b95d156 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 27 Aug 2019 22:28:09 -0500 Subject: [PATCH 13/26] move batch stuff to subpackage --- cmd/picsv/main.go | 7 +++-- importbatch.go => gpexp/importbatch.go | 29 ++++++++++--------- .../importbatch_test.go | 27 ++++++++--------- translator.go => gpexp/translator.go | 2 +- orm.go | 4 +++ 5 files changed, 38 insertions(+), 31 deletions(-) rename importbatch.go => gpexp/importbatch.go (95%) rename importbatch_test.go => gpexp/importbatch_test.go (95%) rename translator.go => gpexp/translator.go (99%) diff --git a/cmd/picsv/main.go b/cmd/picsv/main.go index 7fbc662..ba8b928 100644 --- a/cmd/picsv/main.go +++ b/cmd/picsv/main.go @@ -12,6 +12,7 @@ import ( "github.com/jaffee/commandeer" "github.com/pilosa/go-pilosa" + "github.com/pilosa/go-pilosa/gpexp" "github.com/pkg/errors" ) @@ -87,11 +88,11 @@ func (m *Main) Run() error { // this has a non-obvious dependence on the previous line... the fields are set up in the index which comes from the schema client.SyncSchema(schema) - batch, err := pilosa.NewBatch(client, m.BatchSize, index, fields) + batch, err := gpexp.NewBatch(client, m.BatchSize, index, fields) if err != nil { return errors.Wrap(err, "getting new batch") } - record := pilosa.Row{ + record := gpexp.Row{ Values: make([]interface{}, len(header)), } @@ -107,7 +108,7 @@ func (m *Main) Run() error { } } err := batch.Add(record) - if err == pilosa.ErrBatchNowFull { + if err == gpexp.ErrBatchNowFull { err := batch.Import() if err != nil { return errors.Wrap(err, "importing") diff --git a/importbatch.go b/gpexp/importbatch.go similarity index 95% rename from importbatch.go rename to gpexp/importbatch.go index 8b53710..d4a1150 100644 --- a/importbatch.go +++ b/gpexp/importbatch.go @@ -1,6 +1,7 @@ -package pilosa +package gpexp import ( + "github.com/pilosa/go-pilosa" "github.com/pilosa/pilosa/roaring" "github.com/pkg/errors" "golang.org/x/sync/errgroup" @@ -50,10 +51,10 @@ type RecordBatch interface { // // nil values are ignored. type Batch struct { - client *Client - index *Index - header []*Field - headerMap map[string]*Field + client *pilosa.Client + index *pilosa.Index + header []*pilosa.Field + headerMap map[string]*pilosa.Field // ids is a slice of length batchSize of record IDs ids []uint64 @@ -101,11 +102,11 @@ func OptTranslator(t Translator) BatchOption { // before returning ErrBatchNowFull. The positions of the Fields in // 'fields' correspond to the positions of values in the Row's Values // passed to Batch.Add(). -func NewBatch(client *Client, size int, index *Index, fields []*Field, opts ...BatchOption) (*Batch, error) { +func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pilosa.Field, opts ...BatchOption) (*Batch, error) { if len(fields) == 0 || size == 0 { return nil, errors.New("can't batch with no fields or batch size") } - headerMap := make(map[string]*Field, len(fields)) + headerMap := make(map[string]*pilosa.Field, len(fields)) rowIDs := make(map[string][]uint64) values := make(map[string][]int64) tt := make(map[string]map[string][]int) @@ -113,12 +114,12 @@ func NewBatch(client *Client, size int, index *Index, fields []*Field, opts ...B headerMap[field.Name()] = field opts := field.Opts() switch opts.Type() { - case FieldTypeDefault, FieldTypeSet: + case pilosa.FieldTypeDefault, pilosa.FieldTypeSet: if opts.Keys() { tt[field.Name()] = make(map[string][]int) } rowIDs[field.Name()] = make([]uint64, 0, size) - case FieldTypeInt: + case pilosa.FieldTypeInt: values[field.Name()] = make([]int64, 0, size) } } @@ -211,7 +212,7 @@ func (b *Batch) Add(rec Row) error { case int64: b.values[field.Name()] = append(b.values[field.Name()], val) case nil: - if field.Opts().Type() == FieldTypeInt { + if field.Opts().Type() == pilosa.FieldTypeInt { b.values[field.Name()] = append(b.values[field.Name()], 0) clearIndexes, ok := b.clearValues[field.Name()] if !ok { @@ -357,9 +358,9 @@ func (b *Batch) doImport() error { var nilSentinel = ^uint64(0) func (b *Batch) makeFragments() fragments { - shardWidth := b.index.shardWidth + shardWidth := b.index.ShardWidth() if shardWidth == 0 { - shardWidth = DefaultShardWidth + shardWidth = pilosa.DefaultShardWidth } frags := make(fragments) for fname, rowIDs := range b.rowIDs { @@ -381,9 +382,9 @@ func (b *Batch) makeFragments() fragments { } func (b *Batch) importValueData() error { - shardWidth := b.index.shardWidth + shardWidth := b.index.ShardWidth() if shardWidth == 0 { - shardWidth = DefaultShardWidth + shardWidth = pilosa.DefaultShardWidth } eg := errgroup.Group{} diff --git a/importbatch_test.go b/gpexp/importbatch_test.go similarity index 95% rename from importbatch_test.go rename to gpexp/importbatch_test.go index 929ab49..f9ac887 100644 --- a/importbatch_test.go +++ b/gpexp/importbatch_test.go @@ -1,24 +1,25 @@ -package pilosa +package gpexp import ( "reflect" "strconv" "testing" + "github.com/pilosa/go-pilosa" "github.com/pkg/errors" ) // TODO test against cluster func TestBatches(t *testing.T) { - client := DefaultClient() - schema := NewSchema() + client := pilosa.DefaultClient() + schema := pilosa.NewSchema() idx := schema.Index("gopilosatest-blah") - fields := make([]*Field, 4) - fields[0] = idx.Field("zero", OptFieldKeys(true)) - fields[1] = idx.Field("one", OptFieldKeys(true)) - fields[2] = idx.Field("two", OptFieldKeys(true)) - fields[3] = idx.Field("three", OptFieldTypeInt()) + fields := make([]*pilosa.Field, 4) + fields[0] = idx.Field("zero", pilosa.OptFieldKeys(true)) + fields[1] = idx.Field("one", pilosa.OptFieldKeys(true)) + fields[2] = idx.Field("two", pilosa.OptFieldKeys(true)) + fields[3] = idx.Field("three", pilosa.OptFieldTypeInt()) err := client.SyncSchema(schema) if err != nil { t.Fatalf("syncing schema: %v", err) @@ -317,11 +318,11 @@ func TestBatches(t *testing.T) { } func TestBatchesStringIDs(t *testing.T) { - client := DefaultClient() - schema := NewSchema() - idx := schema.Index("gopilosatest-blah", OptIndexKeys(true)) - fields := make([]*Field, 1) - fields[0] = idx.Field("zero", OptFieldKeys(true)) + client := pilosa.DefaultClient() + schema := pilosa.NewSchema() + idx := schema.Index("gopilosatest-blah", pilosa.OptIndexKeys(true)) + fields := make([]*pilosa.Field, 1) + fields[0] = idx.Field("zero", pilosa.OptFieldKeys(true)) err := client.SyncSchema(schema) if err != nil { t.Fatalf("syncing schema: %v", err) diff --git a/translator.go b/gpexp/translator.go similarity index 99% rename from translator.go rename to gpexp/translator.go index aef126c..74785e5 100644 --- a/translator.go +++ b/gpexp/translator.go @@ -1,4 +1,4 @@ -package pilosa +package gpexp type Translator interface { GetCol(index, key string) (uint64, bool, error) diff --git a/orm.go b/orm.go index 465f74e..7dc3480 100644 --- a/orm.go +++ b/orm.go @@ -412,6 +412,10 @@ func NewIndex(name string) *Index { } } +func (idx *Index) ShardWidth() uint64 { + return idx.shardWidth +} + // Fields return a copy of the fields in this index func (idx *Index) Fields() map[string]*Field { result := make(map[string]*Field) From a93000430cbb5b26a9d7ee22333f1c4cd2b843a3 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Wed, 28 Aug 2019 08:02:31 -0500 Subject: [PATCH 14/26] update circle CI to Go 1.12/13-rc --- .circleci/config.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4b53161..b48d376 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 defaults: &defaults working_directory: /go/src/github.com/pilosa/go-pilosa docker: - - image: circleci/golang:1.11 + - image: circleci/golang:1.12 environment: GO111MODULE: "on" fast-checkout: &fast-checkout @@ -30,18 +30,18 @@ jobs: - *fast-checkout - run: make install-gometalinter - run: make gometalinter - test-golang-1.12-rc: &base-test + test-golang-1.13-rc: &base-test <<: *defaults steps: - *fast-checkout - run: make test-all docker: - - image: circleci/golang:1.12-rc + - image: circleci/golang:1.13-rc - image: pilosa/pilosa:master - test-golang-1.11: + test-golang-1.12: <<: *base-test docker: - - image: circleci/golang:1.11 + - image: circleci/golang:1.12 - image: pilosa/pilosa:master workflows: version: 2 @@ -51,9 +51,9 @@ workflows: - linter: requires: - build - - test-golang-1.12-rc: + - test-golang-1.13-rc: requires: - build - - test-golang-1.11: + - test-golang-1.12: requires: - build From 8e2f334f484f6de92d6b4a46b222a7fc84a82f12 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Wed, 28 Aug 2019 08:48:44 -0500 Subject: [PATCH 15/26] use shardnode cluster change detector w/ cleanup --- client.go | 48 ++++++++++------------------ client_internal_it_test.go | 2 +- client_it_test.go | 65 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 32 deletions(-) diff --git a/client.go b/client.go index f60764a..5d84220 100644 --- a/client.go +++ b/client.go @@ -90,37 +90,11 @@ type Client struct { importLogEncoder encoder logLock sync.Mutex - // TODO shardNodes needs to be invalidated/updated when cluster topology changes. shardNodes shardNodes tick *time.Ticker + done chan struct{} } -// func (c *Client) translateCol(index, key string) (uint64, bool) { -// c.tlock.RLock() -// v, b := c.translator.GetCol(index, key) -// c.tlock.RUnlock() -// return v, b -// } - -// func (c *Client) translateRow(index, field, key string) (uint64, bool) { -// c.tlock.RLock() -// v, b := c.translator.GetRow(index, field, key) -// c.tlock.RUnlock() -// return v, b -// } - -// func (c *Client) addTranslateCol(index, key string, value uint64) { -// c.tlock.Lock() -// c.translator.AddCol(index, key, value) -// c.tlock.Unlock() -// } - -// func (c *Client) addTranslateRow(index, field, key string, value uint64) { -// c.tlock.Lock() -// c.translator.AddRow(index, field, key, value) -// c.tlock.Unlock() -// } - func (c *Client) getURIsForShard(index string, shard uint64) ([]*URI, error) { uris, ok := c.shardNodes.Get(index, shard) if ok { @@ -139,13 +113,22 @@ func (c *Client) getURIsForShard(index string, shard uint64) ([]*URI, error) { } func (c *Client) runChangeDetection() { - c.tick = time.NewTicker(time.Minute) - - for range c.tick.C { - c.detectClusterChanges() + for { + select { + case <-c.tick.C: + c.detectClusterChanges() + case <-c.done: + return + } } } +func (c *Client) Close() error { + c.tick.Stop() + close(c.done) + return nil +} + // detectClusterChanges chooses a random index and shard from the // shardNodes cache and deletes it. It then looks it up from Pilosa to // see if it still matches, and if not it drops the whole cache. @@ -232,6 +215,8 @@ func newClientWithOptions(options *ClientOptions) *Client { coordinatorLock: &sync.RWMutex{}, shardNodes: newShardNodes(), + tick: time.NewTicker(time.Minute), + done: make(chan struct{}, 0), } if options.importLogWriter != nil { c.importLogEncoder = newImportLogEncoder(options.importLogWriter) @@ -245,6 +230,7 @@ func newClientWithOptions(options *ClientOptions) *Client { c.minRetrySleepTime = 1 * time.Second c.maxRetrySleepTime = 2 * time.Minute c.importManager = newRecordImportManager(c) + go c.runChangeDetection() return c } diff --git a/client_internal_it_test.go b/client_internal_it_test.go index 2471b90..3baf7dd 100644 --- a/client_internal_it_test.go +++ b/client_internal_it_test.go @@ -174,7 +174,7 @@ func TestImportWithReplayErrors(t *testing.T) { func TestDetectClusterChanges(t *testing.T) { c := getClient() - + defer c.Close() c.shardNodes.data["blah"] = make(map[uint64][]*URI) c.shardNodes.data["blah"][1] = []*URI{{scheme: "zzz"}} diff --git a/client_it_test.go b/client_it_test.go index a597267..a58ab01 100644 --- a/client_it_test.go +++ b/client_it_test.go @@ -77,10 +77,12 @@ func Setup() { if err != nil { panic(err) } + _ = client.Close() } func TearDown() { client := getClient() + defer client.Close() err := client.DeleteIndex(index) if err != nil { panic(err) @@ -93,6 +95,7 @@ func TearDown() { func Reset() { client := getClient() + defer client.Close() client.DeleteIndex(index) Setup() } @@ -106,6 +109,7 @@ func TestCreateDefaultClient(t *testing.T) { func TestClientReturnsResponse(t *testing.T) { client := getClient() + defer client.Close() response, err := client.Query(testField.Row(1)) if err != nil { t.Fatalf("Error querying: %s", err) @@ -119,6 +123,7 @@ func TestQueryWithShards(t *testing.T) { Reset() const shardWidth = 1048576 client := getClient() + defer client.Close() if _, err := client.Query(testField.Set(1, 100)); err != nil { t.Fatal(err) } @@ -141,6 +146,7 @@ func TestQueryWithShards(t *testing.T) { func TestQueryWithColumns(t *testing.T) { Reset() client := getClient() + defer client.Close() targetAttrs := map[string]interface{}{ "name": "some string", "age": int64(95), @@ -184,6 +190,7 @@ func TestQueryWithColumns(t *testing.T) { func TestSetRowAttrs(t *testing.T) { Reset() client := getClient() + defer client.Close() targetAttrs := map[string]interface{}{ "name": "some string", "age": int64(95), @@ -209,6 +216,7 @@ func TestSetRowAttrs(t *testing.T) { func TestOrmCount(t *testing.T) { client := getClient() + defer client.Close() countField := index.Field("count-test") err := client.EnsureField(countField) if err != nil { @@ -231,6 +239,7 @@ func TestOrmCount(t *testing.T) { func TestIntersectReturns(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("segments") err := client.EnsureField(field) if err != nil { @@ -258,6 +267,7 @@ func TestIntersectReturns(t *testing.T) { func TestTopNReturns(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("topn_test") err := client.EnsureField(field) if err != nil { @@ -309,6 +319,7 @@ func TestTopNReturns(t *testing.T) { func TestMinMaxRow(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("test-minmaxrow-field") err := client.EnsureField(field) if err != nil { @@ -347,6 +358,7 @@ func TestMinMaxRow(t *testing.T) { func TestSetMutexField(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("mutex-test", OptFieldTypeMutex(CacheTypeDefault, 0)) err := client.EnsureField(field) if err != nil { @@ -391,6 +403,7 @@ func TestSetMutexField(t *testing.T) { func TestSetBoolField(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("bool-test", OptFieldTypeBool()) err := client.EnsureField(field) if err != nil { @@ -414,6 +427,7 @@ func TestSetBoolField(t *testing.T) { func TestClearRowQuery(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("clear-row-test") err := client.EnsureField(field) if err != nil { @@ -452,6 +466,7 @@ func TestClearRowQuery(t *testing.T) { func TestRowsQuery(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("rows-test") err := client.EnsureField(field) if err != nil { @@ -480,6 +495,7 @@ func TestRowsQuery(t *testing.T) { func TestGroupByQuery(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("group-by-test") err := client.EnsureField(field) if err != nil { @@ -509,6 +525,7 @@ func TestGroupByQuery(t *testing.T) { func TestCreateDeleteIndexField(t *testing.T) { client := getClient() + defer client.Close() index1 := NewIndex("to-be-deleted") field1 := index1.Field("foo") err := client.CreateIndex(index1) @@ -531,6 +548,7 @@ func TestCreateDeleteIndexField(t *testing.T) { func TestEnsureIndexExists(t *testing.T) { client := getClient() + defer client.Close() err := client.EnsureIndex(index) if err != nil { t.Fatal(err) @@ -539,6 +557,7 @@ func TestEnsureIndexExists(t *testing.T) { func TestEnsureFieldExists(t *testing.T) { client := getClient() + defer client.Close() err := client.EnsureField(testField) if err != nil { t.Fatal(err) @@ -547,6 +566,7 @@ func TestEnsureFieldExists(t *testing.T) { func TestCreateFieldWithTimeQuantum(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("field-with-timequantum", OptFieldTypeTime(TimeQuantumYear)) err := client.CreateField(field) if err != nil { @@ -556,6 +576,7 @@ func TestCreateFieldWithTimeQuantum(t *testing.T) { func TestErrorCreatingIndex(t *testing.T) { client := getClient() + defer client.Close() err := client.CreateIndex(index) if err == nil { t.Fatal() @@ -564,6 +585,7 @@ func TestErrorCreatingIndex(t *testing.T) { func TestErrorCreatingField(t *testing.T) { client := getClient() + defer client.Close() err := client.CreateField(testField) if err == nil { t.Fatal() @@ -572,6 +594,7 @@ func TestErrorCreatingField(t *testing.T) { func TestIndexAlreadyExists(t *testing.T) { client := getClient() + defer client.Close() err := client.CreateIndex(index) if err != ErrIndexExists { t.Fatal(err) @@ -620,6 +643,7 @@ func TestQueryFailsIfAddressNotResolved(t *testing.T) { func TestQueryFails(t *testing.T) { client := getClient() + defer client.Close() _, err := client.Query(index.RawQuery("Invalid query")) if err == nil { t.Fatal() @@ -628,6 +652,7 @@ func TestQueryFails(t *testing.T) { func TestInvalidHttpRequest(t *testing.T) { client := getClient() + defer client.Close() _, _, err := client.httpRequest("INVALID METHOD", "/foo", nil, nil, false) if err == nil { t.Fatal() @@ -664,6 +689,7 @@ func TestResponseNotRead(t *testing.T) { func TestSchema(t *testing.T) { client := getClient() + defer client.Close() schema, err := client.Schema() if err != nil { t.Fatal(err) @@ -712,6 +738,7 @@ func TestSchema(t *testing.T) { func TestSync(t *testing.T) { client := getClient() + defer client.Close() remoteIndex := NewIndex("remote-index-1") err := client.EnsureIndex(remoteIndex) if err != nil { @@ -813,6 +840,7 @@ func NewGivenColumnGenerator(recs []Record) *GivenColumnGenerator { func TestImportWithBatchSize(t *testing.T) { client := getClient() + defer client.Close() // the first iterator for creating the target iterator := &ColumnGenerator{numRows: 10, numColumns: 1000} target := map[uint64][]uint64{} @@ -853,6 +881,7 @@ func TestImportWithBatchSize(t *testing.T) { func TestImportValues(t *testing.T) { client := getClient() + defer client.Close() schema, err := client.Schema() if err != nil { t.Fatalf("getting schema: %v", err) @@ -885,6 +914,7 @@ func TestImportValues(t *testing.T) { func TestImportWithBatchSizeExpectingZero(t *testing.T) { const shardWidth = 1048576 client := getClient() + defer client.Close() iterator := NewGivenColumnGenerator( []Record{ @@ -925,6 +955,7 @@ func failingImportColumns(field *Field, shard uint64, records []Record, nodes [] func TestImportWithBatchSizeFails(t *testing.T) { client := getClient() + defer client.Close() iterator := &ColumnGenerator{numRows: 10, numColumns: 1000} field := index.Field("importfield-batchsize") err := client.EnsureField(field) @@ -978,6 +1009,7 @@ func TestExportReaderReadBodyFailure(t *testing.T) { func TestFetchFragmentNodes(t *testing.T) { client := getClient() + defer client.Close() nodes, err := client.fetchFragmentNodes(index.Name(), 0) if err != nil { t.Fatal(err) @@ -997,6 +1029,7 @@ func TestFetchFragmentNodes(t *testing.T) { func TestFetchStatus(t *testing.T) { client := getClient() + defer client.Close() status, err := client.Status() if err != nil { t.Fatal(err) @@ -1008,6 +1041,7 @@ func TestFetchStatus(t *testing.T) { func TestFetchInfo(t *testing.T) { client := getClient() + defer client.Close() info, err := client.Info() if err != nil { t.Fatal(err) @@ -1031,6 +1065,7 @@ func TestFetchInfo(t *testing.T) { func TestRowRangeQuery(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("test-rowrangefield", OptFieldTypeTime(TimeQuantumMonthDayHour)) err := client.EnsureField(field) if err != nil { @@ -1058,6 +1093,7 @@ func TestRowRangeQuery(t *testing.T) { func TestRangeField(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("rangefield", OptFieldTypeInt()) field2 := index.Field("rangefield-set") err := client.EnsureField(field) @@ -1092,6 +1128,7 @@ func TestRangeField(t *testing.T) { func TestRangeField2(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("rangefield", OptFieldTypeInt(10, 20)) field2 := index.Field("rangefield-set") err := client.EnsureField(field) @@ -1139,6 +1176,7 @@ func TestRangeField2(t *testing.T) { func TestNotQuery(t *testing.T) { client := getClient() + defer client.Close() index := schema.Index("not-query-index", OptIndexTrackExistence(true)) field := index.Field("not-field") err := client.SyncSchema(schema) @@ -1166,6 +1204,7 @@ func TestNotQuery(t *testing.T) { func TestStoreQuery(t *testing.T) { client := getClient() + defer client.Close() schema := NewSchema() index := schema.Index("store-test") fromField := index.Field("x-from-field") @@ -1196,6 +1235,7 @@ func TestStoreQuery(t *testing.T) { func TestExcludeAttrsColumns(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("excludecolumnsattrsfield") err := client.EnsureField(field) if err != nil { @@ -1239,6 +1279,7 @@ func TestExcludeAttrsColumns(t *testing.T) { func TestMultipleClientKeyQuery(t *testing.T) { client := getClient() + defer client.Close() field := keysIndex.Field("multiple-client-field") err := client.EnsureField(field) if err != nil { @@ -1416,6 +1457,7 @@ func TestStatusUnmarshalFails(t *testing.T) { func TestStatusToNodeShardsForIndex(t *testing.T) { client := getClient() + defer client.Close() status := Status{ Nodes: []StatusNode{ { @@ -1444,6 +1486,7 @@ func TestStatusToNodeShardsForIndex(t *testing.T) { func TestHttpRequest(t *testing.T) { client := getClient() + defer client.Close() _, _, err := client.HttpRequest("GET", "/status", nil, nil) if err != nil { t.Fatal(err) @@ -1638,6 +1681,7 @@ func TestServerWarning(t *testing.T) { func TestRowIDColumnIDImport(t *testing.T) { client := getClient() + defer client.Close() iterator := newTestIterator() field := index.Field("importfield-rowid-colid") err := client.EnsureField(field) @@ -1700,6 +1744,7 @@ func TestRowIDColumnIDImport(t *testing.T) { func TestRowIDColumnIDImportTimestamp(t *testing.T) { client := getClient() + defer client.Close() iterator := newTestIteratorWithTimestamp() field := index.Field("importfield-csv-rowid-colid-time", OptFieldTypeTime(TimeQuantumYearMonthDayHour)) err := client.EnsureField(field) @@ -1774,6 +1819,7 @@ func TestRowIDColumnIDImportTimestamp(t *testing.T) { func TestRowIDColumnIDImportManualAddress(t *testing.T) { client := getClientManualAddress() + defer client.Close() iterator := newTestIterator() field := index.Field("importfield-rowid-colid") err := client.EnsureField(field) @@ -1836,6 +1882,7 @@ func TestRowIDColumnIDImportManualAddress(t *testing.T) { func TestRowIDColumnIDImportRoaring(t *testing.T) { client := getClient() + defer client.Close() iterator := newTestIterator() field := index.Field("importfield-rowid-colid") err := client.EnsureField(field) @@ -1899,6 +1946,7 @@ func TestRowIDColumnIDImportRoaring(t *testing.T) { func TestRowIDColumnIDTimestampImportRoaring(t *testing.T) { client := getClient() + defer client.Close() iterator := newTestIteratorWithTimestamp() field := index.Field("importfield-rowid-colid-time", OptFieldTypeTime(TimeQuantumYearMonthDayHour)) err := client.EnsureField(field) @@ -1973,6 +2021,7 @@ func TestRowIDColumnIDTimestampImportRoaring(t *testing.T) { func TestRowIDColumnIDTimestampImportRoaringNoStandardView(t *testing.T) { client := getClient() + defer client.Close() iterator := newTestIteratorWithTimestamp() field := index.Field("importfield-rowid-colid-time-nostd", OptFieldTypeTime(TimeQuantumMonthDayHour, true)) err := client.EnsureField(field) @@ -2066,6 +2115,7 @@ func TestRowIDColumnIDImportFailsRoaring(t *testing.T) { func TestCSVRowIDColumnKeyImport(t *testing.T) { client := getClient() + defer client.Close() iterator := NewArrayRecordIterator([]Record{ Column{RowID: 10, ColumnKey: "five"}, Column{RowID: 2, ColumnKey: "three"}, @@ -2108,6 +2158,7 @@ func TestCSVRowIDColumnKeyImport(t *testing.T) { func TestCSVRowIDColumnKeyImportManualAddress(t *testing.T) { client := getClientManualAddress() + defer client.Close() iterator := NewArrayRecordIterator([]Record{ Column{RowID: 10, ColumnKey: "five"}, Column{RowID: 2, ColumnKey: "three"}, @@ -2166,6 +2217,7 @@ func TestRowIDColumnKeyImportFails(t *testing.T) { func TestRowKeyColumnIDImport(t *testing.T) { client := getClient() + defer client.Close() iterator := NewArrayRecordIterator([]Record{ Column{RowKey: "ten", ColumnID: 7}, Column{RowKey: "ten", ColumnID: 5}, @@ -2208,6 +2260,7 @@ func TestRowKeyColumnIDImport(t *testing.T) { func TestRowKeyColumnKeyImport(t *testing.T) { client := getClient() + defer client.Close() iterator := NewArrayRecordIterator([]Record{ Column{RowKey: "ten", ColumnKey: "five"}, Column{RowKey: "two", ColumnKey: "three"}, @@ -2249,6 +2302,7 @@ func TestRowKeyColumnKeyImport(t *testing.T) { func TestRowKeyColumnKeyImportRoaring(t *testing.T) { client := getClient() + defer client.Close() iterator := NewArrayRecordIterator([]Record{ Column{RowKey: "ten", ColumnKey: "five"}, Column{RowKey: "two", ColumnKey: "three"}, @@ -2296,6 +2350,7 @@ func TestValueFieldImport(t *testing.T) { }) } client := getClient() + defer client.Close() iterator := newIterator() field := index.Field("importvaluefield", OptFieldTypeInt(0, 100)) err := client.EnsureField(field) @@ -2346,6 +2401,7 @@ func TestValueFieldImport(t *testing.T) { func TestValueFieldWithKeysImport(t *testing.T) { client := getClient() + defer client.Close() iterator := NewArrayRecordIterator([]Record{ FieldValue{ColumnKey: "ten", Value: 7}, FieldValue{ColumnKey: "seven", Value: 1}, @@ -2384,6 +2440,7 @@ func TestValueFieldWithKeysImport(t *testing.T) { func TestExportRowIDColumnID(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("exportfield-rowid-colid") client.EnsureField(field) _, err := client.Query(index.BatchQuery( @@ -2407,6 +2464,7 @@ func TestExportRowIDColumnID(t *testing.T) { func TestExportRowIDColumnKey(t *testing.T) { client := getClient() + defer client.Close() field := keysIndex.Field("exportfield-rowid-colkey") client.EnsureField(field) _, err := client.Query(keysIndex.BatchQuery( @@ -2430,6 +2488,7 @@ func TestExportRowIDColumnKey(t *testing.T) { func TestExportRowKeyColumnID(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("exportfield-rowkey-colid", OptFieldKeys(true)) client.EnsureField(field) _, err := client.Query(index.BatchQuery( @@ -2453,6 +2512,7 @@ func TestExportRowKeyColumnID(t *testing.T) { func TestExportRowKeyColumnKey(t *testing.T) { client := getClient() + defer client.Close() field := keysIndex.Field("exportfield-rowkey-colkey", OptFieldKeys(true)) client.EnsureField(field) _, err := client.Query(keysIndex.BatchQuery( @@ -2476,6 +2536,7 @@ func TestExportRowKeyColumnKey(t *testing.T) { func TestTranslateRowKeys(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("translate-rowkeys", OptFieldKeys(true)) client.EnsureField(field) _, err := client.Query(index.BatchQuery( @@ -2497,6 +2558,7 @@ func TestTranslateRowKeys(t *testing.T) { func TestTranslateColKeys(t *testing.T) { client := getClient() + defer client.Close() field := keysIndex.Field("translate-colkeys") client.EnsureField(field) _, err := client.Query(keysIndex.BatchQuery( @@ -2529,6 +2591,7 @@ func TestCSVExportFailure(t *testing.T) { func TestImportColumnIteratorError(t *testing.T) { client := getClient() + defer client.Close() field := index.Field("not-important") iterator := &BrokenRecordIterator{} err := client.ImportField(field, iterator) @@ -2541,6 +2604,7 @@ func TestErrorReturningImportOption(t *testing.T) { iterator := NewArrayRecordIterator([]Record{}) field := index.Field("importfield") client := getClient() + defer client.Close() optionErr := errors.New("ERR") err := client.ImportField(field, iterator, ErrorImportOption(optionErr)) if err != optionErr { @@ -2550,6 +2614,7 @@ func TestErrorReturningImportOption(t *testing.T) { func TestImportColumnsNoNodesError(t *testing.T) { client := getClient() + defer client.Close() field := &Field{ index: &Index{ options: &IndexOptions{}, From c2b019c94ab7aef2adbb800229b89432812e4461 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Sat, 31 Aug 2019 16:06:35 -0500 Subject: [PATCH 16/26] remove picsv command (moving to pdk) --- cmd/picsv/.gitignore | 2 - cmd/picsv/Makefile | 3 - cmd/picsv/main.go | 323 ------------------------------- cmd/picsv/main_internal_test.go | 20 -- cmd/picsv/main_test.go | 331 -------------------------------- cmd/picsv/testdata/sample.csv | 8 - gpexp/importbatch.go | 2 +- 7 files changed, 1 insertion(+), 688 deletions(-) delete mode 100644 cmd/picsv/.gitignore delete mode 100644 cmd/picsv/Makefile delete mode 100644 cmd/picsv/main.go delete mode 100644 cmd/picsv/main_internal_test.go delete mode 100644 cmd/picsv/main_test.go delete mode 100644 cmd/picsv/testdata/sample.csv diff --git a/cmd/picsv/.gitignore b/cmd/picsv/.gitignore deleted file mode 100644 index d9f1f6e..0000000 --- a/cmd/picsv/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -marketing-*.csv -config.json diff --git a/cmd/picsv/Makefile b/cmd/picsv/Makefile deleted file mode 100644 index d8276e1..0000000 --- a/cmd/picsv/Makefile +++ /dev/null @@ -1,3 +0,0 @@ - -bench: - GO111MODULE=on go test -bench=. -run=ZZZ -benchtime=3x diff --git a/cmd/picsv/main.go b/cmd/picsv/main.go deleted file mode 100644 index ba8b928..0000000 --- a/cmd/picsv/main.go +++ /dev/null @@ -1,323 +0,0 @@ -package main - -import ( - "encoding/csv" - "encoding/json" - "fmt" - "io" - "log" - "os" - "strconv" - "time" - - "github.com/jaffee/commandeer" - "github.com/pilosa/go-pilosa" - "github.com/pilosa/go-pilosa/gpexp" - "github.com/pkg/errors" -) - -type Main struct { - Pilosa []string - File string - Index string - BatchSize int - ConfigFile string - - Config *Config `flag:"-"` -} - -func NewMain() *Main { - return &Main{ - Pilosa: []string{"localhost:10101"}, - File: "data.csv", - Index: "picsvtest", - BatchSize: 1000, - - Config: NewConfig(), - } -} - -func (m *Main) Run() error { - start := time.Now() - - // Load Config File (if available) - if m.ConfigFile != "" { - f, err := os.Open(m.ConfigFile) - if err != nil { - return errors.Wrap(err, "opening config file") - } - dec := json.NewDecoder(f) - err = dec.Decode(m.Config) - if err != nil { - return errors.Wrap(err, "decoding config file") - } - } - log.Printf("Flags: %+v\n", *m) - log.Printf("Config: %+v\n", *m.Config) - - f, err := os.Open(m.File) - if err != nil { - return errors.Wrap(err, "opening file") - } - defer f.Close() - reader := csv.NewReader(f) - - client, err := pilosa.NewClient(m.Pilosa) - if err != nil { - return errors.Wrap(err, "getting pilosa client") - } - schema, err := client.Schema() - if err != nil { - return errors.Wrap(err, "getting schema") - } - opts := []pilosa.IndexOption{} - if m.Config.IDField != "" { - opts = append(opts, pilosa.OptIndexKeys(true)) - } - index := schema.Index(m.Index, opts...) - - headerRow, err := reader.Read() - if err != nil { - return errors.Wrap(err, "reading CSV header") - } - log.Println("Got Header: ", headerRow) - fields, header, getIDFn, err := processHeader(m.Config, index, headerRow) - if err != nil { - return errors.Wrap(err, "processing header") - } - - // this has a non-obvious dependence on the previous line... the fields are set up in the index which comes from the schema - client.SyncSchema(schema) - batch, err := gpexp.NewBatch(client, m.BatchSize, index, fields) - if err != nil { - return errors.Wrap(err, "getting new batch") - } - record := gpexp.Row{ - Values: make([]interface{}, len(header)), - } - - numRecords := uint64(0) - for row, err := reader.Read(); err == nil; row, err = reader.Read() { - record.ID = getIDFn(row, numRecords) - for _, meta := range header { - if meta.srcIndex < len(row) { - record.Values[meta.recordIndex] = meta.valGetter(row[meta.srcIndex]) - } else { - record.Values[meta.recordIndex] = nil - log.Printf("row is shorter than header: %v", row) - } - } - err := batch.Add(record) - if err == gpexp.ErrBatchNowFull { - err := batch.Import() - if err != nil { - return errors.Wrap(err, "importing") - } - } else if err != nil { - return errors.Wrap(err, "adding to batch") - } - - numRecords++ - } - - if err != io.EOF && err != nil { - return errors.Wrap(err, "reading csv") - } - err = batch.Import() - if err != nil { - return errors.Wrap(err, "final import") - } - - log.Printf("processed %d ids\n", numRecords) - log.Println("Duration: ", time.Since(start)) - return nil -} - -type valueMeta struct { - srcIndex int - recordIndex int - valGetter func(val string) interface{} -} - -type idGetter func(row []string, numRecords uint64) interface{} - -func processHeader(config *Config, index *pilosa.Index, headerRow []string) ([]*pilosa.Field, map[string]valueMeta, idGetter, error) { - fields := make([]*pilosa.Field, 0, len(headerRow)) - header := make(map[string]valueMeta) - getIDFn := func(row []string, numRecords uint64) interface{} { - return numRecords - } - for i, fieldName := range headerRow { - if fieldName == config.IDField { - idIndex := i - switch config.IDType { - case "uint64": - getIDFn = func(row []string, numRecords uint64) interface{} { - uintVal, err := strconv.ParseUint(row[idIndex], 0, 64) - if err != nil { - return nil - } - return uintVal - } - case "string": - getIDFn = func(row []string, numRecords uint64) interface{} { - return row[idIndex] - } - default: - return nil, nil, nil, errors.Errorf("unknown IDType: %s", config.IDType) - } - continue - } - - var valGetter func(val string) interface{} - srcField, ok := config.SourceFields[fieldName] - if !ok { - srcField = SourceField{ - TargetField: fieldName, - Type: "string", - } - config.SourceFields[fieldName] = srcField - } - pilosaField, ok := config.PilosaFields[srcField.TargetField] - if !ok { - pilosaField = Field{ - Type: "set", - CacheType: pilosa.CacheTypeRanked, - CacheSize: 100000, - Keys: true, - } - config.PilosaFields[fieldName] = pilosaField - } - - fieldName = srcField.TargetField - switch srcField.Type { - case "ignore": - continue - case "int": - valGetter = func(val string) interface{} { - intVal, err := strconv.ParseInt(val, 10, 64) - if err != nil { - return nil - } - return intVal - } - fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) - case "float": - if srcField.Multiplier != 0 { - valGetter = func(val string) interface{} { - floatVal, err := strconv.ParseFloat(val, 64) - if err != nil { - return nil - } - return int64(floatVal * srcField.Multiplier) - } - } else { - valGetter = func(val string) interface{} { - floatVal, err := strconv.ParseFloat(val, 64) - if err != nil { - return nil - } - return int64(floatVal) - } - } - fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) - case "string": - valGetter = func(val string) interface{} { - if val == "" { - return nil // ignore empty strings - } - return val - } - fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) - case "uint64": - valGetter = func(val string) interface{} { - uintVal, err := strconv.ParseUint(val, 0, 64) - if err != nil { - return nil - } - return uintVal - } - fields = append(fields, index.Field(fieldName, pilosaField.MakeOptions()...)) - } - header[fieldName] = valueMeta{ - valGetter: valGetter, - srcIndex: i, - recordIndex: len(fields) - 1, - } - } - - return fields, header, getIDFn, nil -} - -func main() { - if err := commandeer.Run(NewMain()); err != nil { - log.Fatal(err) - } -} - -func NewConfig() *Config { - return &Config{ - PilosaFields: make(map[string]Field), - SourceFields: make(map[string]SourceField), - IDType: "string", - } -} - -type Config struct { - PilosaFields map[string]Field `json:"pilosa-fields"` - SourceFields map[string]SourceField `json:"source-fields"` - - // IDField denotes which field in the source should be used for Pilosa record IDs. - IDField string `json:"id-field"` - - // IDType denotes whether the ID field should be parsed as a string or uint64. - IDType string `json:"id-type"` -} - -type Field struct { - Type string `json:"type"` - Min int64 `json:"min"` - Max int64 `json:"max"` - Keys bool `json:"keys"` - CacheType pilosa.CacheType `json:"cache-type"` - CacheSize int `json:"cache-size"` - // TODO time stuff -} - -func (f Field) MakeOptions() (opts []pilosa.FieldOption) { - switch f.Type { - case "set": - opts = append(opts, pilosa.OptFieldKeys(f.Keys), pilosa.OptFieldTypeSet(f.CacheType, f.CacheSize)) - case "int": - if f.Max != 0 || f.Min != 0 { - opts = append(opts, pilosa.OptFieldTypeInt(f.Min, f.Max)) - } else { - opts = append(opts, pilosa.OptFieldTypeInt()) - } - default: - panic(fmt.Sprintf("unknown pilosa field type: %s", f.Type)) - } - return opts -} - -type SourceField struct { - // TargetField is the Pilosa field that this source field should map to. - TargetField string `json:"target-field"` - - // Type denotes how the source field should be parsed. (string, - // int, rowID, float, or ignore). rowID means that the field will - // be parsed as a uint64 and then used directly as a rowID for a - // set field. If "string", key translation must be on for that - // Pilosa field, and it must be a set field. If int or float, it - // must be a Pilosa int field. - Type string `json:"type"` - - // Multiplier is for float fields. Because Pilosa does not support - // floats natively, it is sometimes useful to store a float in - // Pilosa as an integer, but first multiplied by some constant - // factor to preserve some amount of precision. If 0 this field won't be used. - Multiplier float64 `json:"multiplier"` -} - -// TODO we should validate the Config once it is constructed. -// What are valid mappings from source fields to pilosa fields? diff --git a/cmd/picsv/main_internal_test.go b/cmd/picsv/main_internal_test.go deleted file mode 100644 index bfae155..0000000 --- a/cmd/picsv/main_internal_test.go +++ /dev/null @@ -1,20 +0,0 @@ -package main - -import ( - "strings" - "testing" -) - -func TestProcessHeader(t *testing.T) { - config := NewConfig() - headerRow := []string{"a", "b", "c"} - - t.Run("invalid IDType", func(t *testing.T) { - config.IDField = "a" - config.IDType = "nope" - _, _, _, err := processHeader(config, nil, headerRow) - if err == nil || !strings.Contains(err.Error(), "unknown IDType") { - t.Fatalf("unknown IDType gave: %v", err) - } - }) -} diff --git a/cmd/picsv/main_test.go b/cmd/picsv/main_test.go deleted file mode 100644 index aa96df3..0000000 --- a/cmd/picsv/main_test.go +++ /dev/null @@ -1,331 +0,0 @@ -package main_test - -import ( - "fmt" - "io" - "net/http" - "os" - "testing" - - "github.com/pilosa/go-pilosa" - picsv "github.com/pilosa/go-pilosa/cmd/picsv" - "github.com/pkg/errors" -) - -func BenchmarkImportCSV(b *testing.B) { - m := picsv.NewMain() - m.BatchSize = 1 << 20 - m.Index = "picsvbench" - m.File = "marketing-200k.csv" - getRawData(b, m.File) - client, err := pilosa.NewClient(m.Pilosa) - if err != nil { - b.Fatalf("getting client: %v", err) - } - b.ResetTimer() - - for i := 0; i < b.N; i++ { - err := m.Run() - if err != nil { - b.Fatalf("running import: %v", err) - } - b.StopTimer() - err = client.DeleteIndexByName(m.Index) - if err != nil { - b.Fatalf("deleting index: %v", err) - } - b.StartTimer() - } - -} - -func getRawData(t testing.TB, file string) { - if _, err := os.Open(file); err == nil { - return - } else if !os.IsNotExist(err) { - t.Fatalf("opening %s: %v", file, err) - } - // if the file doesn't exist - f, err := os.Create(file) - if err != nil { - t.Fatalf("creating file: %v", err) - } - resp, err := http.Get(fmt.Sprintf("https://molecula-sample-data.s3.amazonaws.com/%s", file)) - if err != nil { - t.Fatalf("getting data: %v", err) - } - if resp.StatusCode > 299 { - t.Fatalf("getting data failed: %v", resp.Status) - } - _, err = io.Copy(f, resp.Body) - if err != nil { - t.Fatalf("copying data into file: %v", err) - } - - err = f.Close() - if err != nil { - t.Fatalf("closing file: %v", err) - } - -} - -func TestImportCSV(t *testing.T) { - m := picsv.NewMain() - m.BatchSize = 100000 - m.Index = "testpicsv" - m.File = "marketing-200k.csv" - m.Config.SourceFields["age"] = picsv.SourceField{TargetField: "age", Type: "float"} - m.Config.PilosaFields["age"] = picsv.Field{Type: "int"} - m.Config.IDField = "id" - getRawData(t, m.File) - client, err := pilosa.NewClient(m.Pilosa) - if err != nil { - t.Fatalf("getting client: %v", err) - } - - defer func() { - err = client.DeleteIndexByName(m.Index) - if err != nil { - t.Fatalf("deleting index: %v", err) - } - }() - err = m.Run() - if err != nil { - t.Fatalf("running ingest: %v", err) - } - - schema, err := client.Schema() - if err != nil { - t.Fatalf("getting schema: %v", err) - } - - index := schema.Index(m.Index) - marital := index.Field("marital") - converted := index.Field("converted") - age := index.Field("age") - - tests := []struct { - query *pilosa.PQLRowQuery - bash string - exp int64 - }{ - { - query: marital.Row("married"), - bash: `awk -F, '/married/ {print $1,$4}' marketing-200k.csv | sort | uniq | wc`, - exp: 125514, - }, - { - query: converted.Row("no"), - bash: `awk -F, '{print $1,$17}' marketing-200k.csv | grep "no" |sort | uniq | wc`, - exp: 199999, - }, - { - query: age.Equals(55), - bash: `awk -F, '{print $1,$2}' marketing-200k.csv | grep " 55.0" |sort | uniq | wc`, - exp: 3282, - }, - } - - for i, test := range tests { - t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { - q := index.Count(test.query) - resp, err := client.Query(q) - if err != nil { - t.Fatalf("running query '%s': %v", q.Serialize(), err) - } - if resp.Result().Count() != test.exp { - t.Fatalf("Got unexpected result %d instead of %d for\nquery: %s\nbash: %s", resp.Result().Count(), test.exp, q.Serialize(), test.bash) - } - }) - } -} - -func TestSmallImport(t *testing.T) { - m := picsv.NewMain() - m.BatchSize = 1 << 20 - m.Index = "testsample" - m.File = "testdata/sample.csv" - m.ConfigFile = "config.json" - client, err := pilosa.NewClient(m.Pilosa) - if err != nil { - t.Fatalf("getting client: %v", err) - } - defer func() { - err = client.DeleteIndexByName(m.Index) - if err != nil { - t.Logf("deleting index: %v", err) - } - }() - config := `{ -"pilosa-fields": {"size": {"type": "set", "keys": true, "cache-type": "ranked", "cache-size": 100000}, - "age": {"type": "int"}, - "color": {"type": "set", "keys": true}, - "result": {"type": "int"}, - "dayofweek": {"type": "set", "keys": false, "cache-type": "ranked", "cache-size": 7} - }, -"id-field": "ID", -"id-type": "string", -"source-fields": { - "Size": {"target-field": "size", "type": "string"}, - "Color": {"target-field": "color", "type": "string"}, - "Age": {"target-field": "age", "type": "int"}, - "Result": {"target-field": "result", "type": "float", "multiplier": 100000000}, - "dayofweek": {"target-field": "dayofweek", "type": "uint64"} - } -} -` - data := ` -ID,Size,Color,Age,Result,dayofweek -ABDJ,small,green,42,1.13106317,1 -HFZP,large,red,99,30.23959735,2 -HFZP,small,green,99,NA,3 -EJSK,medium,purple,22,20.23959735,1 -EJSK,large,green,35,25.13106317, -FEFF,,,,,6 -` - writeFile(t, m.ConfigFile, config) - writeFile(t, m.File, data) - - err = m.Run() - if err != nil { - t.Fatalf("running ingest: %v", err) - } - - schema, err := client.Schema() - if err != nil { - t.Fatalf("getting schema: %v", err) - } - - index := schema.Index(m.Index) - size := index.Field("size") - color := index.Field("color") - age := index.Field("age") - result := index.Field("result") - day := index.Field("dayofweek") - - tests := []struct { - query pilosa.PQLQuery - resType string - exp interface{} - }{ - { - query: index.Count(size.Row("small")), - resType: "count", - exp: int64(2), - }, - { - query: size.Row("small"), - resType: "rowKeys", - exp: []string{"ABDJ", "HFZP"}, - }, - { - query: color.Row("green"), - resType: "rowKeys", - exp: []string{"ABDJ", "HFZP", "EJSK"}, - }, - { - query: age.Equals(99), - resType: "rowKeys", - exp: []string{"HFZP"}, - }, - { - query: age.GT(0), - resType: "rowKeys", - exp: []string{"ABDJ", "HFZP", "EJSK"}, - }, - { - query: result.GT(0), - resType: "rowKeys", - exp: []string{"ABDJ", "EJSK"}, - }, - { - query: result.GT(100000), - resType: "rowKeys", - exp: []string{"ABDJ", "EJSK"}, - }, - { - query: day.Row(1), - resType: "rowKeys", - exp: []string{"ABDJ", "EJSK"}, - }, - { - query: day.Row(6), - resType: "rowKeys", - exp: []string{"FEFF"}, - }, - { - query: index.Count(day.Row(3)), - resType: "count", - exp: int64(1), - }, - { - query: index.Count(day.Row(2)), - resType: "count", - exp: int64(1), // not mutually exclusive! - }, - { - query: size.Row(`""`), // TODO... go-pilosa should probably serialize keys into PQL using quotes. - resType: "rowKeys", - exp: []string{}, // empty strings are ignored rather than ingested - }, - { - query: color.Row(`""`), - resType: "rowKeys", - exp: []string{}, // empty strings are ignored rather than ingested - }, - } - - for i, test := range tests { - t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { - resp, err := client.Query(test.query) - if err != nil { - t.Fatalf("running query: %v", err) - } - res := resp.Result() - switch test.resType { - case "count": - if res.Count() != test.exp.(int64) { - t.Fatalf("unexpected count %d is not %d", res.Count(), test.exp.(int64)) - } - case "rowKeys": - got := res.Row().Keys - exp := test.exp.([]string) - if err := isPermutationOf(got, exp); err != nil { - t.Fatalf("unequal rows %v expected/got:\n%v\n%v", err, exp, got) - } - } - }) - } - -} - -func writeFile(t testing.TB, name, contents string) { - cf, err := os.Create(name) - if err != nil { - t.Fatalf("creating config file: %v", err) - } - _, err = cf.Write([]byte(contents)) - if err != nil { - t.Fatalf("writing config file: %v", err) - } -} - -func isPermutationOf(one, two []string) error { - if len(one) != len(two) { - return errors.Errorf("different lengths %d and %d", len(one), len(two)) - } -outer: - for _, vOne := range one { - for j, vTwo := range two { - if vOne == vTwo { - two = append(two[:j], two[j+1:]...) - continue outer - } - } - return errors.Errorf("%s in one but not two", vOne) - } - if len(two) != 0 { - return errors.Errorf("vals in two but not one: %v", two) - } - return nil -} diff --git a/cmd/picsv/testdata/sample.csv b/cmd/picsv/testdata/sample.csv deleted file mode 100644 index 2804233..0000000 --- a/cmd/picsv/testdata/sample.csv +++ /dev/null @@ -1,8 +0,0 @@ - -ID,Size,Color,Age,Result,dayofweek -ABDJ,small,green,42,1.13106317,1 -HFZP,large,red,99,30.23959735,2 -HFZP,small,green,99,NA,3 -EJSK,medium,purple,22,20.23959735,1 -EJSK,large,green,35,25.13106317, -FEFF,,,,,6 diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index d4a1150..642facd 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -184,7 +184,7 @@ func (b *Batch) Add(rec Row) error { b.toTranslateID[rid] = ints b.ids = append(b.ids, 0) } - default: + default: // TODO support nil ID as being auto-allocated. return errors.Errorf("unsupported id type %T value %v", rid, rid) } From b811c3b5398eb8988b614ab434a3df4aae1ed29e Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Mon, 9 Sep 2019 14:31:37 -0500 Subject: [PATCH 17/26] start adding time quantum support to batch import adds QuantizedTime type to support efficiently tracking a time quantum with each Row which will be applied to all the values of time-capable fields in that row. --- gpexp/importbatch.go | 68 +++++++++++++++++++++++ gpexp/importbatch_test.go | 110 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index 642facd..468877a 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -1,6 +1,8 @@ package gpexp import ( + "time" + "github.com/pilosa/go-pilosa" "github.com/pilosa/pilosa/roaring" "github.com/pkg/errors" @@ -153,6 +155,72 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi type Row struct { ID interface{} Values []interface{} + Time *QuantizedTime +} + +// QuantizedTime represents a moment in time down to some granularity +// (year, month, day, or hour). +type QuantizedTime struct { + ymdh [10]byte +} + +// Set sets the Quantized time to the given timestamp (down to hour +// granularity). +func (qt *QuantizedTime) Set(t time.Time) { + copy(qt.ymdh[:], t.Format("2006010215")) +} + +// SetYear sets the quantized time's year, but leaves month, day, and +// hour untouched. +func (qt *QuantizedTime) SetYear(year string) { + copy(qt.ymdh[:4], year) +} + +// SetMonth sets the QuantizedTime's month, but leaves year, day, and +// hour untouched. +func (qt *QuantizedTime) SetMonth(month string) { + copy(qt.ymdh[4:6], month) +} + +// SetDay sets the QuantizedTime's day, but leaves year, month, and +// hour untouched. +func (qt *QuantizedTime) SetDay(day string) { + copy(qt.ymdh[6:8], day) +} + +// SetHour sets the QuantizedTime's hour, but leaves year, month, and +// day untouched. +func (qt *QuantizedTime) SetHour(hour string) { + copy(qt.ymdh[8:10], hour) +} + +func (qt *QuantizedTime) views(q pilosa.TimeQuantum) ([]string, error) { + views := make([]string, 0, len(q)) + for _, unit := range q { + switch unit { + case 'Y': + if qt.ymdh[0] == 0 { + return nil, errors.New("no data set for year") + } + views = append(views, string(qt.ymdh[:4])) + case 'M': + if qt.ymdh[4] == 0 { + return nil, errors.New("no data set for month") + } + views = append(views, string(qt.ymdh[:6])) + case 'D': + if qt.ymdh[6] == 0 { + return nil, errors.New("no data set for day") + } + views = append(views, string(qt.ymdh[:8])) + case 'H': + if qt.ymdh[8] == 0 { + return nil, errors.New("no data set for hour") + } + views = append(views, string(qt.ymdh[:10])) + } + } + return views, nil } // Add adds a record to the batch. Performance will be best if record diff --git a/gpexp/importbatch_test.go b/gpexp/importbatch_test.go index f9ac887..7bd4110 100644 --- a/gpexp/importbatch_test.go +++ b/gpexp/importbatch_test.go @@ -4,6 +4,7 @@ import ( "reflect" "strconv" "testing" + "time" "github.com/pilosa/go-pilosa" "github.com/pkg/errors" @@ -483,3 +484,112 @@ outer: } return nil } + +func TestQuantizedTime(t *testing.T) { + cases := []struct { + name string + time time.Time + year string + month string + day string + hour string + quantum pilosa.TimeQuantum + exp []string + expErr string + }{ + { + name: "no time quantum", + exp: []string{}, + expErr: "", + }, + { + name: "no data", + quantum: pilosa.TimeQuantumYear, + expErr: "no data set for year", + }, + { + name: "timestamp", + time: time.Date(2013, time.October, 16, 17, 34, 43, 0, time.FixedZone("UTC-5", -5*60*60)), + quantum: "YMDH", + exp: []string{"2013", "201310", "20131016", "2013101617"}, + }, + { + name: "timestamp-less-granular", + time: time.Date(2013, time.October, 16, 17, 34, 43, 0, time.FixedZone("UTC-5", -5*60*60)), + quantum: "YM", + exp: []string{"2013", "201310"}, + }, + { + name: "timestamp-mid-granular", + time: time.Date(2013, time.October, 16, 17, 34, 43, 0, time.FixedZone("UTC-5", -5*60*60)), + quantum: "MD", + exp: []string{"201310", "20131016"}, + }, + { + name: "justyear", + year: "2013", + quantum: "Y", + exp: []string{"2013"}, + }, + { + name: "justyear-wantmonth", + year: "2013", + quantum: "YM", + expErr: "no data set for month", + }, + { + name: "timestamp-changeyear", + time: time.Date(2013, time.October, 16, 17, 34, 43, 0, time.FixedZone("UTC-5", -5*60*60)), + year: "2019", + quantum: "YMDH", + exp: []string{"2019", "201910", "20191016", "2019101617"}, + }, + { + name: "yearmonthdayhour", + year: "2013", + month: "10", + day: "16", + hour: "17", + quantum: "YMDH", + exp: []string{"2013", "201310", "20131016", "2013101617"}, + }, + { + name: "timestamp-changehour", + time: time.Date(2013, time.October, 16, 17, 34, 43, 0, time.FixedZone("UTC-5", -5*60*60)), + hour: "05", + quantum: "MDH", + exp: []string{"201310", "20131016", "2013101605"}, + }, + } + + for i, test := range cases { + t.Run(test.name+strconv.Itoa(i), func(t *testing.T) { + tq := &QuantizedTime{} + var zt time.Time + if zt != test.time { + tq.Set(test.time) + } + if test.year != "" { + tq.SetYear(test.year) + } + if test.month != "" { + tq.SetMonth(test.month) + } + if test.day != "" { + tq.SetDay(test.day) + } + if test.hour != "" { + tq.SetHour(test.hour) + } + + views, err := tq.views(test.quantum) + if !reflect.DeepEqual(views, test.exp) { + t.Errorf("unexpected views, got/want:\n%v\n%v\n", views, test.exp) + } + if (err != nil && err.Error() != test.expErr) || (err == nil && test.expErr != "") { + t.Errorf("unexpected error, got/want:\n%v\n%s\n", err, test.expErr) + } + }) + } + +} From e40d84aa7666d13de0521162899eeec21407e082 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Mon, 9 Sep 2019 18:42:46 -0500 Subject: [PATCH 18/26] complete per-record timestamp support --- gpexp/importbatch.go | 108 +++++++++++++++++++++++++++++--------- gpexp/importbatch_test.go | 64 +++++++++++++++++++--- 2 files changed, 139 insertions(+), 33 deletions(-) diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index 468877a..47151c0 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -67,10 +67,19 @@ type Batch struct { // values holds the values for each record of an int field values map[string][]int64 + // times holds a time for each record. (if any of the fields are time fields) + times []QuantizedTime + // clearValues holds a slice of indices into b.ids for each // integer field which has nil values. After translation, these // slices will be filled out with the actual column IDs those // indices pertain to so that they can be cleared. + // + // TODO: This is actually a problem — a nil value doesn't mean + // "clear this value", it should mean "don't touch this value", so + // there is no way currently to update a record with int values + // without knowing all the int values, clearing them, or setting + // them to something else in the process. clearValues map[string][]uint64 // TODO, support timestamps, set fields with more than one value per record, mutex, and bool. @@ -90,8 +99,11 @@ type Batch struct { transCache Translator } +// BatchOption is a functional option for Batch objects. type BatchOption func(b *Batch) error +// OptTranslator allows one to pass in a custom Translator +// implementation for mapping keys to IDs. func OptTranslator(t Translator) BatchOption { return func(b *Batch) error { b.transCache = t @@ -112,15 +124,17 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi rowIDs := make(map[string][]uint64) values := make(map[string][]int64) tt := make(map[string]map[string][]int) + hasTime := false for _, field := range fields { headerMap[field.Name()] = field opts := field.Opts() - switch opts.Type() { - case pilosa.FieldTypeDefault, pilosa.FieldTypeSet: + switch typ := opts.Type(); typ { + case pilosa.FieldTypeDefault, pilosa.FieldTypeSet, pilosa.FieldTypeTime: if opts.Keys() { tt[field.Name()] = make(map[string][]int) } rowIDs[field.Name()] = make([]uint64, 0, size) + hasTime = typ == pilosa.FieldTypeTime || hasTime case pilosa.FieldTypeInt: values[field.Name()] = make([]int64, 0, size) } @@ -138,6 +152,9 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi toTranslateID: make(map[string][]int), transCache: NewMapTranslator(), } + if hasTime { + b.times = make([]QuantizedTime, 0, size) + } for _, opt := range opts { err := opt(b) if err != nil { @@ -155,7 +172,7 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi type Row struct { ID interface{} Values []interface{} - Time *QuantizedTime + Time QuantizedTime } // QuantizedTime represents a moment in time down to some granularity @@ -194,7 +211,20 @@ func (qt *QuantizedTime) SetHour(hour string) { copy(qt.ymdh[8:10], hour) } +// Reset sets the time to the zero value which generates no time views. +func (qt *QuantizedTime) Reset() { + for i := range qt.ymdh { + qt.ymdh[i] = 0 + } +} + +// views builds the list of Pilosa views for this particular time, +// given a quantum. func (qt *QuantizedTime) views(q pilosa.TimeQuantum) ([]string, error) { + zero := QuantizedTime{} + if *qt == zero { + return nil, nil + } views := make([]string, 0, len(q)) for _, unit := range q { switch unit { @@ -256,6 +286,10 @@ func (b *Batch) Add(rec Row) error { return errors.Errorf("unsupported id type %T value %v", rid, rid) } + if b.times != nil { + b.times = append(b.times, rec.Time) + } + for i := 0; i < len(rec.Values); i++ { field := b.header[i] switch val := rec.Values[i].(type) { @@ -402,14 +436,17 @@ func (b *Batch) doTranslation() error { func (b *Batch) doImport() error { eg := errgroup.Group{} - frags := b.makeFragments() - for shard, viewMap := range frags { - for fieldView, bitmap := range viewMap { - fieldView := fieldView - bitmap := bitmap + frags, err := b.makeFragments() + if err != nil { + return errors.Wrap(err, "making fragments") + } + for shard, fieldMap := range frags { + for field, viewMap := range fieldMap { + field := field + viewMap := viewMap eg.Go(func() error { - err := b.client.ImportRoaringBitmap(b.index.Field(fieldView.field), shard, map[string]*roaring.Bitmap{"": bitmap}, false) - return errors.Wrapf(err, "importing data for %s", fieldView.field) + err := b.client.ImportRoaringBitmap(b.index.Field(field), shard, viewMap, false) + return errors.Wrapf(err, "importing data for %s", field) }) } } @@ -425,13 +462,15 @@ func (b *Batch) doImport() error { // if needed though). var nilSentinel = ^uint64(0) -func (b *Batch) makeFragments() fragments { +func (b *Batch) makeFragments() (fragments, error) { shardWidth := b.index.ShardWidth() if shardWidth == 0 { shardWidth = pilosa.DefaultShardWidth } frags := make(fragments) for fname, rowIDs := range b.rowIDs { + field := b.headerMap[fname] + opts := field.Opts() curShard := ^uint64(0) // impossible sentinel value for shard. var curBM *roaring.Bitmap for j := range b.ids { @@ -443,12 +482,30 @@ func (b *Batch) makeFragments() fragments { curShard = col / shardWidth curBM = frags.GetOrCreate(curShard, fname, "") } - curBM.DirectAdd(row*shardWidth + (col % shardWidth)) + // TODO this is super ugly, but we want to avoid setting + // bits on the standard view in the specific case when + // there isn't one. Should probably refactor this whole + // loop to be more general w.r.t. views. Also... tests for + // the NoStandardView case would be great. + if !(opts.Type() == pilosa.FieldTypeTime && opts.NoStandardView()) { + curBM.DirectAdd(row*shardWidth + (col % shardWidth)) + } + if opts.Type() == pilosa.FieldTypeTime { + views, err := b.times[j].views(opts.TimeQuantum()) + if err != nil { + return nil, errors.Wrap(err, "calculating views") + } + for _, view := range views { + tbm := frags.GetOrCreate(curShard, fname, view) + tbm.DirectAdd(row*shardWidth + (col % shardWidth)) + } + } } } - return frags + return frags, nil } +// importValueData imports data for int fields. func (b *Batch) importValueData() error { shardWidth := b.index.ShardWidth() if shardWidth == 0 { @@ -531,6 +588,7 @@ func (b *Batch) importValueData() error { // next round. Where possible it does not re-allocate memory. func (b *Batch) reset() { b.ids = b.ids[:0] + b.times = b.times[:0] for fieldName, rowIDs := range b.rowIDs { b.rowIDs[fieldName] = rowIDs[:0] m := b.toTranslate[fieldName] @@ -549,24 +607,24 @@ func (b *Batch) reset() { } } -type fieldView struct { - field string - view string -} - -// map[shard][fieldview]fragmentData -type fragments map[uint64]map[fieldView]*roaring.Bitmap +// map[shard][field][view]fragmentData +type fragments map[uint64]map[string]map[string]*roaring.Bitmap func (f fragments) GetOrCreate(shard uint64, field, view string) *roaring.Bitmap { - viewMap, ok := f[shard] + fieldMap, ok := f[shard] + if !ok { + fieldMap = make(map[string]map[string]*roaring.Bitmap) + } + viewMap, ok := fieldMap[field] if !ok { - viewMap = make(map[fieldView]*roaring.Bitmap) + viewMap = make(map[string]*roaring.Bitmap) } - bm, ok := viewMap[fieldView{field: field, view: view}] + bm, ok := viewMap[view] if !ok { bm = roaring.NewBTreeBitmap() - viewMap[fieldView{field: field, view: view}] = bm + viewMap[view] = bm } - f[shard] = viewMap + fieldMap[field] = viewMap + f[shard] = fieldMap return bm } diff --git a/gpexp/importbatch_test.go b/gpexp/importbatch_test.go index 7bd4110..db94fab 100644 --- a/gpexp/importbatch_test.go +++ b/gpexp/importbatch_test.go @@ -16,11 +16,13 @@ func TestBatches(t *testing.T) { client := pilosa.DefaultClient() schema := pilosa.NewSchema() idx := schema.Index("gopilosatest-blah") - fields := make([]*pilosa.Field, 4) + numFields := 5 + fields := make([]*pilosa.Field, numFields) fields[0] = idx.Field("zero", pilosa.OptFieldKeys(true)) fields[1] = idx.Field("one", pilosa.OptFieldKeys(true)) fields[2] = idx.Field("two", pilosa.OptFieldKeys(true)) fields[3] = idx.Field("three", pilosa.OptFieldTypeInt()) + fields[4] = idx.Field("four", pilosa.OptFieldTypeTime(pilosa.TimeQuantumYearMonthDay)) err := client.SyncSchema(schema) if err != nil { t.Fatalf("syncing schema: %v", err) @@ -35,7 +37,8 @@ func TestBatches(t *testing.T) { if err != nil { t.Fatalf("getting new batch: %v", err) } - r := Row{Values: make([]interface{}, 4)} + r := Row{Values: make([]interface{}, numFields)} + r.Time.Set(time.Date(2019, time.January, 2, 15, 45, 0, 0, time.UTC)) for i := 0; i < 9; i++ { r.ID = uint64(i) @@ -44,15 +47,20 @@ func TestBatches(t *testing.T) { r.Values[1] = "b" r.Values[2] = "c" r.Values[3] = int64(99) + r.Values[4] = uint64(1) + r.Time.SetMonth("01") } else { r.Values[0] = "x" r.Values[1] = "y" r.Values[2] = "z" r.Values[3] = int64(-10) + r.Values[4] = uint64(1) + r.Time.SetMonth("02") } if i == 8 { r.Values[0] = nil r.Values[3] = nil + r.Values[4] = nil } err := b.Add(r) if err != nil { @@ -149,6 +157,10 @@ func TestBatches(t *testing.T) { t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) } + } else if fname == "four" { + if !reflect.DeepEqual(rowIDs, []uint64{1, 1, 1, 1, 1, 1, 1, 1, nilSentinel, nilSentinel}) { + t.Fatalf("unexpected rowids for time field") + } } else { if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 1}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 2}) { t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) @@ -170,11 +182,13 @@ func TestBatches(t *testing.T) { r.Values[1] = "b" r.Values[2] = "c" r.Values[3] = int64(99) + r.Values[4] = uint64(1) } else { r.Values[0] = "x" r.Values[1] = "y" r.Values[2] = "z" r.Values[3] = int64(-10) + r.Values[4] = uint64(2) } err := b.Add(r) if i != 18 && err != nil { @@ -212,11 +226,13 @@ func TestBatches(t *testing.T) { r.Values[1] = "e" r.Values[2] = "f" r.Values[3] = int64(100) + r.Values[4] = uint64(3) } else { r.Values[0] = "u" r.Values[1] = "v" r.Values[2] = "w" r.Values[3] = int64(0) + r.Values[4] = uint64(4) } err := b.Add(r) if i != 28 && err != nil { @@ -244,7 +260,10 @@ func TestBatches(t *testing.T) { } } - frags := b.makeFragments() + frags, err := b.makeFragments() + if err != nil { + t.Fatalf("making fragments: %v", err) + } if len(frags) != 1 { t.Fatalf("unexpected # of shards in fragments: %d", len(frags)) @@ -253,8 +272,8 @@ func TestBatches(t *testing.T) { if !ok { t.Fatalf("shard 0 should be in frags") } - if len(viewMap) != 3 { - t.Fatalf("there should be 3 views") + if len(viewMap) != 4 { + t.Fatalf("there should be 4 views") } resp, err := client.Query(idx.BatchQuery(fields[0].Row("a"), @@ -296,7 +315,9 @@ func TestBatches(t *testing.T) { resp, err = client.Query(idx.BatchQuery(fields[3].GT(-11), fields[3].Equals(0), - fields[3].Equals(100))) + fields[3].Equals(100), + fields[4].Range(1, time.Date(2019, time.January, 1, 0, 0, 0, 0, time.UTC), time.Date(2019, time.January, 29, 0, 0, 0, 0, time.UTC)), + fields[4].Range(1, time.Date(2019, time.February, 1, 0, 0, 0, 0, time.UTC), time.Date(2019, time.February, 29, 0, 0, 0, 0, time.UTC)))) if err != nil { t.Fatalf("querying: %v", err) } @@ -313,6 +334,16 @@ func TestBatches(t *testing.T) { if !reflect.DeepEqual(cols, []uint64{20, 22, 24, 26, 28}) { t.Fatalf("wrong cols for ==100: %v", cols) } + cols = results[3].Row().Columns + exp := []uint64{0, 2, 4, 6, 10, 12, 14, 16, 18} + if !reflect.DeepEqual(cols, exp) { + t.Fatalf("wrong cols for January: got/want\n%v\n%v", cols, exp) + } + cols = results[4].Row().Columns + exp = []uint64{1, 3, 5, 7} + if !reflect.DeepEqual(cols, exp) { + t.Fatalf("wrong cols for January: got/want\n%v\n%v", cols, exp) + } // TODO test non-full batches, test behavior of doing import on empty batch // TODO test importing across multiple shards @@ -494,18 +525,25 @@ func TestQuantizedTime(t *testing.T) { day string hour string quantum pilosa.TimeQuantum + reset bool exp []string expErr string }{ { name: "no time quantum", + expErr: "", + }, + { + name: "no time quantum with data", + year: "2017", exp: []string{}, expErr: "", }, { name: "no data", quantum: pilosa.TimeQuantumYear, - expErr: "no data set for year", + exp: nil, + expErr: "", }, { name: "timestamp", @@ -560,11 +598,18 @@ func TestQuantizedTime(t *testing.T) { quantum: "MDH", exp: []string{"201310", "20131016", "2013101605"}, }, + { + name: "timestamp", + time: time.Date(2013, time.October, 16, 17, 34, 43, 0, time.FixedZone("UTC-5", -5*60*60)), + quantum: "YMDH", + reset: true, + exp: nil, + }, } for i, test := range cases { t.Run(test.name+strconv.Itoa(i), func(t *testing.T) { - tq := &QuantizedTime{} + tq := QuantizedTime{} var zt time.Time if zt != test.time { tq.Set(test.time) @@ -581,6 +626,9 @@ func TestQuantizedTime(t *testing.T) { if test.hour != "" { tq.SetHour(test.hour) } + if test.reset { + tq.Reset() + } views, err := tq.views(test.quantum) if !reflect.DeepEqual(views, test.exp) { From 2dfc4b7bd86cf48aac01d4124d4dbd26906afe58 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 10 Sep 2019 13:00:53 -0500 Subject: [PATCH 19/26] test and fix clearValues bug The test demonstrates how a nil integer value added to a Batch would clear an existing integer value instead of ignoring it. This fixes it by tracking which records have nil integer values for each field, and removing those ids/values before importing. In order to be able to execute import requests concurrently, while re-using the same slices for ids and values for each import, we expose encoding separately from importing in the Client, so that we can encode serially reusing the same slices, and import the encoded data concurrently. --- client.go | 30 +++++++-- gpexp/importbatch.go | 128 +++++++++++++------------------------- gpexp/importbatch_test.go | 64 ++++++++++++++++++- 3 files changed, 132 insertions(+), 90 deletions(-) diff --git a/client.go b/client.go index 5d84220..f20c3a4 100644 --- a/client.go +++ b/client.go @@ -730,8 +730,23 @@ func (c *Client) importValues(field *Field, // index,field,shard on all nodes which should hold that shard. It // assumes that the ids have been translated from keys if necessary // and so tells Pilosa to ignore checking if the index uses column -// keys. +// keys. ImportValues wraps EncodeImportValues and DoImportValues — +// these are broken out and exported so that performance conscious +// users can re-use the same vals and ids byte buffers for local +// encoding, while performing the imports concurrently. func (c *Client) ImportValues(index, field string, shard uint64, vals []int64, ids []uint64, clear bool) error { + path, data, err := c.EncodeImportValues(index, field, shard, vals, ids, clear) + if err != nil { + return errors.Wrap(err, "encoding import-values request") + } + err = c.DoImportValues(index, shard, path, data) + return errors.Wrap(err, "doing import values") +} + +// EncodeImportValues computes the HTTP path and payload for an +// import-values request. It is typically followed by a call to +// DoImportValues. +func (c *Client) EncodeImportValues(index, field string, shard uint64, vals []int64, ids []uint64, clear bool) (path string, data []byte, err error) { msg := &pbuf.ImportValueRequest{ Index: index, Field: field, @@ -739,11 +754,18 @@ func (c *Client) ImportValues(index, field string, shard uint64, vals []int64, i ColumnIDs: ids, Values: vals, } - data, err := proto.Marshal(msg) + data, err = proto.Marshal(msg) if err != nil { - return errors.Wrap(err, "marshaling to protobuf") + return "", nil, errors.Wrap(err, "marshaling to protobuf") } - path := fmt.Sprintf("/index/%s/field/%s/import?clear=%s&ignoreKeyCheck=true", index, field, strconv.FormatBool(clear)) + path = fmt.Sprintf("/index/%s/field/%s/import?clear=%s&ignoreKeyCheck=true", index, field, strconv.FormatBool(clear)) + return path, data, nil +} + +// DoImportValues takes a path and data payload (normally from +// EncodeImportValues), logs the import, finds all nodes which own +// this shard, and concurrently imports to those nodes. +func (c *Client) DoImportValues(index string, shard uint64, path string, data []byte) error { c.logImport(index, path, shard, false, data) uris, err := c.getURIsForShard(index, shard) diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index 47151c0..be74b59 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -61,7 +61,8 @@ type Batch struct { // ids is a slice of length batchSize of record IDs ids []uint64 - // rowIDs is a slice of length len(Batch.header) which contains slices of length batchSize + // rowIDs is a map of field names to slices of length batchSize + // which contain row IDs. rowIDs map[string][]uint64 // values holds the values for each record of an int field @@ -70,19 +71,11 @@ type Batch struct { // times holds a time for each record. (if any of the fields are time fields) times []QuantizedTime - // clearValues holds a slice of indices into b.ids for each - // integer field which has nil values. After translation, these - // slices will be filled out with the actual column IDs those - // indices pertain to so that they can be cleared. - // - // TODO: This is actually a problem — a nil value doesn't mean - // "clear this value", it should mean "don't touch this value", so - // there is no way currently to update a record with int values - // without knowing all the int values, clearing them, or setting - // them to something else in the process. - clearValues map[string][]uint64 + // nullIndices holds a slice of indices into b.ids for each + // integer field which has nil values. + nullIndices map[string][]uint64 - // TODO, support timestamps, set fields with more than one value per record, mutex, and bool. + // TODO support mutex and bool fields. // for each field, keep a map of key to which record indexes that key mapped to toTranslate map[string]map[string][]int @@ -147,7 +140,7 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi ids: make([]uint64, 0, size), rowIDs: rowIDs, values: values, - clearValues: make(map[string][]uint64), + nullIndices: make(map[string][]uint64), toTranslate: tt, toTranslateID: make(map[string][]int), transCache: NewMapTranslator(), @@ -164,11 +157,7 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi return b, nil } -// Row represents a single record which can be added to a RecordBatch. -// -// Note: it is not named "Record" because there is a conflict with -// another type in this package. This may be rectified by deprecating -// something or splitting packages in the future. +// Row represents a single record which can be added to a Batch. type Row struct { ID interface{} Values []interface{} @@ -316,12 +305,12 @@ func (b *Batch) Add(rec Row) error { case nil: if field.Opts().Type() == pilosa.FieldTypeInt { b.values[field.Name()] = append(b.values[field.Name()], 0) - clearIndexes, ok := b.clearValues[field.Name()] + nullIndices, ok := b.nullIndices[field.Name()] if !ok { - clearIndexes = make([]uint64, 0) + nullIndices = make([]uint64, 0) } - clearIndexes = append(clearIndexes, uint64(len(b.ids)-1)) - b.clearValues[field.Name()] = clearIndexes + nullIndices = append(nullIndices, uint64(len(b.ids)-1)) + b.nullIndices[field.Name()] = nullIndices } else { b.rowIDs[field.Name()] = append(b.rowIDs[field.Name()], nilSentinel) @@ -425,11 +414,6 @@ func (b *Batch) doTranslation() error { } } - for _, idIndexes := range b.clearValues { - for i, index := range idIndexes { - idIndexes[i] = b.ids[index] - } - } return nil } @@ -511,77 +495,53 @@ func (b *Batch) importValueData() error { if shardWidth == 0 { shardWidth = pilosa.DefaultShardWidth } - eg := errgroup.Group{} - curShard := b.ids[0] / shardWidth - startIdx := 0 - for i := 1; i <= len(b.ids); i++ { - // when i==len(b.ids) we ensure that the import logic gets run - // by making a fake shard once we're past the last ID - recordID := (curShard + 2) * shardWidth - if i < len(b.ids) { - recordID = b.ids[i] - } - if recordID/shardWidth != curShard { - endIdx := i - ids := b.ids[startIdx:endIdx] - for field, values := range b.values { - field := field - shard := curShard - vslice := values[startIdx:endIdx] - eg.Go(func() error { - err := b.client.ImportValues(b.index.Name(), field, shard, vslice, ids, false) - return errors.Wrapf(err, "importing values for %s", field) - }) - } - startIdx = i - curShard = recordID / shardWidth - } - } - - err := eg.Wait() - if err != nil { - return errors.Wrap(err, "importing value data") - } - // Now we clear any values for which we got a nil. - // - // TODO we need an endpoint which lets us set and clear - // transactionally... this is kind of a hack. - maxLen := 0 - for _, ids := range b.clearValues { - if len(ids) > maxLen { - maxLen = len(ids) + ids := make([]uint64, len(b.ids)) + for field, values := range b.values { + // grow our temp ids slice to full length + ids = ids[:len(b.ids)] + // copy orig ids back in + copy(ids, b.ids) + + // trim out null values from ids and values. + nullIndices := b.nullIndices[field] + for i, nullIndex := range nullIndices { + nullIndex -= uint64(i) // offset the index by the number of items removed so far + ids = append(ids[:nullIndex], ids[nullIndex+1:]...) + values = append(values[:nullIndex], values[nullIndex+1:]...) } - } - eg = errgroup.Group{} - values := make([]int64, 0, maxLen) - for field, ids := range b.clearValues { - // TODO maybe sort ids here - curShard := b.ids[0] / shardWidth + + // now do imports by shard + curShard := ids[0] / shardWidth startIdx := 0 for i := 1; i <= len(ids); i++ { - recordID := (curShard + 2) * shardWidth + var recordID uint64 if i < len(ids) { - recordID = b.ids[i] + recordID = ids[i] + } else { + recordID = (curShard + 2) * shardWidth } + if recordID/shardWidth != curShard { endIdx := i - idSlice := ids[startIdx:endIdx] - values := values[:len(idSlice)] - field := field shard := curShard + field := field + path, data, err := b.client.EncodeImportValues(b.index.Name(), field, shard, values[startIdx:endIdx], ids[startIdx:endIdx], false) + if err != nil { + return errors.Wrap(err, "encoding import values") + } eg.Go(func() error { - err := b.client.ImportValues(b.index.Name(), field, shard, values, idSlice, true) - return errors.Wrap(err, "clearing values") + err := b.client.DoImportValues(b.index.Name(), shard, path, data) + return errors.Wrapf(err, "importing values for %s", field) }) startIdx = i curShard = recordID / shardWidth } } } - - return errors.Wrap(eg.Wait(), "importing clear value data") + err := eg.Wait() + return errors.Wrap(err, "importing value data") } // reset is called at the end of importing to ready the batch for the @@ -602,8 +562,8 @@ func (b *Batch) reset() { for k := range b.values { delete(b.values, k) // TODO pool these slices } - for k := range b.clearValues { - delete(b.clearValues, k) // TODO pool these slices + for k := range b.nullIndices { + delete(b.nullIndices, k) // TODO pool these slices } } diff --git a/gpexp/importbatch_test.go b/gpexp/importbatch_test.go index db94fab..6e3f6ec 100644 --- a/gpexp/importbatch_test.go +++ b/gpexp/importbatch_test.go @@ -12,6 +12,66 @@ import ( // TODO test against cluster +func TestImportBatchInts(t *testing.T) { + client := pilosa.DefaultClient() + schema := pilosa.NewSchema() + idx := schema.Index("gopilosatest-blah") + field := idx.Field("anint", pilosa.OptFieldTypeInt()) + err := client.SyncSchema(schema) + if err != nil { + t.Fatalf("syncing schema: %v", err) + } + + b, err := NewBatch(client, 3, idx, []*pilosa.Field{field}) + if err != nil { + t.Fatalf("getting batch: %v", err) + } + + r := Row{Values: make([]interface{}, 1)} + + for i := uint64(0); i < 3; i++ { + r.ID = i + r.Values[0] = int64(i) + err := b.Add(r) + if err != nil && err != ErrBatchNowFull { + t.Fatalf("adding to batch: %v", err) + } + } + err = b.Import() + if err != nil { + t.Fatalf("importing: %v", err) + } + + r.ID = uint64(0) + r.Values[0] = nil + err = b.Add(r) + if err != nil { + t.Fatalf("adding after import: %v", err) + } + r.ID = uint64(1) + r.Values[0] = int64(7) + err = b.Add(r) + if err != nil { + t.Fatalf("adding second after import: %v", err) + } + + err = b.Import() + if err != nil { + t.Fatalf("second import: %v", err) + } + + resp, err := client.Query(idx.BatchQuery(field.Equals(0), field.Equals(7), field.Equals(2))) + if err != nil { + t.Fatalf("querying: %v", err) + } + + for i, result := range resp.Results() { + if !reflect.DeepEqual(result.Row().Columns, []uint64{uint64(i)}) { + t.Errorf("expected %v for %d, but got %v", []uint64{uint64(i)}, i, result.Row().Columns) + } + } +} + func TestBatches(t *testing.T) { client := pilosa.DefaultClient() schema := pilosa.NewSchema() @@ -90,8 +150,8 @@ func TestBatches(t *testing.T) { if !reflect.DeepEqual(b.values["three"], []int64{99, -10, 99, -10, 99, -10, 99, -10, 0}) { t.Fatalf("unexpected values: %v", b.values["three"]) } - if !reflect.DeepEqual(b.clearValues["three"], []uint64{8}) { - t.Fatalf("unexpected clearValues: %v", b.clearValues["three"]) + if !reflect.DeepEqual(b.nullIndices["three"], []uint64{8}) { + t.Fatalf("unexpected nullIndices: %v", b.nullIndices["three"]) } if len(b.toTranslate["one"]) != 2 { From bfe8680b131f1e94a1bc91426504578b02f266ea Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Fri, 4 Oct 2019 09:21:16 -0500 Subject: [PATCH 20/26] handle byte slice batch record IDs also check for unsupported field types and error --- gpexp/importbatch.go | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index be74b59..adff9ee 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -130,6 +130,8 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi hasTime = typ == pilosa.FieldTypeTime || hasTime case pilosa.FieldTypeInt: values[field.Name()] = make([]int64, 0, size) + default: + return nil, errors.Errorf("field type %s is not currently supported through Batch", typ) } } b := &Batch{ @@ -254,10 +256,7 @@ func (b *Batch) Add(rec Row) error { return errors.Errorf("record needs to match up with batch fields, got %d fields and %d record", len(b.header), len(rec.Values)) } - switch rid := rec.ID.(type) { - case uint64: - b.ids = append(b.ids, rid) - case string: + handleStringID := func(rid string) error { if colID, ok, err := b.transCache.GetCol(b.index.Name(), rid); err != nil { return errors.Wrap(err, "translating column") } else if ok { @@ -271,6 +270,23 @@ func (b *Batch) Add(rec Row) error { b.toTranslateID[rid] = ints b.ids = append(b.ids, 0) } + return nil + } + var err error + + switch rid := rec.ID.(type) { + case uint64: + b.ids = append(b.ids, rid) + case string: + err := handleStringID(rid) + if err != nil { + return err + } + case []byte: + err = handleStringID(string(rid)) + if err != nil { + return err + } default: // TODO support nil ID as being auto-allocated. return errors.Errorf("unsupported id type %T value %v", rid, rid) } From 9b81c1a481260544cd2c0a5f4e826bdda4f8f82c Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Fri, 4 Oct 2019 15:27:28 -0500 Subject: [PATCH 21/26] fix and comment for importvalues --- gpexp/importbatch.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index adff9ee..4269c49 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -522,6 +522,20 @@ func (b *Batch) importValueData() error { // trim out null values from ids and values. nullIndices := b.nullIndices[field] + // TODO(jaffee) I think this may be very inefficient. It looks + // like we're copying the `ids` and `values` slices over + // themselves (an O(n) operation) for each nullIndex so this + // is effectively O(n^2). What we could do is iterate through + // ids and values each once, while simultaneously iterating + // through nullindices and keeping track of how many + // nullIndices we've passed, and so how far back we need to + // copy each item. + // + // It was a couple weeks ago that I wrote this code, and I + // vaguely remember thinking about this, so I may just be + // missing something now. We should benchmark on what should + // be a bad case (an int field which is mostly null), and see + // if the improved implementation helps a lot. for i, nullIndex := range nullIndices { nullIndex -= uint64(i) // offset the index by the number of items removed so far ids = append(ids[:nullIndex], ids[nullIndex+1:]...) @@ -529,6 +543,9 @@ func (b *Batch) importValueData() error { } // now do imports by shard + if len(ids) == 0 { + continue // TODO test this "all nil" case + } curShard := ids[0] / shardWidth startIdx := 0 for i := 1; i <= len(ids); i++ { From 4129aee12032289abdc85fe923dd10b527598f2f Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 8 Oct 2019 14:27:29 -0500 Subject: [PATCH 22/26] support for string slice values in batch ingest slight retry improvements --- client.go | 7 +- go.mod | 2 + gpexp/importbatch.go | 143 ++++++++++++++++++++++++++++++++++---- gpexp/importbatch_test.go | 108 ++++++++++++++++++++++++++++ 4 files changed, 244 insertions(+), 16 deletions(-) diff --git a/client.go b/client.go index f20c3a4..f2c2f72 100644 --- a/client.go +++ b/client.go @@ -227,7 +227,7 @@ func newClientWithOptions(options *ClientOptions) *Client { c.tracer = options.tracer } c.retries = *options.retries - c.minRetrySleepTime = 1 * time.Second + c.minRetrySleepTime = 100 * time.Millisecond c.maxRetrySleepTime = 2 * time.Minute c.importManager = newRecordImportManager(c) go c.runChangeDetection() @@ -1124,7 +1124,10 @@ func (c *Client) doRequest(host *URI, method, path string, headers map[string]st } err = errors.New(strings.TrimSpace(string(content))) } - c.logger.Printf("request failed with: %s, retrying (%d)", err.Error(), tries) + if tries == 0 { + break + } + c.logger.Printf("request failed with: %s status: %d, retrying after %d more time(s) after %v ", err.Error(), resp.StatusCode, tries, sleepTime) time.Sleep(sleepTime) sleepTime *= 2 if sleepTime > c.maxRetrySleepTime { diff --git a/go.mod b/go.mod index ac26e55..81c5c19 100644 --- a/go.mod +++ b/go.mod @@ -13,3 +13,5 @@ require ( github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect golang.org/x/sync v0.0.0-20190423024810-112230192c58 ) + +go 1.12 diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index 4269c49..9ee21b6 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -65,6 +65,11 @@ type Batch struct { // which contain row IDs. rowIDs map[string][]uint64 + // rowIDSets is a map from field name to a batchSize slice of + // slices of row IDs. When a given record can have more than one + // value for a field, rowIDSets stores that information. + rowIDSets map[string][][]uint64 + // values holds the values for each record of an int field values map[string][]int64 @@ -80,6 +85,11 @@ type Batch struct { // for each field, keep a map of key to which record indexes that key mapped to toTranslate map[string]map[string][]int + // toTranslateSets is a map from field name to a map of string + // keys that need to be translated to sets of record indexes which + // those keys map to. + toTranslateSets map[string]map[string][]int + // for string ids which we weren't able to immediately translate, // keep a map of which record(s) each string id maps to. // @@ -117,6 +127,7 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi rowIDs := make(map[string][]uint64) values := make(map[string][]int64) tt := make(map[string]map[string][]int) + ttSets := make(map[string]map[string][]int) hasTime := false for _, field := range fields { headerMap[field.Name()] = field @@ -125,6 +136,7 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi case pilosa.FieldTypeDefault, pilosa.FieldTypeSet, pilosa.FieldTypeTime: if opts.Keys() { tt[field.Name()] = make(map[string][]int) + ttSets[field.Name()] = make(map[string][]int) } rowIDs[field.Name()] = make([]uint64, 0, size) hasTime = typ == pilosa.FieldTypeTime || hasTime @@ -135,17 +147,19 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi } } b := &Batch{ - client: client, - header: fields, - headerMap: headerMap, - index: index, - ids: make([]uint64, 0, size), - rowIDs: rowIDs, - values: values, - nullIndices: make(map[string][]uint64), - toTranslate: tt, - toTranslateID: make(map[string][]int), - transCache: NewMapTranslator(), + client: client, + header: fields, + headerMap: headerMap, + index: index, + ids: make([]uint64, 0, size), + rowIDs: rowIDs, + rowIDSets: make(map[string][][]uint64), + values: values, + nullIndices: make(map[string][]uint64), + toTranslate: tt, + toTranslateSets: ttSets, + toTranslateID: make(map[string][]int), + transCache: NewMapTranslator(), } if hasTime { b.times = make([]QuantizedTime, 0, size) @@ -291,6 +305,9 @@ func (b *Batch) Add(rec Row) error { return errors.Errorf("unsupported id type %T value %v", rid, rid) } + // curPos is the current position in b.ids, rowIDs[*], etc. + curPos := len(b.ids) - 1 + if b.times != nil { b.times = append(b.times, rec.Time) } @@ -310,7 +327,7 @@ func (b *Batch) Add(rec Row) error { if !ok { ints = make([]int, 0) } - ints = append(ints, len(rowIDs)) + ints = append(ints, curPos) b.toTranslate[field.Name()][val] = ints b.rowIDs[field.Name()] = append(rowIDs, 0) } @@ -318,6 +335,30 @@ func (b *Batch) Add(rec Row) error { b.rowIDs[field.Name()] = append(b.rowIDs[field.Name()], val) case int64: b.values[field.Name()] = append(b.values[field.Name()], val) + case []string: + rowIDSets, ok := b.rowIDSets[field.Name()] + if !ok { + rowIDSets = make([][]uint64, len(b.ids)-1, cap(b.ids)) + } else { + rowIDSets = rowIDSets[:len(b.ids)-1] // grow this field's rowIDSets if necessary + } + + rowIDs := make([]uint64, 0, len(val)) + for _, k := range val { + if rowID, ok, err := b.transCache.GetRow(b.index.Name(), field.Name(), k); err != nil { + return errors.Wrap(err, "translating row from []string") + } else if ok { + rowIDs = append(rowIDs, rowID) + } else { + ints, ok := b.toTranslateSets[field.Name()][k] + if !ok { + ints = make([]int, 0, 1) + } + ints = append(ints, curPos) + b.toTranslateSets[field.Name()][k] = ints + } + } + b.rowIDSets[field.Name()] = append(rowIDSets, rowIDs) case nil: if field.Opts().Type() == pilosa.FieldTypeInt { b.values[field.Name()] = append(b.values[field.Name()], 0) @@ -325,7 +366,7 @@ func (b *Batch) Add(rec Row) error { if !ok { nullIndices = make([]uint64, 0) } - nullIndices = append(nullIndices, uint64(len(b.ids)-1)) + nullIndices = append(nullIndices, uint64(curPos)) b.nullIndices[field.Name()] = nullIndices } else { @@ -430,6 +471,33 @@ func (b *Batch) doTranslation() error { } } + for fieldName, tt := range b.toTranslateSets { + keys = keys[:0] + + for k := range tt { + keys = append(keys, k) + } + + if len(keys) == 0 { + continue + } + // translate keys from Pilosa + ids, err := b.client.TranslateRowKeys(b.headerMap[fieldName], keys) + if err != nil { + return errors.Wrap(err, "translating row keys") + } + if err := b.transCache.AddRows(b.index.Name(), fieldName, keys, ids); err != nil { + return errors.Wrap(err, "adding rows to cache") + } + rowIDSets := b.rowIDSets[fieldName] + for j, key := range keys { + rowID := ids[j] + for _, recordIdx := range tt[key] { + rowIDSets[recordIdx] = append(rowIDSets[recordIdx], rowID) + } + } + } + return nil } @@ -469,6 +537,9 @@ func (b *Batch) makeFragments() (fragments, error) { } frags := make(fragments) for fname, rowIDs := range b.rowIDs { + if len(rowIDs) == 0 { + continue // this can happen when the values that came in for this field were string slices + } field := b.headerMap[fname] opts := field.Opts() curShard := ^uint64(0) // impossible sentinel value for shard. @@ -502,6 +573,45 @@ func (b *Batch) makeFragments() (fragments, error) { } } } + + for fname, rowIDSets := range b.rowIDSets { + field := b.headerMap[fname] + opts := field.Opts() + curShard := ^uint64(0) // impossible sentinel value for shard. + var curBM *roaring.Bitmap + for j := range b.ids { + col, rowIDs := b.ids[j], rowIDSets[j] + if len(rowIDs) == 0 { + continue + } + if col/shardWidth != curShard { + curShard = col / shardWidth + curBM = frags.GetOrCreate(curShard, fname, "") + } + // TODO this is super ugly, but we want to avoid setting + // bits on the standard view in the specific case when + // there isn't one. Should probably refactor this whole + // loop to be more general w.r.t. views. Also... tests for + // the NoStandardView case would be great. + if !(opts.Type() == pilosa.FieldTypeTime && opts.NoStandardView()) { + for _, row := range rowIDs { + curBM.DirectAdd(row*shardWidth + (col % shardWidth)) + } + } + if opts.Type() == pilosa.FieldTypeTime { + views, err := b.times[j].views(opts.TimeQuantum()) + if err != nil { + return nil, errors.Wrap(err, "calculating views") + } + for _, view := range views { + tbm := frags.GetOrCreate(curShard, fname, view) + for _, row := range rowIDs { + tbm.DirectAdd(row*shardWidth + (col % shardWidth)) + } + } + } + } + } return frags, nil } @@ -512,7 +622,6 @@ func (b *Batch) importValueData() error { shardWidth = pilosa.DefaultShardWidth } eg := errgroup.Group{} - ids := make([]uint64, len(b.ids)) for field, values := range b.values { // grow our temp ids slice to full length @@ -584,10 +693,16 @@ func (b *Batch) reset() { b.times = b.times[:0] for fieldName, rowIDs := range b.rowIDs { b.rowIDs[fieldName] = rowIDs[:0] + rowIDSet := b.rowIDSets[fieldName] + b.rowIDSets[fieldName] = rowIDSet[:0] m := b.toTranslate[fieldName] for k := range m { delete(m, k) // TODO pool these slices } + m = b.toTranslateSets[fieldName] + for k := range m { + delete(m, k) + } } for k := range b.toTranslateID { delete(b.toTranslateID, k) // TODO pool these slices diff --git a/gpexp/importbatch_test.go b/gpexp/importbatch_test.go index 6e3f6ec..d99f746 100644 --- a/gpexp/importbatch_test.go +++ b/gpexp/importbatch_test.go @@ -72,6 +72,114 @@ func TestImportBatchInts(t *testing.T) { } } +func TestStringSlice(t *testing.T) { + client := pilosa.DefaultClient() + schema := pilosa.NewSchema() + idx := schema.Index("test-string-slice") + fields := make([]*pilosa.Field, 1) + fields[0] = idx.Field("strslice", pilosa.OptFieldKeys(true), pilosa.OptFieldTypeSet(pilosa.CacheTypeRanked, 100)) + err := client.SyncSchema(schema) + if err != nil { + t.Fatalf("syncing schema: %v", err) + } + defer func() { + err := client.DeleteIndex(idx) + if err != nil { + t.Logf("problem cleaning up from test: %v", err) + } + }() + + trans := NewMapTranslator() + err = trans.AddRows("test-string-slice", "strslice", []string{"c", "d", "f"}, []uint64{9, 10, 13}) + if err != nil { + t.Fatalf("adding to translator: %v", err) + } + + b, err := NewBatch(client, 3, idx, fields, OptTranslator(trans)) + if err != nil { + t.Fatalf("creating new batch: %v", err) + } + + r := Row{Values: make([]interface{}, len(fields))} + r.ID = uint64(0) + r.Values[0] = []string{"a"} + err = b.Add(r) + if err != nil { + t.Fatalf("adding to batch: %v", err) + } + if got := b.toTranslateSets["strslice"]["a"]; !reflect.DeepEqual(got, []int{0}) { + t.Fatalf("expected []int{0}, got: %v", got) + } + + r.ID = uint64(1) + r.Values[0] = []string{"a", "b", "c"} + err = b.Add(r) + if err != nil { + t.Fatalf("adding to batch: %v", err) + } + if got := b.toTranslateSets["strslice"]["a"]; !reflect.DeepEqual(got, []int{0, 1}) { + t.Fatalf("expected []int{0,1}, got: %v", got) + } + if got := b.toTranslateSets["strslice"]["b"]; !reflect.DeepEqual(got, []int{1}) { + t.Fatalf("expected []int{1}, got: %v", got) + } + if got, ok := b.toTranslateSets["strslice"]["c"]; ok { + t.Fatalf("should be nothing at c, got: %v", got) + } + if got := b.rowIDSets["strslice"][1]; !reflect.DeepEqual(got, []uint64{9}) { + t.Fatalf("expected c to map to rowID 9 but got %v", got) + } + + r.ID = uint64(2) + r.Values[0] = []string{"d", "e", "f"} + err = b.Add(r) + if err != ErrBatchNowFull { + t.Fatalf("adding to batch: %v", err) + } + if got, ok := b.toTranslateSets["strslice"]["d"]; ok { + t.Fatalf("should be nothing at d, got: %v", got) + } + if got, ok := b.toTranslateSets["strslice"]["f"]; ok { + t.Fatalf("should be nothing at f, got: %v", got) + } + if got := b.toTranslateSets["strslice"]["e"]; !reflect.DeepEqual(got, []int{2}) { + t.Fatalf("expected []int{2}, got: %v", got) + } + if got := b.rowIDSets["strslice"][2]; !reflect.DeepEqual(got, []uint64{10, 13}) { + t.Fatalf("expected c to map to rowID 9 but got %v", got) + } + + err = b.doTranslation() + if err != nil { + t.Fatalf("translating: %v", err) + } + + if got := b.rowIDSets["strslice"][0]; !reflect.DeepEqual(got, []uint64{1}) { + t.Fatalf("after translation, rec 0: %v", got) + } + if got := b.rowIDSets["strslice"][1]; !reflect.DeepEqual(got, []uint64{9, 1, 2}) { + t.Fatalf("after translation, rec 1: %v", got) + } + if got := b.rowIDSets["strslice"][2]; !reflect.DeepEqual(got, []uint64{10, 13, 3}) { + t.Fatalf("after translation, rec 2: %v", got) + } + + err = b.doImport() + if err != nil { + t.Fatalf("doing import: %v", err) + } + + resp, err := client.Query(idx.BatchQuery(fields[0].Row("a"))) + if err != nil { + t.Fatalf("querying: %v", err) + } + result := resp.Result() + if !reflect.DeepEqual(result.Row().Columns, []uint64{0, 1}) { + t.Fatalf("expected a to be [0,1], got %v", result.Row().Columns) + } + +} + func TestBatches(t *testing.T) { client := pilosa.DefaultClient() schema := pilosa.NewSchema() From 6791c1437ec4a725c93d20b74ab7b3c7cb62ceae Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Tue, 8 Oct 2019 14:46:51 -0500 Subject: [PATCH 23/26] fix importbatch bug, wrap some errs --- client.go | 4 ++-- gpexp/importbatch.go | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/client.go b/client.go index f2c2f72..aca3350 100644 --- a/client.go +++ b/client.go @@ -362,7 +362,7 @@ func (c *Client) EnsureIndex(index *Index) error { if err == ErrIndexExists { return nil } - return err + return errors.Wrap(err, "creating index") } // EnsureField creates a field on the server if it doesn't exists. @@ -424,7 +424,7 @@ func (c *Client) syncSchema(schema *Schema, serverSchema *Schema) error { if _, ok := serverSchema.indexes[indexName]; !ok { err = c.EnsureIndex(index) if err != nil { - return err + return errors.Wrap(err, "ensuring index") } } for _, field := range index.fields { diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index 9ee21b6..4dd2fab 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -575,6 +575,9 @@ func (b *Batch) makeFragments() (fragments, error) { } for fname, rowIDSets := range b.rowIDSets { + if len(rowIDSets) == 0 { + continue + } field := b.headerMap[fname] opts := field.Opts() curShard := ^uint64(0) // impossible sentinel value for shard. From 1d062fd72d5682ef262765ed094f16075392c654 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Wed, 9 Oct 2019 17:31:59 -0500 Subject: [PATCH 24/26] support multiple of the same field in batch needed to convert rowIDs and toTranslate to map from field index rather than field name... going to continue refactoring this to use a map from index rather than a slice --- gpexp/importbatch.go | 52 ++++++++++++++++++++------------------- gpexp/importbatch_test.go | 48 ++++++++++++++++++++++++------------ 2 files changed, 59 insertions(+), 41 deletions(-) diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index 4dd2fab..6b62050 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -61,9 +61,9 @@ type Batch struct { // ids is a slice of length batchSize of record IDs ids []uint64 - // rowIDs is a map of field names to slices of length batchSize - // which contain row IDs. - rowIDs map[string][]uint64 + // rowIDs is a map of field index (in the header) to slices of + // length batchSize which contain row IDs. + rowIDs [][]uint64 // rowIDSets is a map from field name to a batchSize slice of // slices of row IDs. When a given record can have more than one @@ -83,7 +83,7 @@ type Batch struct { // TODO support mutex and bool fields. // for each field, keep a map of key to which record indexes that key mapped to - toTranslate map[string]map[string][]int + toTranslate []map[string][]int // toTranslateSets is a map from field name to a map of string // keys that need to be translated to sets of record indexes which @@ -124,21 +124,21 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi return nil, errors.New("can't batch with no fields or batch size") } headerMap := make(map[string]*pilosa.Field, len(fields)) - rowIDs := make(map[string][]uint64) + rowIDs := make([][]uint64, len(fields)) values := make(map[string][]int64) - tt := make(map[string]map[string][]int) + tt := make([]map[string][]int, len(fields)) ttSets := make(map[string]map[string][]int) hasTime := false - for _, field := range fields { + for i, field := range fields { headerMap[field.Name()] = field opts := field.Opts() switch typ := opts.Type(); typ { case pilosa.FieldTypeDefault, pilosa.FieldTypeSet, pilosa.FieldTypeTime: if opts.Keys() { - tt[field.Name()] = make(map[string][]int) + tt[i] = make(map[string][]int) ttSets[field.Name()] = make(map[string][]int) } - rowIDs[field.Name()] = make([]uint64, 0, size) + rowIDs[i] = make([]uint64, 0, size) // TODO make this on-demand when it gets used. could be a string array field. hasTime = typ == pilosa.FieldTypeTime || hasTime case pilosa.FieldTypeInt: values[field.Name()] = make([]int64, 0, size) @@ -316,23 +316,23 @@ func (b *Batch) Add(rec Row) error { field := b.header[i] switch val := rec.Values[i].(type) { case string: - rowIDs := b.rowIDs[field.Name()] + rowIDs := b.rowIDs[i] // translate val and append to b.rowIDs[i] if rowID, ok, err := b.transCache.GetRow(b.index.Name(), field.Name(), val); err != nil { return errors.Wrap(err, "translating row") } else if ok { - b.rowIDs[field.Name()] = append(rowIDs, rowID) + b.rowIDs[i] = append(rowIDs, rowID) } else { - ints, ok := b.toTranslate[field.Name()][val] + ints, ok := b.toTranslate[i][val] if !ok { ints = make([]int, 0) } ints = append(ints, curPos) - b.toTranslate[field.Name()][val] = ints - b.rowIDs[field.Name()] = append(rowIDs, 0) + b.toTranslate[i][val] = ints + b.rowIDs[i] = append(rowIDs, 0) } case uint64: - b.rowIDs[field.Name()] = append(b.rowIDs[field.Name()], val) + b.rowIDs[i] = append(b.rowIDs[i], val) case int64: b.values[field.Name()] = append(b.values[field.Name()], val) case []string: @@ -370,7 +370,7 @@ func (b *Batch) Add(rec Row) error { b.nullIndices[field.Name()] = nullIndices } else { - b.rowIDs[field.Name()] = append(b.rowIDs[field.Name()], nilSentinel) + b.rowIDs[i] = append(b.rowIDs[i], nilSentinel) } default: return errors.Errorf("Val %v Type %[1]T is not currently supported. Use string, uint64 (row id), or int64 (integer value)", val) @@ -440,7 +440,8 @@ func (b *Batch) doTranslation() error { } // translate row keys - for fieldName, tt := range b.toTranslate { + for i, tt := range b.toTranslate { + fieldName := b.header[i].Name() keys = keys[:0] // make a slice of keys @@ -462,7 +463,7 @@ func (b *Batch) doTranslation() error { } // fill out missing IDs in local batch records with translated IDs - rows := b.rowIDs[fieldName] + rows := b.rowIDs[i] for j, key := range keys { id := ids[j] for _, recordIdx := range tt[key] { @@ -536,11 +537,11 @@ func (b *Batch) makeFragments() (fragments, error) { shardWidth = pilosa.DefaultShardWidth } frags := make(fragments) - for fname, rowIDs := range b.rowIDs { + for i, rowIDs := range b.rowIDs { if len(rowIDs) == 0 { continue // this can happen when the values that came in for this field were string slices } - field := b.headerMap[fname] + field := b.header[i] opts := field.Opts() curShard := ^uint64(0) // impossible sentinel value for shard. var curBM *roaring.Bitmap @@ -551,7 +552,7 @@ func (b *Batch) makeFragments() (fragments, error) { } if col/shardWidth != curShard { curShard = col / shardWidth - curBM = frags.GetOrCreate(curShard, fname, "") + curBM = frags.GetOrCreate(curShard, field.Name(), "") } // TODO this is super ugly, but we want to avoid setting // bits on the standard view in the specific case when @@ -567,7 +568,7 @@ func (b *Batch) makeFragments() (fragments, error) { return nil, errors.Wrap(err, "calculating views") } for _, view := range views { - tbm := frags.GetOrCreate(curShard, fname, view) + tbm := frags.GetOrCreate(curShard, field.Name(), view) tbm.DirectAdd(row*shardWidth + (col % shardWidth)) } } @@ -694,11 +695,12 @@ func (b *Batch) importValueData() error { func (b *Batch) reset() { b.ids = b.ids[:0] b.times = b.times[:0] - for fieldName, rowIDs := range b.rowIDs { - b.rowIDs[fieldName] = rowIDs[:0] + for i, rowIDs := range b.rowIDs { + fieldName := b.header[i].Name() + b.rowIDs[i] = rowIDs[:0] rowIDSet := b.rowIDSets[fieldName] b.rowIDSets[fieldName] = rowIDSet[:0] - m := b.toTranslate[fieldName] + m := b.toTranslate[i] for k := range m { delete(m, k) // TODO pool these slices } diff --git a/gpexp/importbatch_test.go b/gpexp/importbatch_test.go index d99f746..c097de2 100644 --- a/gpexp/importbatch_test.go +++ b/gpexp/importbatch_test.go @@ -237,10 +237,10 @@ func TestBatches(t *testing.T) { } - if len(b.toTranslate["zero"]) != 2 { + if len(b.toTranslate[0]) != 2 { t.Fatalf("wrong number of keys in toTranslate[0]") } - for k, ints := range b.toTranslate["zero"] { + for k, ints := range b.toTranslate[0] { if k == "a" { if !reflect.DeepEqual(ints, []int{0, 2, 4, 6}) { t.Fatalf("wrong ints for key a in field zero: %v", ints) @@ -262,10 +262,10 @@ func TestBatches(t *testing.T) { t.Fatalf("unexpected nullIndices: %v", b.nullIndices["three"]) } - if len(b.toTranslate["one"]) != 2 { - t.Fatalf("wrong number of keys in toTranslate[\"one\"]") + if len(b.toTranslate[1]) != 2 { + t.Fatalf("wrong number of keys in toTranslate[1]") } - for k, ints := range b.toTranslate["one"] { + for k, ints := range b.toTranslate[1] { if k == "b" { if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { t.Fatalf("wrong ints for key b in field one: %v", ints) @@ -280,10 +280,10 @@ func TestBatches(t *testing.T) { } } - if len(b.toTranslate["two"]) != 2 { + if len(b.toTranslate[2]) != 2 { t.Fatalf("wrong number of keys in toTranslate[2]") } - for k, ints := range b.toTranslate["two"] { + for k, ints := range b.toTranslate[2] { if k == "c" { if !reflect.DeepEqual(ints, []int{0, 2, 4, 6, 8}) { t.Fatalf("wrong ints for key c in field two: %v", ints) @@ -317,21 +317,25 @@ func TestBatches(t *testing.T) { t.Fatalf("doing translation: %v", err) } - for fname, rowIDs := range b.rowIDs { + for fidx, rowIDs := range b.rowIDs { // we don't know which key will get translated first, but we do know the pattern - if fname == "zero" { + if fidx == 0 { if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, nilSentinel, nilSentinel}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, nilSentinel, nilSentinel}) { - t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) + t.Fatalf("unexpected row ids for field %d: %v", fidx, rowIDs) } - } else if fname == "four" { + } else if fidx == 4 { if !reflect.DeepEqual(rowIDs, []uint64{1, 1, 1, 1, 1, 1, 1, 1, nilSentinel, nilSentinel}) { t.Fatalf("unexpected rowids for time field") } + } else if fidx == 3 { + if len(rowIDs) != 0 { + t.Fatalf("expected no rowIDs for int field, but got: %v", rowIDs) + } } else { if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 1}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 2}) { - t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) + t.Fatalf("unexpected row ids for field %d: %v", fidx, rowIDs) } } } @@ -378,10 +382,16 @@ func TestBatches(t *testing.T) { t.Fatalf("doing import: %v", err) } - for fname, rowIDs := range b.rowIDs { + for fidx, rowIDs := range b.rowIDs { + if fidx == 3 { + if len(rowIDs) != 0 { + t.Fatalf("expected no rowIDs for int field, but got: %v", rowIDs) + } + continue + } // we don't know which key will get translated first, but we do know the pattern if !reflect.DeepEqual(rowIDs, []uint64{1, 2, 1, 2, 1, 2, 1, 2, 1, 2}) && !reflect.DeepEqual(rowIDs, []uint64{2, 1, 2, 1, 2, 1, 2, 1, 2, 1}) { - t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) + t.Fatalf("unexpected row ids for field %d: %v", fidx, rowIDs) } } @@ -421,10 +431,16 @@ func TestBatches(t *testing.T) { t.Fatalf("doing import: %v", err) } - for fname, rowIDs := range b.rowIDs { + for fidx, rowIDs := range b.rowIDs { // we don't know which key will get translated first, but we do know the pattern + if fidx == 3 { + if len(rowIDs) != 0 { + t.Fatalf("expected no rowIDs for int field, but got: %v", rowIDs) + } + continue + } if !reflect.DeepEqual(rowIDs, []uint64{3, 4, 3, 4, 3, 4, 3, 4, 3, 4}) && !reflect.DeepEqual(rowIDs, []uint64{4, 3, 4, 3, 4, 3, 4, 3, 4, 3}) { - t.Fatalf("unexpected row ids for field %s: %v", fname, rowIDs) + t.Fatalf("unexpected row ids for field %d: %v", fidx, rowIDs) } } From 58c281632e8c6355e1386b0cd99b9f008fbd3104 Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Wed, 9 Oct 2019 17:38:37 -0500 Subject: [PATCH 25/26] map from int instead of slice --- gpexp/importbatch.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gpexp/importbatch.go b/gpexp/importbatch.go index 6b62050..c121e2e 100644 --- a/gpexp/importbatch.go +++ b/gpexp/importbatch.go @@ -63,7 +63,7 @@ type Batch struct { // rowIDs is a map of field index (in the header) to slices of // length batchSize which contain row IDs. - rowIDs [][]uint64 + rowIDs map[int][]uint64 // rowIDSets is a map from field name to a batchSize slice of // slices of row IDs. When a given record can have more than one @@ -83,7 +83,7 @@ type Batch struct { // TODO support mutex and bool fields. // for each field, keep a map of key to which record indexes that key mapped to - toTranslate []map[string][]int + toTranslate map[int]map[string][]int // toTranslateSets is a map from field name to a map of string // keys that need to be translated to sets of record indexes which @@ -124,9 +124,9 @@ func NewBatch(client *pilosa.Client, size int, index *pilosa.Index, fields []*pi return nil, errors.New("can't batch with no fields or batch size") } headerMap := make(map[string]*pilosa.Field, len(fields)) - rowIDs := make([][]uint64, len(fields)) + rowIDs := make(map[int][]uint64, len(fields)) values := make(map[string][]int64) - tt := make([]map[string][]int, len(fields)) + tt := make(map[int]map[string][]int, len(fields)) ttSets := make(map[string]map[string][]int) hasTime := false for i, field := range fields { From d00044fc73de53a1c3472b1ea66a72410b64d86e Mon Sep 17 00:00:00 2001 From: Matt Jaffee Date: Thu, 10 Oct 2019 22:24:44 -0500 Subject: [PATCH 26/26] fix flaky test due to unpredictable translation order --- .circleci/config.yml | 6 +++--- gpexp/importbatch_test.go | 14 ++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b48d376..99d0942 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -30,13 +30,13 @@ jobs: - *fast-checkout - run: make install-gometalinter - run: make gometalinter - test-golang-1.13-rc: &base-test + test-golang-1.13: &base-test <<: *defaults steps: - *fast-checkout - run: make test-all docker: - - image: circleci/golang:1.13-rc + - image: circleci/golang:1.13 - image: pilosa/pilosa:master test-golang-1.12: <<: *base-test @@ -51,7 +51,7 @@ workflows: - linter: requires: - build - - test-golang-1.13-rc: + - test-golang-1.13: requires: - build - test-golang-1.12: diff --git a/gpexp/importbatch_test.go b/gpexp/importbatch_test.go index c097de2..7764d10 100644 --- a/gpexp/importbatch_test.go +++ b/gpexp/importbatch_test.go @@ -154,14 +154,12 @@ func TestStringSlice(t *testing.T) { t.Fatalf("translating: %v", err) } - if got := b.rowIDSets["strslice"][0]; !reflect.DeepEqual(got, []uint64{1}) { - t.Fatalf("after translation, rec 0: %v", got) - } - if got := b.rowIDSets["strslice"][1]; !reflect.DeepEqual(got, []uint64{9, 1, 2}) { - t.Fatalf("after translation, rec 1: %v", got) - } - if got := b.rowIDSets["strslice"][2]; !reflect.DeepEqual(got, []uint64{10, 13, 3}) { - t.Fatalf("after translation, rec 2: %v", got) + if got0 := b.rowIDSets["strslice"][0]; len(got0) != 1 { + t.Errorf("after translation, rec 0, wrong len: %v", got0) + } else if got1 := b.rowIDSets["strslice"][1]; len(got1) != 3 || got1[0] != 9 || (got1[1] != got0[0] && got1[2] != got0[0]) { + t.Errorf("after translation, rec 1: %v, rec 0: %v", got1, got0) + } else if got2 := b.rowIDSets["strslice"][2]; len(got2) != 3 || got2[0] != 10 || got2[1] != 13 || got2[2] == got1[2] || got2[2] == got0[0] { + t.Errorf("after translation, rec 2: %v", got2) } err = b.doImport()