From ef24eed390973d51799ee27f90f00ae2ce1db57d Mon Sep 17 00:00:00 2001 From: Marie Giraud Date: Tue, 5 Apr 2022 10:22:39 +0000 Subject: [PATCH 1/7] feat: add analyzer to order qi --- internal/infra/source.go | 4 ++++ pkg/sigo/analyzer.go | 50 ++++++++++++++++++++++++++++++++++++++++ pkg/sigo/driver.go | 5 ++++ pkg/sigo/model.go | 6 +++++ pkg/sigo/stats.go | 10 ++++++++ pkg/sigo/stats_test.go | 13 +++++++++++ 6 files changed, 88 insertions(+) create mode 100644 pkg/sigo/analyzer.go diff --git a/internal/infra/source.go b/internal/infra/source.go index 8b016ee..b0e1bd9 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -117,6 +117,10 @@ func (s *JSONLineSource) QuasiIdentifer() []string { return s.quasiIdentifers } +func (s *JSONLineSource) UpdateQI(newQI []string) { + s.quasiIdentifers = newQI +} + func (s *JSONLineSource) Sensitive() []string { return s.sensitives } diff --git a/pkg/sigo/analyzer.go b/pkg/sigo/analyzer.go new file mode 100644 index 0000000..00843df --- /dev/null +++ b/pkg/sigo/analyzer.go @@ -0,0 +1,50 @@ +package sigo + +import ( + "sort" +) + +type File struct { + source RecordSource + values map[string][]float64 +} + +func New(s RecordSource) Analyzer { + return File{source: s, values: make(map[string][]float64)} +} + +func (f File) Add(r Record) { + for i, key := range f.source.QuasiIdentifer() { + f.values[key] = append(f.values[key], r.QuasiIdentifer()[i]) + } +} + +func (f File) CountUniqueValues() map[string]int { + uniques := make(map[string]int) + + for _, key := range f.source.QuasiIdentifer() { + uniques[key] = Unique(f.values[key]) + } + + return uniques +} + +func order(countUnique map[string]int) (qiOrdered []string) { + switched := make(map[int][]string) + slice := []int{} + + for key, count := range countUnique { + switched[count] = append(switched[count], key) + + slice = append(slice, count) + } + + sort.Sort(sort.Reverse(sort.IntSlice(slice))) + + for _, val := range slice { + qiOrdered = append(qiOrdered, switched[val]...) + delete(switched, val) + } + + return qiOrdered +} diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 33ee23f..df9bdd0 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -26,6 +26,7 @@ import ( func Anonymize(source RecordSource, factory GeneralizerFactory, k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger) error { generalizer := factory.New(k, l, dim, source.QuasiIdentifer()) + analyzer := New(source) count := 0 log.Info().Msg("Reading source") @@ -35,6 +36,7 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, return fmt.Errorf("%w", source.Err()) } + analyzer.Add(source.Value()) generalizer.Add(source.Value()) count++ } @@ -42,6 +44,9 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") + orderedQI := order(analyzer.CountUniqueValues()) + source.UpdateQI(orderedQI) + generalizer.Build() log.Info().Msg("Cluster Anonymization") diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index 5a9bd41..b0a88aa 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -22,6 +22,7 @@ type RecordSource interface { Err() error Value() Record QuasiIdentifer() []string + UpdateQI([]string) Sensitive() []string } @@ -59,3 +60,8 @@ type Anonymizer interface { type Debugger interface { Information(Record, Cluster) Record } + +type Analyzer interface { + Add(Record) + CountUniqueValues() map[string]int +} diff --git a/pkg/sigo/stats.go b/pkg/sigo/stats.go index d4192ef..2cdda69 100644 --- a/pkg/sigo/stats.go +++ b/pkg/sigo/stats.go @@ -210,3 +210,13 @@ func BoxMuller() (float64, float64) { return z1, z2 } + +func Unique(values []float64) int { + tmp := make(map[float64]int) + + for _, val := range values { + tmp[val]++ + } + + return len(tmp) +} diff --git a/pkg/sigo/stats_test.go b/pkg/sigo/stats_test.go index a1c8735..44d37d7 100644 --- a/pkg/sigo/stats_test.go +++ b/pkg/sigo/stats_test.go @@ -32,3 +32,16 @@ func TestQuartiles(t *testing.T) { assert.Equal(t, q.Q2, sigo.Median(values)) assert.Equal(t, 5.00, sigo.IQR(values)) } + +func TestUnique(t *testing.T) { + t.Parallel() + + values1 := []float64{12, 10, 5, 6, 9, 10, 4, 5, 10, 12, 9, 6, 4, 3, 9, 10} + values2 := []float64{1, 9, 8, 5, 2, 6, 7, 10, 3, 12, 4, 11} + + res1 := sigo.Unique(values1) + res2 := sigo.Unique(values2) + + assert.Equal(t, 7, res1) + assert.Equal(t, 12, res2) +} From 8f5f6d79227ecdc25e80b0924e70cd5adc8f28bd Mon Sep 17 00:00:00 2001 From: Marie Giraud Date: Wed, 6 Apr 2022 13:08:26 +0000 Subject: [PATCH 2/7] test: add unit tests for analyzer --- pkg/sigo/analyzer.go | 23 +++++++++- pkg/sigo/analyzer_test.go | 97 +++++++++++++++++++++++++++++++++++++++ pkg/sigo/driver.go | 2 +- pkg/sigo/model.go | 1 + 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 pkg/sigo/analyzer_test.go diff --git a/pkg/sigo/analyzer.go b/pkg/sigo/analyzer.go index 00843df..582727c 100644 --- a/pkg/sigo/analyzer.go +++ b/pkg/sigo/analyzer.go @@ -1,3 +1,20 @@ +// Copyright (C) 2022 CGI France +// +// This file is part of SIGO. +// +// SIGO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SIGO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SIGO. If not, see . + package sigo import ( @@ -19,6 +36,10 @@ func (f File) Add(r Record) { } } +func (f File) Values(key string) []float64 { + return f.values[key] +} + func (f File) CountUniqueValues() map[string]int { uniques := make(map[string]int) @@ -29,7 +50,7 @@ func (f File) CountUniqueValues() map[string]int { return uniques } -func order(countUnique map[string]int) (qiOrdered []string) { +func Order(countUnique map[string]int) (qiOrdered []string) { switched := make(map[int][]string) slice := []int{} diff --git a/pkg/sigo/analyzer_test.go b/pkg/sigo/analyzer_test.go new file mode 100644 index 0000000..07c1c9e --- /dev/null +++ b/pkg/sigo/analyzer_test.go @@ -0,0 +1,97 @@ +// Copyright (C) 2022 CGI France +// +// This file is part of SIGO. +// +// SIGO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SIGO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SIGO. If not, see . +package sigo_test + +import ( + "strings" + "testing" + + "github.com/cgi-fr/jsonline/pkg/jsonline" + "github.com/cgi-fr/sigo/internal/infra" + "github.com/cgi-fr/sigo/pkg/sigo" + "github.com/stretchr/testify/assert" +) + +func TestAddRecordToAnalyzer(t *testing.T) { + t.Parallel() + + sourceText := `{"x":2, "y":1, "foo":"baz"} + {"x":3, "y":2, "foo":"baz"} + {"x":2, "y":3, "foo":"baz"}` + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"}) + assert.Nil(t, err) + + analyzer := sigo.New(source) + + row := jsonline.NewRow() + row.Set("x", 3) + row.Set("y", 2) + row.Set("z", "bar") + record := infra.NewJSONLineRecord(&row, &[]string{"x", "y"}, &[]string{"foo"}) + + analyzer.Add(record) + + assert.Equal(t, analyzer.Values("x"), []float64{3}) + assert.Equal(t, analyzer.Values("y"), []float64{2}) +} + +func TestCountUniqueValues(t *testing.T) { + t.Parallel() + + sourceText := `{"x":4, "y":1} + {"x":3, "y":2} + {"x":4, "y":3}` + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{}) + assert.Nil(t, err) + + analyzer := sigo.New(source) + + qi := []string{"x", "y"} + analyzer.Add(createRow(4, 1, qi)) + analyzer.Add(createRow(3, 2, qi)) + analyzer.Add(createRow(4, 3, qi)) + + res := analyzer.CountUniqueValues() + + assert.Equal(t, 2, res["x"]) + assert.Equal(t, 3, res["y"]) +} + +func TestOrderQI(t *testing.T) { + t.Parallel() + + sourceText := `{"x":1, "y":1} + {"x":2, "y":2} + {"x":1, "y":3}` + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{}) + assert.Nil(t, err) + + analyzer := sigo.New(source) + + qi := []string{"x", "y"} + analyzer.Add(createRow(1, 1, qi)) + analyzer.Add(createRow(2, 2, qi)) + analyzer.Add(createRow(1, 3, qi)) + + unique := analyzer.CountUniqueValues() + res := sigo.Order(unique) + + assert.Equal(t, []string{"y", "x"}, res) +} diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index df9bdd0..2210aa7 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -44,7 +44,7 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") - orderedQI := order(analyzer.CountUniqueValues()) + orderedQI := Order(analyzer.CountUniqueValues()) source.UpdateQI(orderedQI) generalizer.Build() diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index b0a88aa..1b55bea 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -63,5 +63,6 @@ type Debugger interface { type Analyzer interface { Add(Record) + Values(string) []float64 CountUniqueValues() map[string]int } From 29fb7c0507ad3008a822aec3b5c77e720715eab1 Mon Sep 17 00:00:00 2001 From: Marie Giraud Date: Wed, 6 Apr 2022 17:15:46 +0000 Subject: [PATCH 3/7] feat: remove analyzer, add values in KDTree --- pkg/sigo/analyzer.go | 71 --------------------------- pkg/sigo/analyzer_test.go | 97 ------------------------------------- pkg/sigo/driver.go | 5 +- pkg/sigo/kdtree.go | 21 +++++++- pkg/sigo/kdtree_test.go | 16 ++++++ pkg/sigo/model.go | 2 + pkg/sigo/stats.go | 20 ++++++++ pkg/sigo/stats_test.go | 10 ++++ test/suites/02-order-QI.yml | 22 +++++++++ 9 files changed, 91 insertions(+), 173 deletions(-) delete mode 100644 pkg/sigo/analyzer.go delete mode 100644 pkg/sigo/analyzer_test.go create mode 100644 test/suites/02-order-QI.yml diff --git a/pkg/sigo/analyzer.go b/pkg/sigo/analyzer.go deleted file mode 100644 index 582727c..0000000 --- a/pkg/sigo/analyzer.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (C) 2022 CGI France -// -// This file is part of SIGO. -// -// SIGO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// SIGO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with SIGO. If not, see . - -package sigo - -import ( - "sort" -) - -type File struct { - source RecordSource - values map[string][]float64 -} - -func New(s RecordSource) Analyzer { - return File{source: s, values: make(map[string][]float64)} -} - -func (f File) Add(r Record) { - for i, key := range f.source.QuasiIdentifer() { - f.values[key] = append(f.values[key], r.QuasiIdentifer()[i]) - } -} - -func (f File) Values(key string) []float64 { - return f.values[key] -} - -func (f File) CountUniqueValues() map[string]int { - uniques := make(map[string]int) - - for _, key := range f.source.QuasiIdentifer() { - uniques[key] = Unique(f.values[key]) - } - - return uniques -} - -func Order(countUnique map[string]int) (qiOrdered []string) { - switched := make(map[int][]string) - slice := []int{} - - for key, count := range countUnique { - switched[count] = append(switched[count], key) - - slice = append(slice, count) - } - - sort.Sort(sort.Reverse(sort.IntSlice(slice))) - - for _, val := range slice { - qiOrdered = append(qiOrdered, switched[val]...) - delete(switched, val) - } - - return qiOrdered -} diff --git a/pkg/sigo/analyzer_test.go b/pkg/sigo/analyzer_test.go deleted file mode 100644 index 07c1c9e..0000000 --- a/pkg/sigo/analyzer_test.go +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2022 CGI France -// -// This file is part of SIGO. -// -// SIGO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// SIGO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with SIGO. If not, see . -package sigo_test - -import ( - "strings" - "testing" - - "github.com/cgi-fr/jsonline/pkg/jsonline" - "github.com/cgi-fr/sigo/internal/infra" - "github.com/cgi-fr/sigo/pkg/sigo" - "github.com/stretchr/testify/assert" -) - -func TestAddRecordToAnalyzer(t *testing.T) { - t.Parallel() - - sourceText := `{"x":2, "y":1, "foo":"baz"} - {"x":3, "y":2, "foo":"baz"} - {"x":2, "y":3, "foo":"baz"}` - - source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"}) - assert.Nil(t, err) - - analyzer := sigo.New(source) - - row := jsonline.NewRow() - row.Set("x", 3) - row.Set("y", 2) - row.Set("z", "bar") - record := infra.NewJSONLineRecord(&row, &[]string{"x", "y"}, &[]string{"foo"}) - - analyzer.Add(record) - - assert.Equal(t, analyzer.Values("x"), []float64{3}) - assert.Equal(t, analyzer.Values("y"), []float64{2}) -} - -func TestCountUniqueValues(t *testing.T) { - t.Parallel() - - sourceText := `{"x":4, "y":1} - {"x":3, "y":2} - {"x":4, "y":3}` - - source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{}) - assert.Nil(t, err) - - analyzer := sigo.New(source) - - qi := []string{"x", "y"} - analyzer.Add(createRow(4, 1, qi)) - analyzer.Add(createRow(3, 2, qi)) - analyzer.Add(createRow(4, 3, qi)) - - res := analyzer.CountUniqueValues() - - assert.Equal(t, 2, res["x"]) - assert.Equal(t, 3, res["y"]) -} - -func TestOrderQI(t *testing.T) { - t.Parallel() - - sourceText := `{"x":1, "y":1} - {"x":2, "y":2} - {"x":1, "y":3}` - - source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{}) - assert.Nil(t, err) - - analyzer := sigo.New(source) - - qi := []string{"x", "y"} - analyzer.Add(createRow(1, 1, qi)) - analyzer.Add(createRow(2, 2, qi)) - analyzer.Add(createRow(1, 3, qi)) - - unique := analyzer.CountUniqueValues() - res := sigo.Order(unique) - - assert.Equal(t, []string{"y", "x"}, res) -} diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 2210aa7..9640ea6 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -26,7 +26,6 @@ import ( func Anonymize(source RecordSource, factory GeneralizerFactory, k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger) error { generalizer := factory.New(k, l, dim, source.QuasiIdentifer()) - analyzer := New(source) count := 0 log.Info().Msg("Reading source") @@ -36,15 +35,15 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, return fmt.Errorf("%w", source.Err()) } - analyzer.Add(source.Value()) generalizer.Add(source.Value()) + generalizer.AddValues(source.Value()) count++ } log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") - orderedQI := Order(analyzer.CountUniqueValues()) + orderedQI := Order(generalizer.CountUniqueValues()) source.UpdateQI(orderedQI) generalizer.Build() diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index c049434..0769099 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -35,12 +35,12 @@ type KDTreeFactory struct{} func (f KDTreeFactory) New(k int, l int, dim int, qi []string) Generalizer { // nolint: exhaustivestruct - tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), qi: qi} + tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), qi: qi, values: make(map[string][]float64)} root := NewNode(&tree, "root", 0) root.validate() tree.root = &root - return tree + return &tree } type KDTree struct { @@ -50,6 +50,7 @@ type KDTree struct { dim int clusterID map[string]int qi []string + values map[string][]float64 } func NewKDTree(k, l, dim int, clusterID map[string]int) KDTree { @@ -61,6 +62,22 @@ func (t KDTree) Add(r Record) { t.root.Add(r) } +func (t *KDTree) AddValues(r Record) { + for i, key := range t.qi { + t.values[key] = append(t.values[key], r.QuasiIdentifer()[i]) + } +} + +func (t KDTree) CountUniqueValues() map[string]int { + uniques := make(map[string]int) + + for _, key := range t.qi { + uniques[key] = Unique(t.values[key]) + } + + return uniques +} + func (t KDTree) Build() { t.root.build() } diff --git a/pkg/sigo/kdtree_test.go b/pkg/sigo/kdtree_test.go index 86493e3..83e60c2 100644 --- a/pkg/sigo/kdtree_test.go +++ b/pkg/sigo/kdtree_test.go @@ -200,3 +200,19 @@ func TestAddClusterInfos(t *testing.T) { } } } + +func TestCountUniqueValues(t *testing.T) { + t.Parallel() + + qi := []string{"x", "y"} + kdtree := sigo.NewKDTreeFactory().New(3, 1, 2, qi) + + kdtree.AddValues(createRow(4, 1, qi)) + kdtree.AddValues(createRow(3, 2, qi)) + kdtree.AddValues(createRow(4, 3, qi)) + + res := kdtree.CountUniqueValues() + + assert.Equal(t, 2, res["x"]) + assert.Equal(t, 3, res["y"]) +} diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index 1b55bea..87e7f8e 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -44,9 +44,11 @@ type Cluster interface { type Generalizer interface { Add(Record) + AddValues(Record) Clusters() []Cluster String() string Build() + CountUniqueValues() map[string]int } type GeneralizerFactory interface { diff --git a/pkg/sigo/stats.go b/pkg/sigo/stats.go index 2cdda69..bb516b1 100644 --- a/pkg/sigo/stats.go +++ b/pkg/sigo/stats.go @@ -220,3 +220,23 @@ func Unique(values []float64) int { return len(tmp) } + +func Order(countUnique map[string]int) (ordered []string) { + switched := make(map[int][]string) + slice := []int{} + + for key, count := range countUnique { + switched[count] = append(switched[count], key) + + slice = append(slice, count) + } + + sort.Sort(sort.Reverse(sort.IntSlice(slice))) + + for _, val := range slice { + ordered = append(ordered, switched[val]...) + delete(switched, val) + } + + return ordered +} diff --git a/pkg/sigo/stats_test.go b/pkg/sigo/stats_test.go index 44d37d7..128d79a 100644 --- a/pkg/sigo/stats_test.go +++ b/pkg/sigo/stats_test.go @@ -45,3 +45,13 @@ func TestUnique(t *testing.T) { assert.Equal(t, 7, res1) assert.Equal(t, 12, res2) } + +func TestOrderMap(t *testing.T) { + t.Parallel() + + values := map[string]int{"x": 2, "y": 3} + + res := sigo.Order(values) + + assert.Equal(t, []string{"y", "x"}, res) +} diff --git a/test/suites/02-order-QI.yml b/test/suites/02-order-QI.yml new file mode 100644 index 0000000..00eee34 --- /dev/null +++ b/test/suites/02-order-QI.yml @@ -0,0 +1,22 @@ +# Venom Test Suite definition +# Check Venom documentation for more information : https://github.com/ovh/venom + +# name: sigo odrer qi +# testcases: +# - name: sort qi +# steps: +# - script: |- +# sigo -q x,y -i id < Date: Thu, 7 Apr 2022 10:14:43 +0000 Subject: [PATCH 4/7] feat: add analyzer to generalizer --- examples/order.json | 15 +++++++++ internal/infra/source.go | 4 --- pkg/sigo/analyzer.go | 70 +++++++++++++++++++++++++++++++++++++++ pkg/sigo/analyzer_test.go | 27 +++++++++++++++ pkg/sigo/driver.go | 4 --- pkg/sigo/kdtree.go | 40 +++++++++------------- pkg/sigo/kdtree_test.go | 16 --------- pkg/sigo/model.go | 7 ++-- pkg/sigo/stats.go | 20 ----------- pkg/sigo/stats_test.go | 10 ------ 10 files changed, 131 insertions(+), 82 deletions(-) create mode 100644 examples/order.json create mode 100644 pkg/sigo/analyzer.go create mode 100644 pkg/sigo/analyzer_test.go diff --git a/examples/order.json b/examples/order.json new file mode 100644 index 0000000..0315ec5 --- /dev/null +++ b/examples/order.json @@ -0,0 +1,15 @@ +{"x":4, "y":1} +{"x":4, "y":2} +{"x":4, "y":3} +{"x":3, "y":4} +{"x":3, "y":5} +{"x":3, "y":6} +{"x":3, "y":7} +{"x":4, "y":8} +{"x":4, "y":9} +{"x":4, "y":10} +{"x":3, "y":11} +{"x":4, "y":12} +{"x":4, "y":13} +{"x":3, "y":14} +{"x":3, "y":15} diff --git a/internal/infra/source.go b/internal/infra/source.go index b0e1bd9..8b016ee 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -117,10 +117,6 @@ func (s *JSONLineSource) QuasiIdentifer() []string { return s.quasiIdentifers } -func (s *JSONLineSource) UpdateQI(newQI []string) { - s.quasiIdentifers = newQI -} - func (s *JSONLineSource) Sensitive() []string { return s.sensitives } diff --git a/pkg/sigo/analyzer.go b/pkg/sigo/analyzer.go new file mode 100644 index 0000000..c386551 --- /dev/null +++ b/pkg/sigo/analyzer.go @@ -0,0 +1,70 @@ +package sigo + +import ( + "sort" +) + +type Source struct { + qi map[string]int + values map[string][]float64 +} + +func NewAnalyzer(qi []string) Analyzer { + dict := make(map[string]int) + for i, key := range qi { + dict[key] = i + } + + return Source{qi: dict, values: make(map[string][]float64)} +} + +func (s Source) Add(r Record) { + for key, i := range s.qi { + s.values[key] = append(s.values[key], r.QuasiIdentifer()[i]) + } +} + +func (s Source) QI(i int) string { + return s.Order()[i] +} + +func (s Source) CountUniqueValues() map[string]int { + uniques := make(map[string]int) + + for key := range s.qi { + uniques[key] = Unique(s.values[key]) + } + + return uniques +} + +func (s Source) Order() map[int]string { + order := make(map[int]string) + switched := make(map[int][]string) + slice := []int{} + + for key, count := range s.CountUniqueValues() { + switched[count] = append(switched[count], key) + + slice = append(slice, count) + } + + sort.Sort(sort.Reverse(sort.IntSlice(slice))) + + i := 0 + + for _, count := range slice { + for _, qi := range switched[count] { + order[i] = qi + i++ + } + + delete(switched, count) + } + + return order +} + +func (s Source) Dimension(rot int) int { + return s.qi[s.Order()[rot]] +} diff --git a/pkg/sigo/analyzer_test.go b/pkg/sigo/analyzer_test.go new file mode 100644 index 0000000..ff2cd6d --- /dev/null +++ b/pkg/sigo/analyzer_test.go @@ -0,0 +1,27 @@ +package sigo_test + +// func TestCountUniqueValues(t *testing.T) { +// t.Parallel() + +// qi := []string{"x", "y"} +// kdtree := sigo.NewKDTreeFactory().New(3, 1, 2, qi) + +// kdtree.AddValues(createRow(4, 1, qi)) +// kdtree.AddValues(createRow(3, 2, qi)) +// kdtree.AddValues(createRow(4, 3, qi)) + +// res := kdtree.CountUniqueValues() + +// assert.Equal(t, 2, res["x"]) +// assert.Equal(t, 3, res["y"]) +// } + +// func TestOrderMap(t *testing.T) { +// t.Parallel() + +// values := map[string]int{"x": 2, "y": 3} + +// res := sigo.Order(values) + +// assert.Equal(t, []string{"y", "x"}, res) +// } diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 9640ea6..33ee23f 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -36,16 +36,12 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, } generalizer.Add(source.Value()) - generalizer.AddValues(source.Value()) count++ } log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") - orderedQI := Order(generalizer.CountUniqueValues()) - source.UpdateQI(orderedQI) - generalizer.Build() log.Info().Msg("Cluster Anonymization") diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index 0769099..8c40e7c 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -35,12 +35,12 @@ type KDTreeFactory struct{} func (f KDTreeFactory) New(k int, l int, dim int, qi []string) Generalizer { // nolint: exhaustivestruct - tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), qi: qi, values: make(map[string][]float64)} + tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), analyzer: NewAnalyzer(qi)} root := NewNode(&tree, "root", 0) root.validate() tree.root = &root - return &tree + return tree } type KDTree struct { @@ -49,8 +49,7 @@ type KDTree struct { root *node dim int clusterID map[string]int - qi []string - values map[string][]float64 + analyzer Analyzer } func NewKDTree(k, l, dim int, clusterID map[string]int) KDTree { @@ -60,22 +59,7 @@ func NewKDTree(k, l, dim int, clusterID map[string]int) KDTree { func (t KDTree) Add(r Record) { t.root.Add(r) -} - -func (t *KDTree) AddValues(r Record) { - for i, key := range t.qi { - t.values[key] = append(t.values[key], r.QuasiIdentifer()[i]) - } -} - -func (t KDTree) CountUniqueValues() map[string]int { - uniques := make(map[string]int) - - for _, key := range t.qi { - uniques[key] = Unique(t.values[key]) - } - - return uniques + t.analyzer.Add(r) } func (t KDTree) Build() { @@ -123,13 +107,18 @@ func (n *node) Add(r Record) { n.cluster = append(n.cluster, r) } +func (n *node) initiateRot() { + n.rot = n.tree.analyzer.Dimension(0) +} + func (n *node) incRot() { - n.rot = (n.rot + 1) % n.tree.dim + dim := (n.rot + 1) % n.tree.dim + n.rot = n.tree.analyzer.Dimension(dim) } func (n *node) build() { log.Debug(). - Str("Dimension", n.tree.qi[n.rot]). + Str("Dimension", n.tree.analyzer.QI(n.rot)). Str("Path", n.clusterPath). Int("Size", len(n.cluster)). Msg("Cluster:") @@ -146,6 +135,8 @@ func (n *node) build() { ) for i := 1; i <= n.tree.dim; i++ { + n.initiateRot() + lower, upper, valide = n.split() if !valide { n.incRot() @@ -196,9 +187,10 @@ func (n *node) split() (node, node, bool) { }) n.pivot = nil - lower := NewNode(n.tree, n.clusterPath+"-l", n.rot+1) + dim := (n.rot + 1) % n.tree.dim + lower := NewNode(n.tree, n.clusterPath+"-l", n.tree.analyzer.Dimension(dim+1)) // n.rot+1 copy(lower.bounds, n.bounds) - upper := NewNode(n.tree, n.clusterPath+"-u", n.rot+1) + upper := NewNode(n.tree, n.clusterPath+"-u", n.tree.analyzer.Dimension(dim+1)) // n.rot+1 copy(upper.bounds, n.bounds) lowerSize := 0 diff --git a/pkg/sigo/kdtree_test.go b/pkg/sigo/kdtree_test.go index 83e60c2..86493e3 100644 --- a/pkg/sigo/kdtree_test.go +++ b/pkg/sigo/kdtree_test.go @@ -200,19 +200,3 @@ func TestAddClusterInfos(t *testing.T) { } } } - -func TestCountUniqueValues(t *testing.T) { - t.Parallel() - - qi := []string{"x", "y"} - kdtree := sigo.NewKDTreeFactory().New(3, 1, 2, qi) - - kdtree.AddValues(createRow(4, 1, qi)) - kdtree.AddValues(createRow(3, 2, qi)) - kdtree.AddValues(createRow(4, 3, qi)) - - res := kdtree.CountUniqueValues() - - assert.Equal(t, 2, res["x"]) - assert.Equal(t, 3, res["y"]) -} diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index 87e7f8e..e2ebca4 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -22,7 +22,6 @@ type RecordSource interface { Err() error Value() Record QuasiIdentifer() []string - UpdateQI([]string) Sensitive() []string } @@ -44,11 +43,9 @@ type Cluster interface { type Generalizer interface { Add(Record) - AddValues(Record) Clusters() []Cluster String() string Build() - CountUniqueValues() map[string]int } type GeneralizerFactory interface { @@ -65,6 +62,8 @@ type Debugger interface { type Analyzer interface { Add(Record) - Values(string) []float64 + QI(i int) string CountUniqueValues() map[string]int + Order() map[int]string + Dimension(int) int } diff --git a/pkg/sigo/stats.go b/pkg/sigo/stats.go index bb516b1..2cdda69 100644 --- a/pkg/sigo/stats.go +++ b/pkg/sigo/stats.go @@ -220,23 +220,3 @@ func Unique(values []float64) int { return len(tmp) } - -func Order(countUnique map[string]int) (ordered []string) { - switched := make(map[int][]string) - slice := []int{} - - for key, count := range countUnique { - switched[count] = append(switched[count], key) - - slice = append(slice, count) - } - - sort.Sort(sort.Reverse(sort.IntSlice(slice))) - - for _, val := range slice { - ordered = append(ordered, switched[val]...) - delete(switched, val) - } - - return ordered -} diff --git a/pkg/sigo/stats_test.go b/pkg/sigo/stats_test.go index 128d79a..44d37d7 100644 --- a/pkg/sigo/stats_test.go +++ b/pkg/sigo/stats_test.go @@ -45,13 +45,3 @@ func TestUnique(t *testing.T) { assert.Equal(t, 7, res1) assert.Equal(t, 12, res2) } - -func TestOrderMap(t *testing.T) { - t.Parallel() - - values := map[string]int{"x": 2, "y": 3} - - res := sigo.Order(values) - - assert.Equal(t, []string{"y", "x"}, res) -} From d5a7b5cbef579c95d16740d250f0dea53c5e87d4 Mon Sep 17 00:00:00 2001 From: Marie Giraud Date: Thu, 7 Apr 2022 12:08:31 +0000 Subject: [PATCH 5/7] test: add unit test for analyzer --- pkg/sigo/analyzer_test.go | 78 ++++++++++++++++++++++++++++++--------- pkg/sigo/kdtree.go | 17 +++------ 2 files changed, 66 insertions(+), 29 deletions(-) diff --git a/pkg/sigo/analyzer_test.go b/pkg/sigo/analyzer_test.go index ff2cd6d..dae56f5 100644 --- a/pkg/sigo/analyzer_test.go +++ b/pkg/sigo/analyzer_test.go @@ -1,27 +1,71 @@ +// Copyright (C) 2022 CGI France +// +// This file is part of SIGO. +// +// SIGO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SIGO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SIGO. If not, see . package sigo_test -// func TestCountUniqueValues(t *testing.T) { -// t.Parallel() +import ( + "testing" -// qi := []string{"x", "y"} -// kdtree := sigo.NewKDTreeFactory().New(3, 1, 2, qi) + "github.com/cgi-fr/sigo/pkg/sigo" + "github.com/stretchr/testify/assert" +) -// kdtree.AddValues(createRow(4, 1, qi)) -// kdtree.AddValues(createRow(3, 2, qi)) -// kdtree.AddValues(createRow(4, 3, qi)) +func TestCountUniqueValues(t *testing.T) { + t.Parallel() -// res := kdtree.CountUniqueValues() + qi := []string{"x", "y"} + source := sigo.NewAnalyzer(qi) -// assert.Equal(t, 2, res["x"]) -// assert.Equal(t, 3, res["y"]) -// } + source.Add(createRow(4, 1, qi)) + source.Add(createRow(3, 2, qi)) + source.Add(createRow(4, 3, qi)) -// func TestOrderMap(t *testing.T) { -// t.Parallel() + res := source.CountUniqueValues() -// values := map[string]int{"x": 2, "y": 3} + assert.Equal(t, 2, res["x"]) + assert.Equal(t, 3, res["y"]) +} -// res := sigo.Order(values) +func TestOrderMap(t *testing.T) { + t.Parallel() -// assert.Equal(t, []string{"y", "x"}, res) -// } + qi := []string{"x", "y"} + source := sigo.NewAnalyzer(qi) + + source.Add(createRow(4, 1, qi)) + source.Add(createRow(3, 2, qi)) + source.Add(createRow(4, 3, qi)) + + res := source.Order() + + assert.Equal(t, "y", res[0]) + assert.Equal(t, "x", res[1]) +} + +func TestDimension(t *testing.T) { + t.Parallel() + + qi := []string{"x", "y"} + source := sigo.NewAnalyzer(qi) + + source.Add(createRow(4, 1, qi)) + source.Add(createRow(3, 2, qi)) + source.Add(createRow(4, 3, qi)) + + res := source.Dimension(0) + + assert.Equal(t, 1, res) +} diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index 8c40e7c..f023293 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -107,13 +107,8 @@ func (n *node) Add(r Record) { n.cluster = append(n.cluster, r) } -func (n *node) initiateRot() { - n.rot = n.tree.analyzer.Dimension(0) -} - func (n *node) incRot() { - dim := (n.rot + 1) % n.tree.dim - n.rot = n.tree.analyzer.Dimension(dim) + n.rot = (n.rot + 1) % n.tree.dim } func (n *node) build() { @@ -135,8 +130,6 @@ func (n *node) build() { ) for i := 1; i <= n.tree.dim; i++ { - n.initiateRot() - lower, upper, valide = n.split() if !valide { n.incRot() @@ -182,15 +175,15 @@ func (n *node) Bounds() []bounds { } func (n *node) split() (node, node, bool) { + dim := n.tree.analyzer.Dimension(n.rot) sort.SliceStable(n.cluster, func(i int, j int) bool { - return n.cluster[i].QuasiIdentifer()[n.rot] < n.cluster[j].QuasiIdentifer()[n.rot] + return n.cluster[i].QuasiIdentifer()[dim] < n.cluster[j].QuasiIdentifer()[dim] }) n.pivot = nil - dim := (n.rot + 1) % n.tree.dim - lower := NewNode(n.tree, n.clusterPath+"-l", n.tree.analyzer.Dimension(dim+1)) // n.rot+1 + lower := NewNode(n.tree, n.clusterPath+"-l", n.rot+1) copy(lower.bounds, n.bounds) - upper := NewNode(n.tree, n.clusterPath+"-u", n.tree.analyzer.Dimension(dim+1)) // n.rot+1 + upper := NewNode(n.tree, n.clusterPath+"-u", n.rot+1) copy(upper.bounds, n.bounds) lowerSize := 0 From 6d9a2aea9c34a7d65de39ddecfe3ba50c1ad7ae6 Mon Sep 17 00:00:00 2001 From: Marie Giraud Date: Thu, 7 Apr 2022 18:01:18 +0000 Subject: [PATCH 6/7] test: add test venom --- pkg/sigo/kdtree.go | 2 +- test/suites/02-order-QI.yml | 46 ++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index f023293..0342460 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -191,7 +191,7 @@ func (n *node) split() (node, node, bool) { previous := n.cluster[0] for _, row := range n.cluster { - if lowerSize < len(n.cluster)/2 || row.QuasiIdentifer()[n.rot] == previous.QuasiIdentifer()[n.rot] { + if lowerSize < len(n.cluster)/2 { // || row.QuasiIdentifer()[n.rot] == previous.QuasiIdentifer()[n.rot] { lower.Add(row) previous = row lowerSize++ diff --git a/test/suites/02-order-QI.yml b/test/suites/02-order-QI.yml index 00eee34..82b8973 100644 --- a/test/suites/02-order-QI.yml +++ b/test/suites/02-order-QI.yml @@ -1,22 +1,30 @@ # Venom Test Suite definition # Check Venom documentation for more information : https://github.com/ovh/venom -# name: sigo odrer qi -# testcases: -# - name: sort qi -# steps: -# - script: |- -# sigo -q x,y -i id < Date: Thu, 18 Apr 2024 20:27:30 +0000 Subject: [PATCH 7/7] fix: test result order --- test/suites/04-run-anonymizer.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/suites/04-run-anonymizer.yml b/test/suites/04-run-anonymizer.yml index 7d9203a..0ffd346 100644 --- a/test/suites/04-run-anonymizer.yml +++ b/test/suites/04-run-anonymizer.yml @@ -25,8 +25,8 @@ testcases: {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"saumon"} {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"chouette"} {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"canard"} - {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"loup"} {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"singe"} + {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"loup"} {"fruit":[1,1],"taille":[4,5],"poids":[4,5],"meurtre":1,"natation":[1,1],"course":[1,1],"voltige":[0,0],"animal":"ours"} {"fruit":[1,1],"taille":[4,5],"poids":[4,5],"meurtre":0,"natation":[1,1],"course":[1,1],"voltige":[0,0],"animal":"elephant"} EOF @@ -51,4 +51,3 @@ testcases: assertions: - result.code ShouldEqual 0 - result.systemout ShouldBeEmpty -