diff --git a/examples/order.json b/examples/order.json new file mode 100644 index 0000000..0315ec5 --- /dev/null +++ b/examples/order.json @@ -0,0 +1,15 @@ +{"x":4, "y":1} +{"x":4, "y":2} +{"x":4, "y":3} +{"x":3, "y":4} +{"x":3, "y":5} +{"x":3, "y":6} +{"x":3, "y":7} +{"x":4, "y":8} +{"x":4, "y":9} +{"x":4, "y":10} +{"x":3, "y":11} +{"x":4, "y":12} +{"x":4, "y":13} +{"x":3, "y":14} +{"x":3, "y":15} diff --git a/pkg/sigo/analyzer.go b/pkg/sigo/analyzer.go new file mode 100644 index 0000000..c386551 --- /dev/null +++ b/pkg/sigo/analyzer.go @@ -0,0 +1,70 @@ +package sigo + +import ( + "sort" +) + +type Source struct { + qi map[string]int + values map[string][]float64 +} + +func NewAnalyzer(qi []string) Analyzer { + dict := make(map[string]int) + for i, key := range qi { + dict[key] = i + } + + return Source{qi: dict, values: make(map[string][]float64)} +} + +func (s Source) Add(r Record) { + for key, i := range s.qi { + s.values[key] = append(s.values[key], r.QuasiIdentifer()[i]) + } +} + +func (s Source) QI(i int) string { + return s.Order()[i] +} + +func (s Source) CountUniqueValues() map[string]int { + uniques := make(map[string]int) + + for key := range s.qi { + uniques[key] = Unique(s.values[key]) + } + + return uniques +} + +func (s Source) Order() map[int]string { + order := make(map[int]string) + switched := make(map[int][]string) + slice := []int{} + + for key, count := range s.CountUniqueValues() { + switched[count] = append(switched[count], key) + + slice = append(slice, count) + } + + sort.Sort(sort.Reverse(sort.IntSlice(slice))) + + i := 0 + + for _, count := range slice { + for _, qi := range switched[count] { + order[i] = qi + i++ + } + + delete(switched, count) + } + + return order +} + +func (s Source) Dimension(rot int) int { + return s.qi[s.Order()[rot]] +} diff --git a/pkg/sigo/analyzer_test.go b/pkg/sigo/analyzer_test.go new file mode 100644 index 0000000..dae56f5 --- /dev/null +++ b/pkg/sigo/analyzer_test.go @@ -0,0 +1,71 @@ +// Copyright (C) 2022 CGI France +// +// This file is part of SIGO. +// +// SIGO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SIGO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SIGO. If not, see . +package sigo_test + +import ( + "testing" + + "github.com/cgi-fr/sigo/pkg/sigo" + "github.com/stretchr/testify/assert" +) + +func TestCountUniqueValues(t *testing.T) { + t.Parallel() + + qi := []string{"x", "y"} + source := sigo.NewAnalyzer(qi) + + source.Add(createRow(4, 1, qi)) + source.Add(createRow(3, 2, qi)) + source.Add(createRow(4, 3, qi)) + + res := source.CountUniqueValues() + + assert.Equal(t, 2, res["x"]) + assert.Equal(t, 3, res["y"]) +} + +func TestOrderMap(t *testing.T) { + t.Parallel() + + qi := []string{"x", "y"} + source := sigo.NewAnalyzer(qi) + + source.Add(createRow(4, 1, qi)) + source.Add(createRow(3, 2, qi)) + source.Add(createRow(4, 3, qi)) + + res := source.Order() + + assert.Equal(t, "y", res[0]) + assert.Equal(t, "x", res[1]) +} + +func TestDimension(t *testing.T) { + t.Parallel() + + qi := []string{"x", "y"} + source := sigo.NewAnalyzer(qi) + + source.Add(createRow(4, 1, qi)) + source.Add(createRow(3, 2, qi)) + source.Add(createRow(4, 3, qi)) + + res := source.Dimension(0) + + assert.Equal(t, 1, res) +} diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index a866e63..8ddc3e7 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -35,7 +35,7 @@ type KDTreeFactory struct{} func (f KDTreeFactory) New(k int, l int, dim int, qi []string) Generalizer { // nolint: exhaustivestruct - tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), qi: qi} + tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), analyzer: NewAnalyzer(qi)} root := NewNode(&tree, "root", 0) root.validate() tree.root = &root @@ -49,7 +49,7 @@ type KDTree struct { root *node dim int clusterID map[string]int - qi []string + analyzer Analyzer } func NewKDTree(k, l, dim int, clusterID map[string]int) KDTree { @@ -60,6 +60,7 @@ func NewKDTree(k, l, dim int, clusterID map[string]int) KDTree { // Add add a record to the tree (root node). func (t KDTree) Add(r Record) { t.root.Add(r) + t.analyzer.Add(r) } // Build starts building the tree. @@ -113,7 +114,7 @@ func (n *node) incRot() { // build creates nodes. func (n *node) build() { log.Debug(). - Str("Dimension", n.tree.qi[n.rot]). + Str("Dimension", n.tree.analyzer.QI(n.rot)). Str("Path", n.clusterPath). Int("Size", len(n.cluster)). Msg("Cluster:") @@ -155,8 +156,9 @@ func (n *node) build() { // split creates 2 subnodes by ordering the node and splitting in order to have 2 equal parts // and all elements having the same value in the same subnode. func (n *node) split() (node, node, bool) { + dim := n.tree.analyzer.Dimension(n.rot) sort.SliceStable(n.cluster, func(i int, j int) bool { - return n.cluster[i].QuasiIdentifer()[n.rot] < n.cluster[j].QuasiIdentifer()[n.rot] + return n.cluster[i].QuasiIdentifer()[dim] < n.cluster[j].QuasiIdentifer()[dim] }) n.pivot = nil diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index b0cc461..892a5e1 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -58,3 +58,11 @@ type Anonymizer interface { type Debugger interface { Information(Record, Cluster) Record } + +type Analyzer interface { + Add(Record) + QI(i int) string + CountUniqueValues() map[string]int + Order() map[int]string + Dimension(int) int +} diff --git a/pkg/sigo/stats.go b/pkg/sigo/stats.go index 3e3674f..a1d0c35 100644 --- a/pkg/sigo/stats.go +++ b/pkg/sigo/stats.go @@ -247,6 +247,16 @@ func BoxMuller() (float64, float64) { return z1, z2 } +func Unique(values []float64) int { + tmp := make(map[float64]int) + + for _, val := range values { + tmp[val]++ + } + + return len(tmp) +} + // Secure shuffle of the order of the elements. func Shuffle(s []float64) []float64 { slice := s diff --git a/pkg/sigo/stats_test.go b/pkg/sigo/stats_test.go index 10daa05..3ae35ef 100644 --- a/pkg/sigo/stats_test.go +++ b/pkg/sigo/stats_test.go @@ -33,6 +33,19 @@ func TestQuartiles(t *testing.T) { assert.Equal(t, 5.00, sigo.IQR(values)) } +func TestUnique(t *testing.T) { + t.Parallel() + + values1 := []float64{12, 10, 5, 6, 9, 10, 4, 5, 10, 12, 9, 6, 4, 3, 9, 10} + values2 := []float64{1, 9, 8, 5, 2, 6, 7, 10, 3, 12, 4, 11} + + res1 := sigo.Unique(values1) + res2 := sigo.Unique(values2) + + assert.Equal(t, 7, res1) + assert.Equal(t, 12, res2) +} + func TestRandInt(t *testing.T) { t.Parallel() diff --git a/test/suites/02-order-QI.yml b/test/suites/02-order-QI.yml new file mode 100644 index 0000000..82b8973 --- /dev/null +++ b/test/suites/02-order-QI.yml @@ -0,0 +1,30 @@ +# Venom Test Suite definition +# Check Venom documentation for more information : https://github.com/ovh/venom + +name: sigo odrer qi +testcases: + - name: sort qi + steps: + - script: |- + sigo -q x,y -i id <