diff --git a/examples/order.json b/examples/order.json
new file mode 100644
index 0000000..0315ec5
--- /dev/null
+++ b/examples/order.json
@@ -0,0 +1,15 @@
+{"x":4, "y":1}
+{"x":4, "y":2}
+{"x":4, "y":3}
+{"x":3, "y":4}
+{"x":3, "y":5}
+{"x":3, "y":6}
+{"x":3, "y":7}
+{"x":4, "y":8}
+{"x":4, "y":9}
+{"x":4, "y":10}
+{"x":3, "y":11}
+{"x":4, "y":12}
+{"x":4, "y":13}
+{"x":3, "y":14}
+{"x":3, "y":15}
diff --git a/pkg/sigo/analyzer.go b/pkg/sigo/analyzer.go
new file mode 100644
index 0000000..c386551
--- /dev/null
+++ b/pkg/sigo/analyzer.go
@@ -0,0 +1,70 @@
+package sigo
+
+import (
+ "sort"
+)
+
+type Source struct {
+ qi map[string]int
+ values map[string][]float64
+}
+
+func NewAnalyzer(qi []string) Analyzer {
+ dict := make(map[string]int)
+ for i, key := range qi {
+ dict[key] = i
+ }
+
+ return Source{qi: dict, values: make(map[string][]float64)}
+}
+
+func (s Source) Add(r Record) {
+ for key, i := range s.qi {
+ s.values[key] = append(s.values[key], r.QuasiIdentifer()[i])
+ }
+}
+
+func (s Source) QI(i int) string {
+ return s.Order()[i]
+}
+
+func (s Source) CountUniqueValues() map[string]int {
+ uniques := make(map[string]int)
+
+ for key := range s.qi {
+ uniques[key] = Unique(s.values[key])
+ }
+
+ return uniques
+}
+
+func (s Source) Order() map[int]string {
+ order := make(map[int]string)
+ switched := make(map[int][]string)
+ slice := []int{}
+
+ for key, count := range s.CountUniqueValues() {
+ switched[count] = append(switched[count], key)
+
+ slice = append(slice, count)
+ }
+
+ sort.Sort(sort.Reverse(sort.IntSlice(slice)))
+
+ i := 0
+
+ for _, count := range slice {
+ for _, qi := range switched[count] {
+ order[i] = qi
+ i++
+ }
+
+ delete(switched, count)
+ }
+
+ return order
+}
+
+func (s Source) Dimension(rot int) int {
+ return s.qi[s.Order()[rot]]
+}
diff --git a/pkg/sigo/analyzer_test.go b/pkg/sigo/analyzer_test.go
new file mode 100644
index 0000000..dae56f5
--- /dev/null
+++ b/pkg/sigo/analyzer_test.go
@@ -0,0 +1,71 @@
+// Copyright (C) 2022 CGI France
+//
+// This file is part of SIGO.
+//
+// SIGO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// SIGO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with SIGO. If not, see .
+package sigo_test
+
+import (
+ "testing"
+
+ "github.com/cgi-fr/sigo/pkg/sigo"
+ "github.com/stretchr/testify/assert"
+)
+
+func TestCountUniqueValues(t *testing.T) {
+ t.Parallel()
+
+ qi := []string{"x", "y"}
+ source := sigo.NewAnalyzer(qi)
+
+ source.Add(createRow(4, 1, qi))
+ source.Add(createRow(3, 2, qi))
+ source.Add(createRow(4, 3, qi))
+
+ res := source.CountUniqueValues()
+
+ assert.Equal(t, 2, res["x"])
+ assert.Equal(t, 3, res["y"])
+}
+
+func TestOrderMap(t *testing.T) {
+ t.Parallel()
+
+ qi := []string{"x", "y"}
+ source := sigo.NewAnalyzer(qi)
+
+ source.Add(createRow(4, 1, qi))
+ source.Add(createRow(3, 2, qi))
+ source.Add(createRow(4, 3, qi))
+
+ res := source.Order()
+
+ assert.Equal(t, "y", res[0])
+ assert.Equal(t, "x", res[1])
+}
+
+func TestDimension(t *testing.T) {
+ t.Parallel()
+
+ qi := []string{"x", "y"}
+ source := sigo.NewAnalyzer(qi)
+
+ source.Add(createRow(4, 1, qi))
+ source.Add(createRow(3, 2, qi))
+ source.Add(createRow(4, 3, qi))
+
+ res := source.Dimension(0)
+
+ assert.Equal(t, 1, res)
+}
diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go
index a866e63..8ddc3e7 100644
--- a/pkg/sigo/kdtree.go
+++ b/pkg/sigo/kdtree.go
@@ -35,7 +35,7 @@ type KDTreeFactory struct{}
func (f KDTreeFactory) New(k int, l int, dim int, qi []string) Generalizer {
// nolint: exhaustivestruct
- tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), qi: qi}
+ tree := KDTree{k: k, l: l, dim: dim, clusterID: make(map[string]int), analyzer: NewAnalyzer(qi)}
root := NewNode(&tree, "root", 0)
root.validate()
tree.root = &root
@@ -49,7 +49,7 @@ type KDTree struct {
root *node
dim int
clusterID map[string]int
- qi []string
+ analyzer Analyzer
}
func NewKDTree(k, l, dim int, clusterID map[string]int) KDTree {
@@ -60,6 +60,7 @@ func NewKDTree(k, l, dim int, clusterID map[string]int) KDTree {
// Add add a record to the tree (root node).
func (t KDTree) Add(r Record) {
t.root.Add(r)
+ t.analyzer.Add(r)
}
// Build starts building the tree.
@@ -113,7 +114,7 @@ func (n *node) incRot() {
// build creates nodes.
func (n *node) build() {
log.Debug().
- Str("Dimension", n.tree.qi[n.rot]).
+ Str("Dimension", n.tree.analyzer.QI(n.rot)).
Str("Path", n.clusterPath).
Int("Size", len(n.cluster)).
Msg("Cluster:")
@@ -155,8 +156,9 @@ func (n *node) build() {
// split creates 2 subnodes by ordering the node and splitting in order to have 2 equal parts
// and all elements having the same value in the same subnode.
func (n *node) split() (node, node, bool) {
+ dim := n.tree.analyzer.Dimension(n.rot)
sort.SliceStable(n.cluster, func(i int, j int) bool {
- return n.cluster[i].QuasiIdentifer()[n.rot] < n.cluster[j].QuasiIdentifer()[n.rot]
+ return n.cluster[i].QuasiIdentifer()[dim] < n.cluster[j].QuasiIdentifer()[dim]
})
n.pivot = nil
diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go
index b0cc461..892a5e1 100644
--- a/pkg/sigo/model.go
+++ b/pkg/sigo/model.go
@@ -58,3 +58,11 @@ type Anonymizer interface {
type Debugger interface {
Information(Record, Cluster) Record
}
+
+type Analyzer interface {
+ Add(Record)
+ QI(i int) string
+ CountUniqueValues() map[string]int
+ Order() map[int]string
+ Dimension(int) int
+}
diff --git a/pkg/sigo/stats.go b/pkg/sigo/stats.go
index 3e3674f..a1d0c35 100644
--- a/pkg/sigo/stats.go
+++ b/pkg/sigo/stats.go
@@ -247,6 +247,16 @@ func BoxMuller() (float64, float64) {
return z1, z2
}
+func Unique(values []float64) int {
+ tmp := make(map[float64]int)
+
+ for _, val := range values {
+ tmp[val]++
+ }
+
+ return len(tmp)
+}
+
// Secure shuffle of the order of the elements.
func Shuffle(s []float64) []float64 {
slice := s
diff --git a/pkg/sigo/stats_test.go b/pkg/sigo/stats_test.go
index 10daa05..3ae35ef 100644
--- a/pkg/sigo/stats_test.go
+++ b/pkg/sigo/stats_test.go
@@ -33,6 +33,19 @@ func TestQuartiles(t *testing.T) {
assert.Equal(t, 5.00, sigo.IQR(values))
}
+func TestUnique(t *testing.T) {
+ t.Parallel()
+
+ values1 := []float64{12, 10, 5, 6, 9, 10, 4, 5, 10, 12, 9, 6, 4, 3, 9, 10}
+ values2 := []float64{1, 9, 8, 5, 2, 6, 7, 10, 3, 12, 4, 11}
+
+ res1 := sigo.Unique(values1)
+ res2 := sigo.Unique(values2)
+
+ assert.Equal(t, 7, res1)
+ assert.Equal(t, 12, res2)
+}
+
func TestRandInt(t *testing.T) {
t.Parallel()
diff --git a/test/suites/02-order-QI.yml b/test/suites/02-order-QI.yml
new file mode 100644
index 0000000..82b8973
--- /dev/null
+++ b/test/suites/02-order-QI.yml
@@ -0,0 +1,30 @@
+# Venom Test Suite definition
+# Check Venom documentation for more information : https://github.com/ovh/venom
+
+name: sigo odrer qi
+testcases:
+ - name: sort qi
+ steps:
+ - script: |-
+ sigo -q x,y -i id <