Skip to content

Commit 0cc8976

Browse files
authored
Merge pull request #43 from axiomhq/intmap
Improve set operation speeds
2 parents 53e9214 + 4528df8 commit 0cc8976

File tree

5 files changed

+103
-37
lines changed

5 files changed

+103
-37
lines changed

go.mod

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
module github.com/axiomhq/hyperloglog
22

3-
go 1.21
3+
go 1.23
44

5-
toolchain go1.23.0
5+
toolchain go1.23.4
66

77
require (
88
github.com/davecgh/go-spew v1.1.1
@@ -11,6 +11,7 @@ require (
1111
)
1212

1313
require (
14+
github.com/kamstrup/intmap v0.5.0 // indirect
1415
github.com/kr/pretty v0.3.0 // indirect
1516
github.com/pmezard/go-difflib v1.0.0 // indirect
1617
github.com/rogpeppe/go-internal v1.9.0 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
33
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
44
github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc h1:8WFBn63wegobsYAX0YjD+8suexZDga5CctH4CCTx2+8=
55
github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
6+
github.com/kamstrup/intmap v0.5.0 h1:WY7OJQeG7Ujc9zpPTO6PraDGSveG9js9wCPoI2q8wJQ=
7+
github.com/kamstrup/intmap v0.5.0/go.mod h1:gWUVWHKzWj8xpJVFf5GC0O26bWmv3GqdnIX/LMT6Aq4=
68
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
79
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
810
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=

hyperloglog.go

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ type Sketch struct {
1818
p uint8
1919
m uint32
2020
alpha float64
21-
tmpSet set
21+
tmpSet *set
2222
sparseList *compressedList
2323
regs []uint8
2424
}
@@ -45,7 +45,7 @@ func NewSketch(precision uint8, sparse bool) (*Sketch, error) {
4545
alpha: alpha(float64(m)),
4646
}
4747
if sparse {
48-
s.tmpSet = set{}
48+
s.tmpSet = newSet(0)
4949
s.sparseList = newCompressedList(0)
5050
} else {
5151
s.regs = make([]uint8, m)
@@ -65,7 +65,7 @@ func (sk *Sketch) Clone() *Sketch {
6565
}
6666

6767
func (sk *Sketch) maybeToNormal() {
68-
if uint32(len(sk.tmpSet))*100 > sk.m {
68+
if uint32(sk.tmpSet.Len())*100 > sk.m {
6969
sk.mergeSparse()
7070
if uint32(sk.sparseList.Len()) > sk.m {
7171
sk.toNormal()
@@ -90,9 +90,7 @@ func (sk *Sketch) Merge(other *Sketch) error {
9090
}
9191

9292
func (sk *Sketch) mergeSparseSketch(other *Sketch) {
93-
for k := range other.tmpSet {
94-
sk.tmpSet.add(k)
95-
}
93+
sk.tmpSet.Merge(other.tmpSet)
9694
for iter := other.sparseList.Iter(); iter.HasNext(); {
9795
sk.tmpSet.add(iter.Next())
9896
}
@@ -105,10 +103,10 @@ func (sk *Sketch) mergeDenseSketch(other *Sketch) {
105103
}
106104

107105
if other.sparse() {
108-
for k := range other.tmpSet {
106+
other.tmpSet.ForEach(func(k uint32) {
109107
i, r := decodeHash(k, other.p, pp)
110108
sk.insert(i, r)
111-
}
109+
})
112110
for iter := other.sparseList.Iter(); iter.HasNext(); {
113111
i, r := decodeHash(iter.Next(), other.p, pp)
114112
sk.insert(i, r)
@@ -123,7 +121,7 @@ func (sk *Sketch) mergeDenseSketch(other *Sketch) {
123121
}
124122

125123
func (sk *Sketch) toNormal() {
126-
if len(sk.tmpSet) > 0 {
124+
if sk.tmpSet.Len() > 0 {
127125
sk.mergeSparse()
128126
}
129127

@@ -165,17 +163,17 @@ func (sk *Sketch) Estimate() uint64 {
165163
}
166164

167165
func (sk *Sketch) mergeSparse() {
168-
if len(sk.tmpSet) == 0 {
166+
if sk.tmpSet.Len() == 0 {
169167
return
170168
}
171169

172-
keys := make(uint64Slice, 0, len(sk.tmpSet))
173-
for k := range sk.tmpSet {
170+
keys := make(uint64Slice, 0, sk.tmpSet.Len())
171+
sk.tmpSet.ForEach(func(k uint32) {
174172
keys = append(keys, k)
175-
}
173+
})
176174
sort.Sort(keys)
177175

178-
newList := newCompressedList(4*len(sk.tmpSet) + len(sk.sparseList.b))
176+
newList := newCompressedList(4*sk.tmpSet.Len() + sk.sparseList.Len())
179177
for iter, i := sk.sparseList.Iter(), 0; iter.HasNext() || i < len(keys); {
180178
if !iter.HasNext() {
181179
newList.Append(keys[i])
@@ -201,7 +199,7 @@ func (sk *Sketch) mergeSparse() {
201199
}
202200

203201
sk.sparseList = newList
204-
sk.tmpSet = set{}
202+
sk.tmpSet = newSet(0)
205203
}
206204

207205
// MarshalBinary implements the encoding.BinaryMarshaler interface.
@@ -277,7 +275,7 @@ func (sk *Sketch) UnmarshalBinary(data []byte) error {
277275
sparse := data[3] == byte(1)
278276

279277
// Make a newSketch Sketch if the precision doesn't match or if the Sketch was used
280-
if sk.p != p || sk.regs != nil || len(sk.tmpSet) > 0 || (sk.sparseList != nil && sk.sparseList.Len() > 0) {
278+
if sk.p != p || sk.regs != nil || sk.tmpSet.Len() > 0 || (sk.sparseList != nil && sk.sparseList.Len() > 0) {
281279
newh, err := NewSketch(p, sparse)
282280
if err != nil {
283281
return err
@@ -292,14 +290,14 @@ func (sk *Sketch) UnmarshalBinary(data []byte) error {
292290

293291
// Unmarshal the tmp_set.
294292
tssz := binary.BigEndian.Uint32(data[4:8])
295-
sk.tmpSet = make(map[uint32]struct{}, tssz)
293+
sk.tmpSet = newSet(int(tssz))
296294

297295
// We need to unmarshal tssz values in total, and each value requires us
298296
// to read 4 bytes.
299297
tsLastByte := int((tssz * 4) + 8)
300298
for i := 8; i < tsLastByte; i += 4 {
301299
k := binary.BigEndian.Uint32(data[i : i+4])
302-
sk.tmpSet[k] = struct{}{}
300+
sk.tmpSet.add(k)
303301
}
304302

305303
// Unmarshal the sparse Sketch.

hyperloglog_test.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,9 @@ func TestHLL_Error(t *testing.T) {
317317

318318
func TestHLL_Marshal_Unmarshal_Sparse(t *testing.T) {
319319
sk, _ := NewSketch(4, true)
320-
sk.tmpSet = map[uint32]struct{}{26: {}, 40: {}}
320+
sk.tmpSet = newSet(2)
321+
sk.tmpSet.add(26)
322+
sk.tmpSet.add(40)
321323

322324
// Add a bunch of values to the sparse representation.
323325
for i := 0; i < 10; i++ {
@@ -811,3 +813,39 @@ func TestHLL_Add_Out_Of_Order(t *testing.T) {
811813
})
812814
}
813815
}
816+
817+
func benchmarkMerge(b *testing.B, size1, size2 int) {
818+
// Generate data for first sketch
819+
sk1 := New14()
820+
for i := 0; i < size1; i++ {
821+
sk1.Insert([]byte(fmt.Sprintf("a%d", i)))
822+
}
823+
824+
// Generate data for second sketch
825+
sk2 := New14()
826+
for i := 0; i < size2; i++ {
827+
sk2.Insert([]byte(fmt.Sprintf("b%d", i)))
828+
}
829+
830+
b.ResetTimer()
831+
b.ReportAllocs()
832+
833+
for i := 0; i < b.N; i++ {
834+
sk := New14()
835+
sk.Merge(sk1)
836+
sk.Merge(sk2)
837+
}
838+
}
839+
840+
func Benchmark_Merge(b *testing.B) {
841+
sizes := []int{100, 10000, 1000000}
842+
843+
for _, size1 := range sizes {
844+
for _, size2 := range sizes {
845+
name := fmt.Sprintf("size1=%d/size2=%d", size1, size2)
846+
b.Run(name, func(b *testing.B) {
847+
benchmarkMerge(b, size1, size2)
848+
})
849+
}
850+
}
851+
}

sparse.go

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package hyperloglog
22

33
import (
44
"math/bits"
5+
6+
"github.com/kamstrup/intmap"
57
)
68

79
func getIndex(k uint32, p, pp uint8) uint32 {
@@ -34,37 +36,61 @@ func decodeHash(k uint32, p, pp uint8) (uint32, uint8) {
3436
return getIndex(k, p, pp), r
3537
}
3638

37-
type set map[uint32]struct{}
39+
type set struct {
40+
m *intmap.Set[uint32]
41+
}
42+
43+
func newSet(size int) *set {
44+
return &set{m: intmap.NewSet[uint32](size)}
45+
}
46+
47+
func (s *set) ForEach(fn func(v uint32)) {
48+
s.m.ForEach(func(v uint32) bool {
49+
fn(v)
50+
return true
51+
})
52+
}
53+
54+
func (s *set) Merge(other *set) {
55+
other.m.ForEach(func(v uint32) bool {
56+
s.m.Add(v)
57+
return true
58+
})
59+
}
60+
61+
func (s *set) Len() int {
62+
return s.m.Len()
63+
}
3864

39-
func (s set) add(v uint32) bool {
40-
_, ok := s[v]
41-
if ok {
65+
func (s *set) add(v uint32) bool {
66+
if s.m.Has(v) {
4267
return false
4368
}
44-
s[v] = struct{}{}
69+
s.m.Add(v)
4570
return true
4671
}
4772

48-
func (s set) Clone() set {
73+
func (s *set) Clone() *set {
4974
if s == nil {
5075
return nil
5176
}
5277

53-
newS := make(map[uint32]struct{}, len(s))
54-
for k, v := range s {
55-
newS[k] = v
56-
}
57-
return newS
78+
newS := intmap.NewSet[uint32](s.m.Len())
79+
s.m.ForEach(func(v uint32) bool {
80+
newS.Add(v)
81+
return true
82+
})
83+
return &set{m: newS}
5884
}
5985

60-
func (s set) MarshalBinary() (data []byte, err error) {
86+
func (s *set) MarshalBinary() (data []byte, err error) {
6187
// 4 bytes for the size of the set, and 4 bytes for each key.
6288
// list.
63-
data = make([]byte, 0, 4+(4*len(s)))
89+
data = make([]byte, 0, 4+(4*s.m.Len()))
6490

6591
// Length of the set. We only need 32 bits because the size of the set
6692
// couldn't exceed that on 32 bit architectures.
67-
sl := len(s)
93+
sl := s.m.Len()
6894
data = append(data, []byte{
6995
byte(sl >> 24),
7096
byte(sl >> 16),
@@ -73,14 +99,15 @@ func (s set) MarshalBinary() (data []byte, err error) {
7399
}...)
74100

75101
// Marshal each element in the set.
76-
for k := range s {
102+
s.m.ForEach(func(k uint32) bool {
77103
data = append(data, []byte{
78104
byte(k >> 24),
79105
byte(k >> 16),
80106
byte(k >> 8),
81107
byte(k),
82108
}...)
83-
}
109+
return true
110+
})
84111

85112
return data, nil
86113
}

0 commit comments

Comments
 (0)