Skip to content

Commit

Permalink
nip45: simplify algorithm and hardcode to precision 8.
Browse files Browse the repository at this point in the history
  • Loading branch information
fiatjaf committed Nov 3, 2024
1 parent d9a99db commit ff6cd5c
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 98 deletions.
36 changes: 5 additions & 31 deletions nip45/helpers.go
Original file line number Diff line number Diff line change
@@ -1,23 +1,19 @@
package nip45

import "math"
import (
"math"
)

const two32 = 1 << 32

// Extract bits from uint64 using LSB 0 numbering, including lo.
func eb(bits uint64, hi uint8, lo uint8) uint64 {
m := uint64(((1 << (hi - lo)) - 1) << lo)
return (bits & m) >> lo
}

func linearCounting(m uint32, v uint32) float64 {
fm := float64(m)
return fm * math.Log(fm/float64(v))
}

func clz64(x uint64) uint8 {
func clz56(x uint64) uint8 {
var c uint8
for m := uint64(1 << 63); m&x == 0 && m != 0; m >>= 1 {
for m := uint64(1 << 55); m&x == 0 && m != 0; m >>= 1 {
c++
}
return c
Expand All @@ -32,25 +28,3 @@ func countZeros(s []uint8) uint32 {
}
return c
}

func calculateEstimate(s []uint8) float64 {
sum := 0.0
for _, val := range s {
sum += 1.0 / float64(uint64(1)<<val)
}

m := uint32(len(s))
fm := float64(m)
return alpha(m) * fm * fm / sum
}

func alpha(m uint32) float64 {
if m == 16 {
return 0.673
} else if m == 32 {
return 0.697
} else if m == 64 {
return 0.709
}
return 0.7213 / (1 + 1.079/float64(m))
}
82 changes: 28 additions & 54 deletions nip45/hll.go
Original file line number Diff line number Diff line change
@@ -1,29 +1,21 @@
package nip45

import (
"fmt"
"strconv"
"encoding/binary"
"encoding/hex"
)

var threshold = []uint{
10, 20, 40, 80, 220, 400, 900, 1800, 3100,
6500, 11500, 20000, 50000, 120000, 350000,
}

// Everything is hardcoded to use precision 8, i.e. 256 registers.
type HyperLogLog struct {
registers []uint8
precision uint8
}

func New(precision uint8) (*HyperLogLog, error) {
if precision > 16 || precision < 4 {
return nil, fmt.Errorf("precision must be between 4 and 16")
}

func New() *HyperLogLog {
// precision is always 8
// the number of registers is always 256 (1<<8)
hll := &HyperLogLog{}
hll.precision = precision
hll.registers = make([]uint8, 1<<precision)
return hll, nil
hll.registers = make([]uint8, 256)
return hll
}

func (hll *HyperLogLog) Clear() {
Expand All @@ -33,70 +25,52 @@ func (hll *HyperLogLog) Clear() {
}

func (hll *HyperLogLog) Add(id string) {
x, _ := strconv.ParseUint(id[32:32+8*2], 16, 64)
x, _ := hex.DecodeString(id[32 : 32+8*2])
j := x[0] // register address (first 8 bits, i.e. first byte)

i := eb(x, 64, 64-hll.precision) // {x31,...,x32-p}
w := x<<hll.precision | 1<<(hll.precision-1) // {x32-p,...,x0}
w := binary.BigEndian.Uint64(x) // number that we will use
zeroBits := clz56(w) + 1 // count zeroes (skip the first byte, so only use 56 bits)

zeroBits := clz64(w) + 1
if zeroBits > hll.registers[i] {
hll.registers[i] = zeroBits
if zeroBits > hll.registers[j] {
hll.registers[j] = zeroBits
}
}

func (hll *HyperLogLog) Merge(other *HyperLogLog) error {
if hll.precision != other.precision {
return fmt.Errorf("precisions must be equal")
}

for i, v := range other.registers {
if v > hll.registers[i] {
hll.registers[i] = v
}
}

return nil
}

func (hll *HyperLogLog) Count() uint64 {
m := uint32(len(hll.registers))
v := countZeros(hll.registers)

if v := countZeros(hll.registers); v != 0 {
lc := linearCounting(m, v)
if lc <= float64(threshold[hll.precision-4]) {
if v != 0 {
lc := linearCounting(256 /* nregisters */, v)

if lc <= 220 /* threshold */ {
return uint64(lc)
}
}

est := calculateEstimate(hll.registers)
if est <= float64(len(hll.registers))*5.0 {
if v := countZeros(hll.registers); v != 0 {
return uint64(linearCounting(m, v))
est := hll.calculateEstimate()
if est <= 256 /* nregisters */ *3 {
if v != 0 {
return uint64(linearCounting(256 /* nregisters */, v))
}
}

return uint64(est)
}

func (hll *HyperLogLog) estimateBias(est float64) float64 {
estTable, biasTable := rawEstimateData[hll.precision-4], biasData[hll.precision-4]

if estTable[0] > est {
return biasTable[0]
}

lastEstimate := estTable[len(estTable)-1]
if lastEstimate < est {
return biasTable[len(biasTable)-1]
func (hll HyperLogLog) calculateEstimate() float64 {
sum := 0.0
for _, val := range hll.registers {
sum += 1.0 / float64(uint64(1)<<val) // this is the same as 2^(-val)
}

var i int
for i = 0; i < len(estTable) && estTable[i] < est; i++ {
}

e1, b1 := estTable[i-1], biasTable[i-1]
e2, b2 := estTable[i], biasTable[i]

c := (est - e1) / (e2 - e1)
return b1*(1-c) + b2*c
return 0.7182725932495458 /* alpha for 256 registers */ * 256 /* nregisters */ * 256 /* nregisters */ / sum
}
27 changes: 14 additions & 13 deletions nip45/hll_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"github.com/stretchr/testify/require"
)

func TestHyperLogLog(t *testing.T) {
func TestHyperLogLogBasic(t *testing.T) {
rand := rand.New(rand.NewPCG(1, 0))

for _, count := range []int{
Expand All @@ -18,7 +18,7 @@ func TestHyperLogLog(t *testing.T) {
777, 922, 1000, 1500, 2222, 9999,
13600, 80000, 133333, 200000,
} {
hll, _ := New(8)
hll := New()

for range count {
b := make([]byte, 32)
Expand All @@ -29,9 +29,10 @@ func TestHyperLogLog(t *testing.T) {
hll.Add(id)
}

res100 := int(hll.Count() * 100)
require.Greater(t, res100, count*85, "result too low (actual %d < %d)", hll.Count(), count)
require.Less(t, res100, count*115, "result too high (actual %d > %d)", hll.Count(), count)
c := hll.Count()
res100 := int(c * 100)
require.Greater(t, res100, count*85, "result too low (actual %d < %d)", c, count)
require.Less(t, res100, count*115, "result too high (actual %d > %d)", c, count)
}
}

Expand All @@ -45,8 +46,8 @@ func TestHyperLogLogMerge(t *testing.T) {
777, 922, 1000, 1500, 2222, 9999,
13600, 80000, 133333, 200000,
} {
hllA, _ := New(8)
hllB, _ := New(8)
hllA := New()
hllB := New()

for range count / 2 {
b := make([]byte, 32)
Expand All @@ -65,7 +66,7 @@ func TestHyperLogLogMerge(t *testing.T) {
hllB.Add(id)
}

hll, _ := New(8)
hll := New()
hll.Merge(hllA)
hll.Merge(hllB)

Expand All @@ -76,7 +77,7 @@ func TestHyperLogLogMerge(t *testing.T) {
}

func TestHyperLogLogMergeComplex(t *testing.T) {
rand := rand.New(rand.NewPCG(2, 0))
rand := rand.New(rand.NewPCG(4, 0))

for _, count := range []int{
3, 6, 9, 12, 15, 22, 36, 46, 57,
Expand All @@ -85,9 +86,9 @@ func TestHyperLogLogMergeComplex(t *testing.T) {
777, 922, 1000, 1500, 2222, 9999,
13600, 80000, 133333, 200000,
} {
hllA, _ := New(8)
hllB, _ := New(8)
hllC, _ := New(8)
hllA := New()
hllB := New()
hllC := New()

for range count / 3 {
b := make([]byte, 32)
Expand Down Expand Up @@ -117,7 +118,7 @@ func TestHyperLogLogMergeComplex(t *testing.T) {
hllA.Add(id)
}

hll, _ := New(8)
hll := New()
hll.Merge(hllA)
hll.Merge(hllB)
hll.Merge(hllC)
Expand Down

0 comments on commit ff6cd5c

Please sign in to comment.