Skip to content

Commit

Permalink
huff0: Speed up compression of short blocks
Browse files Browse the repository at this point in the history
Tells the compiler to merge loads and stores of nodeElts.

name                                      old speed      new speed      delta
Compress4XReuseNone/digits-8               433MB/s ± 1%   434MB/s ± 0%     ~     (p=0.393 n=10+10)
Compress4XReuseNone/gettysburg-8           245MB/s ± 1%   254MB/s ± 1%   +3.55%  (p=0.000 n=10+10)
Compress4XReuseNone/twain-8                363MB/s ± 0%   363MB/s ± 0%     ~     (p=0.360 n=8+10)
Compress4XReuseNone/low-ent.10k-8          465MB/s ± 0%   467MB/s ± 0%   +0.32%  (p=0.008 n=9+10)
Compress4XReuseNone/superlow-ent-10k-8     305MB/s ± 2%   304MB/s ± 1%     ~     (p=0.143 n=10+10)
Compress4XReuseNone/case1-8               13.1MB/s ± 1%  14.6MB/s ± 2%  +11.44%  (p=0.000 n=10+10)
Compress4XReuseNone/case2-8               10.9MB/s ± 1%  12.3MB/s ± 2%  +12.28%  (p=0.000 n=10+9)
Compress4XReuseNone/case3-8               11.5MB/s ± 1%  12.9MB/s ± 2%  +12.37%  (p=0.000 n=10+10)
Compress4XReuseNone/pngdata.001-8          300MB/s ± 0%   298MB/s ± 2%     ~     (p=0.762 n=8+10)
Compress4XReuseNone/normcount2-8          31.5MB/s ± 1%  33.7MB/s ± 2%   +6.87%  (p=0.000 n=9+10)
Compress4XReuseAllow/digits-8              435MB/s ± 1%   432MB/s ± 2%     ~     (p=0.043 n=10+10)
Compress4XReuseAllow/gettysburg-8          272MB/s ± 1%   281MB/s ± 1%   +3.19%  (p=0.000 n=10+10)
Compress4XReuseAllow/twain-8               363MB/s ± 0%   361MB/s ± 1%     ~     (p=0.150 n=9+10)
Compress4XReuseAllow/low-ent.10k-8         469MB/s ± 1%   469MB/s ± 0%     ~     (p=0.616 n=10+10)
Compress4XReuseAllow/superlow-ent-10k-8    305MB/s ± 0%   306MB/s ± 0%   +0.44%  (p=0.001 n=9+9)
Compress4XReuseAllow/case1-8              15.8MB/s ± 0%  18.0MB/s ± 2%  +13.88%  (p=0.000 n=9+10)
Compress4XReuseAllow/case2-8              13.3MB/s ± 0%  15.4MB/s ± 1%  +15.86%  (p=0.000 n=9+10)
Compress4XReuseAllow/case3-8              14.1MB/s ± 0%  15.8MB/s ± 1%  +12.15%  (p=0.000 n=10+8)
Compress4XReuseAllow/pngdata.001-8         302MB/s ± 1%   304MB/s ± 0%   +0.68%  (p=0.000 n=9+10)
Compress4XReuseAllow/normcount2-8         40.2MB/s ± 1%  43.8MB/s ± 2%   +8.86%  (p=0.000 n=10+10)
Compress4XReusePrefer/digits-8             436MB/s ± 0%   436MB/s ± 1%     ~     (p=0.560 n=9+9)
Compress4XReusePrefer/gettysburg-8         422MB/s ± 1%   421MB/s ± 2%     ~     (p=0.579 n=10+10)
Compress4XReusePrefer/twain-8              363MB/s ± 1%   365MB/s ± 0%     ~     (p=0.018 n=10+10)
Compress4XReusePrefer/low-ent.10k-8        473MB/s ± 0%   472MB/s ± 0%     ~     (p=0.021 n=7+8)
Compress4XReusePrefer/superlow-ent-10k-8   312MB/s ± 1%   312MB/s ± 0%     ~     (p=0.278 n=10+9)
Compress4XReusePrefer/case1-8              134MB/s ± 1%   134MB/s ± 1%     ~     (p=0.780 n=9+10)
Compress4XReusePrefer/case2-8              122MB/s ± 2%   122MB/s ± 0%     ~     (p=1.000 n=10+8)
Compress4XReusePrefer/case3-8              129MB/s ± 3%   129MB/s ± 0%     ~     (p=0.698 n=10+10)
Compress4XReusePrefer/pngdata.001-8        313MB/s ± 1%   313MB/s ± 0%     ~     (p=0.481 n=10+10)
Compress4XReusePrefer/normcount2-8         179MB/s ± 2%   182MB/s ± 0%   +1.96%  (p=0.000 n=10+9)
Compress4XSizes/digits-100-8              57.6MB/s ± 1%  61.7MB/s ± 1%   +7.11%  (p=0.000 n=9+10)
Compress4XSizes/digits-200-8               100MB/s ± 2%   108MB/s ± 2%   +7.90%  (p=0.000 n=10+10)
Compress4XSizes/digits-500-8               191MB/s ± 1%   201MB/s ± 1%   +5.55%  (p=0.000 n=10+9)
Compress4XSizes/digits-1000-8              273MB/s ± 1%   283MB/s ± 1%   +3.75%  (p=0.000 n=10+10)
Compress4XSizes/digits-5000-8              416MB/s ± 0%   418MB/s ± 1%     ~     (p=0.165 n=10+10)
Compress4XSizes/digits-10000-8             434MB/s ± 1%   437MB/s ± 1%   +0.65%  (p=0.002 n=10+10)
Compress4XSizes/digits-50000-8             434MB/s ± 0%   434MB/s ± 0%     ~     (p=0.604 n=10+9)
[Geo mean]                                 152MB/s        157MB/s        +3.33%
  • Loading branch information
greatroar committed Jan 20, 2023
1 parent 4b0abf4 commit 292f2d2
Showing 1 changed file with 55 additions and 35 deletions.
90 changes: 55 additions & 35 deletions huff0/compress.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,53 +484,55 @@ func (s *Scratch) buildCTable() error {
// Different from reference implementation.
huffNode0 := s.nodes[0 : huffNodesLen+1]

for huffNode[nonNullRank].count == 0 {
for huffNode[nonNullRank].count() == 0 {
nonNullRank--
}

lowS := int16(nonNullRank)
nodeRoot := nodeNb + lowS - 1
lowN := nodeNb
huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
huffNode[lowS].setParent(nodeNb)
huffNode[lowS-1].setParent(nodeNb)
nodeNb++
lowS -= 2
for n := nodeNb; n <= nodeRoot; n++ {
huffNode[n].count = 1 << 30
huffNode[n].setCount(1 << 30)
}
// fake entry, strong barrier
huffNode0[0].count = 1 << 31
huffNode0[0].setCount(1 << 31)

// create parents
for nodeNb <= nodeRoot {
var n1, n2 int16
if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n1 = lowS
lowS--
} else {
n1 = lowN
lowN++
}
if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n2 = lowS
lowS--
} else {
n2 = lowN
lowN++
}

huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
huffNode0[n1+1].setParent(nodeNb)
huffNode0[n2+1].setParent(nodeNb)
nodeNb++
}

// distribute weights (unlimited tree height)
huffNode[nodeRoot].nbBits = 0
huffNode[nodeRoot].setNbBits(0)
for n := nodeRoot - 1; n >= startNode; n-- {
huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
for n := uint16(0); n <= nonNullRank; n++ {
huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
s.actualTableLog = s.setMaxHeight(int(nonNullRank))
maxNbBits := s.actualTableLog
Expand All @@ -542,7 +544,7 @@ func (s *Scratch) buildCTable() error {
var nbPerRank [tableLogMax + 1]uint16
var valPerRank [16]uint16
for _, v := range huffNode[:nonNullRank+1] {
nbPerRank[v.nbBits]++
nbPerRank[v.nbBits()]++
}
// determine stating value per rank
{
Expand All @@ -557,7 +559,7 @@ func (s *Scratch) buildCTable() error {

// push nbBits per symbol, symbol order
for _, v := range huffNode[:nonNullRank+1] {
s.cTable[v.symbol].nBits = v.nbBits
s.cTable[v.symbol()].nBits = v.nbBits()
}

// assign value within rank, symbol order
Expand Down Expand Up @@ -603,12 +605,12 @@ func (s *Scratch) huffSort() {
pos := rank[r].current
rank[r].current++
prev := nodes[(pos-1)&huffNodesMask]
for pos > rank[r].base && c > prev.count {
for pos > rank[r].base && c > prev.count() {
nodes[pos&huffNodesMask] = prev
pos--
prev = nodes[(pos-1)&huffNodesMask]
}
nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
}
}

Expand All @@ -617,7 +619,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
huffNode := s.nodes[1 : huffNodesLen+1]
//huffNode = huffNode[: huffNodesLen]

largestBits := huffNode[lastNonNull].nbBits
largestBits := huffNode[lastNonNull].nbBits()

// early exit : no elt > maxNbBits
if largestBits <= maxNbBits {
Expand All @@ -627,14 +629,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
baseCost := int(1) << (largestBits - maxNbBits)
n := uint32(lastNonNull)

for huffNode[n].nbBits > maxNbBits {
totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
huffNode[n].nbBits = maxNbBits
for huffNode[n].nbBits() > maxNbBits {
totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
huffNode[n].setNbBits(maxNbBits)
n--
}
// n stops at huffNode[n].nbBits <= maxNbBits

for huffNode[n].nbBits == maxNbBits {
for huffNode[n].nbBits() == maxNbBits {
n--
}
// n end at index of smallest symbol using < maxNbBits
Expand All @@ -655,10 +657,10 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
{
currentNbBits := maxNbBits
for pos := int(n); pos >= 0; pos-- {
if huffNode[pos].nbBits >= currentNbBits {
if huffNode[pos].nbBits() >= currentNbBits {
continue
}
currentNbBits = huffNode[pos].nbBits // < maxNbBits
currentNbBits = huffNode[pos].nbBits() // < maxNbBits
rankLast[maxNbBits-currentNbBits] = uint32(pos)
}
}
Expand All @@ -675,8 +677,8 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
if lowPos == noSymbol {
break
}
highTotal := huffNode[highPos].count
lowTotal := 2 * huffNode[lowPos].count
highTotal := huffNode[highPos].count()
lowTotal := 2 * huffNode[lowPos].count()
if highTotal <= lowTotal {
break
}
Expand All @@ -692,39 +694,57 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
// this rank is no longer empty
rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
}
huffNode[rankLast[nBitsToDecrease]].nbBits++
huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
huffNode[rankLast[nBitsToDecrease]].nbBits())
if rankLast[nBitsToDecrease] == 0 {
/* special case, reached largest symbol */
rankLast[nBitsToDecrease] = noSymbol
} else {
rankLast[nBitsToDecrease]--
if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
}
}
}

for totalCost < 0 { /* Sometimes, cost correction overshoot */
if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
for huffNode[n].nbBits == maxNbBits {
for huffNode[n].nbBits() == maxNbBits {
n--
}
huffNode[n+1].nbBits--
huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
rankLast[1] = n + 1
totalCost++
continue
}
huffNode[rankLast[1]+1].nbBits--
huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
rankLast[1]++
totalCost++
}
}
return maxNbBits
}

type nodeElt struct {
count uint32
parent uint16
symbol byte
nbBits uint8
// A nodeElt is the fields
//
// count uint32
// parent uint16
// symbol byte
// nbBits uint8
//
// in some order, all squashed into an integer so that the compiler
// always loads and stores entire nodeElts instead of separate fields.
type nodeElt uint64

func makeNodeElt(count uint32, symbol byte) nodeElt {
return nodeElt(count) | nodeElt(symbol)<<48
}

func (e *nodeElt) count() uint32 { return uint32(*e) }
func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
func (e *nodeElt) symbol() byte { return byte(*e >> 48) }
func (e *nodeElt) nbBits() uint8 { return uint8(*e >> 56) }

func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }

0 comments on commit 292f2d2

Please sign in to comment.