From 292f2d20f91465b45226fe3ff1f317cc603e5a00 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Fri, 20 Jan 2023 10:23:30 +0100 Subject: [PATCH] huff0: Speed up compression of short blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tells the compiler to merge loads and stores of nodeElts. name old speed new speed delta Compress4XReuseNone/digits-8 433MB/s ± 1% 434MB/s ± 0% ~ (p=0.393 n=10+10) Compress4XReuseNone/gettysburg-8 245MB/s ± 1% 254MB/s ± 1% +3.55% (p=0.000 n=10+10) Compress4XReuseNone/twain-8 363MB/s ± 0% 363MB/s ± 0% ~ (p=0.360 n=8+10) Compress4XReuseNone/low-ent.10k-8 465MB/s ± 0% 467MB/s ± 0% +0.32% (p=0.008 n=9+10) Compress4XReuseNone/superlow-ent-10k-8 305MB/s ± 2% 304MB/s ± 1% ~ (p=0.143 n=10+10) Compress4XReuseNone/case1-8 13.1MB/s ± 1% 14.6MB/s ± 2% +11.44% (p=0.000 n=10+10) Compress4XReuseNone/case2-8 10.9MB/s ± 1% 12.3MB/s ± 2% +12.28% (p=0.000 n=10+9) Compress4XReuseNone/case3-8 11.5MB/s ± 1% 12.9MB/s ± 2% +12.37% (p=0.000 n=10+10) Compress4XReuseNone/pngdata.001-8 300MB/s ± 0% 298MB/s ± 2% ~ (p=0.762 n=8+10) Compress4XReuseNone/normcount2-8 31.5MB/s ± 1% 33.7MB/s ± 2% +6.87% (p=0.000 n=9+10) Compress4XReuseAllow/digits-8 435MB/s ± 1% 432MB/s ± 2% ~ (p=0.043 n=10+10) Compress4XReuseAllow/gettysburg-8 272MB/s ± 1% 281MB/s ± 1% +3.19% (p=0.000 n=10+10) Compress4XReuseAllow/twain-8 363MB/s ± 0% 361MB/s ± 1% ~ (p=0.150 n=9+10) Compress4XReuseAllow/low-ent.10k-8 469MB/s ± 1% 469MB/s ± 0% ~ (p=0.616 n=10+10) Compress4XReuseAllow/superlow-ent-10k-8 305MB/s ± 0% 306MB/s ± 0% +0.44% (p=0.001 n=9+9) Compress4XReuseAllow/case1-8 15.8MB/s ± 0% 18.0MB/s ± 2% +13.88% (p=0.000 n=9+10) Compress4XReuseAllow/case2-8 13.3MB/s ± 0% 15.4MB/s ± 1% +15.86% (p=0.000 n=9+10) Compress4XReuseAllow/case3-8 14.1MB/s ± 0% 15.8MB/s ± 1% +12.15% (p=0.000 n=10+8) Compress4XReuseAllow/pngdata.001-8 302MB/s ± 1% 304MB/s ± 0% +0.68% (p=0.000 n=9+10) Compress4XReuseAllow/normcount2-8 40.2MB/s ± 1% 43.8MB/s ± 2% +8.86% (p=0.000 n=10+10) Compress4XReusePrefer/digits-8 436MB/s ± 0% 436MB/s ± 1% ~ (p=0.560 n=9+9) Compress4XReusePrefer/gettysburg-8 422MB/s ± 1% 421MB/s ± 2% ~ (p=0.579 n=10+10) Compress4XReusePrefer/twain-8 363MB/s ± 1% 365MB/s ± 0% ~ (p=0.018 n=10+10) Compress4XReusePrefer/low-ent.10k-8 473MB/s ± 0% 472MB/s ± 0% ~ (p=0.021 n=7+8) Compress4XReusePrefer/superlow-ent-10k-8 312MB/s ± 1% 312MB/s ± 0% ~ (p=0.278 n=10+9) Compress4XReusePrefer/case1-8 134MB/s ± 1% 134MB/s ± 1% ~ (p=0.780 n=9+10) Compress4XReusePrefer/case2-8 122MB/s ± 2% 122MB/s ± 0% ~ (p=1.000 n=10+8) Compress4XReusePrefer/case3-8 129MB/s ± 3% 129MB/s ± 0% ~ (p=0.698 n=10+10) Compress4XReusePrefer/pngdata.001-8 313MB/s ± 1% 313MB/s ± 0% ~ (p=0.481 n=10+10) Compress4XReusePrefer/normcount2-8 179MB/s ± 2% 182MB/s ± 0% +1.96% (p=0.000 n=10+9) Compress4XSizes/digits-100-8 57.6MB/s ± 1% 61.7MB/s ± 1% +7.11% (p=0.000 n=9+10) Compress4XSizes/digits-200-8 100MB/s ± 2% 108MB/s ± 2% +7.90% (p=0.000 n=10+10) Compress4XSizes/digits-500-8 191MB/s ± 1% 201MB/s ± 1% +5.55% (p=0.000 n=10+9) Compress4XSizes/digits-1000-8 273MB/s ± 1% 283MB/s ± 1% +3.75% (p=0.000 n=10+10) Compress4XSizes/digits-5000-8 416MB/s ± 0% 418MB/s ± 1% ~ (p=0.165 n=10+10) Compress4XSizes/digits-10000-8 434MB/s ± 1% 437MB/s ± 1% +0.65% (p=0.002 n=10+10) Compress4XSizes/digits-50000-8 434MB/s ± 0% 434MB/s ± 0% ~ (p=0.604 n=10+9) [Geo mean] 152MB/s 157MB/s +3.33% --- huff0/compress.go | 90 +++++++++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/huff0/compress.go b/huff0/compress.go index d9223a91ef..cdc94856f2 100644 --- a/huff0/compress.go +++ b/huff0/compress.go @@ -484,34 +484,35 @@ func (s *Scratch) buildCTable() error { // Different from reference implementation. huffNode0 := s.nodes[0 : huffNodesLen+1] - for huffNode[nonNullRank].count == 0 { + for huffNode[nonNullRank].count() == 0 { nonNullRank-- } lowS := int16(nonNullRank) nodeRoot := nodeNb + lowS - 1 lowN := nodeNb - huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count - huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb) + huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count()) + huffNode[lowS].setParent(nodeNb) + huffNode[lowS-1].setParent(nodeNb) nodeNb++ lowS -= 2 for n := nodeNb; n <= nodeRoot; n++ { - huffNode[n].count = 1 << 30 + huffNode[n].setCount(1 << 30) } // fake entry, strong barrier - huffNode0[0].count = 1 << 31 + huffNode0[0].setCount(1 << 31) // create parents for nodeNb <= nodeRoot { var n1, n2 int16 - if huffNode0[lowS+1].count < huffNode0[lowN+1].count { + if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() { n1 = lowS lowS-- } else { n1 = lowN lowN++ } - if huffNode0[lowS+1].count < huffNode0[lowN+1].count { + if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() { n2 = lowS lowS-- } else { @@ -519,18 +520,19 @@ func (s *Scratch) buildCTable() error { lowN++ } - huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count - huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb) + huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count()) + huffNode0[n1+1].setParent(nodeNb) + huffNode0[n2+1].setParent(nodeNb) nodeNb++ } // distribute weights (unlimited tree height) - huffNode[nodeRoot].nbBits = 0 + huffNode[nodeRoot].setNbBits(0) for n := nodeRoot - 1; n >= startNode; n-- { - huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1 + huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1) } for n := uint16(0); n <= nonNullRank; n++ { - huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1 + huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1) } s.actualTableLog = s.setMaxHeight(int(nonNullRank)) maxNbBits := s.actualTableLog @@ -542,7 +544,7 @@ func (s *Scratch) buildCTable() error { var nbPerRank [tableLogMax + 1]uint16 var valPerRank [16]uint16 for _, v := range huffNode[:nonNullRank+1] { - nbPerRank[v.nbBits]++ + nbPerRank[v.nbBits()]++ } // determine stating value per rank { @@ -557,7 +559,7 @@ func (s *Scratch) buildCTable() error { // push nbBits per symbol, symbol order for _, v := range huffNode[:nonNullRank+1] { - s.cTable[v.symbol].nBits = v.nbBits + s.cTable[v.symbol()].nBits = v.nbBits() } // assign value within rank, symbol order @@ -603,12 +605,12 @@ func (s *Scratch) huffSort() { pos := rank[r].current rank[r].current++ prev := nodes[(pos-1)&huffNodesMask] - for pos > rank[r].base && c > prev.count { + for pos > rank[r].base && c > prev.count() { nodes[pos&huffNodesMask] = prev pos-- prev = nodes[(pos-1)&huffNodesMask] } - nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)} + nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n)) } } @@ -617,7 +619,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { huffNode := s.nodes[1 : huffNodesLen+1] //huffNode = huffNode[: huffNodesLen] - largestBits := huffNode[lastNonNull].nbBits + largestBits := huffNode[lastNonNull].nbBits() // early exit : no elt > maxNbBits if largestBits <= maxNbBits { @@ -627,14 +629,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { baseCost := int(1) << (largestBits - maxNbBits) n := uint32(lastNonNull) - for huffNode[n].nbBits > maxNbBits { - totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)) - huffNode[n].nbBits = maxNbBits + for huffNode[n].nbBits() > maxNbBits { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits())) + huffNode[n].setNbBits(maxNbBits) n-- } // n stops at huffNode[n].nbBits <= maxNbBits - for huffNode[n].nbBits == maxNbBits { + for huffNode[n].nbBits() == maxNbBits { n-- } // n end at index of smallest symbol using < maxNbBits @@ -655,10 +657,10 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { { currentNbBits := maxNbBits for pos := int(n); pos >= 0; pos-- { - if huffNode[pos].nbBits >= currentNbBits { + if huffNode[pos].nbBits() >= currentNbBits { continue } - currentNbBits = huffNode[pos].nbBits // < maxNbBits + currentNbBits = huffNode[pos].nbBits() // < maxNbBits rankLast[maxNbBits-currentNbBits] = uint32(pos) } } @@ -675,8 +677,8 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { if lowPos == noSymbol { break } - highTotal := huffNode[highPos].count - lowTotal := 2 * huffNode[lowPos].count + highTotal := huffNode[highPos].count() + lowTotal := 2 * huffNode[lowPos].count() if highTotal <= lowTotal { break } @@ -692,13 +694,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { // this rank is no longer empty rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease] } - huffNode[rankLast[nBitsToDecrease]].nbBits++ + huffNode[rankLast[nBitsToDecrease]].setNbBits(1 + + huffNode[rankLast[nBitsToDecrease]].nbBits()) if rankLast[nBitsToDecrease] == 0 { /* special case, reached largest symbol */ rankLast[nBitsToDecrease] = noSymbol } else { rankLast[nBitsToDecrease]-- - if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease { + if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease { rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */ } } @@ -706,15 +709,15 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { for totalCost < 0 { /* Sometimes, cost correction overshoot */ if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */ - for huffNode[n].nbBits == maxNbBits { + for huffNode[n].nbBits() == maxNbBits { n-- } - huffNode[n+1].nbBits-- + huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1) rankLast[1] = n + 1 totalCost++ continue } - huffNode[rankLast[1]+1].nbBits-- + huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1) rankLast[1]++ totalCost++ } @@ -722,9 +725,26 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { return maxNbBits } -type nodeElt struct { - count uint32 - parent uint16 - symbol byte - nbBits uint8 +// A nodeElt is the fields +// +// count uint32 +// parent uint16 +// symbol byte +// nbBits uint8 +// +// in some order, all squashed into an integer so that the compiler +// always loads and stores entire nodeElts instead of separate fields. +type nodeElt uint64 + +func makeNodeElt(count uint32, symbol byte) nodeElt { + return nodeElt(count) | nodeElt(symbol)<<48 } + +func (e *nodeElt) count() uint32 { return uint32(*e) } +func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) } +func (e *nodeElt) symbol() byte { return byte(*e >> 48) } +func (e *nodeElt) nbBits() uint8 { return uint8(*e >> 56) } + +func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) } +func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 } +func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }