From 8b1c2b8c4fa97028a2938a4ddeb0d0dcc480acaa Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 21 Nov 2022 12:19:46 +0100 Subject: [PATCH 1/4] Avoid copy on Leopard GF8 WIP: Still a subtle bug Speed is encouraging though: ``` BenchmarkEncodeLeopard50x20x1M-32 100 10201946 ns/op 7194.74 MB/s 3403 B/op 3 allocs/op BenchmarkEncodeLeopard50x20x1M-32 153 7816916 ns/op 9389.93 MB/s 5016 B/op 35 allocs/op ``` --- _gen/gf8.go | 107 +- galois_amd64.go | 81 ++ galois_arm64.go | 5 + galois_gen_amd64.go | 96 ++ galois_gen_amd64.s | 2660 +++++++++++++++++++++++++++++++++++++++++-- galois_noasm.go | 5 + galois_ppc64le.go | 5 + leopard8.go | 87 +- 8 files changed, 2914 insertions(+), 132 deletions(-) diff --git a/_gen/gf8.go b/_gen/gf8.go index a542a336..d1d4dbf2 100644 --- a/_gen/gf8.go +++ b/_gen/gf8.go @@ -119,9 +119,14 @@ func genGF8() { x := [8]int{} for skipMask := range x[:] { - { + for _, withDst := range []bool{false, true} { var suffix = "avx2_" + fmt.Sprint(skipMask) - TEXT("ifftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 *[2*16]uint8)")) + dstString := "" + if withDst { + dstString = "dst, " + suffix = "dst_" + suffix + } + TEXT("ifftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(%swork [][]byte, dist int, t01, t23, t02 *[2*16]uint8)", dstString)) Pragma("noescape") var t01, t23, t02 table256 // Load and expand tables @@ -153,6 +158,11 @@ func genGF8() { var work [4]reg.GPVirtual workTable := Load(Param("work").Base(), GP64()) // &work[0] + var dst [4]reg.GPVirtual + dstTable := GP64() + if withDst { + Load(Param("dst").Base(), dstTable) // &dst[0] + } bytes := GP64() MOVQ(Mem{Base: workTable, Disp: 8}, bytes) @@ -162,6 +172,10 @@ func genGF8() { work[i] = GP64() // work[i] = &workTable[dist*i] MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if withDst { + dst[i] = GP64() + MOVQ(Mem{Base: dstTable, Index: offset, Scale: 1}, dst[i]) + } if i < len(work)-1 { ADDQ(dist, offset) } @@ -231,8 +245,14 @@ func genGF8() { // Store + Next loop: for i := range work { - VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) - VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) + if withDst { + VMOVDQU(workReg[i], Mem{Base: dst[i], Disp: 0}) + VMOVDQU(workReg2[i], Mem{Base: dst[i], Disp: 32}) + ADDQ(U8(64), dst[i]) + } else { + VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) + VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) + } ADDQ(U8(64), work[i]) } @@ -242,9 +262,14 @@ func genGF8() { VZEROUPPER() RET() } - { + for _, withDst := range []bool{false, true} { var suffix = "avx2_" + fmt.Sprint(skipMask) - TEXT("fftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 *[2*16]uint8)")) + dstString := "" + if withDst { + dstString = "dst, " + suffix = "dst_" + suffix + } + TEXT("fftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(%swork [][]byte, dist int, t01, t23, t02 *[2*16]uint8)", dstString)) Pragma("noescape") var t01, t23, t02 table256 // Load and expand tables @@ -280,9 +305,15 @@ func genGF8() { dist := Load(Param("dist"), GP64()) var work [4]reg.GPVirtual + var dst [4]reg.GPVirtual + workTable := Load(Param("work").Base(), GP64()) // &work[0] bytes := GP64() MOVQ(Mem{Base: workTable, Disp: 8}, bytes) + dstTable := GP64() + if withDst { + Load(Param("dst").Base(), dstTable) // &dst[0] + } offset := GP64() XORQ(offset, offset) @@ -290,6 +321,10 @@ func genGF8() { work[i] = GP64() // work[i] = &workTable[dist*i] MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if withDst { + dst[i] = GP64() + MOVQ(Mem{Base: dstTable, Index: offset, Scale: 1}, dst[i]) + } if i < len(work)-1 { ADDQ(dist, offset) } @@ -356,8 +391,14 @@ func genGF8() { // Store + Next loop: for i := range work { - VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) - VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) + if withDst { + VMOVDQU(workReg[i], Mem{Base: dst[i], Disp: 0}) + VMOVDQU(workReg2[i], Mem{Base: dst[i], Disp: 32}) + ADDQ(U8(64), dst[i]) + } else { + VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) + VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) + } ADDQ(U8(64), work[i]) } @@ -371,9 +412,14 @@ func genGF8() { // GFNI for skipMask := range x[:] { - { + for _, withDst := range []bool{false, true} { var suffix = "gfni_" + fmt.Sprint(skipMask) - TEXT("ifftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 uint64)")) + dstString := "" + if withDst { + dstString = "dst, " + suffix = "dst_" + suffix + } + TEXT("ifftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(%swork [][]byte, dist int, t01, t23, t02 uint64)", dstString)) Pragma("noescape") var t01, t23, t02 table512 = ZMM(), ZMM(), ZMM() // Load and expand tables @@ -394,6 +440,11 @@ func genGF8() { var work [4]reg.GPVirtual workTable := Load(Param("work").Base(), GP64()) // &work[0] + var dst [4]reg.GPVirtual + dstTable := GP64() + if withDst { + Load(Param("dst").Base(), dstTable) // &dst[0] + } bytes := GP64() MOVQ(Mem{Base: workTable, Disp: 8}, bytes) @@ -403,6 +454,10 @@ func genGF8() { work[i] = GP64() // work[i] = &workTable[dist*i] MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if withDst { + dst[i] = GP64() + MOVQ(Mem{Base: dstTable, Index: offset, Scale: 1}, dst[i]) + } if i < len(work)-1 { ADDQ(dist, offset) } @@ -442,7 +497,12 @@ func genGF8() { // Store + Next loop: for i := range work { - VMOVDQU64(workReg[i], Mem{Base: work[i], Disp: 0}) + if withDst { + VMOVDQU64(workReg[i], Mem{Base: dst[i], Disp: 0}) + ADDQ(U8(64), dst[i]) + } else { + VMOVDQU64(workReg[i], Mem{Base: work[i], Disp: 0}) + } ADDQ(U8(64), work[i]) } @@ -452,9 +512,14 @@ func genGF8() { VZEROUPPER() RET() } - { + for _, withDst := range []bool{false, true} { var suffix = "gfni_" + fmt.Sprint(skipMask) - TEXT("fftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 uint64)")) + dstString := "" + if withDst { + dstString = "dst, " + suffix = "dst_" + suffix + } + TEXT("fftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(%swork [][]byte, dist int, t01, t23, t02 uint64)", dstString)) Pragma("noescape") var t01, t23, t02 table512 = ZMM(), ZMM(), ZMM() // Load and expand tables @@ -475,6 +540,11 @@ func genGF8() { var work [4]reg.GPVirtual workTable := Load(Param("work").Base(), GP64()) // &work[0] + var dst [4]reg.GPVirtual + dstTable := GP64() + if withDst { + Load(Param("dst").Base(), dstTable) // &dst[0] + } bytes := GP64() MOVQ(Mem{Base: workTable, Disp: 8}, bytes) @@ -484,6 +554,10 @@ func genGF8() { work[i] = GP64() // work[i] = &workTable[dist*i] MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if withDst { + dst[i] = GP64() + MOVQ(Mem{Base: dstTable, Index: offset, Scale: 1}, dst[i]) + } if i < len(work)-1 { ADDQ(dist, offset) } @@ -522,7 +596,12 @@ func genGF8() { // Store + Next loop: for i := range work { - VMOVDQU64(workReg[i], Mem{Base: work[i], Disp: 0}) + if withDst { + VMOVDQU64(workReg[i], Mem{Base: dst[i], Disp: 0}) + ADDQ(U8(64), dst[i]) + } else { + VMOVDQU64(workReg[i], Mem{Base: work[i], Disp: 0}) + } ADDQ(U8(64), work[i]) } diff --git a/galois_amd64.go b/galois_amd64.go index 120ccaee..76a3ebd7 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -308,6 +308,87 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) } +// 4-way butterfly +func ifftDIT48Dst(dst, work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + if len(work[0]) == 0 { + return + } + + if o.useGFNI { + // Note that these currently require that length is multiple of 64. + t01 := gf2p811dMulMatrices[log_m01] + t23 := gf2p811dMulMatrices[log_m23] + t02 := gf2p811dMulMatrices[log_m02] + if log_m01 == modulus8 { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_dst_gfni_7(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_gfni_3(dst, work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_dst_gfni_5(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_gfni_1(dst, work, dist*24, t01, t23, t02) + } + } + } else { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_dst_gfni_6(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_gfni_2(dst, work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_dst_gfni_4(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_gfni_0(dst, work, dist*24, t01, t23, t02) + } + } + } + return + } + if o.useAVX2 { + // Note that these currently require that length is multiple of 64. + t01 := &multiply256LUT8[log_m01] + t23 := &multiply256LUT8[log_m23] + t02 := &multiply256LUT8[log_m02] + if log_m01 == modulus8 { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_dst_avx2_7(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_avx2_3(dst, work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_dst_avx2_5(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_avx2_1(dst, work, dist*24, t01, t23, t02) + } + } + } else { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_dst_avx2_6(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_avx2_2(dst, work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_dst_avx2_4(dst, work, dist*24, t01, t23, t02) + } else { + ifftDIT48_dst_avx2_0(dst, work, dist*24, t01, t23, t02) + } + } + } + return + } + ifftDIT4DstRef8(dst, work, dist, log_m01, log_m23, log_m02, o) +} + func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { if len(work[0]) == 0 { return diff --git a/galois_arm64.go b/galois_arm64.go index 9ab27941..dc29db6e 100644 --- a/galois_arm64.go +++ b/galois_arm64.go @@ -75,6 +75,11 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) } +// 4-way butterfly with separate destination +func ifftDIT48Dst(dst, work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4DstRef8(dst, work, dist, log_m01, log_m23, log_m02, o) +} + // 4-way butterfly func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 5f53c3b4..2fe1f948 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -2659,95 +2659,191 @@ func fftDIT28_avx2(x []byte, y []byte, table *[32]uint8) //go:noescape func ifftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_0(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_0(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_1(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_1(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_2(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_2(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_3(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_3(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_4(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_4(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_5(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_5(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_6(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_6(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_dst_avx2_7(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func fftDIT48_dst_avx2_7(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_0(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func fftDIT48_dst_gfni_0(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_1(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func fftDIT48_dst_gfni_1(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_2(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func fftDIT48_dst_gfni_2(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_3(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func fftDIT48_dst_gfni_3(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_4(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func fftDIT48_dst_gfni_4(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_5(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func fftDIT48_dst_gfni_5(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_6(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func fftDIT48_dst_gfni_6(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +//go:noescape +func ifftDIT48_dst_gfni_7(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + //go:noescape func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_dst_gfni_7(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index c29ae526..a26e2220 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -102713,6 +102713,140 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_avx2_0(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_dst_avx2_0(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX + VBROADCASTI128 16(AX), Y0 + MOVQ t23+64(FP), CX + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 16(CX), Y2 + MOVQ t02+72(FP), CX + VBROADCASTI128 (CX), Y3 + VBROADCASTI128 16(CX), Y4 + MOVQ dist+48(FP), CX + MOVQ work_base+24(FP), DX + MOVQ dst_base+0(FP), BX + MOVQ 8(DX), SI + XORQ DI, DI + MOVQ (DX)(DI*1), R8 + MOVQ (BX)(DI*1), R9 + ADDQ CX, DI + MOVQ (DX)(DI*1), R10 + MOVQ (BX)(DI*1), R11 + ADDQ CX, DI + MOVQ (DX)(DI*1), R12 + MOVQ (BX)(DI*1), R13 + ADDQ CX, DI + MOVQ (DX)(DI*1), CX + MOVQ (BX)(DI*1), DX + MOVQ $0x0000000f, BX + MOVQ BX, X5 + VPBROADCASTB X5, Y5 + +loop: + VMOVDQU (R8), Y6 + VMOVDQU (R10), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU 32(R10), Y9 + VPXOR Y7, Y6, Y7 + VPXOR Y9, Y8, Y9 + VBROADCASTI128 (AX), Y10 + + // LEO_MULADD_256 + VPAND Y7, Y5, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y10, Y11 + VPAND Y12, Y5, Y12 + VPSHUFB Y12, Y0, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + + // LEO_MULADD_256 + VPAND Y9, Y5, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y10, Y11 + VPAND Y12, Y5, Y12 + VPSHUFB Y12, Y0, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU (R12), Y10 + VMOVDQU (CX), Y11 + VMOVDQU 32(R12), Y12 + VMOVDQU 32(CX), Y13 + VPXOR Y10, Y11, Y11 + VPXOR Y12, Y13, Y13 + + // LEO_MULADD_256 + VPAND Y11, Y5, Y14 + VPSRLQ $0x04, Y11, Y15 + VPSHUFB Y14, Y1, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y2, Y15 + XOR3WAY( $0x00, Y14, Y15, Y10) + + // LEO_MULADD_256 + VPAND Y13, Y5, Y14 + VPSRLQ $0x04, Y13, Y15 + VPSHUFB Y14, Y1, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y2, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + VPXOR Y9, Y13, Y13 + + // LEO_MULADD_256 + VPAND Y10, Y5, Y14 + VPSRLQ $0x04, Y10, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y5, Y14 + VPSRLQ $0x04, Y11, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y5, Y14 + VPSRLQ $0x04, Y12, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y8) + + // LEO_MULADD_256 + VPAND Y13, Y5, Y14 + VPSRLQ $0x04, Y13, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y9) + VMOVDQU Y6, (R9) + VMOVDQU Y8, 32(R9) + ADDQ $0x40, R9 + ADDQ $0x40, R8 + VMOVDQU Y7, (R11) + VMOVDQU Y9, 32(R11) + ADDQ $0x40, R11 + ADDQ $0x40, R10 + VMOVDQU Y10, (R13) + VMOVDQU Y12, 32(R13) + ADDQ $0x40, R13 + ADDQ $0x40, R12 + VMOVDQU Y11, (DX) + VMOVDQU Y13, 32(DX) + ADDQ $0x40, DX + ADDQ $0x40, CX + SUBQ $0x40, SI + JA loop + VZEROUPPER + RET + // func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_0(SB), NOSPLIT, $0-56 @@ -102838,6 +102972,140 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_avx2_0(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_dst_avx2_0(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX + VBROADCASTI128 16(AX), Y0 + MOVQ t23+64(FP), CX + VBROADCASTI128 16(CX), Y1 + MOVQ t02+72(FP), DX + VBROADCASTI128 (DX), Y2 + VBROADCASTI128 16(DX), Y3 + MOVQ dist+48(FP), DX + MOVQ work_base+24(FP), BX + MOVQ 8(BX), SI + MOVQ dst_base+0(FP), DI + XORQ R8, R8 + MOVQ (BX)(R8*1), R9 + MOVQ (DI)(R8*1), R10 + ADDQ DX, R8 + MOVQ (BX)(R8*1), R11 + MOVQ (DI)(R8*1), R12 + ADDQ DX, R8 + MOVQ (BX)(R8*1), R13 + MOVQ (DI)(R8*1), R14 + ADDQ DX, R8 + MOVQ (BX)(R8*1), DX + MOVQ (DI)(R8*1), BX + MOVQ $0x0000000f, DI + MOVQ DI, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (R13), Y9 + VMOVDQU 32(R13), Y10 + VMOVDQU (R11), Y7 + VMOVDQU 32(R11), Y8 + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + VBROADCASTI128 (AX), Y13 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y14 + VPSRLQ $0x04, Y7, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y0, Y15 + XOR3WAY( $0x00, Y14, Y15, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y14 + VPSRLQ $0x04, Y8, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y0, Y15 + XOR3WAY( $0x00, Y14, Y15, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + VBROADCASTI128 (CX), Y13 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y14 + VPSRLQ $0x04, Y11, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y1, Y15 + XOR3WAY( $0x00, Y14, Y15, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y14 + VPSRLQ $0x04, Y12, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y1, Y15 + XOR3WAY( $0x00, Y14, Y15, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y7, (R12) + VMOVDQU Y8, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y9, (R14) + VMOVDQU Y10, 32(R14) + ADDQ $0x40, R14 + ADDQ $0x40, R13 + VMOVDQU Y11, (BX) + VMOVDQU Y12, 32(BX) + ADDQ $0x40, BX + ADDQ $0x40, DX + SUBQ $0x40, SI + JA loop + VZEROUPPER + RET + // func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_1(SB), NOSPLIT, $0-56 @@ -102944,47 +103212,162 @@ loop: VZEROUPPER RET -// func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func ifftDIT48_dst_avx2_1(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_1(SB), NOSPLIT, $0-56 - MOVQ t01+32(FP), AX +TEXT ·ifftDIT48_dst_avx2_1(SB), NOSPLIT, $0-80 + MOVQ t23+64(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 - MOVQ t23+40(FP), AX + MOVQ t02+72(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - MOVQ $0x0000000f, CX - MOVQ CX, X4 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X4 VPBROADCASTB X4, Y4 loop: - VMOVDQU (SI), Y5 - VMOVDQU 32(SI), Y6 - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y10 - VMOVDQU (DI), Y7 - VMOVDQU 32(DI), Y8 - VMOVDQU (AX), Y11 + VMOVDQU (DI), Y5 + VMOVDQU (R9), Y6 + VMOVDQU 32(DI), Y7 + VMOVDQU 32(R9), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + VMOVDQU (R11), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R11), Y11 VMOVDQU 32(AX), Y12 - VPXOR Y5, Y9, Y9 - VPXOR Y7, Y11, Y11 - VPXOR Y6, Y10, Y10 - VPXOR Y8, Y12, Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 // LEO_MULADD_256 - VPAND Y7, Y4, Y13 - VPSRLQ $0x04, Y7, Y14 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y11) + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VMOVDQU Y5, (R8) + VMOVDQU Y7, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y6, (R10) + VMOVDQU Y8, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y9, (R12) + VMOVDQU Y11, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y10, (CX) + VMOVDQU Y12, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_1(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 @@ -103034,6 +103417,105 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_avx2_1(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_dst_avx2_1(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+64(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ 8(CX), DX + MOVQ dst_base+0(FP), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (BX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (BX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (BX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (BX)(SI*1), CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y10 + VMOVDQU (R9), Y7 + VMOVDQU 32(R9), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y7, (R10) + VMOVDQU Y8, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y9, (R12) + VMOVDQU Y10, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y11, (CX) + VMOVDQU Y12, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + // func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_2(SB), NOSPLIT, $0-56 @@ -103140,6 +103622,121 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_avx2_2(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_dst_avx2_2(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+72(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (DI), Y5 + VMOVDQU (R9), Y6 + VMOVDQU 32(DI), Y7 + VMOVDQU 32(R9), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + + // LEO_MULADD_256 + VPAND Y6, Y4, Y9 + VPSRLQ $0x04, Y6, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y9 + VPSRLQ $0x04, Y8, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + VMOVDQU (R11), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R11), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VMOVDQU Y5, (R8) + VMOVDQU Y7, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y6, (R10) + VMOVDQU Y8, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y9, (R12) + VMOVDQU Y11, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y10, (CX) + VMOVDQU Y12, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_2(SB), NOSPLIT, $0-56 @@ -103165,12 +103762,123 @@ TEXT ·fftDIT48_avx2_2(SB), NOSPLIT, $0-56 VPBROADCASTB X4, Y4 loop: - VMOVDQU (SI), Y5 - VMOVDQU 32(SI), Y6 - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y10 - VMOVDQU (DI), Y7 - VMOVDQU 32(DI), Y8 + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_dst_avx2_2(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_dst_avx2_2(SB), NOSPLIT, $0-80 + MOVQ t23+64(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+72(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ 8(CX), DX + MOVQ dst_base+0(FP), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (BX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (BX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (BX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (BX)(SI*1), CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y10 + VMOVDQU (R9), Y7 + VMOVDQU 32(R9), Y8 VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y12 @@ -103229,17 +103937,21 @@ loop: XOR3WAY( $0x00, Y13, Y14, Y10) VPXOR Y9, Y11, Y11 VPXOR Y10, Y12, Y12 - VMOVDQU Y5, (SI) - VMOVDQU Y6, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y7, (DI) - VMOVDQU Y8, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y9, (R8) - VMOVDQU Y10, 32(R8) + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y11, (AX) - VMOVDQU Y12, 32(AX) + ADDQ $0x40, DI + VMOVDQU Y7, (R10) + VMOVDQU Y8, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y9, (R12) + VMOVDQU Y10, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y11, (CX) + VMOVDQU Y12, 32(CX) + ADDQ $0x40, CX ADDQ $0x40, AX SUBQ $0x40, DX JA loop @@ -103333,6 +104045,102 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_avx2_3(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_dst_avx2_3(SB), NOSPLIT, $0-80 + MOVQ t02+72(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (DI), Y3 + VMOVDQU (R9), Y4 + VMOVDQU 32(DI), Y5 + VMOVDQU 32(R9), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + VMOVDQU (R11), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R11), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + + // LEO_MULADD_256 + VPAND Y7, Y2, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + + // LEO_MULADD_256 + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU Y3, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y4, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y7, (R12) + VMOVDQU Y9, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y8, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_3(SB), NOSPLIT, $0-56 @@ -103404,6 +104212,86 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_avx2_3(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_dst_avx2_3(SB), NOSPLIT, $0-80 + MOVQ t23+64(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ 8(CX), DX + MOVQ dst_base+0(FP), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (BX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (BX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (BX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (BX)(SI*1), CX + MOVQ $0x0000000f, BX + MOVQ BX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (R11), Y7 + VMOVDQU 32(R11), Y8 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y7, (R12) + VMOVDQU Y8, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y9, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + // func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_4(SB), NOSPLIT, $0-56 @@ -103429,10 +104317,105 @@ TEXT ·ifftDIT48_avx2_4(SB), NOSPLIT, $0-56 VPBROADCASTB X4, Y4 loop: - VMOVDQU (SI), Y5 - VMOVDQU (DI), Y6 - VMOVDQU 32(SI), Y7 - VMOVDQU 32(DI), Y8 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + + // LEO_MULADD_256 + VPAND Y6, Y4, Y9 + VPSRLQ $0x04, Y6, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y9 + VPSRLQ $0x04, Y8, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y11) + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_dst_avx2_4(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_dst_avx2_4(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+64(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (DI), Y5 + VMOVDQU (R9), Y6 + VMOVDQU 32(DI), Y7 + VMOVDQU 32(R9), Y8 VPXOR Y6, Y5, Y6 VPXOR Y8, Y7, Y8 @@ -103451,9 +104434,9 @@ loop: VPAND Y10, Y4, Y10 VPSHUFB Y10, Y1, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) - VMOVDQU (R8), Y9 + VMOVDQU (R11), Y9 VMOVDQU (AX), Y10 - VMOVDQU 32(R8), Y11 + VMOVDQU 32(R11), Y11 VMOVDQU 32(AX), Y12 VPXOR Y9, Y10, Y10 VPXOR Y11, Y12, Y12 @@ -103477,19 +104460,23 @@ loop: VPXOR Y6, Y10, Y10 VPXOR Y7, Y11, Y11 VPXOR Y8, Y12, Y12 - VMOVDQU Y5, (SI) - VMOVDQU Y7, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y6, (DI) - VMOVDQU Y8, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y9, (R8) - VMOVDQU Y11, 32(R8) + VMOVDQU Y5, (R8) + VMOVDQU Y7, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y10, (AX) - VMOVDQU Y12, 32(AX) + ADDQ $0x40, DI + VMOVDQU Y6, (R10) + VMOVDQU Y8, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y9, (R12) + VMOVDQU Y11, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y10, (CX) + VMOVDQU Y12, 32(CX) + ADDQ $0x40, CX ADDQ $0x40, AX - SUBQ $0x40, DX + SUBQ $0x40, BX JA loop VZEROUPPER RET @@ -103600,6 +104587,121 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_avx2_4(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_dst_avx2_4(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+72(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ 8(CX), DX + MOVQ dst_base+0(FP), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (BX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (BX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (BX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (BX)(SI*1), CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y10 + VMOVDQU (R9), Y7 + VMOVDQU 32(R9), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y7, (R10) + VMOVDQU Y8, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y9, (R12) + VMOVDQU Y10, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y11, (CX) + VMOVDQU Y12, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + // func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_5(SB), NOSPLIT, $0-56 @@ -103661,9 +104763,160 @@ loop: VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) - VMOVDQU Y9, 32(R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_dst_avx2_5(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_dst_avx2_5(SB), NOSPLIT, $0-80 + MOVQ t23+64(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (DI), Y3 + VMOVDQU (R9), Y4 + VMOVDQU 32(DI), Y5 + VMOVDQU 32(R9), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + VMOVDQU (R11), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R11), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + + // LEO_MULADD_256 + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VMOVDQU Y3, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y4, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y7, (R12) + VMOVDQU Y9, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y8, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_5(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU (R8), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + + // LEO_MULADD_256 + VPAND Y5, Y2, Y11 + VPSRLQ $0x04, Y5, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + + // LEO_MULADD_256 + VPAND Y6, Y2, Y11 + VPSRLQ $0x04, Y6, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y8, (AX) + VMOVDQU Y9, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX @@ -103671,34 +104924,39 @@ loop: VZEROUPPER RET -// func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func fftDIT48_dst_avx2_5(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_5(SB), NOSPLIT, $0-56 - MOVQ t01+32(FP), AX +TEXT ·fftDIT48_dst_avx2_5(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - MOVQ $0x0000000f, CX - MOVQ CX, X2 + MOVQ dst_base+0(FP), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (BX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (BX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (BX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (BX)(SI*1), CX + MOVQ $0x0000000f, BX + MOVQ BX, X2 VPBROADCASTB X2, Y2 loop: - VMOVDQU (SI), Y3 - VMOVDQU 32(SI), Y4 - VMOVDQU (R8), Y7 - VMOVDQU 32(R8), Y8 - VMOVDQU (DI), Y5 - VMOVDQU 32(DI), Y6 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (R11), Y7 + VMOVDQU 32(R11), Y8 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y10 VPXOR Y3, Y7, Y7 @@ -103725,17 +104983,21 @@ loop: VPXOR Y6, Y4, Y6 VPXOR Y7, Y9, Y9 VPXOR Y8, Y10, Y10 - VMOVDQU Y3, (SI) - VMOVDQU Y4, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y5, (DI) - VMOVDQU Y6, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y7, (R8) - VMOVDQU Y8, 32(R8) + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y9, (AX) - VMOVDQU Y10, 32(AX) + ADDQ $0x40, DI + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y7, (R12) + VMOVDQU Y8, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y9, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX ADDQ $0x40, AX SUBQ $0x40, DX JA loop @@ -103813,6 +105075,86 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_avx2_6(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_dst_avx2_6(SB), NOSPLIT, $0-80 + MOVQ t01+56(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (DI), Y3 + VMOVDQU (R9), Y4 + VMOVDQU 32(DI), Y5 + VMOVDQU 32(R9), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + + // LEO_MULADD_256 + VPAND Y4, Y2, Y7 + VPSRLQ $0x04, Y4, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + + // LEO_MULADD_256 + VPAND Y6, Y2, Y7 + VPSRLQ $0x04, Y6, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + VMOVDQU (R11), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R11), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VMOVDQU Y3, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y4, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y7, (R12) + VMOVDQU Y9, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y8, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_6(SB), NOSPLIT, $0-56 @@ -103900,6 +105242,102 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_avx2_6(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_dst_avx2_6(SB), NOSPLIT, $0-80 + MOVQ t02+72(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ 8(CX), DX + MOVQ dst_base+0(FP), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (BX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (BX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (BX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (BX)(SI*1), CX + MOVQ $0x0000000f, BX + MOVQ BX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (R11), Y7 + VMOVDQU 32(R11), Y8 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + + // LEO_MULADD_256 + VPAND Y7, Y2, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + + // LEO_MULADD_256 + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y7, (R12) + VMOVDQU Y8, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y9, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + // func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·ifftDIT48_avx2_7(SB), NOSPLIT, $0-56 @@ -103952,6 +105390,67 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_avx2_7(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·ifftDIT48_dst_avx2_7(SB), NOSPLIT, $0-80 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X0 + VPBROADCASTB X0, Y0 + +loop: + VMOVDQU (DI), Y0 + VMOVDQU (R9), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU 32(R9), Y3 + VPXOR Y1, Y0, Y1 + VPXOR Y3, Y2, Y3 + VMOVDQU (R11), Y4 + VMOVDQU (AX), Y5 + VMOVDQU 32(R11), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y4, Y5, Y5 + VPXOR Y6, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VMOVDQU Y0, (R8) + VMOVDQU Y2, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y1, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y4, (R12) + VMOVDQU Y6, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y5, (CX) + VMOVDQU Y7, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·fftDIT48_avx2_7(SB), NOSPLIT, $0-56 @@ -103996,8 +105495,69 @@ loop: VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y6, (AX) - VMOVDQU Y7, 32(AX) + VMOVDQU Y6, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_dst_avx2_7(dst [][]byte, work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·fftDIT48_dst_avx2_7(SB), NOSPLIT, $0-80 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ 8(CX), DX + MOVQ dst_base+0(FP), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (BX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (BX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (BX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (BX)(SI*1), CX + MOVQ $0x0000000f, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + +loop: + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU (R11), Y4 + VMOVDQU 32(R11), Y5 + VMOVDQU (R9), Y2 + VMOVDQU 32(R9), Y3 + VMOVDQU (AX), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y2, Y6, Y6 + VPXOR Y1, Y5, Y5 + VPXOR Y3, Y7, Y7 + VPXOR Y2, Y0, Y2 + VPXOR Y3, Y1, Y3 + VPXOR Y4, Y6, Y6 + VPXOR Y5, Y7, Y7 + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU Y4, (R12) + VMOVDQU Y5, 32(R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU Y6, (CX) + VMOVDQU Y7, 32(CX) + ADDQ $0x40, CX ADDQ $0x40, AX SUBQ $0x40, DX JA loop @@ -104059,6 +105619,70 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_gfni_0(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_0(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + VBROADCASTF32X2 t23+64(FP), Z1 + VBROADCASTF32X2 t02+72(FP), Z2 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R9), Z4 + VMOVDQU64 (R11), Z5 + VMOVDQU64 (AX), Z6 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z5, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VPTERNLOGD $0x96, Z7, Z3, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z4, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z5, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z6, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56 @@ -104115,6 +105739,71 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_gfni_0(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_0(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + VBROADCASTF32X2 t23+64(FP), Z1 + VBROADCASTF32X2 t02+72(FP), Z2 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R9), Z4 + VMOVDQU64 (R11), Z5 + VMOVDQU64 (AX), Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + VXORPD Z5, Z6, Z6 + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z4, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z5, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z6, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56 @@ -104165,6 +105854,65 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_gfni_1(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_1(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t23+64(FP), Z0 + VBROADCASTF32X2 t02+72(FP), Z1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R9), Z3 + VMOVDQU64 (R11), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z5, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56 @@ -104212,6 +105960,62 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_gfni_1(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_1(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + VBROADCASTF32X2 t23+64(FP), Z1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R9), Z3 + VMOVDQU64 (R11), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z5, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56 @@ -104263,6 +106067,66 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_gfni_2(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_2(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + VBROADCASTF32X2 t02+72(FP), Z1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R9), Z3 + VMOVDQU64 (R11), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z5, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56 @@ -104314,6 +106178,66 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_gfni_2(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_2(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t23+64(FP), Z0 + VBROADCASTF32X2 t02+72(FP), Z1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R9), Z3 + VMOVDQU64 (R11), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z5, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56 @@ -104331,9 +106255,60 @@ TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56 MOVQ (CX)(BX*1), AX loop: - VMOVDQU64 (SI), Z1 - VMOVDQU64 (DI), Z2 - VMOVDQU64 (R8), Z3 + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_dst_gfni_3(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_3(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t02+72(FP), Z0 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R9), Z2 + VMOVDQU64 (R11), Z3 VMOVDQU64 (AX), Z4 VXORPD Z2, Z1, Z2 VXORPD Z3, Z4, Z4 @@ -104347,15 +106322,19 @@ loop: // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 VXORPD Z2, Z5, Z2 - VMOVDQU64 Z1, (SI) - ADDQ $0x40, SI - VMOVDQU64 Z2, (DI) - ADDQ $0x40, DI - VMOVDQU64 Z3, (R8) + VMOVDQU64 Z1, (R8) ADDQ $0x40, R8 - VMOVDQU64 Z4, (AX) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z4, (CX) + ADDQ $0x40, CX ADDQ $0x40, AX - SUBQ $0x40, DX + SUBQ $0x40, BX JA loop VZEROUPPER RET @@ -104402,6 +106381,57 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_gfni_3(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_3(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t23+64(FP), Z0 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R9), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z3, Z5, Z3 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z4, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56 @@ -104448,6 +106478,61 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_gfni_4(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_4(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + VBROADCASTF32X2 t23+64(FP), Z1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R9), Z3 + VMOVDQU64 (R11), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z5, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56 @@ -104499,6 +106584,66 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_gfni_4(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_4(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + VBROADCASTF32X2 t02+72(FP), Z1 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R9), Z3 + VMOVDQU64 (R11), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z5, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56 @@ -104540,6 +106685,56 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_gfni_5(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_5(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t23+64(FP), Z0 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R9), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VPTERNLOGD $0x96, Z5, Z1, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z4, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56 @@ -104582,6 +106777,57 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_gfni_5(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_5(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R9), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z4, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56 @@ -104624,6 +106870,57 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_gfni_6(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_6(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t01+56(FP), Z0 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R9), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z4, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56 @@ -104670,6 +106967,61 @@ loop: VZEROUPPER RET +// func fftDIT48_dst_gfni_6(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_6(SB), NOSPLIT, $0-80 + VBROADCASTF32X2 t02+72(FP), Z0 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R9), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (AX), Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z4, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56 @@ -104707,6 +107059,52 @@ loop: VZEROUPPER RET +// func ifftDIT48_dst_gfni_7(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_dst_gfni_7(SB), NOSPLIT, $0-80 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z0 + VMOVDQU64 (R9), Z1 + VMOVDQU64 (R11), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z3, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + // func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56 @@ -104743,3 +107141,49 @@ loop: JA loop VZEROUPPER RET + +// func fftDIT48_dst_gfni_7(dst [][]byte, work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_dst_gfni_7(SB), NOSPLIT, $0-80 + MOVQ dist+48(FP), AX + MOVQ work_base+24(FP), CX + MOVQ dst_base+0(FP), DX + MOVQ 8(CX), BX + XORQ SI, SI + MOVQ (CX)(SI*1), DI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (CX)(SI*1), R9 + MOVQ (DX)(SI*1), R10 + ADDQ AX, SI + MOVQ (CX)(SI*1), R11 + MOVQ (DX)(SI*1), R12 + ADDQ AX, SI + MOVQ (CX)(SI*1), AX + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU64 (DI), Z0 + VMOVDQU64 (R9), Z1 + VMOVDQU64 (R11), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + ADDQ $0x40, DI + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + ADDQ $0x40, R11 + VMOVDQU64 Z3, (CX) + ADDQ $0x40, CX + ADDQ $0x40, AX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET diff --git a/galois_noasm.go b/galois_noasm.go index 9043601a..18ac46c0 100644 --- a/galois_noasm.go +++ b/galois_noasm.go @@ -50,6 +50,11 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) } +// 4-way butterfly with separate destination +func ifftDIT48Dst(dst, work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4DstRef8(dst, work, dist, log_m01, log_m23, log_m02, o) +} + // 4-way butterfly func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) diff --git a/galois_ppc64le.go b/galois_ppc64le.go index 8cd7b52b..096a22d7 100644 --- a/galois_ppc64le.go +++ b/galois_ppc64le.go @@ -81,6 +81,11 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) } +// 4-way butterfly with separate destination +func ifftDIT48Dst(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4DstRef8(work, dist, log_m01, log_m23, log_m02, o) +} + // 4-way butterfly func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) diff --git a/leopard8.go b/leopard8.go index e72ac5fd..087bc944 100644 --- a/leopard8.go +++ b/leopard8.go @@ -169,7 +169,9 @@ func (r *leopardFF8) encode(shards [][]byte) error { // Split large shards. // More likely on lower shard count. off := 0 - sh := make([][]byte, len(shards)) + sh := make([][]byte, len(shards)+m) + tmp := sh[len(shards):] + sh = sh[:len(shards)] // work slice we can modify wMod := make([][]byte, len(work)) @@ -199,6 +201,7 @@ func (r *leopardFF8) encode(shards [][]byte) error { ifftDITEncoder8( sh[:r.dataShards], + tmp, mtrunc, work, nil, // No xor output @@ -222,6 +225,7 @@ func (r *leopardFF8) encode(shards [][]byte) error { ifftDITEncoder8( sh, // data source + tmp, m, work[m:], // temporary workspace work, // xor destination @@ -240,6 +244,7 @@ func (r *leopardFF8) encode(shards [][]byte) error { ifftDITEncoder8( sh, // data source + tmp, lastCount, work[m:], // temporary workspace work, // xor destination @@ -766,16 +771,23 @@ func fftDIT4Ref8(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *opt } } +var zeroBufferPool = sync.Pool{New: func() interface{} { return make([]byte, workSize8) }} + // Unrolled IFFT for encoder -func ifftDITEncoder8(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe8, o *options) { +func ifftDITEncoder8(data, tmp [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe8, o *options) { // I tried rolling the memcpy/memset into the first layer of the FFT and // found that it only yields a 4% performance improvement, which is not // worth the extra complexity. + in := tmp[:m] for i := 0; i < mtrunc; i++ { - copy(work[i], data[i]) + in[i] = data[i] } - for i := mtrunc; i < m; i++ { - memclr(work[i]) + if mtrunc < m { + zero := zeroBufferPool.Get().([]byte)[:len(data[mtrunc])] + defer zeroBufferPool.Put(zero) + for i := mtrunc; i < m; i++ { + in[i] = zero + } } // Decimation in time: Unroll 2 layers at a time @@ -791,14 +803,19 @@ func ifftDITEncoder8(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, // For each set of dist elements: for i := r; i < iend; i++ { - ifftDIT48( + ifftDIT48Dst( work[i:], + in[i:], dist, log_m01, log_m23, log_m02, o, ) + in[i] = work[i] + in[i+dist] = work[i+dist] + in[i+dist+dist] = work[i+dist+dist] + in[i+dist+dist+dist] = work[i+dist+dist+dist] } } @@ -817,12 +834,19 @@ func ifftDITEncoder8(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, } logm := skewLUT[dist] - + for i := range work[:dist] { + if &in[i][0] != &work[i][0] { + copy(work[i], in[i]) + copy(work[i+dist], in[i+dist]) + in[i] = work[i] + in[i+dist] = work[i+dist] + } + } if logm == modulus8 { - slicesXor(work[dist:dist*2], work[:dist], o) + slicesXor(in[dist:dist*2], in[:dist], o) } else { for i := 0; i < dist; i++ { - ifftDIT28(work[i], work[i+dist], logm, o) + ifftDIT28(in[i], in[i+dist], logm, o) } } } @@ -830,7 +854,17 @@ func ifftDITEncoder8(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, // I tried unrolling this but it does not provide more than 5% performance // improvement for 16-bit finite fields, so it's not worth the complexity. if xorRes != nil { - slicesXor(xorRes[:m], work[:m], o) + slicesXor(xorRes[:m], in[:m], o) + } + if false { + // Safety xfer + // Shouldn't be needed. + for i := range in { + if &in[i][0] != &work[i][0] { + copy(work[i], in[i]) + in[i] = work[i] + } + } } } @@ -858,6 +892,39 @@ func ifftDIT4Ref8(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *op } } +func ifftDIT4DstRef8(dst, work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + // First layer: + copyIf := func(dst, src [][]byte, i int) { + if &dst[i][0] != &src[i][0] { + copy(dst[i], src[i]) + } + } + copyIf(dst, work, dist) + copyIf(dst, work, 0) + if log_m01 == modulus8 { + sliceXor(dst[0], dst[dist], o) + } else { + ifftDIT28(dst[0], dst[dist], log_m01, o) + } + + copyIf(dst, work, dist*3) + copyIf(dst, work, dist*2) + if log_m23 == modulus8 { + sliceXor(dst[dist*2], dst[dist*3], o) + } else { + ifftDIT28(dst[dist*2], dst[dist*3], log_m23, o) + } + + // Second layer: + if log_m02 == modulus8 { + sliceXor(dst[0], dst[dist*2], o) + sliceXor(dst[dist], dst[dist*3], o) + } else { + ifftDIT28(dst[0], dst[dist*2], log_m02, o) + ifftDIT28(dst[dist], dst[dist*3], log_m02, o) + } +} + // Reference version of muladd: x[] ^= y[] * log_m func refMulAdd8(x, y []byte, log_m ffe8) { lut := &mul8LUTs[log_m] From a97e582dc56d98b6d5a2165ad5dda56bdad03d42 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 22 Nov 2022 13:03:48 +0100 Subject: [PATCH 2/4] Pool work shards. --- leopard8.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/leopard8.go b/leopard8.go index 087bc944..a91f09b3 100644 --- a/leopard8.go +++ b/leopard8.go @@ -25,10 +25,10 @@ type leopardFF8 struct { totalShards int // Total number of shards. Calculated, and should not be modified. workPool sync.Pool + shardPool sync.Pool inversion map[[inversion8Bytes]byte]leopardGF8cache inversionMu sync.Mutex - - o options + o options } const inversion8Bytes = 256 / 8 @@ -61,6 +61,11 @@ func newFF8(dataShards, parityShards int, opt options) (*leopardFF8, error) { // r.totalShards is not covering the space, but an estimate. r.inversion = make(map[[inversion8Bytes]byte]leopardGF8cache, r.totalShards) } + m := ceilPow2(r.parityShards) + tmpSize := r.totalShards + m + m*2 + r.shardPool.New = func() interface{} { + return make([][]byte, tmpSize) + } return r, nil } @@ -169,12 +174,20 @@ func (r *leopardFF8) encode(shards [][]byte) error { // Split large shards. // More likely on lower shard count. off := 0 - sh := make([][]byte, len(shards)+m) - tmp := sh[len(shards):] - sh = sh[:len(shards)] + shardPool := r.shardPool.Get().([][]byte) + // Slice into 3 // work slice we can modify - wMod := make([][]byte, len(work)) + wMod := shardPool[:len(work)] + sh := shardPool[len(work) : len(work)+len(shards)] + tmp := shardPool[len(work)+len(shards) : len(work)+len(shards)+m] + + defer func() { + for i := range shardPool { + shardPool[i] = nil + } + r.shardPool.Put(shardPool) + }() copy(wMod, work) for off < shardSize { work := wMod From 87f23e8b64f116bfa349bfa79f20a5484923123c Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 22 Nov 2022 13:14:52 +0100 Subject: [PATCH 3/4] Use zero array instead of pool. --- leopard8.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/leopard8.go b/leopard8.go index a91f09b3..79e7aa57 100644 --- a/leopard8.go +++ b/leopard8.go @@ -784,7 +784,8 @@ func fftDIT4Ref8(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *opt } } -var zeroBufferPool = sync.Pool{New: func() interface{} { return make([]byte, workSize8) }} +// zeroBufferPool returns a pointer to a zeroed array. +var zeroBufferPool = &[workSize8]byte{} // Unrolled IFFT for encoder func ifftDITEncoder8(data, tmp [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe8, o *options) { @@ -796,8 +797,7 @@ func ifftDITEncoder8(data, tmp [][]byte, mtrunc int, work [][]byte, xorRes [][]b in[i] = data[i] } if mtrunc < m { - zero := zeroBufferPool.Get().([]byte)[:len(data[mtrunc])] - defer zeroBufferPool.Put(zero) + zero := (*zeroBufferPool)[:len(data[mtrunc])] for i := mtrunc; i < m; i++ { in[i] = zero } From 52b7f16254292a695fda0a8786ef2f60a244be71 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 22 Nov 2022 13:25:32 +0100 Subject: [PATCH 4/4] Use first for size. --- leopard8.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/leopard8.go b/leopard8.go index 79e7aa57..415420ed 100644 --- a/leopard8.go +++ b/leopard8.go @@ -797,7 +797,7 @@ func ifftDITEncoder8(data, tmp [][]byte, mtrunc int, work [][]byte, xorRes [][]b in[i] = data[i] } if mtrunc < m { - zero := (*zeroBufferPool)[:len(data[mtrunc])] + zero := (*zeroBufferPool)[:len(data[0])] for i := mtrunc; i < m; i++ { in[i] = zero }