|
@@ -0,0 +1,249 @@
|
|
|
|
|
+// Copyright 2016 The Go Authors. All rights reserved.
|
|
|
|
|
+// Use of this source code is governed by a BSD-style
|
|
|
|
|
+// license that can be found in the LICENSE file.
|
|
|
|
|
+
|
|
|
|
|
+// +build s390x
|
|
|
|
|
+
|
|
|
|
|
+#include "textflag.h"
|
|
|
|
|
+
|
|
|
|
|
+// Vector register range containing CRC-32 constants
|
|
|
|
|
+
|
|
|
|
|
+#define CONST_PERM_LE2BE V9
|
|
|
|
|
+#define CONST_R2R1 V10
|
|
|
|
|
+#define CONST_R4R3 V11
|
|
|
|
|
+#define CONST_R5 V12
|
|
|
|
|
+#define CONST_RU_POLY V13
|
|
|
|
|
+#define CONST_CRC_POLY V14
|
|
|
|
|
+
|
|
|
|
|
+// The CRC-32 constant block contains reduction constants to fold and
|
|
|
|
|
+// process particular chunks of the input data stream in parallel.
|
|
|
|
|
+//
|
|
|
|
|
+// Note that the constant definitions below are extended in order to compute
|
|
|
|
|
+// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
|
|
|
|
|
+// The rightmost doubleword can be 0 to prevent contribution to the result or
|
|
|
|
|
+// can be multiplied by 1 to perform an XOR without the need for a separate
|
|
|
|
|
+// VECTOR EXCLUSIVE OR instruction.
|
|
|
|
|
+//
|
|
|
|
|
+// The polynomials used are bit-reflected:
|
|
|
|
|
+//
|
|
|
|
|
+// IEEE: P'(x) = 0x0edb88320
|
|
|
|
|
+// Castagnoli: P'(x) = 0x082f63b78
|
|
|
|
|
+
|
|
|
|
|
+// IEEE polynomial constants
|
|
|
|
|
+DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
|
|
|
|
|
+DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
|
|
|
|
|
+DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
|
|
|
|
|
+DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
|
|
|
|
|
+DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
|
|
|
|
|
+DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
|
|
|
|
|
+DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
|
|
|
|
|
+DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
|
|
|
|
|
+DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
|
|
|
|
|
+DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
|
|
|
|
|
+DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
|
|
|
|
|
+DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
|
|
|
|
|
+
|
|
|
|
|
+GLOBL ·crcleconskp(SB), RODATA, $144
|
|
|
|
|
+
|
|
|
|
|
+// Castagonli Polynomial constants
|
|
|
|
|
+DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
|
|
|
|
|
+DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
|
|
|
|
|
+DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
|
|
|
|
|
+DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
|
|
|
|
|
+DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
|
|
|
|
|
+DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
|
|
|
|
|
+DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
|
|
|
|
|
+DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
|
|
|
|
|
+DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
|
|
|
|
|
+DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
|
|
|
|
|
+DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
|
|
|
|
|
+DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
|
|
|
|
|
+
|
|
|
|
|
+GLOBL ·crccleconskp(SB), RODATA, $144
|
|
|
|
|
+
|
|
|
|
|
+// func hasVectorFacility() bool
|
|
|
|
|
+TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
|
|
|
|
|
+ MOVD $x-24(SP), R1
|
|
|
|
|
+ XC $24, 0(R1), 0(R1) // clear the storage
|
|
|
|
|
+ MOVD $2, R0 // R0 is the number of double words stored -1
|
|
|
|
|
+ WORD $0xB2B01000 // STFLE 0(R1)
|
|
|
|
|
+ XOR R0, R0 // reset the value of R0
|
|
|
|
|
+ MOVBZ z-8(SP), R1
|
|
|
|
|
+ AND $0x40, R1
|
|
|
|
|
+ BEQ novector
|
|
|
|
|
+
|
|
|
|
|
+vectorinstalled:
|
|
|
|
|
+ // check if the vector instruction has been enabled
|
|
|
|
|
+ VLEIB $0, $0xF, V16
|
|
|
|
|
+ VLGVB $0, V16, R1
|
|
|
|
|
+ CMPBNE R1, $0xF, novector
|
|
|
|
|
+ MOVB $1, ret+0(FP) // have vx
|
|
|
|
|
+ RET
|
|
|
|
|
+
|
|
|
|
|
+novector:
|
|
|
|
|
+ MOVB $0, ret+0(FP) // no vx
|
|
|
|
|
+ RET
|
|
|
|
|
+
|
|
|
|
|
+// The CRC-32 function(s) use these calling conventions:
|
|
|
|
|
+//
|
|
|
|
|
+// Parameters:
|
|
|
|
|
+//
|
|
|
|
|
+// R2: Initial CRC value, typically ~0; and final CRC (return) value.
|
|
|
|
|
+// R3: Input buffer pointer, performance might be improved if the
|
|
|
|
|
+// buffer is on a doubleword boundary.
|
|
|
|
|
+// R4: Length of the buffer, must be 64 bytes or greater.
|
|
|
|
|
+//
|
|
|
|
|
+// Register usage:
|
|
|
|
|
+//
|
|
|
|
|
+// R5: CRC-32 constant pool base pointer.
|
|
|
|
|
+// V0: Initial CRC value and intermediate constants and results.
|
|
|
|
|
+// V1..V4: Data for CRC computation.
|
|
|
|
|
+// V5..V8: Next data chunks that are fetched from the input buffer.
|
|
|
|
|
+//
|
|
|
|
|
+// V9..V14: CRC-32 constants.
|
|
|
|
|
+
|
|
|
|
|
+// func vectorizedIEEE(crc uint32, p []byte) uint32
|
|
|
|
|
+TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
|
|
|
|
|
+ MOVWZ crc+0(FP), R2 // R2 stores the CRC value
|
|
|
|
|
+ MOVD p+8(FP), R3 // data pointer
|
|
|
|
|
+ MOVD p_len+16(FP), R4 // len(p)
|
|
|
|
|
+
|
|
|
|
|
+ MOVD $·crcleconskp(SB), R5
|
|
|
|
|
+ BR vectorizedBody<>(SB)
|
|
|
|
|
+
|
|
|
|
|
+// func vectorizedCastagnoli(crc uint32, p []byte) uint32
|
|
|
|
|
+TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
|
|
|
|
|
+ MOVWZ crc+0(FP), R2 // R2 stores the CRC value
|
|
|
|
|
+ MOVD p+8(FP), R3 // data pointer
|
|
|
|
|
+ MOVD p_len+16(FP), R4 // len(p)
|
|
|
|
|
+
|
|
|
|
|
+ // R5: crc-32 constant pool base pointer, constant is used to reduce crc
|
|
|
|
|
+ MOVD $·crccleconskp(SB), R5
|
|
|
|
|
+ BR vectorizedBody<>(SB)
|
|
|
|
|
+
|
|
|
|
|
+TEXT vectorizedBody<>(SB), NOSPLIT, $0
|
|
|
|
|
+ XOR $0xffffffff, R2 // NOTW R2
|
|
|
|
|
+ VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
|
|
|
|
|
+
|
|
|
|
|
+ // Load the initial CRC value into the rightmost word of V0
|
|
|
|
|
+ VZERO V0
|
|
|
|
|
+ VLVGF $3, R2, V0
|
|
|
|
|
+
|
|
|
|
|
+ // Crash if the input size is less than 64-bytes.
|
|
|
|
|
+ CMP R4, $64
|
|
|
|
|
+ BLT crash
|
|
|
|
|
+
|
|
|
|
|
+ // Load a 64-byte data chunk and XOR with CRC
|
|
|
|
|
+ VLM 0(R3), V1, V4 // 64-bytes into V1..V4
|
|
|
|
|
+
|
|
|
|
|
+ // Reflect the data if the CRC operation is in the bit-reflected domain
|
|
|
|
|
+ VPERM V1, V1, CONST_PERM_LE2BE, V1
|
|
|
|
|
+ VPERM V2, V2, CONST_PERM_LE2BE, V2
|
|
|
|
|
+ VPERM V3, V3, CONST_PERM_LE2BE, V3
|
|
|
|
|
+ VPERM V4, V4, CONST_PERM_LE2BE, V4
|
|
|
|
|
+
|
|
|
|
|
+ VX V0, V1, V1 // V1 ^= CRC
|
|
|
|
|
+ ADD $64, R3 // BUF = BUF + 64
|
|
|
|
|
+ ADD $(-64), R4
|
|
|
|
|
+
|
|
|
|
|
+ // Check remaining buffer size and jump to proper folding method
|
|
|
|
|
+ CMP R4, $64
|
|
|
|
|
+ BLT less_than_64bytes
|
|
|
|
|
+
|
|
|
|
|
+fold_64bytes_loop:
|
|
|
|
|
+ // Load the next 64-byte data chunk into V5 to V8
|
|
|
|
|
+ VLM 0(R3), V5, V8
|
|
|
|
|
+ VPERM V5, V5, CONST_PERM_LE2BE, V5
|
|
|
|
|
+ VPERM V6, V6, CONST_PERM_LE2BE, V6
|
|
|
|
|
+ VPERM V7, V7, CONST_PERM_LE2BE, V7
|
|
|
|
|
+ VPERM V8, V8, CONST_PERM_LE2BE, V8
|
|
|
|
|
+
|
|
|
|
|
+ // Perform a GF(2) multiplication of the doublewords in V1 with
|
|
|
|
|
+ // the reduction constants in V0. The intermediate result is
|
|
|
|
|
+ // then folded (accumulated) with the next data chunk in V5 and
|
|
|
|
|
+ // stored in V1. Repeat this step for the register contents
|
|
|
|
|
+ // in V2, V3, and V4 respectively.
|
|
|
|
|
+
|
|
|
|
|
+ VGFMAG CONST_R2R1, V1, V5, V1
|
|
|
|
|
+ VGFMAG CONST_R2R1, V2, V6, V2
|
|
|
|
|
+ VGFMAG CONST_R2R1, V3, V7, V3
|
|
|
|
|
+ VGFMAG CONST_R2R1, V4, V8, V4
|
|
|
|
|
+
|
|
|
|
|
+ // Adjust buffer pointer and length for next loop
|
|
|
|
|
+ ADD $64, R3 // BUF = BUF + 64
|
|
|
|
|
+ ADD $(-64), R4 // LEN = LEN - 64
|
|
|
|
|
+
|
|
|
|
|
+ CMP R4, $64
|
|
|
|
|
+ BGE fold_64bytes_loop
|
|
|
|
|
+
|
|
|
|
|
+less_than_64bytes:
|
|
|
|
|
+ // Fold V1 to V4 into a single 128-bit value in V1
|
|
|
|
|
+ VGFMAG CONST_R4R3, V1, V2, V1
|
|
|
|
|
+ VGFMAG CONST_R4R3, V1, V3, V1
|
|
|
|
|
+ VGFMAG CONST_R4R3, V1, V4, V1
|
|
|
|
|
+
|
|
|
|
|
+ // Check whether to continue with 64-bit folding
|
|
|
|
|
+ CMP R4, $16
|
|
|
|
|
+ BLT final_fold
|
|
|
|
|
+
|
|
|
|
|
+fold_16bytes_loop:
|
|
|
|
|
+ VL 0(R3), V2 // Load next data chunk
|
|
|
|
|
+ VPERM V2, V2, CONST_PERM_LE2BE, V2
|
|
|
|
|
+
|
|
|
|
|
+ VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
|
|
|
|
|
+
|
|
|
|
|
+ // Adjust buffer pointer and size for folding next data chunk
|
|
|
|
|
+ ADD $16, R3
|
|
|
|
|
+ ADD $-16, R4
|
|
|
|
|
+
|
|
|
|
|
+ // Process remaining data chunks
|
|
|
|
|
+ CMP R4, $16
|
|
|
|
|
+ BGE fold_16bytes_loop
|
|
|
|
|
+
|
|
|
|
|
+final_fold:
|
|
|
|
|
+ VLEIB $7, $0x40, V9
|
|
|
|
|
+ VSRLB V9, CONST_R4R3, V0
|
|
|
|
|
+ VLEIG $0, $1, V0
|
|
|
|
|
+
|
|
|
|
|
+ VGFMG V0, V1, V1
|
|
|
|
|
+
|
|
|
|
|
+ VLEIB $7, $0x20, V9 // Shift by words
|
|
|
|
|
+ VSRLB V9, V1, V2 // Store remaining bits in V2
|
|
|
|
|
+ VUPLLF V1, V1 // Split rightmost doubleword
|
|
|
|
|
+ VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
|
|
|
|
|
+
|
|
|
|
|
+ // The input values to the Barret reduction are the degree-63 polynomial
|
|
|
|
|
+ // in V1 (R(x)), degree-32 generator polynomial, and the reduction
|
|
|
|
|
+ // constant u. The Barret reduction result is the CRC value of R(x) mod
|
|
|
|
|
+ // P(x).
|
|
|
|
|
+ //
|
|
|
|
|
+ // The Barret reduction algorithm is defined as:
|
|
|
|
|
+ //
|
|
|
|
|
+ // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
|
|
|
|
|
+ // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
|
|
|
|
|
+ // 3. C(x) = R(x) XOR T2(x) mod x^32
|
|
|
|
|
+ //
|
|
|
|
|
+ // Note: To compensate the division by x^32, use the vector unpack
|
|
|
|
|
+ // instruction to move the leftmost word into the leftmost doubleword
|
|
|
|
|
+ // of the vector register. The rightmost doubleword is multiplied
|
|
|
|
|
+ // with zero to not contribute to the intermedate results.
|
|
|
|
|
+
|
|
|
|
|
+ // T1(x) = floor( R(x) / x^32 ) GF2MUL u
|
|
|
|
|
+ VUPLLF V1, V2
|
|
|
|
|
+ VGFMG CONST_RU_POLY, V2, V2
|
|
|
|
|
+
|
|
|
|
|
+ // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
|
|
|
|
|
+ // V2 and XOR the intermediate result, T2(x), with the value in V1.
|
|
|
|
|
+ // The final result is in the rightmost word of V2.
|
|
|
|
|
+
|
|
|
|
|
+ VUPLLF V2, V2
|
|
|
|
|
+ VGFMAG CONST_CRC_POLY, V2, V1, V2
|
|
|
|
|
+
|
|
|
|
|
+done:
|
|
|
|
|
+ VLGVF $2, V2, R2
|
|
|
|
|
+ XOR $0xffffffff, R2 // NOTW R2
|
|
|
|
|
+ MOVWZ R2, ret + 32(FP)
|
|
|
|
|
+ RET
|
|
|
|
|
+
|
|
|
|
|
+crash:
|
|
|
|
|
+ MOVD $0, (R0) // input size is less than 64-bytes
|