crc32_amd64.s 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. //+build gc
  2. // Copyright 2011 The Go Authors. All rights reserved.
  3. // Use of this source code is governed by a BSD-style
  4. // license that can be found in the LICENSE file.
  5. #define NOSPLIT 4
  6. #define RODATA 8
  7. // func castagnoliSSE42(crc uint32, p []byte) uint32
  8. TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
  9. MOVL crc+0(FP), AX // CRC value
  10. MOVQ p+8(FP), SI // data pointer
  11. MOVQ p_len+16(FP), CX // len(p)
  12. NOTL AX
  13. // If there's less than 8 bytes to process, we do it byte-by-byte.
  14. CMPQ CX, $8
  15. JL cleanup
  16. // Process individual bytes until the input is 8-byte aligned.
  17. startup:
  18. MOVQ SI, BX
  19. ANDQ $7, BX
  20. JZ aligned
  21. CRC32B (SI), AX
  22. DECQ CX
  23. INCQ SI
  24. JMP startup
  25. aligned:
  26. // The input is now 8-byte aligned and we can process 8-byte chunks.
  27. CMPQ CX, $8
  28. JL cleanup
  29. CRC32Q (SI), AX
  30. ADDQ $8, SI
  31. SUBQ $8, CX
  32. JMP aligned
  33. cleanup:
  34. // We may have some bytes left over that we process one at a time.
  35. CMPQ CX, $0
  36. JE done
  37. CRC32B (SI), AX
  38. INCQ SI
  39. DECQ CX
  40. JMP cleanup
  41. done:
  42. NOTL AX
  43. MOVL AX, ret+32(FP)
  44. RET
  45. // func haveSSE42() bool
  46. TEXT ·haveSSE42(SB), NOSPLIT, $0
  47. XORQ AX, AX
  48. INCL AX
  49. CPUID
  50. SHRQ $20, CX
  51. ANDQ $1, CX
  52. MOVB CX, ret+0(FP)
  53. RET
  54. // func haveCLMUL() bool
  55. TEXT ·haveCLMUL(SB), NOSPLIT, $0
  56. XORQ AX, AX
  57. INCL AX
  58. CPUID
  59. SHRQ $1, CX
  60. ANDQ $1, CX
  61. MOVB CX, ret+0(FP)
  62. RET
  63. // func haveSSE41() bool
  64. TEXT ·haveSSE41(SB), NOSPLIT, $0
  65. XORQ AX, AX
  66. INCL AX
  67. CPUID
  68. SHRQ $19, CX
  69. ANDQ $1, CX
  70. MOVB CX, ret+0(FP)
  71. RET
  72. // CRC32 polynomial data
  73. //
  74. // These constants are lifted from the
  75. // Linux kernel, since they avoid the costly
  76. // PSHUFB 16 byte reversal proposed in the
  77. // original Intel paper.
  78. DATA r2r1kp<>+0(SB)/8, $0x154442bd4
  79. DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
  80. DATA r4r3kp<>+0(SB)/8, $0x1751997d0
  81. DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
  82. DATA rupolykp<>+0(SB)/8, $0x1db710641
  83. DATA rupolykp<>+8(SB)/8, $0x1f7011641
  84. DATA r5kp<>+0(SB)/8, $0x163cd6124
  85. GLOBL r2r1kp<>(SB), RODATA, $16
  86. GLOBL r4r3kp<>(SB), RODATA, $16
  87. GLOBL rupolykp<>(SB), RODATA, $16
  88. GLOBL r5kp<>(SB), RODATA, $8
  89. // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  90. // len(p) must be at least 64, and must be a multiple of 16.
  91. // func ieeeCLMUL(crc uint32, p []byte) uint32
  92. TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
  93. MOVL crc+0(FP), X0 // Initial CRC value
  94. MOVQ p+8(FP), SI // data pointer
  95. MOVQ p_len+16(FP), CX // len(p)
  96. MOVOU (SI), X1
  97. MOVOU 16(SI), X2
  98. MOVOU 32(SI), X3
  99. MOVOU 48(SI), X4
  100. PXOR X0, X1
  101. ADDQ $64, SI // buf+=64
  102. SUBQ $64, CX // len-=64
  103. CMPQ CX, $64 // Less than 64 bytes left
  104. JB remain64
  105. MOVOU r2r1kp<>+0(SB), X0
  106. loopback64:
  107. MOVOA X1, X5
  108. MOVOA X2, X6
  109. MOVOA X3, X7
  110. MOVOA X4, X8
  111. PCLMULQDQ $0, X0, X1
  112. PCLMULQDQ $0, X0, X2
  113. PCLMULQDQ $0, X0, X3
  114. PCLMULQDQ $0, X0, X4
  115. // Load next early
  116. MOVOU (SI), X11
  117. MOVOU 16(SI), X12
  118. MOVOU 32(SI), X13
  119. MOVOU 48(SI), X14
  120. PCLMULQDQ $0x11, X0, X5
  121. PCLMULQDQ $0x11, X0, X6
  122. PCLMULQDQ $0x11, X0, X7
  123. PCLMULQDQ $0x11, X0, X8
  124. PXOR X5, X1
  125. PXOR X6, X2
  126. PXOR X7, X3
  127. PXOR X8, X4
  128. PXOR X11, X1
  129. PXOR X12, X2
  130. PXOR X13, X3
  131. PXOR X14, X4
  132. ADDQ $0x40, DI
  133. ADDQ $64, SI // buf+=64
  134. SUBQ $64, CX // len-=64
  135. CMPQ CX, $64 // Less than 64 bytes left?
  136. JGE loopback64
  137. // Fold result into a single register (X1)
  138. remain64:
  139. MOVOU r4r3kp<>+0(SB), X0
  140. MOVOA X1, X5
  141. PCLMULQDQ $0, X0, X1
  142. PCLMULQDQ $0x11, X0, X5
  143. PXOR X5, X1
  144. PXOR X2, X1
  145. MOVOA X1, X5
  146. PCLMULQDQ $0, X0, X1
  147. PCLMULQDQ $0x11, X0, X5
  148. PXOR X5, X1
  149. PXOR X3, X1
  150. MOVOA X1, X5
  151. PCLMULQDQ $0, X0, X1
  152. PCLMULQDQ $0x11, X0, X5
  153. PXOR X5, X1
  154. PXOR X4, X1
  155. // More than 16 bytes left?
  156. CMPQ CX, $16
  157. JB finish
  158. // Encode 16 bytes
  159. remain16:
  160. MOVOU (SI), X10
  161. MOVOA X1, X5
  162. PCLMULQDQ $0, X0, X1
  163. PCLMULQDQ $0x11, X0, X5
  164. PXOR X5, X1
  165. PXOR X10, X1
  166. SUBQ $16, CX
  167. ADDQ $16, SI
  168. CMPQ CX, $16
  169. JGE remain16
  170. finish:
  171. // Fold final result into 32 bits and return it
  172. PCMPEQB X3, X3
  173. PCLMULQDQ $1, X1, X0
  174. PSRLDQ $8, X1
  175. PXOR X0, X1
  176. MOVOA X1, X2
  177. MOVQ r5kp<>+0(SB), X0
  178. // Creates 32 bit mask. Note that we don't care about upper half.
  179. PSRLQ $32, X3
  180. PSRLDQ $4, X2
  181. PAND X3, X1
  182. PCLMULQDQ $0, X0, X1
  183. PXOR X2, X1
  184. MOVOU rupolykp<>+0(SB), X0
  185. MOVOA X1, X2
  186. PAND X3, X1
  187. PCLMULQDQ $0x10, X0, X1
  188. PAND X3, X1
  189. PCLMULQDQ $0, X0, X1
  190. PXOR X2, X1
  191. // PEXTRD $1, X1, AX (SSE 4.1)
  192. BYTE $0x66; BYTE $0x0f; BYTE $0x3a
  193. BYTE $0x16; BYTE $0xc8; BYTE $0x01
  194. MOVL AX, ret+32(FP)
  195. RET