crc32_amd64.s 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. //+build !noasm !appengine
  2. // Copyright 2015, Klaus Post, see LICENSE for details.
  3. // func crc32sse(a []byte) hash
  4. TEXT ·crc32sse(SB), 7, $0
  5. MOVQ a+0(FP), R10
  6. XORQ BX, BX
  7. // CRC32 dword (R10), EBX
  8. BYTE $0xF2; BYTE $0x41; BYTE $0x0f
  9. BYTE $0x38; BYTE $0xf1; BYTE $0x1a
  10. MOVL BX, ret+24(FP)
  11. RET
  12. // func crc32sseAll(a []byte, dst []hash)
  13. TEXT ·crc32sseAll(SB), 7, $0
  14. MOVQ a+0(FP), R8 // R8: src
  15. MOVQ a_len+8(FP), R10 // input length
  16. MOVQ dst+24(FP), R9 // R9: dst
  17. SUBQ $4, R10
  18. JS end
  19. JZ one_crc
  20. MOVQ R10, R13
  21. SHRQ $2, R10 // len/4
  22. ANDQ $3, R13 // len&3
  23. XORQ BX, BX
  24. ADDQ $1, R13
  25. TESTQ R10, R10
  26. JZ rem_loop
  27. crc_loop:
  28. MOVQ (R8), R11
  29. XORQ BX, BX
  30. XORQ DX, DX
  31. XORQ DI, DI
  32. MOVQ R11, R12
  33. SHRQ $8, R11
  34. MOVQ R12, AX
  35. MOVQ R11, CX
  36. SHRQ $16, R12
  37. SHRQ $16, R11
  38. MOVQ R12, SI
  39. // CRC32 EAX, EBX
  40. BYTE $0xF2; BYTE $0x0f
  41. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  42. // CRC32 ECX, EDX
  43. BYTE $0xF2; BYTE $0x0f
  44. BYTE $0x38; BYTE $0xf1; BYTE $0xd1
  45. // CRC32 ESI, EDI
  46. BYTE $0xF2; BYTE $0x0f
  47. BYTE $0x38; BYTE $0xf1; BYTE $0xfe
  48. MOVL BX, (R9)
  49. MOVL DX, 4(R9)
  50. MOVL DI, 8(R9)
  51. XORQ BX, BX
  52. MOVL R11, AX
  53. // CRC32 EAX, EBX
  54. BYTE $0xF2; BYTE $0x0f
  55. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  56. MOVL BX, 12(R9)
  57. ADDQ $16, R9
  58. ADDQ $4, R8
  59. XORQ BX, BX
  60. SUBQ $1, R10
  61. JNZ crc_loop
  62. rem_loop:
  63. MOVL (R8), AX
  64. // CRC32 EAX, EBX
  65. BYTE $0xF2; BYTE $0x0f
  66. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  67. MOVL BX, (R9)
  68. ADDQ $4, R9
  69. ADDQ $1, R8
  70. XORQ BX, BX
  71. SUBQ $1, R13
  72. JNZ rem_loop
  73. end:
  74. RET
  75. one_crc:
  76. MOVQ $1, R13
  77. XORQ BX, BX
  78. JMP rem_loop
  79. // func matchLenSSE4(a, b []byte, max int) int
  80. TEXT ·matchLenSSE4(SB), 7, $0
  81. MOVQ a+0(FP), R8 // R8: &a
  82. MOVQ b+24(FP), R9 // R9: &b
  83. MOVQ max+48(FP), R10 // R10: max
  84. XORQ R11, R11 // match length
  85. MOVQ R10, R12
  86. SHRQ $4, R10 // max/16
  87. ANDQ $15, R12 // max & 15
  88. CMPQ R10, $0
  89. JEQ matchlen_verysmall
  90. loopback_matchlen:
  91. MOVOU (R8), X0 // a[x]
  92. MOVOU (R9), X1 // b[x]
  93. // PCMPESTRI $0x18, X1, X0
  94. BYTE $0x66; BYTE $0x0f; BYTE $0x3a
  95. BYTE $0x61; BYTE $0xc1; BYTE $0x18
  96. JC match_ended
  97. ADDQ $16, R8
  98. ADDQ $16, R9
  99. ADDQ $16, R11
  100. SUBQ $1, R10
  101. JNZ loopback_matchlen
  102. matchlen_verysmall:
  103. CMPQ R12, $0
  104. JEQ done_matchlen
  105. loopback_matchlen_single:
  106. // Naiive, but small use
  107. MOVB (R8), R13
  108. MOVB (R9), R14
  109. CMPB R13, R14
  110. JNE done_matchlen
  111. ADDQ $1, R8
  112. ADDQ $1, R9
  113. ADDQ $1, R11
  114. SUBQ $1, R12
  115. JNZ loopback_matchlen_single
  116. MOVQ R11, ret+56(FP)
  117. RET
  118. match_ended:
  119. ADDQ CX, R11
  120. done_matchlen:
  121. MOVQ R11, ret+56(FP)
  122. RET
  123. // func histogram(b []byte, h []int32)
  124. TEXT ·histogram(SB), 7, $0
  125. MOVQ b+0(FP), SI // SI: &b
  126. MOVQ b_len+8(FP), R9 // R9: len(b)
  127. MOVQ h+24(FP), DI // DI: Histogram
  128. MOVQ R9, R8
  129. SHRQ $3, R8
  130. JZ hist1
  131. XORQ R11, R11
  132. loop_hist8:
  133. MOVQ (SI), R10
  134. MOVB R10, R11
  135. INCL (DI)(R11*4)
  136. SHRQ $8, R10
  137. MOVB R10, R11
  138. INCL (DI)(R11*4)
  139. SHRQ $8, R10
  140. MOVB R10, R11
  141. INCL (DI)(R11*4)
  142. SHRQ $8, R10
  143. MOVB R10, R11
  144. INCL (DI)(R11*4)
  145. SHRQ $8, R10
  146. MOVB R10, R11
  147. INCL (DI)(R11*4)
  148. SHRQ $8, R10
  149. MOVB R10, R11
  150. INCL (DI)(R11*4)
  151. SHRQ $8, R10
  152. MOVB R10, R11
  153. INCL (DI)(R11*4)
  154. SHRQ $8, R10
  155. INCL (DI)(R10*4)
  156. ADDQ $8, SI
  157. DECQ R8
  158. JNZ loop_hist8
  159. hist1:
  160. ANDQ $7, R9
  161. JZ end_hist
  162. XORQ R10, R10
  163. loop_hist1:
  164. MOVB (SI), R10
  165. INCL (DI)(R10*4)
  166. INCQ SI
  167. DECQ R9
  168. JNZ loop_hist1
  169. end_hist:
  170. RET