| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- //+build !noasm !appengine
- // Copyright 2015, Klaus Post, see LICENSE for details.
- // func crc32sse(a []byte) hash
- TEXT ·crc32sse(SB), 7, $0
- MOVQ a+0(FP), R10
- XORQ BX, BX
- // CRC32 dword (R10), EBX
- BYTE $0xF2; BYTE $0x41; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0x1a
- MOVL BX, ret+24(FP)
- RET
- // func crc32sseAll(a []byte, dst []hash)
- TEXT ·crc32sseAll(SB), 7, $0
- MOVQ a+0(FP), R8 // R8: src
- MOVQ a_len+8(FP), R10 // input length
- MOVQ dst+24(FP), R9 // R9: dst
- SUBQ $4, R10
- JS end
- JZ one_crc
- MOVQ R10, R13
- SHRQ $2, R10 // len/4
- ANDQ $3, R13 // len&3
- XORQ BX, BX
- ADDQ $1, R13
- TESTQ R10, R10
- JZ rem_loop
- crc_loop:
- MOVQ (R8), R11
- XORQ BX, BX
- XORQ DX, DX
- XORQ DI, DI
- MOVQ R11, R12
- SHRQ $8, R11
- MOVQ R12, AX
- MOVQ R11, CX
- SHRQ $16, R12
- SHRQ $16, R11
- MOVQ R12, SI
- // CRC32 EAX, EBX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd8
- // CRC32 ECX, EDX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd1
- // CRC32 ESI, EDI
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xfe
- MOVL BX, (R9)
- MOVL DX, 4(R9)
- MOVL DI, 8(R9)
- XORQ BX, BX
- MOVL R11, AX
- // CRC32 EAX, EBX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd8
- MOVL BX, 12(R9)
- ADDQ $16, R9
- ADDQ $4, R8
- XORQ BX, BX
- SUBQ $1, R10
- JNZ crc_loop
- rem_loop:
- MOVL (R8), AX
- // CRC32 EAX, EBX
- BYTE $0xF2; BYTE $0x0f
- BYTE $0x38; BYTE $0xf1; BYTE $0xd8
- MOVL BX, (R9)
- ADDQ $4, R9
- ADDQ $1, R8
- XORQ BX, BX
- SUBQ $1, R13
- JNZ rem_loop
- end:
- RET
- one_crc:
- MOVQ $1, R13
- XORQ BX, BX
- JMP rem_loop
- // func matchLenSSE4(a, b []byte, max int) int
- TEXT ·matchLenSSE4(SB), 7, $0
- MOVQ a+0(FP), R8 // R8: &a
- MOVQ b+24(FP), R9 // R9: &b
- MOVQ max+48(FP), R10 // R10: max
- XORQ R11, R11 // match length
- MOVQ R10, R12
- SHRQ $4, R10 // max/16
- ANDQ $15, R12 // max & 15
- CMPQ R10, $0
- JEQ matchlen_verysmall
- loopback_matchlen:
- MOVOU (R8), X0 // a[x]
- MOVOU (R9), X1 // b[x]
- // PCMPESTRI $0x18, X1, X0
- BYTE $0x66; BYTE $0x0f; BYTE $0x3a
- BYTE $0x61; BYTE $0xc1; BYTE $0x18
- JC match_ended
- ADDQ $16, R8
- ADDQ $16, R9
- ADDQ $16, R11
- SUBQ $1, R10
- JNZ loopback_matchlen
- matchlen_verysmall:
- CMPQ R12, $0
- JEQ done_matchlen
- loopback_matchlen_single:
- // Naiive, but small use
- MOVB (R8), R13
- MOVB (R9), R14
- CMPB R13, R14
- JNE done_matchlen
- ADDQ $1, R8
- ADDQ $1, R9
- ADDQ $1, R11
- SUBQ $1, R12
- JNZ loopback_matchlen_single
- MOVQ R11, ret+56(FP)
- RET
- match_ended:
- ADDQ CX, R11
- done_matchlen:
- MOVQ R11, ret+56(FP)
- RET
- // func histogram(b []byte, h []int32)
- TEXT ·histogram(SB), 7, $0
- MOVQ b+0(FP), SI // SI: &b
- MOVQ b_len+8(FP), R9 // R9: len(b)
- MOVQ h+24(FP), DI // DI: Histogram
- MOVQ R9, R8
- SHRQ $3, R8
- JZ hist1
- XORQ R11, R11
- loop_hist8:
- MOVQ (SI), R10
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
- MOVB R10, R11
- INCL (DI)(R11*4)
- SHRQ $8, R10
- INCL (DI)(R10*4)
- ADDQ $8, SI
- DECQ R8
- JNZ loop_hist8
- hist1:
- ANDQ $7, R9
- JZ end_hist
- XORQ R10, R10
- loop_hist1:
- MOVB (SI), R10
- INCL (DI)(R10*4)
- INCQ SI
- DECQ R9
- JNZ loop_hist1
- end_hist:
- RET
|