214 lines
3.5 KiB
ArmAsm
214 lines
3.5 KiB
ArmAsm
|
//+build !noasm
|
||
|
//+build !appengine
|
||
|
|
||
|
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||
|
|
||
|
// func crc32sse(a []byte) uint32
|
||
|
TEXT ·crc32sse(SB), 4, $0
|
||
|
MOVQ a+0(FP), R10
|
||
|
XORQ BX, BX
|
||
|
|
||
|
// CRC32 dword (R10), EBX
|
||
|
BYTE $0xF2; BYTE $0x41; BYTE $0x0f
|
||
|
BYTE $0x38; BYTE $0xf1; BYTE $0x1a
|
||
|
|
||
|
MOVL BX, ret+24(FP)
|
||
|
RET
|
||
|
|
||
|
// func crc32sseAll(a []byte, dst []uint32)
|
||
|
TEXT ·crc32sseAll(SB), 4, $0
|
||
|
MOVQ a+0(FP), R8 // R8: src
|
||
|
MOVQ a_len+8(FP), R10 // input length
|
||
|
MOVQ dst+24(FP), R9 // R9: dst
|
||
|
SUBQ $4, R10
|
||
|
JS end
|
||
|
JZ one_crc
|
||
|
MOVQ R10, R13
|
||
|
SHRQ $2, R10 // len/4
|
||
|
ANDQ $3, R13 // len&3
|
||
|
XORQ BX, BX
|
||
|
ADDQ $1, R13
|
||
|
TESTQ R10, R10
|
||
|
JZ rem_loop
|
||
|
|
||
|
crc_loop:
|
||
|
MOVQ (R8), R11
|
||
|
XORQ BX, BX
|
||
|
XORQ DX, DX
|
||
|
XORQ DI, DI
|
||
|
MOVQ R11, R12
|
||
|
SHRQ $8, R11
|
||
|
MOVQ R12, AX
|
||
|
MOVQ R11, CX
|
||
|
SHRQ $16, R12
|
||
|
SHRQ $16, R11
|
||
|
MOVQ R12, SI
|
||
|
|
||
|
// CRC32 EAX, EBX
|
||
|
BYTE $0xF2; BYTE $0x0f
|
||
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd8
|
||
|
|
||
|
// CRC32 ECX, EDX
|
||
|
BYTE $0xF2; BYTE $0x0f
|
||
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd1
|
||
|
|
||
|
// CRC32 ESI, EDI
|
||
|
BYTE $0xF2; BYTE $0x0f
|
||
|
BYTE $0x38; BYTE $0xf1; BYTE $0xfe
|
||
|
MOVL BX, (R9)
|
||
|
MOVL DX, 4(R9)
|
||
|
MOVL DI, 8(R9)
|
||
|
|
||
|
XORQ BX, BX
|
||
|
MOVL R11, AX
|
||
|
|
||
|
// CRC32 EAX, EBX
|
||
|
BYTE $0xF2; BYTE $0x0f
|
||
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd8
|
||
|
MOVL BX, 12(R9)
|
||
|
|
||
|
ADDQ $16, R9
|
||
|
ADDQ $4, R8
|
||
|
XORQ BX, BX
|
||
|
SUBQ $1, R10
|
||
|
JNZ crc_loop
|
||
|
|
||
|
rem_loop:
|
||
|
MOVL (R8), AX
|
||
|
|
||
|
// CRC32 EAX, EBX
|
||
|
BYTE $0xF2; BYTE $0x0f
|
||
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd8
|
||
|
|
||
|
MOVL BX, (R9)
|
||
|
ADDQ $4, R9
|
||
|
ADDQ $1, R8
|
||
|
XORQ BX, BX
|
||
|
SUBQ $1, R13
|
||
|
JNZ rem_loop
|
||
|
|
||
|
end:
|
||
|
RET
|
||
|
|
||
|
one_crc:
|
||
|
MOVQ $1, R13
|
||
|
XORQ BX, BX
|
||
|
JMP rem_loop
|
||
|
|
||
|
// func matchLenSSE4(a, b []byte, max int) int
|
||
|
TEXT ·matchLenSSE4(SB), 4, $0
|
||
|
MOVQ a_base+0(FP), SI
|
||
|
MOVQ b_base+24(FP), DI
|
||
|
MOVQ DI, DX
|
||
|
MOVQ max+48(FP), CX
|
||
|
|
||
|
cmp8:
|
||
|
// As long as we are 8 or more bytes before the end of max, we can load and
|
||
|
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
|
||
|
CMPQ CX, $8
|
||
|
JLT cmp1
|
||
|
MOVQ (SI), AX
|
||
|
MOVQ (DI), BX
|
||
|
CMPQ AX, BX
|
||
|
JNE bsf
|
||
|
ADDQ $8, SI
|
||
|
ADDQ $8, DI
|
||
|
SUBQ $8, CX
|
||
|
JMP cmp8
|
||
|
|
||
|
bsf:
|
||
|
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
|
||
|
// the index of the first byte that differs. The BSF instruction finds the
|
||
|
// least significant 1 bit, the amd64 architecture is little-endian, and
|
||
|
// the shift by 3 converts a bit index to a byte index.
|
||
|
XORQ AX, BX
|
||
|
BSFQ BX, BX
|
||
|
SHRQ $3, BX
|
||
|
ADDQ BX, DI
|
||
|
|
||
|
// Subtract off &b[0] to convert from &b[ret] to ret, and return.
|
||
|
SUBQ DX, DI
|
||
|
MOVQ DI, ret+56(FP)
|
||
|
RET
|
||
|
|
||
|
cmp1:
|
||
|
// In the slices' tail, compare 1 byte at a time.
|
||
|
CMPQ CX, $0
|
||
|
JEQ matchLenEnd
|
||
|
MOVB (SI), AX
|
||
|
MOVB (DI), BX
|
||
|
CMPB AX, BX
|
||
|
JNE matchLenEnd
|
||
|
ADDQ $1, SI
|
||
|
ADDQ $1, DI
|
||
|
SUBQ $1, CX
|
||
|
JMP cmp1
|
||
|
|
||
|
matchLenEnd:
|
||
|
// Subtract off &b[0] to convert from &b[ret] to ret, and return.
|
||
|
SUBQ DX, DI
|
||
|
MOVQ DI, ret+56(FP)
|
||
|
RET
|
||
|
|
||
|
// func histogram(b []byte, h []int32)
|
||
|
TEXT ·histogram(SB), 4, $0
|
||
|
MOVQ b+0(FP), SI // SI: &b
|
||
|
MOVQ b_len+8(FP), R9 // R9: len(b)
|
||
|
MOVQ h+24(FP), DI // DI: Histogram
|
||
|
MOVQ R9, R8
|
||
|
SHRQ $3, R8
|
||
|
JZ hist1
|
||
|
XORQ R11, R11
|
||
|
|
||
|
loop_hist8:
|
||
|
MOVQ (SI), R10
|
||
|
|
||
|
MOVB R10, R11
|
||
|
INCL (DI)(R11*4)
|
||
|
SHRQ $8, R10
|
||
|
|
||
|
MOVB R10, R11
|
||
|
INCL (DI)(R11*4)
|
||
|
SHRQ $8, R10
|
||
|
|
||
|
MOVB R10, R11
|
||
|
INCL (DI)(R11*4)
|
||
|
SHRQ $8, R10
|
||
|
|
||
|
MOVB R10, R11
|
||
|
INCL (DI)(R11*4)
|
||
|
SHRQ $8, R10
|
||
|
|
||
|
MOVB R10, R11
|
||
|
INCL (DI)(R11*4)
|
||
|
SHRQ $8, R10
|
||
|
|
||
|
MOVB R10, R11
|
||
|
INCL (DI)(R11*4)
|
||
|
SHRQ $8, R10
|
||
|
|
||
|
MOVB R10, R11
|
||
|
INCL (DI)(R11*4)
|
||
|
SHRQ $8, R10
|
||
|
|
||
|
INCL (DI)(R10*4)
|
||
|
|
||
|
ADDQ $8, SI
|
||
|
DECQ R8
|
||
|
JNZ loop_hist8
|
||
|
|
||
|
hist1:
|
||
|
ANDQ $7, R9
|
||
|
JZ end_hist
|
||
|
XORQ R10, R10
|
||
|
|
||
|
loop_hist1:
|
||
|
MOVB (SI), R10
|
||
|
INCL (DI)(R10*4)
|
||
|
INCQ SI
|
||
|
DECQ R9
|
||
|
JNZ loop_hist1
|
||
|
|
||
|
end_hist:
|
||
|
RET
|