320 lines
6 KiB
ArmAsm
320 lines
6 KiB
ArmAsm
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
// +build gc
|
||
|
|
||
|
#define NOSPLIT 4
|
||
|
#define RODATA 8
|
||
|
|
||
|
// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
|
||
|
//
|
||
|
// func castagnoliSSE42(crc uint32, p []byte) uint32
|
||
|
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
|
||
|
MOVL crc+0(FP), AX // CRC value
|
||
|
MOVQ p+8(FP), SI // data pointer
|
||
|
MOVQ p_len+16(FP), CX // len(p)
|
||
|
|
||
|
// If there are fewer than 8 bytes to process, skip alignment.
|
||
|
CMPQ CX, $8
|
||
|
JL less_than_8
|
||
|
|
||
|
MOVQ SI, BX
|
||
|
ANDQ $7, BX
|
||
|
JZ aligned
|
||
|
|
||
|
// Process the first few bytes to 8-byte align the input.
|
||
|
|
||
|
// BX = 8 - BX. We need to process this many bytes to align.
|
||
|
SUBQ $1, BX
|
||
|
XORQ $7, BX
|
||
|
|
||
|
BTQ $0, BX
|
||
|
JNC align_2
|
||
|
|
||
|
CRC32B (SI), AX
|
||
|
DECQ CX
|
||
|
INCQ SI
|
||
|
|
||
|
align_2:
|
||
|
BTQ $1, BX
|
||
|
JNC align_4
|
||
|
|
||
|
// CRC32W (SI), AX
|
||
|
BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||
|
|
||
|
SUBQ $2, CX
|
||
|
ADDQ $2, SI
|
||
|
|
||
|
align_4:
|
||
|
BTQ $2, BX
|
||
|
JNC aligned
|
||
|
|
||
|
// CRC32L (SI), AX
|
||
|
BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||
|
|
||
|
SUBQ $4, CX
|
||
|
ADDQ $4, SI
|
||
|
|
||
|
aligned:
|
||
|
// The input is now 8-byte aligned and we can process 8-byte chunks.
|
||
|
CMPQ CX, $8
|
||
|
JL less_than_8
|
||
|
|
||
|
CRC32Q (SI), AX
|
||
|
ADDQ $8, SI
|
||
|
SUBQ $8, CX
|
||
|
JMP aligned
|
||
|
|
||
|
less_than_8:
|
||
|
// We may have some bytes left over; process 4 bytes, then 2, then 1.
|
||
|
BTQ $2, CX
|
||
|
JNC less_than_4
|
||
|
|
||
|
// CRC32L (SI), AX
|
||
|
BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||
|
ADDQ $4, SI
|
||
|
|
||
|
less_than_4:
|
||
|
BTQ $1, CX
|
||
|
JNC less_than_2
|
||
|
|
||
|
// CRC32W (SI), AX
|
||
|
BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||
|
ADDQ $2, SI
|
||
|
|
||
|
less_than_2:
|
||
|
BTQ $0, CX
|
||
|
JNC done
|
||
|
|
||
|
CRC32B (SI), AX
|
||
|
|
||
|
done:
|
||
|
MOVL AX, ret+32(FP)
|
||
|
RET
|
||
|
|
||
|
// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
|
||
|
// bytes from each buffer.
|
||
|
//
|
||
|
// func castagnoliSSE42Triple(
|
||
|
// crc1, crc2, crc3 uint32,
|
||
|
// a, b, c []byte,
|
||
|
// rounds uint32,
|
||
|
// ) (retA uint32, retB uint32, retC uint32)
|
||
|
TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
|
||
|
MOVL crcA+0(FP), AX
|
||
|
MOVL crcB+4(FP), CX
|
||
|
MOVL crcC+8(FP), DX
|
||
|
|
||
|
MOVQ a+16(FP), R8 // data pointer
|
||
|
MOVQ b+40(FP), R9 // data pointer
|
||
|
MOVQ c+64(FP), R10 // data pointer
|
||
|
|
||
|
MOVL rounds+88(FP), R11
|
||
|
|
||
|
loop:
|
||
|
CRC32Q (R8), AX
|
||
|
CRC32Q (R9), CX
|
||
|
CRC32Q (R10), DX
|
||
|
|
||
|
CRC32Q 8(R8), AX
|
||
|
CRC32Q 8(R9), CX
|
||
|
CRC32Q 8(R10), DX
|
||
|
|
||
|
CRC32Q 16(R8), AX
|
||
|
CRC32Q 16(R9), CX
|
||
|
CRC32Q 16(R10), DX
|
||
|
|
||
|
ADDQ $24, R8
|
||
|
ADDQ $24, R9
|
||
|
ADDQ $24, R10
|
||
|
|
||
|
DECQ R11
|
||
|
JNZ loop
|
||
|
|
||
|
MOVL AX, retA+96(FP)
|
||
|
MOVL CX, retB+100(FP)
|
||
|
MOVL DX, retC+104(FP)
|
||
|
RET
|
||
|
|
||
|
// func haveSSE42() bool
|
||
|
TEXT ·haveSSE42(SB), NOSPLIT, $0
|
||
|
XORQ AX, AX
|
||
|
INCL AX
|
||
|
CPUID
|
||
|
SHRQ $20, CX
|
||
|
ANDQ $1, CX
|
||
|
MOVB CX, ret+0(FP)
|
||
|
RET
|
||
|
|
||
|
// func haveCLMUL() bool
|
||
|
TEXT ·haveCLMUL(SB), NOSPLIT, $0
|
||
|
XORQ AX, AX
|
||
|
INCL AX
|
||
|
CPUID
|
||
|
SHRQ $1, CX
|
||
|
ANDQ $1, CX
|
||
|
MOVB CX, ret+0(FP)
|
||
|
RET
|
||
|
|
||
|
// func haveSSE41() bool
|
||
|
TEXT ·haveSSE41(SB), NOSPLIT, $0
|
||
|
XORQ AX, AX
|
||
|
INCL AX
|
||
|
CPUID
|
||
|
SHRQ $19, CX
|
||
|
ANDQ $1, CX
|
||
|
MOVB CX, ret+0(FP)
|
||
|
RET
|
||
|
|
||
|
// CRC32 polynomial data
|
||
|
//
|
||
|
// These constants are lifted from the
|
||
|
// Linux kernel, since they avoid the costly
|
||
|
// PSHUFB 16 byte reversal proposed in the
|
||
|
// original Intel paper.
|
||
|
DATA r2r1kp<>+0(SB)/8, $0x154442bd4
|
||
|
DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
|
||
|
DATA r4r3kp<>+0(SB)/8, $0x1751997d0
|
||
|
DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
|
||
|
DATA rupolykp<>+0(SB)/8, $0x1db710641
|
||
|
DATA rupolykp<>+8(SB)/8, $0x1f7011641
|
||
|
DATA r5kp<>+0(SB)/8, $0x163cd6124
|
||
|
|
||
|
GLOBL r2r1kp<>(SB), RODATA, $16
|
||
|
GLOBL r4r3kp<>(SB), RODATA, $16
|
||
|
GLOBL rupolykp<>(SB), RODATA, $16
|
||
|
GLOBL r5kp<>(SB), RODATA, $8
|
||
|
|
||
|
// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||
|
// len(p) must be at least 64, and must be a multiple of 16.
|
||
|
|
||
|
// func ieeeCLMUL(crc uint32, p []byte) uint32
|
||
|
TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
|
||
|
MOVL crc+0(FP), X0 // Initial CRC value
|
||
|
MOVQ p+8(FP), SI // data pointer
|
||
|
MOVQ p_len+16(FP), CX // len(p)
|
||
|
|
||
|
MOVOU (SI), X1
|
||
|
MOVOU 16(SI), X2
|
||
|
MOVOU 32(SI), X3
|
||
|
MOVOU 48(SI), X4
|
||
|
PXOR X0, X1
|
||
|
ADDQ $64, SI // buf+=64
|
||
|
SUBQ $64, CX // len-=64
|
||
|
CMPQ CX, $64 // Less than 64 bytes left
|
||
|
JB remain64
|
||
|
|
||
|
MOVOA r2r1kp<>+0(SB), X0
|
||
|
|
||
|
loopback64:
|
||
|
MOVOA X1, X5
|
||
|
MOVOA X2, X6
|
||
|
MOVOA X3, X7
|
||
|
MOVOA X4, X8
|
||
|
|
||
|
PCLMULQDQ $0, X0, X1
|
||
|
PCLMULQDQ $0, X0, X2
|
||
|
PCLMULQDQ $0, X0, X3
|
||
|
PCLMULQDQ $0, X0, X4
|
||
|
|
||
|
// Load next early
|
||
|
MOVOU (SI), X11
|
||
|
MOVOU 16(SI), X12
|
||
|
MOVOU 32(SI), X13
|
||
|
MOVOU 48(SI), X14
|
||
|
|
||
|
PCLMULQDQ $0x11, X0, X5
|
||
|
PCLMULQDQ $0x11, X0, X6
|
||
|
PCLMULQDQ $0x11, X0, X7
|
||
|
PCLMULQDQ $0x11, X0, X8
|
||
|
|
||
|
PXOR X5, X1
|
||
|
PXOR X6, X2
|
||
|
PXOR X7, X3
|
||
|
PXOR X8, X4
|
||
|
|
||
|
PXOR X11, X1
|
||
|
PXOR X12, X2
|
||
|
PXOR X13, X3
|
||
|
PXOR X14, X4
|
||
|
|
||
|
ADDQ $0x40, DI
|
||
|
ADDQ $64, SI // buf+=64
|
||
|
SUBQ $64, CX // len-=64
|
||
|
CMPQ CX, $64 // Less than 64 bytes left?
|
||
|
JGE loopback64
|
||
|
|
||
|
// Fold result into a single register (X1)
|
||
|
remain64:
|
||
|
MOVOA r4r3kp<>+0(SB), X0
|
||
|
|
||
|
MOVOA X1, X5
|
||
|
PCLMULQDQ $0, X0, X1
|
||
|
PCLMULQDQ $0x11, X0, X5
|
||
|
PXOR X5, X1
|
||
|
PXOR X2, X1
|
||
|
|
||
|
MOVOA X1, X5
|
||
|
PCLMULQDQ $0, X0, X1
|
||
|
PCLMULQDQ $0x11, X0, X5
|
||
|
PXOR X5, X1
|
||
|
PXOR X3, X1
|
||
|
|
||
|
MOVOA X1, X5
|
||
|
PCLMULQDQ $0, X0, X1
|
||
|
PCLMULQDQ $0x11, X0, X5
|
||
|
PXOR X5, X1
|
||
|
PXOR X4, X1
|
||
|
|
||
|
// If there is less than 16 bytes left we are done
|
||
|
CMPQ CX, $16
|
||
|
JB finish
|
||
|
|
||
|
// Encode 16 bytes
|
||
|
remain16:
|
||
|
MOVOU (SI), X10
|
||
|
MOVOA X1, X5
|
||
|
PCLMULQDQ $0, X0, X1
|
||
|
PCLMULQDQ $0x11, X0, X5
|
||
|
PXOR X5, X1
|
||
|
PXOR X10, X1
|
||
|
SUBQ $16, CX
|
||
|
ADDQ $16, SI
|
||
|
CMPQ CX, $16
|
||
|
JGE remain16
|
||
|
|
||
|
finish:
|
||
|
// Fold final result into 32 bits and return it
|
||
|
PCMPEQB X3, X3
|
||
|
PCLMULQDQ $1, X1, X0
|
||
|
PSRLDQ $8, X1
|
||
|
PXOR X0, X1
|
||
|
|
||
|
MOVOA X1, X2
|
||
|
MOVQ r5kp<>+0(SB), X0
|
||
|
|
||
|
// Creates 32 bit mask. Note that we don't care about upper half.
|
||
|
PSRLQ $32, X3
|
||
|
|
||
|
PSRLDQ $4, X2
|
||
|
PAND X3, X1
|
||
|
PCLMULQDQ $0, X0, X1
|
||
|
PXOR X2, X1
|
||
|
|
||
|
MOVOA rupolykp<>+0(SB), X0
|
||
|
|
||
|
MOVOA X1, X2
|
||
|
PAND X3, X1
|
||
|
PCLMULQDQ $0x10, X0, X1
|
||
|
PAND X3, X1
|
||
|
PCLMULQDQ $0, X0, X1
|
||
|
PXOR X2, X1
|
||
|
|
||
|
// PEXTRD $1, X1, AX (SSE 4.1)
|
||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x3a
|
||
|
BYTE $0x16; BYTE $0xc8; BYTE $0x01
|
||
|
MOVL AX, ret+32(FP)
|
||
|
|
||
|
RET
|