//+build !noasm
//+build !appengine

// Copyright 2015, Klaus Post, see LICENSE for details.

// func crc32sse(a []byte) uint32
TEXT ·crc32sse(SB), 4, $0
	MOVQ a+0(FP), R10
	XORQ BX, BX

	// CRC32   dword (R10), EBX
	BYTE $0xF2; BYTE $0x41; BYTE $0x0f
	BYTE $0x38; BYTE $0xf1; BYTE $0x1a

	MOVL BX, ret+24(FP)
	RET

// func crc32sseAll(a []byte, dst []uint32)
TEXT ·crc32sseAll(SB), 4, $0
	MOVQ  a+0(FP), R8      // R8: src
	MOVQ  a_len+8(FP), R10 // input length
	MOVQ  dst+24(FP), R9   // R9: dst
	SUBQ  $4, R10
	JS    end
	JZ    one_crc
	MOVQ  R10, R13
	SHRQ  $2, R10          // len/4
	ANDQ  $3, R13          // len&3
	XORQ  BX, BX
	ADDQ  $1, R13
	TESTQ R10, R10
	JZ    rem_loop

crc_loop:
	MOVQ (R8), R11
	XORQ BX, BX
	XORQ DX, DX
	XORQ DI, DI
	MOVQ R11, R12
	SHRQ $8, R11
	MOVQ R12, AX
	MOVQ R11, CX
	SHRQ $16, R12
	SHRQ $16, R11
	MOVQ R12, SI

	// CRC32   EAX, EBX
	BYTE $0xF2; BYTE $0x0f
	BYTE $0x38; BYTE $0xf1; BYTE $0xd8

	// CRC32   ECX, EDX
	BYTE $0xF2; BYTE $0x0f
	BYTE $0x38; BYTE $0xf1; BYTE $0xd1

	// CRC32   ESI, EDI
	BYTE $0xF2; BYTE $0x0f
	BYTE $0x38; BYTE $0xf1; BYTE $0xfe
	MOVL BX, (R9)
	MOVL DX, 4(R9)
	MOVL DI, 8(R9)

	XORQ BX, BX
	MOVL R11, AX

	// CRC32   EAX, EBX
	BYTE $0xF2; BYTE $0x0f
	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
	MOVL BX, 12(R9)

	ADDQ $16, R9
	ADDQ $4, R8
	XORQ BX, BX
	SUBQ $1, R10
	JNZ  crc_loop

rem_loop:
	MOVL (R8), AX

	// CRC32   EAX, EBX
	BYTE $0xF2; BYTE $0x0f
	BYTE $0x38; BYTE $0xf1; BYTE $0xd8

	MOVL BX, (R9)
	ADDQ $4, R9
	ADDQ $1, R8
	XORQ BX, BX
	SUBQ $1, R13
	JNZ  rem_loop

end:
	RET

one_crc:
	MOVQ $1, R13
	XORQ BX, BX
	JMP  rem_loop

// func matchLenSSE4(a, b []byte, max int) int
TEXT ·matchLenSSE4(SB), 4, $0
	MOVQ a_base+0(FP), SI
	MOVQ b_base+24(FP), DI
	MOVQ DI, DX
	MOVQ max+48(FP), CX

cmp8:
	// As long as we are 8 or more bytes before the end of max, we can load and
	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
	CMPQ CX, $8
	JLT  cmp1
	MOVQ (SI), AX
	MOVQ (DI), BX
	CMPQ AX, BX
	JNE  bsf
	ADDQ $8, SI
	ADDQ $8, DI
	SUBQ $8, CX
	JMP  cmp8

bsf:
	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
	// the index of the first byte that differs. The BSF instruction finds the
	// least significant 1 bit, the amd64 architecture is little-endian, and
	// the shift by 3 converts a bit index to a byte index.
	XORQ AX, BX
	BSFQ BX, BX
	SHRQ $3, BX
	ADDQ BX, DI

	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
	SUBQ DX, DI
	MOVQ DI, ret+56(FP)
	RET

cmp1:
	// In the slices' tail, compare 1 byte at a time.
	CMPQ CX, $0
	JEQ  matchLenEnd
	MOVB (SI), AX
	MOVB (DI), BX
	CMPB AX, BX
	JNE  matchLenEnd
	ADDQ $1, SI
	ADDQ $1, DI
	SUBQ $1, CX
	JMP  cmp1

matchLenEnd:
	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
	SUBQ DX, DI
	MOVQ DI, ret+56(FP)
	RET

// func histogram(b []byte, h []int32)
TEXT ·histogram(SB), 4, $0
	MOVQ b+0(FP), SI     // SI: &b
	MOVQ b_len+8(FP), R9 // R9: len(b)
	MOVQ h+24(FP), DI    // DI: Histogram
	MOVQ R9, R8
	SHRQ $3, R8
	JZ   hist1
	XORQ R11, R11

loop_hist8:
	MOVQ (SI), R10

	MOVB R10, R11
	INCL (DI)(R11*4)
	SHRQ $8, R10

	MOVB R10, R11
	INCL (DI)(R11*4)
	SHRQ $8, R10

	MOVB R10, R11
	INCL (DI)(R11*4)
	SHRQ $8, R10

	MOVB R10, R11
	INCL (DI)(R11*4)
	SHRQ $8, R10

	MOVB R10, R11
	INCL (DI)(R11*4)
	SHRQ $8, R10

	MOVB R10, R11
	INCL (DI)(R11*4)
	SHRQ $8, R10

	MOVB R10, R11
	INCL (DI)(R11*4)
	SHRQ $8, R10

	INCL (DI)(R10*4)

	ADDQ $8, SI
	DECQ R8
	JNZ  loop_hist8

hist1:
	ANDQ $7, R9
	JZ   end_hist
	XORQ R10, R10

loop_hist1:
	MOVB (SI), R10
	INCL (DI)(R10*4)
	INCQ SI
	DECQ R9
	JNZ  loop_hist1

end_hist:
	RET