// +build amd64,!appengine,!go1.9

TEXT ·hasAsm(SB),4,$0-1
MOVQ $1, AX
CPUID
SHRQ $23, CX
ANDQ $1, CX
MOVB CX, ret+0(FP)
RET

#define POPCNTQ_DX_DX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0xd2

TEXT ·popcntSliceAsm(SB),4,$0-32
XORQ	AX, AX
MOVQ	s+0(FP), SI
MOVQ	s_len+8(FP), CX
TESTQ	CX, CX
JZ		popcntSliceEnd
popcntSliceLoop:
BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0x16 // POPCNTQ (SI), DX
ADDQ	DX, AX
ADDQ	$8, SI
LOOP	popcntSliceLoop
popcntSliceEnd:
MOVQ	AX, ret+24(FP)
RET

TEXT ·popcntMaskSliceAsm(SB),4,$0-56
XORQ	AX, AX
MOVQ	s+0(FP), SI
MOVQ	s_len+8(FP), CX
TESTQ	CX, CX
JZ		popcntMaskSliceEnd
MOVQ	m+24(FP), DI
popcntMaskSliceLoop:
MOVQ	(DI), DX
NOTQ	DX
ANDQ	(SI), DX
POPCNTQ_DX_DX
ADDQ	DX, AX
ADDQ	$8, SI
ADDQ	$8, DI
LOOP	popcntMaskSliceLoop
popcntMaskSliceEnd:
MOVQ	AX, ret+48(FP)
RET

TEXT ·popcntAndSliceAsm(SB),4,$0-56
XORQ	AX, AX
MOVQ	s+0(FP), SI
MOVQ	s_len+8(FP), CX
TESTQ	CX, CX
JZ		popcntAndSliceEnd
MOVQ	m+24(FP), DI
popcntAndSliceLoop:
MOVQ	(DI), DX
ANDQ	(SI), DX
POPCNTQ_DX_DX
ADDQ	DX, AX
ADDQ	$8, SI
ADDQ	$8, DI
LOOP	popcntAndSliceLoop
popcntAndSliceEnd:
MOVQ	AX, ret+48(FP)
RET

TEXT ·popcntOrSliceAsm(SB),4,$0-56
XORQ	AX, AX
MOVQ	s+0(FP), SI
MOVQ	s_len+8(FP), CX
TESTQ	CX, CX
JZ		popcntOrSliceEnd
MOVQ	m+24(FP), DI
popcntOrSliceLoop:
MOVQ	(DI), DX
ORQ		(SI), DX
POPCNTQ_DX_DX
ADDQ	DX, AX
ADDQ	$8, SI
ADDQ	$8, DI
LOOP	popcntOrSliceLoop
popcntOrSliceEnd:
MOVQ	AX, ret+48(FP)
RET

TEXT ·popcntXorSliceAsm(SB),4,$0-56
XORQ	AX, AX
MOVQ	s+0(FP), SI
MOVQ	s_len+8(FP), CX
TESTQ	CX, CX
JZ		popcntXorSliceEnd
MOVQ	m+24(FP), DI
popcntXorSliceLoop:
MOVQ	(DI), DX
XORQ	(SI), DX
POPCNTQ_DX_DX
ADDQ	DX, AX
ADDQ	$8, SI
ADDQ	$8, DI
LOOP	popcntXorSliceLoop
popcntXorSliceEnd:
MOVQ	AX, ret+48(FP)
RET