1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
|
#include "go_asm.h" #include "textflag.h"
TEXT ·Index(SB),NOSPLIT,$0-56 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX MOVQ b_base+24(FP), R8 MOVQ b_len+32(FP), AX MOVQ DI, R10 LEAQ ret+48(FP), R11 JMP indexbody<>(SB)
TEXT ·IndexString(SB),NOSPLIT,$0-40 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX MOVQ b_base+16(FP), R8 MOVQ b_len+24(FP), AX MOVQ DI, R10 LEAQ ret+32(FP), R11 JMP indexbody<>(SB)
TEXT indexbody<>(SB),NOSPLIT,$0 CMPQ AX, DX JA fail CMPQ DX, $16 JAE sse42 no_sse42: CMPQ AX, $2 JA _3_or_more MOVW (R8), R8 LEAQ -1(DI)(DX*1), DX loop2: MOVW (DI), SI CMPW SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX JB loop2 JMP fail _3_or_more: CMPQ AX, $3 JA _4_or_more MOVW 1(R8), BX MOVW (R8), R8 LEAQ -2(DI)(DX*1), DX loop3: MOVW (DI), SI CMPW SI,R8 JZ partial_success3 ADDQ $1,DI CMPQ DI,DX JB loop3 JMP fail partial_success3: MOVW 1(DI), SI CMPW SI,BX JZ success ADDQ $1,DI CMPQ DI,DX JB loop3 JMP fail _4_or_more: CMPQ AX, $4 JA _5_or_more MOVL (R8), R8 LEAQ -3(DI)(DX*1), DX loop4: MOVL (DI), SI CMPL SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX JB loop4 JMP fail _5_or_more: CMPQ AX, $7 JA _8_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX MOVL -4(R8)(AX*1), BX MOVL (R8), R8 loop5to7: MOVL (DI), SI CMPL SI,R8 JZ partial_success5to7 ADDQ $1,DI CMPQ DI,DX JB loop5to7 JMP fail partial_success5to7: MOVL -4(AX)(DI*1), SI CMPL SI,BX JZ success ADDQ $1,DI CMPQ DI,DX JB loop5to7 JMP fail _8_or_more: CMPQ AX, $8 JA _9_or_more MOVQ (R8), R8 LEAQ -7(DI)(DX*1), DX loop8: MOVQ (DI), SI CMPQ SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX JB loop8 JMP fail _9_or_more: CMPQ AX, $15 JA _16_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX MOVQ -8(R8)(AX*1), BX MOVQ (R8), R8 loop9to15: MOVQ (DI), SI CMPQ SI,R8 JZ partial_success9to15 ADDQ $1,DI CMPQ DI,DX JB loop9to15 JMP fail partial_success9to15: MOVQ -8(AX)(DI*1), SI CMPQ SI,BX JZ success ADDQ $1,DI CMPQ DI,DX JB loop9to15 JMP fail _16_or_more: CMPQ AX, $16 JA _17_or_more MOVOU (R8), X1 LEAQ -15(DI)(DX*1), DX loop16: MOVOU (DI), X2 PCMPEQB X1, X2 PMOVMSKB X2, SI CMPQ SI, $0xffff JE success ADDQ $1,DI CMPQ DI,DX JB loop16 JMP fail _17_or_more: CMPQ AX, $31 JA _32_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX MOVOU -16(R8)(AX*1), X0 MOVOU (R8), X1 loop17to31: MOVOU (DI), X2 PCMPEQB X1,X2 PMOVMSKB X2, SI CMPQ SI, $0xffff JE partial_success17to31 ADDQ $1,DI CMPQ DI,DX JB loop17to31 JMP fail partial_success17to31: MOVOU -16(AX)(DI*1), X3 PCMPEQB X0, X3 PMOVMSKB X3, SI CMPQ SI, $0xffff JE success ADDQ $1,DI CMPQ DI,DX JB loop17to31 JMP fail
_32_or_more: CMPQ AX, $32 JA _33_to_63 VMOVDQU (R8), Y1 LEAQ -31(DI)(DX*1), DX loop32: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 VPMOVMSKB Y3, SI CMPL SI, $0xffffffff JE success_avx2 ADDQ $1,DI CMPQ DI,DX JB loop32 JMP fail_avx2 _33_to_63: LEAQ 1(DI)(DX*1), DX SUBQ AX, DX VMOVDQU -32(R8)(AX*1), Y0 VMOVDQU (R8), Y1 loop33to63: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 VPMOVMSKB Y3, SI CMPL SI, $0xffffffff JE partial_success33to63 ADDQ $1,DI CMPQ DI,DX JB loop33to63 JMP fail_avx2 partial_success33to63: VMOVDQU -32(AX)(DI*1), Y3 VPCMPEQB Y0, Y3, Y4 VPMOVMSKB Y4, SI CMPL SI, $0xffffffff JE success_avx2 ADDQ $1,DI CMPQ DI,DX JB loop33to63 fail_avx2: VZEROUPPER fail: MOVQ $-1, (R11) RET success_avx2: VZEROUPPER JMP success sse42: CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 JNE no_sse42 CMPQ AX, $12 JAE _9_or_more LEAQ 16(R8), SI TESTW $0xff0, SI JEQ no_sse42 MOVOU (R8), X1 LEAQ -15(DI)(DX*1), SI MOVQ $16, R9 SUBQ AX, R9 loop_sse42: PCMPESTRI $0x0c, (DI), X1 CMPQ CX, R9 JBE sse42_success ADDQ R9, DI CMPQ DI, SI JB loop_sse42 PCMPESTRI $0x0c, -1(SI), X1 CMPQ CX, R9 JA fail LEAQ -1(SI), DI sse42_success: ADDQ CX, DI success: SUBQ R10, DI MOVQ DI, (R11) RET
|