I think the sse_probe should be: .global sse_probe sse_probe: **movdqu (%rsi), %xmm0** movq %xmm0, %rax andq $0xff, %rax shl $STRIDE_SHIFT, %rax movq (%rdi, %rax), %rax ret