wiki:I2Result

Version 5 (modified by cameron, 3 years ago) (diff)

--

I2Result Demo

Here is a small program pr.cpp.

int main(int argc, char * argv[]) {
  // Initialize a temporary bitblock value.
  BitBlock volatile temp1 = simd<8>::constant<0x33>();
  // print it out.
  print_register<BitBlock>("temp1", temp1);
  // And another.
  BitBlock volatile temp2 = simd<16>::constant<0x3344>();
  print_register<BitBlock>("temp2", temp2);
  //
  BitBlock rslt = simd<8>::eq(temp1, temp2);
  print_register("simd<8>::eq(temp1, temp2)", rslt);
  uint32_t msk = hsimd<64>::signmask(rslt);
  printf("  hsimd<64>::signmask(rslt) = %u\n", msk);

  // Done.   Exit normally.
  return(0);
}

We can compile to LLVM IR.

clang++ -msse2  -O3  -o pr.ll pr.cpp -I../lib/ -S -emit-llvm

We get the following main.

define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
  %temp1 = alloca <2 x i64>, align 16
  %temp2 = alloca <2 x i64>, align 16
  store volatile <2 x i64> <i64 3689348814741910323, i64 3689348814741910323>, <2 x i64>* %temp1, align 16
  %1 = load volatile <2 x i64>* %temp1, align 16
  tail call void @_Z14print_registerIDv2_xEvPKcT_(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0), <2 x i64> %1)
  store volatile <2 x i64> <i64 3694133962361549636, i64 3694133962361549636>, <2 x i64>* %temp2, align 16
  %2 = load volatile <2 x i64>* %temp2, align 16
  tail call void @_Z14print_registerIDv2_xEvPKcT_(i8* getelementptr inbounds ([6 x i8]* @.str1, i64 0, i64 0), <2 x i64> %2)
  %3 = load volatile <2 x i64>* %temp1, align 16
  %4 = load volatile <2 x i64>* %temp2, align 16
  %5 = bitcast <2 x i64> %3 to <16 x i8>
  %6 = bitcast <2 x i64> %4 to <16 x i8>
  %7 = icmp eq <16 x i8> %5, %6
  %8 = sext <16 x i1> %7 to <16 x i8>
  %9 = bitcast <16 x i8> %8 to <2 x i64>
  tail call void @_Z14print_registerIDv2_xEvPKcT_(i8* getelementptr inbounds ([26 x i8]* @.str2, i64 0, i64 0), <2 x i64> %9)
  %10 = bitcast <16 x i8> %8 to <2 x double>
  %11 = tail call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %10) #3
  %12 = and i32 %11, 255
  %13 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([34 x i8]* @.str3, i64 0, i64 0), i32 %12)
  ret i32 0
}

We can replace the llvm.x86.sse2.movmsk.pd with our own implementation signmaskd. Call the result program pr2.ll

define i32 @signmaskd(<2 x double> %a) alwaysinline #5
{
	%bits = bitcast <2 x double> %a to <2 x i64>
	%b = icmp slt <2 x i64> %bits, zeroinitializer
        %c = bitcast <2 x i1> %b to i2
        %result = zext i2 %c to i32
	ret i32 %result
}

But these programs produce different results. Why?

$ ./pr
                                   temp1 = 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 
                                   temp2 = 33 44 33 44 33 44 33 44 33 44 33 44 33 44 33 44 
               simd<8>::eq(temp1, temp2) = FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 
  hsimd<64>::signmask(rslt) = 3
$ ./pr2
                                   temp1 = 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 
                                   temp2 = 33 44 33 44 33 44 33 44 33 44 33 44 33 44 33 44 
               simd<8>::eq(temp1, temp2) = FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 
  hsimd<64>::signmask(rslt) = 1

The assembly output for signmaskd shows that there is an issue in code generation for the LLVM IR above, with the first bit value extracted being clobbered by the second bit value (both written to the same memory location).

signmaskd:                              # @signmaskd
# BB#0:
	movdqa	.LCPI2_0(%rip), %xmm1
	pxor	%xmm1, %xmm0
	movdqa	%xmm1, %xmm2
	pcmpgtd	%xmm0, %xmm2
	pshufd	$-96, %xmm2, %xmm3      # xmm3 = xmm2[0,0,2,2]
	pcmpeqd	%xmm1, %xmm0
	pshufd	$-11, %xmm0, %xmm0      # xmm0 = xmm0[1,1,3,3]
	pand	%xmm3, %xmm0
	pshufd	$-11, %xmm2, %xmm1      # xmm1 = xmm2[1,1,3,3]
	por	%xmm0, %xmm1
	movd	%xmm1, %rax
	andl	$1, %eax
	movb	%al, -2(%rsp)
	punpckhqdq	%xmm1, %xmm1    # xmm1 = xmm1[1,1]
	movd	%xmm1, %rax
	andl	$1, %eax
	movb	%al, -2(%rsp)
	movzbl	-2(%rsp), %eax
	ret