wiki:I2Result

Version 2 (modified by cameron, 3 years ago) (diff)

--

I2Result Demo

Here is a small program pr.cpp.

int main(int argc, char * argv[]) {
  // Initialize a temporary bitblock value.
  BitBlock volatile temp1 = simd<8>::constant<0x33>();
  // print it out.
  print_register<BitBlock>("temp1", temp1);
  // And another.
  BitBlock volatile temp2 = simd<16>::constant<0x3344>();
  print_register<BitBlock>("temp2", temp2);
  //
  BitBlock rslt = simd<8>::eq(temp1, temp2);
  print_register("simd<8>::eq(temp1, temp2)", rslt);
  uint32_t msk = hsimd<64>::signmask(rslt);
  printf("  hsimd<64>::signmask(rslt) = %u\n", msk);

  // Done.   Exit normally.
  return(0);
}

We can compile to LLVM IR.

clang++ -msse2  -O3  -o pr.ll pr.cpp -I../lib/ -S -emit-llvm

We get the following main.

define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
  %temp1 = alloca <2 x i64>, align 16
  %temp2 = alloca <2 x i64>, align 16
  store volatile <2 x i64> <i64 3689348814741910323, i64 3689348814741910323>, <2 x i64>* %temp1, align 16
  %1 = load volatile <2 x i64>* %temp1, align 16
  tail call void @_Z14print_registerIDv2_xEvPKcT_(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0), <2 x i64> %1)
  store volatile <2 x i64> <i64 3694133962361549636, i64 3694133962361549636>, <2 x i64>* %temp2, align 16
  %2 = load volatile <2 x i64>* %temp2, align 16
  tail call void @_Z14print_registerIDv2_xEvPKcT_(i8* getelementptr inbounds ([6 x i8]* @.str1, i64 0, i64 0), <2 x i64> %2)
  %3 = load volatile <2 x i64>* %temp1, align 16
  %4 = load volatile <2 x i64>* %temp2, align 16
  %5 = bitcast <2 x i64> %3 to <16 x i8>
  %6 = bitcast <2 x i64> %4 to <16 x i8>
  %7 = icmp eq <16 x i8> %5, %6
  %8 = sext <16 x i1> %7 to <16 x i8>
  %9 = bitcast <16 x i8> %8 to <2 x i64>
  tail call void @_Z14print_registerIDv2_xEvPKcT_(i8* getelementptr inbounds ([26 x i8]* @.str2, i64 0, i64 0), <2 x i64> %9)
  %10 = bitcast <16 x i8> %8 to <2 x double>
  %11 = tail call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %10) #3
  %12 = and i32 %11, 255
  %13 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([34 x i8]* @.str3, i64 0, i64 0), i32 %12)
  ret i32 0
}

We can replace the llvm.x86.sse2.movmsk.pd with our own implementation signmaskd. Call the result program pr2.ll

define i32 @signmaskd(<2 x double> %a) alwaysinline #5
{
	%bits = bitcast <2 x double> %a to <2 x i64>
	%b = icmp slt <2 x i64> %bits, zeroinitializer
        %c = bitcast <2 x i1> %b to i2
        %result = zext i2 %c to i32
	ret i32 %result
}

But these programs produce different results. Why?

$ ./pr
                                   temp1 = 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 
                                   temp2 = 33 44 33 44 33 44 33 44 33 44 33 44 33 44 33 44 
               simd<8>::eq(temp1, temp2) = FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 
  hsimd<64>::signmask(rslt) = 3
$ ./pr2
                                   temp1 = 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 
                                   temp2 = 33 44 33 44 33 44 33 44 33 44 33 44 33 44 33 44 
               simd<8>::eq(temp1, temp2) = FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 FF 00 
  hsimd<64>::signmask(rslt) = 1

}}}