[amd64] Fast OP_LREM_IMM for power of two operands.
authorMark Probst <mark.probst@gmail.com>
Tue, 24 Jun 2014 23:04:26 +0000 (16:04 -0700)
committerMark Probst <mark.probst@gmail.com>
Tue, 24 Jun 2014 23:04:26 +0000 (16:04 -0700)
This was the reason for one of our bad performance results in the
paper "Clash of the Lambdas" by Biboudis, Palladinos, Smaragdakis:

http://cgi.di.uoa.gr/~biboudis/clashofthelambdas.pdf

mono/mini/basic-long.cs
mono/mini/cpu-amd64.md
mono/mini/method-to-ir.c
mono/mini/mini-amd64.c
mono/mini/mini.c

index 96db6ca37fb2cd8d7858d539776a272fc3dcc405..e0c5129052ace838f77a4265bc21e662f178bdf0 100644 (file)
@@ -1211,5 +1211,47 @@ class Tests
 
                return (int)res;
        }
+
+       public static int test_0_lrem_imm_2 ()
+       {
+               long x = 245345634L;
+               return (int)(x % 2L);
+       }
+
+       public static int test_1_lrem_imm_2 ()
+       {
+               long x = 24534553245L;
+               return (int)(x % 2L);
+       }
+
+       public static int test_1_lrem_imm_2_neg ()
+       {
+               long x = -24534553245L;
+               return -(int)(x % 2L);
+       }
+
+       public static int test_13_lrem_imm_32 ()
+       {
+               long x = 17389L;
+               return (int)(x % 32L);
+       }
+
+       public static int test_27_lrem_imm_32_neg ()
+       {
+               long x = -2435323L;
+               return -(int)(x % 32L);
+       }
+
+       public static int test_5_lrem_imm_large ()
+       {
+               long x = 0x1000000005L;
+               return (int)(x % 0x40000000L);
+       }
+
+       public static int test_5_lrem_imm_too_large ()
+       {
+               long x = 0x1000000005L;
+               return (int)(x % 0x80000000L);
+       }
 }
 
index 5107e999fa308434c59ea086b6991c022a4a57a2..cf472d6e5003f1346af209ea6ab6d9f80295a0f1 100644 (file)
@@ -98,6 +98,7 @@ long_conv_to_u1: dest:i src1:i len:4
 zext_i4: dest:i src1:i len:4
 
 long_mul_imm: dest:i src1:i clob:1 len:12
+long_rem_imm: dest:a src1:a len:32 clob:d
 long_min: dest:i src1:i src2:i len:16 clob:1
 long_min_un: dest:i src1:i src2:i len:16 clob:1
 long_max: dest:i src1:i src2:i len:16 clob:1
index e667acb1664d222b395ec77452d428da21591e29..8d9ac98d2ba018d31cf77d9d7098308c83e4c709 100644 (file)
@@ -12364,7 +12364,11 @@ mono_op_to_op_imm (int opcode)
        case OP_LSHR:
                return OP_LSHR_IMM;
        case OP_LSHR_UN:
-               return OP_LSHR_UN_IMM;          
+               return OP_LSHR_UN_IMM;
+#ifdef TARGET_AMD64
+       case OP_LREM:
+               return OP_LREM_IMM;
+#endif
 
        case OP_COMPARE:
                return OP_COMPARE_IMM;
index b5fed14d8d22f584ac059dfff08b2e0dff1a9586..944f5557386123055ae8c05a3559b49138022fda 100644 (file)
@@ -3306,9 +3306,10 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_IREM_UN_IMM:
                        mono_decompose_op_imm (cfg, bb, ins);
                        break;
+               case OP_LREM_IMM:
                case OP_IREM_IMM:
                        /* Keep the opcode if we can implement it efficiently */
-                       if (!((ins->inst_imm > 0) && (mono_is_power_of_two (ins->inst_imm) != -1)))
+                       if (!(amd64_is_imm32 (ins->inst_imm) && (ins->inst_imm > 0) && (mono_is_power_of_two (ins->inst_imm) != -1)))
                                mono_decompose_op_imm (cfg, bb, ins);
                        break;
                case OP_COMPARE_IMM:
@@ -4503,6 +4504,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_div_reg (code, ins->sreg2, FALSE);
                        }
                        break;
+               case OP_LREM_IMM: {
+                       int power = mono_is_power_of_two (ins->inst_imm);
+
+                       g_assert (ins->sreg1 == AMD64_RAX);
+                       g_assert (ins->dreg == AMD64_RAX);
+                       g_assert (power >= 0);
+
+                       if (power == 0) {
+                               amd64_mov_reg_imm (code, ins->dreg, 0);
+                               break;
+                       }
+
+                       /* Based on gcc code */
+
+                       /* Add compensation for negative dividents */
+                       amd64_mov_reg_reg_size (code, AMD64_RDX, AMD64_RAX, 8);
+                       if (power > 1)
+                               amd64_shift_reg_imm_size (code, X86_SAR, AMD64_RDX, 63, 8);
+                       amd64_shift_reg_imm_size (code, X86_SHR, AMD64_RDX, 64 - power, 8);
+                       amd64_alu_reg_reg_size (code, X86_ADD, AMD64_RAX, AMD64_RDX, 8);
+                       /* Compute remainder */
+                       amd64_alu_reg_imm_size (code, X86_AND, AMD64_RAX, (1 << power) - 1, 8);
+                       /* Remove compensation */
+                       amd64_alu_reg_reg_size (code, X86_SUB, AMD64_RAX, AMD64_RDX, 8);
+                       break;
+               }
                case OP_IDIV:
                case OP_IREM:
 #if defined( __native_client_codegen__ )
index 679fdcda71a67d121441b242afbc3780d80865c5..4d4c39c2f2aaaeb2280ca0711c62abb057f422ad 100644 (file)
@@ -1135,6 +1135,8 @@ mono_op_imm_to_op (int opcode)
                return OP_IREM_UN;
        case OP_IREM_IMM:
                return OP_IREM;
+       case OP_LREM_IMM:
+               return OP_LREM;
        case OP_DIV_IMM:
 #if SIZEOF_REGISTER == 4
                return OP_IDIV;