2007-06-01 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index 73c5ead209c54826eb2e1bbf1a4d4470443b1bcd..8c2737509341e564f49a8bceabbb448dbf902a62 100644 (file)
@@ -44,6 +44,8 @@ static gboolean use_sse2 = !MONO_ARCH_USE_FPSTACK;
 
 #define IS_IMM32(val) ((((guint64)val) >> 32) == 0)
 
+#define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
+
 #ifdef PLATFORM_WIN32
 /* Under windows, the default pinvoke calling convention is stdcall */
 #define CALLCONV_IS_STDCALL(call_conv) (((call_conv) == MONO_CALL_STDCALL) || ((call_conv) == MONO_CALL_DEFAULT))
@@ -1030,7 +1032,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        }
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
                if (inst->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
@@ -1537,8 +1539,250 @@ store_membase_imm_to_store_membase_reg (int opcode)
        return -1;
 }
 
-/* FIXME: Add more instructions */
-#define INST_IGNORES_CFLAGS(ins) (((ins)->opcode == CEE_BR) || ((ins)->opcode == OP_STORE_MEMBASE_IMM) || ((ins)->opcode == OP_STOREI8_MEMBASE_REG) || ((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_ICONST) || ((ins)->opcode == OP_I8CONST) || ((ins)->opcode == OP_LOAD_MEMBASE))
+#define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB) || ((opcode) == OP_ISBB_IMM)))
+
+/*
+ * peephole_pass_1:
+ *
+ *   Perform peephole opts which should/can be performed before local regalloc
+ */
+static void
+peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
+{
+       MonoInst *ins, *last_ins = NULL;
+       ins = bb->code;
+
+       while (ins) {
+
+               switch (ins->opcode) {
+               case OP_ADD_IMM:
+               case OP_IADD_IMM:
+               case OP_LADD_IMM:
+                       if ((ins->sreg1 < MONO_MAX_IREGS) && (ins->dreg >= MONO_MAX_IREGS) && (ins->inst_imm > 0)) {
+                               /* 
+                                * X86_LEA is like ADD, but doesn't have the
+                                * sreg1==dreg restriction. inst_imm > 0 is needed since LEA sign-extends 
+                                * its operand to 64 bit.
+                                */
+                               ins->opcode = OP_X86_LEA_MEMBASE;
+                               ins->inst_basereg = ins->sreg1;
+                               /* Fall through */
+                       }
+                       else
+                               break;
+               case CEE_XOR:
+                       if ((ins->sreg1 == ins->sreg2) && (ins->sreg1 == ins->dreg)) {
+                               MonoInst *ins2;
+
+                               /* 
+                                * Replace STORE_MEMBASE_IMM 0 with STORE_MEMBASE_REG since 
+                                * the latter has length 2-3 instead of 6 (reverse constant
+                                * propagation). These instruction sequences are very common
+                                * in the initlocals bblock.
+                                */
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
+                                       if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
+                                               ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
+                                               ins2->sreg1 = ins->dreg;
+                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+                                               /* Continue */
+                                       } else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
+                                               NULLIFY_INS (ins2);
+                                               /* Continue */
+                                       } else {
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               case OP_COMPARE_IMM:
+                       /* OP_COMPARE_IMM (reg, 0) 
+                        * --> 
+                        * OP_AMD64_TEST_NULL (reg) 
+                        */
+                       if (!ins->inst_imm)
+                               ins->opcode = OP_AMD64_TEST_NULL;
+                       break;
+               case OP_ICOMPARE_IMM:
+                       if (!ins->inst_imm)
+                               ins->opcode = OP_X86_TEST_NULL;
+                       break;
+               case OP_AMD64_ICOMPARE_MEMBASE_IMM:
+                       /* 
+                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
+                        * OP_X86_COMPARE_MEMBASE_IMM offset(basereg), imm
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
+                        * OP_COMPARE_IMM reg, imm
+                        *
+                        * Note: if imm = 0 then OP_COMPARE_IMM replaced with OP_X86_TEST_NULL
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG) &&
+                           ins->inst_basereg == last_ins->inst_destbasereg &&
+                           ins->inst_offset == last_ins->inst_offset) {
+                                       ins->opcode = OP_ICOMPARE_IMM;
+                                       ins->sreg1 = last_ins->sreg1;
+
+                                       /* check if we can remove cmp reg,0 with test null */
+                                       if (!ins->inst_imm)
+                                               ins->opcode = OP_X86_TEST_NULL;
+                               }
+
+                       break;
+               case OP_LOAD_MEMBASE:
+               case OP_LOADI4_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG 
+                                        || last_ins->opcode == OP_STORE_MEMBASE_REG) &&
+                           ins->inst_basereg == last_ins->inst_destbasereg &&
+                           ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+
+                       /* 
+                        * Note: reg1 must be different from the basereg in the second load
+                        * Note: if reg1 = reg2 is equal then second load is removed
+                        *
+                        * OP_LOAD_MEMBASE offset(basereg), reg1
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_LOAD_MEMBASE offset(basereg), reg1
+                        * OP_MOVE reg1, reg2
+                        */
+                       } if (last_ins && (last_ins->opcode == OP_LOADI4_MEMBASE
+                                          || last_ins->opcode == OP_LOAD_MEMBASE) &&
+                             ins->inst_basereg != last_ins->dreg &&
+                             ins->inst_basereg == last_ins->inst_basereg &&
+                             ins->inst_offset == last_ins->inst_offset) {
+
+                               if (ins->dreg == last_ins->dreg) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->dreg;
+                               }
+
+                               //g_assert_not_reached ();
+
+#if 0
+                       /* 
+                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg
+                        * -->
+                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
+                        * OP_ICONST reg, imm
+                        */
+                       } else if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_IMM
+                                               || last_ins->opcode == OP_STORE_MEMBASE_IMM) &&
+                                  ins->inst_basereg == last_ins->inst_destbasereg &&
+                                  ins->inst_offset == last_ins->inst_offset) {
+                               //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                               ins->opcode = OP_ICONST;
+                               ins->inst_c0 = last_ins->inst_imm;
+                               g_assert_not_reached (); // check this rule
+#endif
+                       }
+                       break;
+               case OP_LOADI1_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI1_MEMBASE_REG) &&
+                                       ins->inst_basereg == last_ins->inst_destbasereg &&
+                                       ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+                       }
+                       break;
+               case OP_LOADI2_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI2_MEMBASE_REG) &&
+                                       ins->inst_basereg == last_ins->inst_destbasereg &&
+                                       ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+                       }
+                       break;
+               case CEE_CONV_I4:
+               case CEE_CONV_U4:
+               case OP_MOVE:
+               case OP_FMOVE:
+                       /*
+                        * Removes:
+                        *
+                        * OP_MOVE reg, reg 
+                        */
+                       if (ins->dreg == ins->sreg1) {
+                               if (last_ins)
+                                       last_ins->next = ins->next;                             
+                               ins = ins->next;
+                               continue;
+                       }
+                       /* 
+                        * Removes:
+                        *
+                        * OP_MOVE sreg, dreg 
+                        * OP_MOVE dreg, sreg
+                        */
+                       if (last_ins && last_ins->opcode == OP_MOVE &&
+                           ins->sreg1 == last_ins->dreg &&
+                           ins->dreg == last_ins->sreg1) {
+                               last_ins->next = ins->next;                             
+                               ins = ins->next;                                
+                               continue;
+                       }
+                       break;
+               }
+               last_ins = ins;
+               ins = ins->next;
+       }
+       bb->last_ins = last_ins;
+}
 
 static void
 peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
@@ -1553,7 +1797,7 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_I8CONST:
                        /* reg = 0 -> XOR (reg, reg) */
                        /* XOR sets cflags on x86, so we cant do it always */
-                       if (ins->inst_c0 == 0 && (ins->next && INST_IGNORES_CFLAGS (ins->next))) {
+                       if (ins->inst_c0 == 0 && (ins->next && INST_IGNORES_CFLAGS (ins->next->opcode))) {
                                ins->opcode = CEE_XOR;
                                ins->sreg1 = ins->dreg;
                                ins->sreg2 = ins->dreg;
@@ -1754,6 +1998,7 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_I4:
                case CEE_CONV_U4:
                case OP_MOVE:
+               case OP_FMOVE:
                        /*
                         * Removes:
                         *
@@ -1898,40 +2143,18 @@ branch_cc_table [] = {
        X86_CC_O, X86_CC_NO, X86_CC_C, X86_CC_NC
 };
 
-static int
-opcode_to_x86_cond (int opcode)
-{
-       switch (opcode) {
-       case OP_IBEQ:
-               return X86_CC_EQ;
-       case OP_IBNE_UN:
-               return X86_CC_NE;
-       case OP_IBLT:
-               return X86_CC_LT;
-       case OP_IBLT_UN:
-               return X86_CC_LT;
-       case OP_IBGT:
-               return X86_CC_GT;
-       case OP_IBGT_UN:
-               return X86_CC_GT;
-       case OP_IBGE:
-               return X86_CC_GE;
-       case OP_IBGE_UN:
-               return X86_CC_GE;
-       case OP_IBLE:
-               return X86_CC_LE;
-       case OP_IBLE_UN:
-               return X86_CC_LE;
-       case OP_COND_EXC_IOV:
-               return X86_CC_O;
-       case OP_COND_EXC_IC:
-               return X86_CC_C;
-       default:
-               g_assert_not_reached ();
-       }
+/* Maps CMP_... constants to X86_CC_... constants */
+static const int
+cc_table [] = {
+       X86_CC_EQ, X86_CC_NE, X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT,
+       X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT
+};
 
-       return -1;
-}
+static const int
+cc_signed_table [] = {
+       TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
+       FALSE, FALSE, FALSE, FALSE
+};
 
 /*#include "cprop.c"*/
 
@@ -1950,6 +2173,9 @@ mono_arch_local_regalloc (MonoCompile *cfg, MonoBasicBlock *bb)
 
        mono_arch_lowering_pass (cfg, bb);
 
+       if (cfg->opt & MONO_OPT_PEEPHOLE)
+               peephole_pass_1 (cfg, bb);
+
        mono_local_regalloc (cfg, bb);
 }
 
@@ -2198,7 +2424,7 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
                MonoType *arg_type;
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
 
                if (sig->hasthis && (i == 0))
                        arg_type = &mono_defaults.object_class->byval_arg;
@@ -2466,7 +2692,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_AMD64_ICOMPARE_REG_MEMBASE:
                        amd64_alu_reg_membase_size (code, X86_CMP, ins->sreg1, ins->sreg2, ins->inst_offset, 4);
                        break;
-               case CEE_BREAK:
+               case OP_BREAK:
                        amd64_breakpoint (code);
                        break;
                case OP_ADDCC:
@@ -2800,22 +3026,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_IBGT:
                case OP_IBGE:
                case OP_IBLE:
-                       EMIT_COND_BRANCH (ins, opcode_to_x86_cond (ins->opcode), TRUE);
-                       break;
                case OP_IBNE_UN:
                case OP_IBLT_UN:
                case OP_IBGT_UN:
                case OP_IBGE_UN:
                case OP_IBLE_UN:
-                       EMIT_COND_BRANCH (ins, opcode_to_x86_cond (ins->opcode), FALSE);
-                       break;
-               case OP_COND_EXC_IOV:
-                       EMIT_COND_SYSTEM_EXCEPTION (opcode_to_x86_cond (ins->opcode),
-                                                                               TRUE, ins->inst_p1);
-                       break;
-               case OP_COND_EXC_IC:
-                       EMIT_COND_SYSTEM_EXCEPTION (opcode_to_x86_cond (ins->opcode),
-                                                                               FALSE, ins->inst_p1);
+                       EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
                case CEE_NOT:
                        amd64_not_reg (code, ins->sreg1);
@@ -2871,7 +3087,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
-               case CEE_JMP: {
+               case OP_JMP: {
                        /*
                         * Note: this 'frame destruction' logic is useful for tail calls, too.
                         * Keep in sync with the code in emit_epilog.
@@ -3063,7 +3279,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_RET:
                        amd64_ret (code);
                        break;
-               case CEE_THROW: {
+               case OP_THROW: {
                        amd64_mov_reg_reg (code, AMD64_RDI, ins->sreg1, 8);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
                                             (gpointer)"mono_arch_throw_exception");
@@ -3083,12 +3299,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        /* Restore stack alignment */
                        amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
                        break;
+
                case OP_LABEL:
                        ins->inst_c0 = code - cfg->native_code;
                        break;
-               case CEE_NOP:
+               case OP_NOP:
                        break;
-               case CEE_BR:
+               case OP_BR:
                        //g_print ("target: %p, next: %p, curr: %p, last: %p\n", ins->inst_target_bb, bb->next_bb, ins, bb->last_ins);
                        //if ((ins->inst_target_bb == bb->next_bb) && ins == bb->last_ins)
                        //break;
@@ -3121,27 +3338,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_CEQ:
                case OP_ICEQ:
-                       amd64_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
                case OP_CLT:
                case OP_ICLT:
-                       amd64_set_reg (code, X86_CC_LT, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
-               case OP_CLT_UN:
-               case OP_ICLT_UN:
-                       amd64_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
                case OP_CGT:
                case OP_ICGT:
-                       amd64_set_reg (code, X86_CC_GT, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
+               case OP_CLT_UN:
+               case OP_ICLT_UN:
                case OP_CGT_UN:
                case OP_ICGT_UN:
-                       amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
+                       amd64_set_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
                        break;
                case OP_COND_EXC_EQ:
@@ -3154,6 +3359,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_COND_EXC_GE_UN:
                case OP_COND_EXC_LE:
                case OP_COND_EXC_LE_UN:
+                       EMIT_COND_SYSTEM_EXCEPTION (cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->inst_p1);
+                       break;
                case OP_COND_EXC_OV:
                case OP_COND_EXC_NO:
                case OP_COND_EXC_C:
@@ -3161,6 +3368,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_EQ], 
                                                    (ins->opcode < OP_COND_EXC_NE_UN), ins->inst_p1);
                        break;
+               case OP_COND_EXC_IOV:
+               case OP_COND_EXC_IC:
+                       EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_IEQ], 
+                                                   (ins->opcode < OP_COND_EXC_INE_UN), ins->inst_p1);
+                       break;
                case CEE_BEQ:
                case CEE_BNE_UN:
                case CEE_BLT:
@@ -3171,7 +3383,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_BGE_UN:
                case CEE_BLE:
                case CEE_BLE_UN:
-                       EMIT_COND_BRANCH (ins, branch_cc_table [ins->opcode - CEE_BEQ], (ins->opcode < CEE_BNE_UN));
+                       EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
 
                /* floating point opcodes */
@@ -3831,7 +4043,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
                        EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
                        break;
-               case CEE_CKFINITE: {
+               case OP_CKFINITE: {
                        if (use_sse2) {
                                /* Transfer value to the fp stack */
                                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 16);
@@ -4197,7 +4409,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                ArgInfo *ainfo = cinfo->args + i;
                gint32 stack_offset;
                MonoType *arg_type;
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
 
                if (sig->hasthis && (i == 0))
                        arg_type = &mono_defaults.object_class->byval_arg;
@@ -4385,7 +4597,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
                code = mono_arch_instrument_epilog (cfg, mono_trace_leave_method, code, TRUE);
 
-       /* the code restoring the registers must be kept in sync with CEE_JMP */
+       /* the code restoring the registers must be kept in sync with OP_JMP */
        pos = 0;
        
        if (method->save_lmf) {
@@ -4613,10 +4825,15 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 
                        pos = cfg->native_code + patch_info->ip.i;
 
-                       if (use_sse2)
-                               *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
-                       else
+
+                       if (use_sse2) {
+                               if (IS_REX (pos [1]))
+                                       *(guint32*)(pos + 5) = (guint8*)code - pos - 9;
+                               else
+                                       *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+                       } else {
                                *(guint32*)(pos + 3) = (guint8*)code - pos - 7;
+                       }
 
                        if (patch_info->type == MONO_PATCH_INFO_R8) {
                                *(double*)code = *(double*)patch_info->data.target;
@@ -4676,7 +4893,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, stack_area);
 
                for (i = 0; i < n; ++i) {
-                       inst = cfg->varinfo [i];
+                       inst = cfg->args [i];
 
                        if (inst->opcode == OP_REGVAR)
                                amd64_mov_membase_reg (code, AMD64_RSP, (i * 8), inst->dreg, 8);
@@ -4824,8 +5041,6 @@ mono_arch_is_inst_imm (gint64 imm)
        return amd64_is_imm32 (imm);
 }
 
-#define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
-
 /*
  * Determine whenever the trap whose info is in SIGINFO is caused by
  * integer overflow.
@@ -4991,26 +5206,62 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
        return (gpointer)(((guint64)(regs [reg])) + disp);
 }
 
-gpointer*
-mono_arch_get_delegate_method_ptr_addr (guint8* code, gpointer *regs)
+gpointer
+mono_arch_get_this_arg_from_call (MonoMethodSignature *sig, gssize *regs, guint8 *code)
 {
-       guint32 reg;
-       guint32 disp;
+       if (MONO_TYPE_ISSTRUCT (sig->ret))
+               return (gpointer)regs [AMD64_RSI];
+       else
+               return (gpointer)regs [AMD64_RDI];
+}
+
+gpointer
+mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_target)
+{
+       guint8 *code, *start;
+       MonoDomain *domain = mono_domain_get ();
+       int i;
+
+       /* FIXME: Support more cases */
+       if (MONO_TYPE_ISSTRUCT (sig->ret))
+               return NULL;
 
-       code -= 10;
+       if (has_target) {
+               mono_domain_lock (domain);
+               start = code = mono_code_manager_reserve (domain->code_mp, 64);
+               mono_domain_unlock (domain);
 
-       if (IS_REX (code [0]) && (code [1] == 0x8b) && (code [3] == 0x48) && (code [4] == 0x8b) && (code [5] == 0x40) && (code [7] == 0x48) && (code [8] == 0xff) && (code [9] == 0xd0)) {
-               /* mov REG, %rax; mov <OFFSET>(%rax), %rax; call *%rax */
-               reg = amd64_rex_b (code [0]) + amd64_modrm_rm (code [2]);
-               disp = code [6];
+               /* Replace the this argument with the target */
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RDI, 8);
+               amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, target), 8);
+               amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
 
-               if (reg == AMD64_RAX)
+               g_assert ((code - start) < 64);
+       } else {
+               for (i = 0; i < sig->param_count; ++i)
+                       if (!mono_is_regsize_var (sig->params [i]))
+                               return NULL;
+               if (sig->param_count > 4)
                        return NULL;
-               else
-                       return (gpointer*)(((guint64)(regs [reg])) + disp);
+
+               mono_domain_lock (domain);
+               start = code = mono_code_manager_reserve (domain->code_mp, 64);
+               mono_domain_unlock (domain);
+
+               if (sig->param_count == 0) {
+                       amd64_jump_membase (code, AMD64_RDI, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               } else {
+                       /* We have to shift the arguments left */
+                       amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RDI, 8);
+                       for (i = 0; i < sig->param_count; ++i)
+                               amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+
+                       amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               }
+               g_assert ((code - start) < 64);
        }
 
-       return NULL;
+       return start;
 }
 
 /*