2007-06-01 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index 8e6d7e879d9df0bd488b4350d638ee9950d3e68a..8c2737509341e564f49a8bceabbb448dbf902a62 100644 (file)
 #include "cpu-amd64.h"
 
 static gint lmf_tls_offset = -1;
+static gint lmf_addr_tls_offset = -1;
 static gint appdomain_tls_offset = -1;
 static gint thread_tls_offset = -1;
 
 #ifdef MONO_XEN_OPT
-/* TRUE by default until we add runtime detection of Xen */
 static gboolean optimize_for_xen = TRUE;
 #else
 #define optimize_for_xen 0
@@ -40,13 +40,12 @@ static gboolean optimize_for_xen = TRUE;
 
 static gboolean use_sse2 = !MONO_ARCH_USE_FPSTACK;
 
-const char * const amd64_desc [OP_LAST];
-static const char*const * ins_spec = amd64_desc;
-
 #define ALIGN_TO(val,align) ((((guint64)val) + ((align) - 1)) & ~((align) - 1))
 
 #define IS_IMM32(val) ((((guint64)val) >> 32) == 0)
 
+#define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
+
 #ifdef PLATFORM_WIN32
 /* Under windows, the default pinvoke calling convention is stdcall */
 #define CALLCONV_IS_STDCALL(call_conv) (((call_conv) == MONO_CALL_STDCALL) || ((call_conv) == MONO_CALL_DEFAULT))
@@ -488,7 +487,7 @@ add_valuetype (MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
  * Draft Version 0.23" document for more information.
  */
 static CallInfo*
-get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
+get_call_info (MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
 {
        guint32 i, gr, fr;
        MonoType *ret_type;
@@ -496,7 +495,10 @@ get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
        guint32 stack_size = 0;
        CallInfo *cinfo;
 
-       cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
+       if (mp)
+               cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
+       else
+               cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
 
        gr = 0;
        fr = 0;
@@ -689,7 +691,7 @@ int
 mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info)
 {
        int k;
-       CallInfo *cinfo = get_call_info (csig, FALSE);
+       CallInfo *cinfo = get_call_info (NULL, csig, FALSE);
        guint32 args_size = cinfo->stack_usage;
 
        /* The arguments are saved to a stack area in mono_arch_instrument_prolog */
@@ -720,6 +722,7 @@ cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
 void
 mono_arch_cpu_init (void)
 {
+#ifndef _MSC_VER
        guint16 fpcw;
 
        /* spec compliance requires running with double precision */
@@ -728,6 +731,9 @@ mono_arch_cpu_init (void)
        fpcw |= X86_FPCW_PREC_DOUBLE;
        __asm__  __volatile__ ("fldcw %0\n": : "m" (fpcw));
        __asm__  __volatile__ ("fnstcw %0\n": "=m" (fpcw));
+#else
+       _control87 (_PC_53, MCW_PC);
+#endif
 }
 
 /*
@@ -762,35 +768,6 @@ mono_amd64_is_sse2 (void)
        return use_sse2;
 }
 
-static gboolean
-is_regsize_var (MonoType *t) {
-       if (t->byref)
-               return TRUE;
-       t = mono_type_get_underlying_type (t);
-       switch (t->type) {
-       case MONO_TYPE_I4:
-       case MONO_TYPE_U4:
-       case MONO_TYPE_I:
-       case MONO_TYPE_U:
-       case MONO_TYPE_PTR:
-       case MONO_TYPE_FNPTR:
-               return TRUE;
-       case MONO_TYPE_OBJECT:
-       case MONO_TYPE_STRING:
-       case MONO_TYPE_CLASS:
-       case MONO_TYPE_SZARRAY:
-       case MONO_TYPE_ARRAY:
-               return TRUE;
-       case MONO_TYPE_GENERICINST:
-               if (!mono_type_generic_inst_is_valuetype (t))
-                       return TRUE;
-               return FALSE;
-       case MONO_TYPE_VALUETYPE:
-               return FALSE;
-       }
-       return FALSE;
-}
-
 GList *
 mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
 {
@@ -809,11 +786,7 @@ mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
                    (ins->opcode != OP_LOCAL && ins->opcode != OP_ARG))
                        continue;
 
-               /* we dont allocate I1 to registers because there is no simply way to sign extend 
-                * 8bit quantities in caller saved registers on x86 */
-               if (is_regsize_var (ins->inst_vtype) || (ins->inst_vtype->type == MONO_TYPE_BOOLEAN) || 
-                   (ins->inst_vtype->type == MONO_TYPE_U1) || (ins->inst_vtype->type == MONO_TYPE_U2)||
-                   (ins->inst_vtype->type == MONO_TYPE_I2) || (ins->inst_vtype->type == MONO_TYPE_CHAR)) {
+               if (mono_is_regsize_var (ins->inst_vtype)) {
                        g_assert (MONO_VARINFO (cfg, i)->reg == -1);
                        g_assert (i == vmv->idx);
                        vars = g_list_prepend (vars, vmv);
@@ -845,7 +818,9 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (sig, FALSE);
+       if (!cfg->arch.cinfo)
+               cfg->arch.cinfo = get_call_info (cfg->mempool, sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        /*
         * FIXME: Remove some of the restrictions.
@@ -891,8 +866,6 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
                /* Avoid hitting the stack_alloc_size < (1 << 16) assertion in emit_epilog () */
                cfg->arch.omit_fp = FALSE;
        }
-
-       g_free (cinfo);
 }
 
 GList *
@@ -951,7 +924,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        mono_arch_compute_omit_fp (cfg);
 
@@ -982,7 +955,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                /* Reserve stack space for saving LMF + argument regs */
                guint32 size = sizeof (MonoLMF);
 
-               if (lmf_tls_offset == -1)
+               if (lmf_addr_tls_offset == -1)
                        /* Need to save argument regs too */
                        size += (AMD64_NREG * 8) + (8 * 8);
 
@@ -1050,7 +1023,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        //printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
                }
        }
-       g_free (offsets);
        offset += locals_stack_size;
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG)) {
@@ -1060,7 +1032,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        }
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
                if (inst->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
@@ -1120,8 +1092,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        }
 
        cfg->stack_offset = offset;
-
-       g_free (cinfo);
 }
 
 void
@@ -1132,12 +1102,12 @@ mono_arch_create_vars (MonoCompile *cfg)
 
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (sig, FALSE);
+       if (!cfg->arch.cinfo)
+               cfg->arch.cinfo = get_call_info (cfg->mempool, sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        if (cinfo->ret.storage == ArgValuetypeInReg)
                cfg->ret_var_is_local = TRUE;
-
-       g_free (cinfo);
 }
 
 static void
@@ -1243,7 +1213,7 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
        sig = call->signature;
        n = sig->param_count + sig->hasthis;
 
-       cinfo = get_call_info (sig, sig->pinvoke);
+       cinfo = get_call_info (cfg->mempool, sig, sig->pinvoke);
 
        for (i = 0; i < n; ++i) {
                ainfo = cinfo->args + i;
@@ -1401,8 +1371,6 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
        cfg->param_area = MAX (cfg->param_area, call->stack_usage);
        cfg->flags |= MONO_CFG_HAS_CALLS;
 
-       g_free (cinfo);
-
        return call;
 }
 
@@ -1556,8 +1524,265 @@ emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer dat
        return emit_call_body (cfg, code, patch_type, data);
 }
 
-/* FIXME: Add more instructions */
-#define INST_IGNORES_CFLAGS(ins) (((ins)->opcode == CEE_BR) || ((ins)->opcode == OP_STORE_MEMBASE_IMM) || ((ins)->opcode == OP_STOREI8_MEMBASE_REG) || ((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_ICONST) || ((ins)->opcode == OP_I8CONST) || ((ins)->opcode == OP_LOAD_MEMBASE))
+static inline int
+store_membase_imm_to_store_membase_reg (int opcode)
+{
+       switch (opcode) {
+       case OP_STORE_MEMBASE_IMM:
+               return OP_STORE_MEMBASE_REG;
+       case OP_STOREI4_MEMBASE_IMM:
+               return OP_STOREI4_MEMBASE_REG;
+       case OP_STOREI8_MEMBASE_IMM:
+               return OP_STOREI8_MEMBASE_REG;
+       }
+
+       return -1;
+}
+
+#define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB) || ((opcode) == OP_ISBB_IMM)))
+
+/*
+ * peephole_pass_1:
+ *
+ *   Perform peephole opts which should/can be performed before local regalloc
+ */
+static void
+peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
+{
+       MonoInst *ins, *last_ins = NULL;
+       ins = bb->code;
+
+       while (ins) {
+
+               switch (ins->opcode) {
+               case OP_ADD_IMM:
+               case OP_IADD_IMM:
+               case OP_LADD_IMM:
+                       if ((ins->sreg1 < MONO_MAX_IREGS) && (ins->dreg >= MONO_MAX_IREGS) && (ins->inst_imm > 0)) {
+                               /* 
+                                * X86_LEA is like ADD, but doesn't have the
+                                * sreg1==dreg restriction. inst_imm > 0 is needed since LEA sign-extends 
+                                * its operand to 64 bit.
+                                */
+                               ins->opcode = OP_X86_LEA_MEMBASE;
+                               ins->inst_basereg = ins->sreg1;
+                               /* Fall through */
+                       }
+                       else
+                               break;
+               case CEE_XOR:
+                       if ((ins->sreg1 == ins->sreg2) && (ins->sreg1 == ins->dreg)) {
+                               MonoInst *ins2;
+
+                               /* 
+                                * Replace STORE_MEMBASE_IMM 0 with STORE_MEMBASE_REG since 
+                                * the latter has length 2-3 instead of 6 (reverse constant
+                                * propagation). These instruction sequences are very common
+                                * in the initlocals bblock.
+                                */
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
+                                       if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
+                                               ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
+                                               ins2->sreg1 = ins->dreg;
+                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+                                               /* Continue */
+                                       } else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
+                                               NULLIFY_INS (ins2);
+                                               /* Continue */
+                                       } else {
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               case OP_COMPARE_IMM:
+                       /* OP_COMPARE_IMM (reg, 0) 
+                        * --> 
+                        * OP_AMD64_TEST_NULL (reg) 
+                        */
+                       if (!ins->inst_imm)
+                               ins->opcode = OP_AMD64_TEST_NULL;
+                       break;
+               case OP_ICOMPARE_IMM:
+                       if (!ins->inst_imm)
+                               ins->opcode = OP_X86_TEST_NULL;
+                       break;
+               case OP_AMD64_ICOMPARE_MEMBASE_IMM:
+                       /* 
+                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
+                        * OP_X86_COMPARE_MEMBASE_IMM offset(basereg), imm
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
+                        * OP_COMPARE_IMM reg, imm
+                        *
+                        * Note: if imm = 0 then OP_COMPARE_IMM replaced with OP_X86_TEST_NULL
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG) &&
+                           ins->inst_basereg == last_ins->inst_destbasereg &&
+                           ins->inst_offset == last_ins->inst_offset) {
+                                       ins->opcode = OP_ICOMPARE_IMM;
+                                       ins->sreg1 = last_ins->sreg1;
+
+                                       /* check if we can remove cmp reg,0 with test null */
+                                       if (!ins->inst_imm)
+                                               ins->opcode = OP_X86_TEST_NULL;
+                               }
+
+                       break;
+               case OP_LOAD_MEMBASE:
+               case OP_LOADI4_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG 
+                                        || last_ins->opcode == OP_STORE_MEMBASE_REG) &&
+                           ins->inst_basereg == last_ins->inst_destbasereg &&
+                           ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+
+                       /* 
+                        * Note: reg1 must be different from the basereg in the second load
+                        * Note: if reg1 = reg2 is equal then second load is removed
+                        *
+                        * OP_LOAD_MEMBASE offset(basereg), reg1
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_LOAD_MEMBASE offset(basereg), reg1
+                        * OP_MOVE reg1, reg2
+                        */
+                       } if (last_ins && (last_ins->opcode == OP_LOADI4_MEMBASE
+                                          || last_ins->opcode == OP_LOAD_MEMBASE) &&
+                             ins->inst_basereg != last_ins->dreg &&
+                             ins->inst_basereg == last_ins->inst_basereg &&
+                             ins->inst_offset == last_ins->inst_offset) {
+
+                               if (ins->dreg == last_ins->dreg) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->dreg;
+                               }
+
+                               //g_assert_not_reached ();
+
+#if 0
+                       /* 
+                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg
+                        * -->
+                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
+                        * OP_ICONST reg, imm
+                        */
+                       } else if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_IMM
+                                               || last_ins->opcode == OP_STORE_MEMBASE_IMM) &&
+                                  ins->inst_basereg == last_ins->inst_destbasereg &&
+                                  ins->inst_offset == last_ins->inst_offset) {
+                               //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                               ins->opcode = OP_ICONST;
+                               ins->inst_c0 = last_ins->inst_imm;
+                               g_assert_not_reached (); // check this rule
+#endif
+                       }
+                       break;
+               case OP_LOADI1_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI1_MEMBASE_REG) &&
+                                       ins->inst_basereg == last_ins->inst_destbasereg &&
+                                       ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+                       }
+                       break;
+               case OP_LOADI2_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI2_MEMBASE_REG) &&
+                                       ins->inst_basereg == last_ins->inst_destbasereg &&
+                                       ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+                       }
+                       break;
+               case CEE_CONV_I4:
+               case CEE_CONV_U4:
+               case OP_MOVE:
+               case OP_FMOVE:
+                       /*
+                        * Removes:
+                        *
+                        * OP_MOVE reg, reg 
+                        */
+                       if (ins->dreg == ins->sreg1) {
+                               if (last_ins)
+                                       last_ins->next = ins->next;                             
+                               ins = ins->next;
+                               continue;
+                       }
+                       /* 
+                        * Removes:
+                        *
+                        * OP_MOVE sreg, dreg 
+                        * OP_MOVE dreg, sreg
+                        */
+                       if (last_ins && last_ins->opcode == OP_MOVE &&
+                           ins->sreg1 == last_ins->dreg &&
+                           ins->dreg == last_ins->sreg1) {
+                               last_ins->next = ins->next;                             
+                               ins = ins->next;                                
+                               continue;
+                       }
+                       break;
+               }
+               last_ins = ins;
+               ins = ins->next;
+       }
+       bb->last_ins = last_ins;
+}
 
 static void
 peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
@@ -1572,10 +1797,37 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_I8CONST:
                        /* reg = 0 -> XOR (reg, reg) */
                        /* XOR sets cflags on x86, so we cant do it always */
-                       if (ins->inst_c0 == 0 && (ins->next && INST_IGNORES_CFLAGS (ins->next))) {
+                       if (ins->inst_c0 == 0 && (ins->next && INST_IGNORES_CFLAGS (ins->next->opcode))) {
                                ins->opcode = CEE_XOR;
                                ins->sreg1 = ins->dreg;
                                ins->sreg2 = ins->dreg;
+                               /* Fall through */
+                       }
+                       else
+                               break;
+               case CEE_XOR:
+                       if ((ins->sreg1 == ins->sreg2) && (ins->sreg1 == ins->dreg)) {
+                               MonoInst *ins2;
+
+                               /* 
+                                * Replace STORE_MEMBASE_IMM 0 with STORE_MEMBASE_REG since 
+                                * the latter has length 2-3 instead of 6 (reverse constant
+                                * propagation). These instruction sequences are very common
+                                * in the initlocals bblock.
+                                */
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
+                                       if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
+                                               ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
+                                               ins2->sreg1 = ins->dreg;
+                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+                                               /* Continue */
+                                       } else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
+                                               NULLIFY_INS (ins2);
+                                               /* Continue */
+                                       } else {
+                                               break;
+                                       }
+                               }
                        }
                        break;
                case OP_MUL_IMM: 
@@ -1695,7 +1947,6 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 #endif
                        }
                        break;
-               case OP_LOADU1_MEMBASE:
                case OP_LOADI1_MEMBASE:
                        /* 
                         * Note: if reg1 = reg2 the load op is removed
@@ -1720,7 +1971,6 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                }
                        }
                        break;
-               case OP_LOADU2_MEMBASE:
                case OP_LOADI2_MEMBASE:
                        /* 
                         * Note: if reg1 = reg2 the load op is removed
@@ -1748,6 +1998,7 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_I4:
                case CEE_CONV_U4:
                case OP_MOVE:
+               case OP_FMOVE:
                        /*
                         * Removes:
                         *
@@ -1812,10 +2063,8 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
        MonoInst *ins, *temp, *last_ins = NULL;
        ins = bb->code;
 
-       if (bb->max_ireg > cfg->rs->next_vireg)
-               cfg->rs->next_vireg = bb->max_ireg;
-       if (bb->max_freg > cfg->rs->next_vfreg)
-               cfg->rs->next_vfreg = bb->max_freg;
+       if (bb->max_vreg > cfg->rs->next_vreg)
+               cfg->rs->next_vreg = bb->max_vreg;
 
        /*
         * FIXME: Need to add more instructions, but the current machine 
@@ -1884,8 +2133,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
        }
        bb->last_ins = last_ins;
 
-       bb->max_ireg = cfg->rs->next_vireg;
-       bb->max_freg = cfg->rs->next_vfreg;
+       bb->max_vreg = cfg->rs->next_vreg;
 }
 
 static const int 
@@ -1895,40 +2143,18 @@ branch_cc_table [] = {
        X86_CC_O, X86_CC_NO, X86_CC_C, X86_CC_NC
 };
 
-static int
-opcode_to_x86_cond (int opcode)
-{
-       switch (opcode) {
-       case OP_IBEQ:
-               return X86_CC_EQ;
-       case OP_IBNE_UN:
-               return X86_CC_NE;
-       case OP_IBLT:
-               return X86_CC_LT;
-       case OP_IBLT_UN:
-               return X86_CC_LT;
-       case OP_IBGT:
-               return X86_CC_GT;
-       case OP_IBGT_UN:
-               return X86_CC_GT;
-       case OP_IBGE:
-               return X86_CC_GE;
-       case OP_IBGE_UN:
-               return X86_CC_GE;
-       case OP_IBLE:
-               return X86_CC_LE;
-       case OP_IBLE_UN:
-               return X86_CC_LE;
-       case OP_COND_EXC_IOV:
-               return X86_CC_O;
-       case OP_COND_EXC_IC:
-               return X86_CC_C;
-       default:
-               g_assert_not_reached ();
-       }
+/* Maps CMP_... constants to X86_CC_... constants */
+static const int
+cc_table [] = {
+       X86_CC_EQ, X86_CC_NE, X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT,
+       X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT
+};
 
-       return -1;
-}
+static const int
+cc_signed_table [] = {
+       TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
+       FALSE, FALSE, FALSE, FALSE
+};
 
 /*#include "cprop.c"*/
 
@@ -1947,6 +2173,9 @@ mono_arch_local_regalloc (MonoCompile *cfg, MonoBasicBlock *bb)
 
        mono_arch_lowering_pass (cfg, bb);
 
+       if (cfg->opt & MONO_OPT_PEEPHOLE)
+               peephole_pass_1 (cfg, bb);
+
        mono_local_regalloc (cfg, bb);
 }
 
@@ -2108,7 +2337,7 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
        case OP_VCALL:
        case OP_VCALL_REG:
        case OP_VCALL_MEMBASE:
-               cinfo = get_call_info (((MonoCallInst*)ins)->signature, FALSE);
+               cinfo = get_call_info (cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
                if (cinfo->ret.storage == ArgValuetypeInReg) {
                        /* Pop the destination address from the stack */
                        amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
@@ -2132,7 +2361,6 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
                                }
                        }
                }
-               g_free (cinfo);
                break;
        }
 
@@ -2183,7 +2411,7 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
 
        sig = mono_method_signature (method);
 
-       cinfo = get_call_info (sig, FALSE);
+       cinfo = cfg->arch.cinfo;
        
        /* This is the opposite of the code in emit_prolog */
 
@@ -2196,7 +2424,7 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
                MonoType *arg_type;
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
 
                if (sig->hasthis && (i == 0))
                        arg_type = &mono_defaults.object_class->byval_arg;
@@ -2229,8 +2457,6 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
                }
        }
 
-       g_free (cinfo);
-
        return code;
 }
 
@@ -2303,7 +2529,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        while (ins) {
                offset = code - cfg->native_code;
 
-               max_len = ((guint8 *)ins_spec [ins->opcode])[MONO_INST_LEN];
+               max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
 
                if (offset > (cfg->code_size - max_len - 16)) {
                        cfg->code_size *= 2;
@@ -2466,7 +2692,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_AMD64_ICOMPARE_REG_MEMBASE:
                        amd64_alu_reg_membase_size (code, X86_CMP, ins->sreg1, ins->sreg2, ins->inst_offset, 4);
                        break;
-               case CEE_BREAK:
+               case OP_BREAK:
                        amd64_breakpoint (code);
                        break;
                case OP_ADDCC:
@@ -2800,22 +3026,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_IBGT:
                case OP_IBGE:
                case OP_IBLE:
-                       EMIT_COND_BRANCH (ins, opcode_to_x86_cond (ins->opcode), TRUE);
-                       break;
                case OP_IBNE_UN:
                case OP_IBLT_UN:
                case OP_IBGT_UN:
                case OP_IBGE_UN:
                case OP_IBLE_UN:
-                       EMIT_COND_BRANCH (ins, opcode_to_x86_cond (ins->opcode), FALSE);
-                       break;
-               case OP_COND_EXC_IOV:
-                       EMIT_COND_SYSTEM_EXCEPTION (opcode_to_x86_cond (ins->opcode),
-                                                                               TRUE, ins->inst_p1);
-                       break;
-               case OP_COND_EXC_IC:
-                       EMIT_COND_SYSTEM_EXCEPTION (opcode_to_x86_cond (ins->opcode),
-                                                                               FALSE, ins->inst_p1);
+                       EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
                case CEE_NOT:
                        amd64_not_reg (code, ins->sreg1);
@@ -2871,7 +3087,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
-               case CEE_JMP: {
+               case OP_JMP: {
                        /*
                         * Note: this 'frame destruction' logic is useful for tail calls, too.
                         * Keep in sync with the code in emit_epilog.
@@ -3063,7 +3279,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_RET:
                        amd64_ret (code);
                        break;
-               case CEE_THROW: {
+               case OP_THROW: {
                        amd64_mov_reg_reg (code, AMD64_RDI, ins->sreg1, 8);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
                                             (gpointer)"mono_arch_throw_exception");
@@ -3083,10 +3299,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        /* Restore stack alignment */
                        amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
                        break;
+
                case OP_LABEL:
                        ins->inst_c0 = code - cfg->native_code;
                        break;
-               case CEE_BR:
+               case OP_NOP:
+                       break;
+               case OP_BR:
                        //g_print ("target: %p, next: %p, curr: %p, last: %p\n", ins->inst_target_bb, bb->next_bb, ins, bb->last_ins);
                        //if ((ins->inst_target_bb == bb->next_bb) && ins == bb->last_ins)
                        //break;
@@ -3119,27 +3338,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_CEQ:
                case OP_ICEQ:
-                       amd64_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
                case OP_CLT:
                case OP_ICLT:
-                       amd64_set_reg (code, X86_CC_LT, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
-               case OP_CLT_UN:
-               case OP_ICLT_UN:
-                       amd64_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
                case OP_CGT:
                case OP_ICGT:
-                       amd64_set_reg (code, X86_CC_GT, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
+               case OP_CLT_UN:
+               case OP_ICLT_UN:
                case OP_CGT_UN:
                case OP_ICGT_UN:
-                       amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
+                       amd64_set_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
                        break;
                case OP_COND_EXC_EQ:
@@ -3152,6 +3359,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_COND_EXC_GE_UN:
                case OP_COND_EXC_LE:
                case OP_COND_EXC_LE_UN:
+                       EMIT_COND_SYSTEM_EXCEPTION (cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->inst_p1);
+                       break;
                case OP_COND_EXC_OV:
                case OP_COND_EXC_NO:
                case OP_COND_EXC_C:
@@ -3159,6 +3368,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_EQ], 
                                                    (ins->opcode < OP_COND_EXC_NE_UN), ins->inst_p1);
                        break;
+               case OP_COND_EXC_IOV:
+               case OP_COND_EXC_IC:
+                       EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_IEQ], 
+                                                   (ins->opcode < OP_COND_EXC_INE_UN), ins->inst_p1);
+                       break;
                case CEE_BEQ:
                case CEE_BNE_UN:
                case CEE_BLT:
@@ -3169,7 +3383,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_BGE_UN:
                case CEE_BLE:
                case CEE_BLE_UN:
-                       EMIT_COND_BRANCH (ins, branch_cc_table [ins->opcode - CEE_BEQ], (ins->opcode < CEE_BNE_UN));
+                       EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
 
                /* floating point opcodes */
@@ -3747,7 +3961,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FBGT:
                case OP_FBGT_UN:
                        if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
+                               if (ins->opcode == OP_FBGT) {
+                                       guchar *br1;
+
+                                       /* skip branch if C1=1 */
+                                       br1 = code;
+                                       x86_branch8 (code, X86_CC_P, 0, FALSE);
+                                       /* branch if (C0 | C3) = 1 */
+                                       EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
+                                       amd64_patch (br1, code);
+                                       break;
+                               } else {
+                                       EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
+                               }
                                break;
                        }
                        amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
@@ -3817,7 +4043,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
                        EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
                        break;
-               case CEE_CKFINITE: {
+               case OP_CKFINITE: {
                        if (use_sse2) {
                                /* Transfer value to the fp stack */
                                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 16);
@@ -3900,6 +4126,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         * hack to overcome limits in x86 reg allocator 
                         * (req: dreg == eax and sreg2 != eax and breg != eax) 
                         */
+                       /* The pushes invalidate rsp */
+                       if ((breg == AMD64_RAX) || (breg == AMD64_RSP)) {
+                               amd64_mov_reg_reg (code, AMD64_R11, breg, 8);
+                               breg = AMD64_R11;
+                       }
+
                        if (ins->dreg != AMD64_RAX)
                                amd64_push_reg (code, AMD64_RAX);
                        
@@ -3910,11 +4142,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                sreg2 = AMD64_RDX;
                        }
 
-                       if (breg == AMD64_RAX) {
-                               amd64_mov_reg_reg (code, AMD64_R11, AMD64_RAX, 8);
-                               breg = AMD64_R11;
-                       }
-
                        amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
 
                        br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
@@ -4119,14 +4346,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
                /* Save sp */
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
-               /* Save method */
-               /* FIXME: add a relocation for this */
-               if (IS_IMM32 (cfg->method))
-                       amd64_mov_membase_imm (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
-               else {
-                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
-                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
-               }
+               /* Skip method (only needed for trampoline LMF frames) */
                /* Save callee saved regs */
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
@@ -4166,7 +4386,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                if (ins->opcode == OP_LABEL)
                                        ins->inst_c1 = max_offset;
                                
-                               max_offset += ((guint8 *)ins_spec [ins->opcode])[MONO_INST_LEN];
+                               max_offset += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
                                ins = ins->next;
                        }
                }
@@ -4175,7 +4395,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        sig = mono_method_signature (method);
        pos = 0;
 
-       cinfo = get_call_info (sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        if (sig->ret->type != MONO_TYPE_VOID) {
                if ((cinfo->ret.storage == ArgInIReg) && (cfg->ret->opcode != OP_REGVAR)) {
@@ -4189,7 +4409,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                ArgInfo *ainfo = cinfo->args + i;
                gint32 stack_offset;
                MonoType *arg_type;
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
 
                if (sig->hasthis && (i == 0))
                        arg_type = &mono_defaults.object_class->byval_arg;
@@ -4271,10 +4491,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                 * The call might clobber argument registers, but they are already
                 * saved to the stack/global regs.
                 */
-               if (lmf_tls_offset != -1) {
+               if (lmf_addr_tls_offset != -1) {
                        guint8 *buf;
 
-                       code = emit_tls_get ( code, AMD64_RAX, lmf_tls_offset);
+                       code = emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
                        amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
                        buf = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
@@ -4295,33 +4515,46 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
 
        if (method->save_lmf) {
-               if (lmf_tls_offset != -1) {
-                       /* Load lmf quicky using the FS register */
-                       code = emit_tls_get (code, AMD64_RAX, lmf_tls_offset);
-               }
-               else {
-                       /* 
-                        * The call might clobber argument registers, but they are already
-                        * saved to the stack/global regs.
+               if ((lmf_tls_offset != -1) && !optimize_for_xen) {
+                       /*
+                        * Optimized version which uses the mono_lmf TLS variable instead of indirection
+                        * through the mono_lmf_addr TLS variable.
                         */
+                       /* %rax = previous_lmf */
+                       x86_prefix (code, X86_FS_PREFIX);
+                       amd64_mov_reg_mem (code, AMD64_RAX, lmf_tls_offset, 8);
+
+                       /* Save previous_lmf */
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_RAX, 8);
+                       /* Set new lmf */
+                       amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
+                       x86_prefix (code, X86_FS_PREFIX);
+                       amd64_mov_mem_reg (code, lmf_tls_offset, AMD64_R11, 8);
+               } else {
+                       if (lmf_addr_tls_offset != -1) {
+                               /* Load lmf quicky using the FS register */
+                               code = emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
+                       }
+                       else {
+                               /* 
+                                * The call might clobber argument registers, but they are already
+                                * saved to the stack/global regs.
+                                */
+                               code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
+                                                                 (gpointer)"mono_get_lmf_addr");               
+                       }
 
-                       code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
-                                                                (gpointer)"mono_get_lmf_addr");                
+                       /* Save lmf_addr */
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+                       /* Save previous_lmf */
+                       amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+                       /* Set new lmf */
+                       amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
+                       amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
                }
-
-               /* Save lmf_addr */
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
-               /* Save previous_lmf */
-               amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
-               /* Set new lmf */
-               amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
-               amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
        }
 
-
-       g_free (cinfo);
-
        if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
                code = mono_arch_instrument_prolog (cfg, mono_trace_enter_method, code, TRUE);
 
@@ -4364,14 +4597,25 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
                code = mono_arch_instrument_epilog (cfg, mono_trace_leave_method, code, TRUE);
 
-       /* the code restoring the registers must be kept in sync with CEE_JMP */
+       /* the code restoring the registers must be kept in sync with OP_JMP */
        pos = 0;
        
        if (method->save_lmf) {
-               /* Restore previous lmf */
-               amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-               amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
-               amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+               if ((lmf_tls_offset != -1) && !optimize_for_xen) {
+                       /*
+                        * Optimized version which uses the mono_lmf TLS variable instead of indirection
+                        * through the mono_lmf_addr TLS variable.
+                        */
+                       /* reg = previous_lmf */
+                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+                       x86_prefix (code, X86_FS_PREFIX);
+                       amd64_mov_mem_reg (code, lmf_tls_offset, AMD64_R11, 8);
+               } else {
+                       /* Restore previous lmf */
+                       amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
+                       amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+               }
 
                /* Restore caller saved regs */
                if (cfg->used_int_regs & (1 << AMD64_RBP)) {
@@ -4430,7 +4674,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
 
        /* Load returned vtypes into registers if needed */
-       cinfo = get_call_info (mono_method_signature (method), FALSE);
+       cinfo = cfg->arch.cinfo;
        if (cinfo->ret.storage == ArgValuetypeInReg) {
                ArgInfo *ainfo = &cinfo->ret;
                MonoInst *inst = cfg->ret;
@@ -4453,7 +4697,6 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                        }
                }
        }
-       g_free (cinfo);
 
        if (cfg->arch.omit_fp) {
                if (cfg->arch.stack_alloc_size)
@@ -4582,10 +4825,15 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 
                        pos = cfg->native_code + patch_info->ip.i;
 
-                       if (use_sse2)
-                               *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
-                       else
+
+                       if (use_sse2) {
+                               if (IS_REX (pos [1]))
+                                       *(guint32*)(pos + 5) = (guint8*)code - pos - 9;
+                               else
+                                       *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+                       } else {
                                *(guint32*)(pos + 3) = (guint8*)code - pos - 7;
+                       }
 
                        if (patch_info->type == MONO_PATCH_INFO_R8) {
                                *(double*)code = *(double*)patch_info->data.target;
@@ -4636,7 +4884,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                /* Allocate a new area on the stack and save arguments there */
                sig = mono_method_signature (cfg->method);
 
-               cinfo = get_call_info (sig, FALSE);
+               cinfo = get_call_info (cfg->mempool, sig, FALSE);
 
                n = sig->param_count + sig->hasthis;
 
@@ -4645,7 +4893,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, stack_area);
 
                for (i = 0; i < n; ++i) {
-                       inst = cfg->varinfo [i];
+                       inst = cfg->args [i];
 
                        if (inst->opcode == OP_REGVAR)
                                amd64_mov_membase_reg (code, AMD64_RSP, (i * 8), inst->dreg, 8);
@@ -4661,12 +4909,9 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
        amd64_mov_reg_reg (code, AMD64_RSI, AMD64_RSP, 8);
        code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func);
 
-       if (enable_arguments) {
+       if (enable_arguments)
                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, stack_area);
 
-               g_free (cinfo);
-       }
-
        return code;
 }
 
@@ -4796,8 +5041,6 @@ mono_arch_is_inst_imm (gint64 imm)
        return amd64_is_imm32 (imm);
 }
 
-#define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
-
 /*
  * Determine whenever the trap whose info is in SIGINFO is caused by
  * integer overflow.
@@ -4963,26 +5206,62 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
        return (gpointer)(((guint64)(regs [reg])) + disp);
 }
 
-gpointer*
-mono_arch_get_delegate_method_ptr_addr (guint8* code, gpointer *regs)
+gpointer
+mono_arch_get_this_arg_from_call (MonoMethodSignature *sig, gssize *regs, guint8 *code)
 {
-       guint32 reg;
-       guint32 disp;
+       if (MONO_TYPE_ISSTRUCT (sig->ret))
+               return (gpointer)regs [AMD64_RSI];
+       else
+               return (gpointer)regs [AMD64_RDI];
+}
+
+gpointer
+mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_target)
+{
+       guint8 *code, *start;
+       MonoDomain *domain = mono_domain_get ();
+       int i;
 
-       code -= 10;
+       /* FIXME: Support more cases */
+       if (MONO_TYPE_ISSTRUCT (sig->ret))
+               return NULL;
 
-       if (IS_REX (code [0]) && (code [1] == 0x8b) && (code [3] == 0x48) && (code [4] == 0x8b) && (code [5] == 0x40) && (code [7] == 0x48) && (code [8] == 0xff) && (code [9] == 0xd0)) {
-               /* mov REG, %rax; mov <OFFSET>(%rax), %rax; call *%rax */
-               reg = amd64_rex_b (code [0]) + amd64_modrm_rm (code [2]);
-               disp = code [6];
+       if (has_target) {
+               mono_domain_lock (domain);
+               start = code = mono_code_manager_reserve (domain->code_mp, 64);
+               mono_domain_unlock (domain);
 
-               if (reg == AMD64_RAX)
+               /* Replace the this argument with the target */
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RDI, 8);
+               amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, target), 8);
+               amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+
+               g_assert ((code - start) < 64);
+       } else {
+               for (i = 0; i < sig->param_count; ++i)
+                       if (!mono_is_regsize_var (sig->params [i]))
+                               return NULL;
+               if (sig->param_count > 4)
                        return NULL;
-               else
-                       return (gpointer*)(((guint64)(regs [reg])) + disp);
+
+               mono_domain_lock (domain);
+               start = code = mono_code_manager_reserve (domain->code_mp, 64);
+               mono_domain_unlock (domain);
+
+               if (sig->param_count == 0) {
+                       amd64_jump_membase (code, AMD64_RDI, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               } else {
+                       /* We have to shift the arguments left */
+                       amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RDI, 8);
+                       for (i = 0; i < sig->param_count; ++i)
+                               amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+
+                       amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               }
+               g_assert ((code - start) < 64);
        }
 
-       return NULL;
+       return start;
 }
 
 /*
@@ -5001,7 +5280,8 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                optimize_for_xen = access ("/proc/xen", F_OK) == 0;
 #endif
                appdomain_tls_offset = mono_domain_get_tls_offset ();
-               lmf_tls_offset = mono_get_lmf_tls_offset ();
+               lmf_tls_offset = mono_get_lmf_tls_offset ();
+               lmf_addr_tls_offset = mono_get_lmf_addr_tls_offset ();
                thread_tls_offset = mono_thread_get_tls_offset ();
        }               
 }
@@ -5015,7 +5295,7 @@ void
 mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
 {
        MonoCallInst *call = (MonoCallInst*)inst;
-       CallInfo * cinfo = get_call_info (inst->signature, FALSE);
+       CallInfo * cinfo = get_call_info (cfg->mempool, inst->signature, FALSE);
 
        if (vt_reg != -1) {
                MonoInst *vtarg;
@@ -5054,8 +5334,6 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
 
                mono_call_inst_add_outarg_reg (cfg, call, this->dreg, cinfo->args [0].reg, FALSE);
        }
-
-       g_free (cinfo);
 }
 
 MonoInst*