2007-10-10 Mark Probst <mark.probst@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index 53ad3c2809027314896ab4e20369abf90eb0775e..8381c2fa1a18ac0bfdf039cb16e021bd914b5a1c 100644 (file)
 #include "cpu-amd64.h"
 
 static gint lmf_tls_offset = -1;
+static gint lmf_addr_tls_offset = -1;
 static gint appdomain_tls_offset = -1;
 static gint thread_tls_offset = -1;
 
 #ifdef MONO_XEN_OPT
-/* TRUE by default until we add runtime detection of Xen */
 static gboolean optimize_for_xen = TRUE;
 #else
 #define optimize_for_xen 0
@@ -40,13 +40,12 @@ static gboolean optimize_for_xen = TRUE;
 
 static gboolean use_sse2 = !MONO_ARCH_USE_FPSTACK;
 
-const char * const amd64_desc [OP_LAST];
-static const char*const * ins_spec = amd64_desc;
-
 #define ALIGN_TO(val,align) ((((guint64)val) + ((align) - 1)) & ~((align) - 1))
 
 #define IS_IMM32(val) ((((guint64)val) >> 32) == 0)
 
+#define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
+
 #ifdef PLATFORM_WIN32
 /* Under windows, the default pinvoke calling convention is stdcall */
 #define CALLCONV_IS_STDCALL(call_conv) (((call_conv) == MONO_CALL_STDCALL) || ((call_conv) == MONO_CALL_DEFAULT))
@@ -54,6 +53,11 @@ static const char*const * ins_spec = amd64_desc;
 #define CALLCONV_IS_STDCALL(call_conv) ((call_conv) == MONO_CALL_STDCALL)
 #endif
 
+/* This mutex protects architecture specific caches */
+#define mono_mini_arch_lock() EnterCriticalSection (&mini_arch_mutex)
+#define mono_mini_arch_unlock() LeaveCriticalSection (&mini_arch_mutex)
+static CRITICAL_SECTION mini_arch_mutex;
+
 #define ARGS_OFFSET 16
 #define GP_SCRATCH_REG AMD64_R11
 
@@ -187,6 +191,12 @@ amd64_patch (unsigned char* code, gpointer target)
                x86_patch (code, (unsigned char*)target);
 }
 
+void 
+mono_amd64_patch (unsigned char* code, gpointer target)
+{
+       amd64_patch (code, target);
+}
+
 typedef enum {
        ArgInIReg,
        ArgInFloatSSEReg,
@@ -488,7 +498,7 @@ add_valuetype (MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
  * Draft Version 0.23" document for more information.
  */
 static CallInfo*
-get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
+get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
 {
        guint32 i, gr, fr;
        MonoType *ret_type;
@@ -496,7 +506,10 @@ get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
        guint32 stack_size = 0;
        CallInfo *cinfo;
 
-       cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
+       if (mp)
+               cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
+       else
+               cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
 
        gr = 0;
        fr = 0;
@@ -504,6 +517,7 @@ get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
        /* return value */
        {
                ret_type = mono_type_get_underlying_type (sig->ret);
+               ret_type = mini_get_basic_type_from_generic (cfg, ret_type);
                switch (ret_type->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -600,6 +614,7 @@ get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
                        continue;
                }
                ptype = mono_type_get_underlying_type (sig->params [i]);
+               ptype = mini_get_basic_type_from_generic (cfg, ptype);
                switch (ptype->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -689,7 +704,7 @@ int
 mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info)
 {
        int k;
-       CallInfo *cinfo = get_call_info (csig, FALSE);
+       CallInfo *cinfo = get_call_info (NULL, NULL, csig, FALSE);
        guint32 args_size = cinfo->stack_usage;
 
        /* The arguments are saved to a stack area in mono_arch_instrument_prolog */
@@ -720,6 +735,7 @@ cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
 void
 mono_arch_cpu_init (void)
 {
+#ifndef _MSC_VER
        guint16 fpcw;
 
        /* spec compliance requires running with double precision */
@@ -728,6 +744,27 @@ mono_arch_cpu_init (void)
        fpcw |= X86_FPCW_PREC_DOUBLE;
        __asm__  __volatile__ ("fldcw %0\n": : "m" (fpcw));
        __asm__  __volatile__ ("fnstcw %0\n": "=m" (fpcw));
+#else
+       _control87 (_PC_53, MCW_PC);
+#endif
+}
+
+/*
+ * Initialize architecture specific code.
+ */
+void
+mono_arch_init (void)
+{
+       InitializeCriticalSection (&mini_arch_mutex);
+}
+
+/*
+ * Cleanup architecture specific code.
+ */
+void
+mono_arch_cleanup (void)
+{
+       DeleteCriticalSection (&mini_arch_mutex);
 }
 
 /*
@@ -762,35 +799,6 @@ mono_amd64_is_sse2 (void)
        return use_sse2;
 }
 
-static gboolean
-is_regsize_var (MonoType *t) {
-       if (t->byref)
-               return TRUE;
-       t = mono_type_get_underlying_type (t);
-       switch (t->type) {
-       case MONO_TYPE_I4:
-       case MONO_TYPE_U4:
-       case MONO_TYPE_I:
-       case MONO_TYPE_U:
-       case MONO_TYPE_PTR:
-       case MONO_TYPE_FNPTR:
-               return TRUE;
-       case MONO_TYPE_OBJECT:
-       case MONO_TYPE_STRING:
-       case MONO_TYPE_CLASS:
-       case MONO_TYPE_SZARRAY:
-       case MONO_TYPE_ARRAY:
-               return TRUE;
-       case MONO_TYPE_GENERICINST:
-               if (!mono_type_generic_inst_is_valuetype (t))
-                       return TRUE;
-               return FALSE;
-       case MONO_TYPE_VALUETYPE:
-               return FALSE;
-       }
-       return FALSE;
-}
-
 GList *
 mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
 {
@@ -809,11 +817,7 @@ mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
                    (ins->opcode != OP_LOCAL && ins->opcode != OP_ARG))
                        continue;
 
-               /* we dont allocate I1 to registers because there is no simply way to sign extend 
-                * 8bit quantities in caller saved registers on x86 */
-               if (is_regsize_var (ins->inst_vtype) || (ins->inst_vtype->type == MONO_TYPE_BOOLEAN) || 
-                   (ins->inst_vtype->type == MONO_TYPE_U1) || (ins->inst_vtype->type == MONO_TYPE_U2)||
-                   (ins->inst_vtype->type == MONO_TYPE_I2) || (ins->inst_vtype->type == MONO_TYPE_CHAR)) {
+               if (mono_is_regsize_var (ins->inst_vtype)) {
                        g_assert (MONO_VARINFO (cfg, i)->reg == -1);
                        g_assert (i == vmv->idx);
                        vars = g_list_prepend (vars, vmv);
@@ -835,7 +839,7 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 {
        MonoMethodSignature *sig;
        MonoMethodHeader *header;
-       int i;
+       int i, locals_size;
        CallInfo *cinfo;
 
        if (cfg->arch.omit_fp_computed)
@@ -845,7 +849,9 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (sig, FALSE);
+       if (!cfg->arch.cinfo)
+               cfg->arch.cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        /*
         * FIXME: Remove some of the restrictions.
@@ -887,12 +893,18 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
                }
        }
 
-       if (cfg->num_varinfo > 10000) {
+       locals_size = 0;
+       for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
+               MonoInst *ins = cfg->varinfo [i];
+               int ialign;
+
+               locals_size += mono_type_size (ins->inst_vtype, &ialign);
+       }
+
+       if ((cfg->num_varinfo > 10000) || (locals_size >= (1 << 15))) {
                /* Avoid hitting the stack_alloc_size < (1 << 16) assertion in emit_epilog () */
                cfg->arch.omit_fp = FALSE;
        }
-
-       g_free (cinfo);
 }
 
 GList *
@@ -951,7 +963,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        mono_arch_compute_omit_fp (cfg);
 
@@ -982,7 +994,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                /* Reserve stack space for saving LMF + argument regs */
                guint32 size = sizeof (MonoLMF);
 
-               if (lmf_tls_offset == -1)
+               if (lmf_addr_tls_offset == -1)
                        /* Need to save argument regs too */
                        size += (AMD64_NREG * 8) + (8 * 8);
 
@@ -1050,7 +1062,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        //printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
                }
        }
-       g_free (offsets);
        offset += locals_stack_size;
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG)) {
@@ -1060,7 +1071,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        }
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
                if (inst->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
@@ -1120,8 +1131,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        }
 
        cfg->stack_offset = offset;
-
-       g_free (cinfo);
 }
 
 void
@@ -1132,12 +1141,12 @@ mono_arch_create_vars (MonoCompile *cfg)
 
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (sig, FALSE);
+       if (!cfg->arch.cinfo)
+               cfg->arch.cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        if (cinfo->ret.storage == ArgValuetypeInReg)
                cfg->ret_var_is_local = TRUE;
-
-       g_free (cinfo);
 }
 
 static void
@@ -1147,19 +1156,19 @@ add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, MonoInst *arg, ArgStorage
        case ArgInIReg:
                arg->opcode = OP_OUTARG_REG;
                arg->inst_left = tree;
-               arg->inst_right = (MonoInst*)call;
+               arg->inst_call = call;
                arg->backend.reg3 = reg;
                break;
        case ArgInFloatSSEReg:
                arg->opcode = OP_AMD64_OUTARG_XMMREG_R4;
                arg->inst_left = tree;
-               arg->inst_right = (MonoInst*)call;
+               arg->inst_call = call;
                arg->backend.reg3 = reg;
                break;
        case ArgInDoubleSSEReg:
                arg->opcode = OP_AMD64_OUTARG_XMMREG_R8;
                arg->inst_left = tree;
-               arg->inst_right = (MonoInst*)call;
+               arg->inst_call = call;
                arg->backend.reg3 = reg;
                break;
        default:
@@ -1243,7 +1252,7 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
        sig = call->signature;
        n = sig->param_count + sig->hasthis;
 
-       cinfo = get_call_info (sig, sig->pinvoke);
+       cinfo = get_call_info (cfg, cfg->mempool, sig, sig->pinvoke);
 
        for (i = 0; i < n; ++i) {
                ainfo = cinfo->args + i;
@@ -1401,8 +1410,6 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
        cfg->param_area = MAX (cfg->param_area, call->stack_usage);
        cfg->flags |= MONO_CFG_HAS_CALLS;
 
-       g_free (cinfo);
-
        return call;
 }
 
@@ -1544,21 +1551,280 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        amd64_call_reg (code, GP_SCRATCH_REG);
                }
        }
-
-       return code;
-}
-
-static inline guint8*
-emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data)
-{
-       mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
-
-       return emit_call_body (cfg, code, patch_type, data);
+
+       return code;
+}
+
+static inline guint8*
+emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data)
+{
+       mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
+
+       return emit_call_body (cfg, code, patch_type, data);
+}
+
+static inline int
+store_membase_imm_to_store_membase_reg (int opcode)
+{
+       switch (opcode) {
+       case OP_STORE_MEMBASE_IMM:
+               return OP_STORE_MEMBASE_REG;
+       case OP_STOREI4_MEMBASE_IMM:
+               return OP_STOREI4_MEMBASE_REG;
+       case OP_STOREI8_MEMBASE_IMM:
+               return OP_STOREI8_MEMBASE_REG;
+       }
+
+       return -1;
+}
+
+#define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB) || ((opcode) == OP_ISBB_IMM)))
+
+/*
+ * peephole_pass_1:
+ *
+ *   Perform peephole opts which should/can be performed before local regalloc
+ */
+static void
+peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
+{
+       MonoInst *ins, *last_ins = NULL;
+       ins = bb->code;
+
+       while (ins) {
+
+               switch (ins->opcode) {
+               case OP_ADD_IMM:
+               case OP_IADD_IMM:
+               case OP_LADD_IMM:
+                       if ((ins->sreg1 < MONO_MAX_IREGS) && (ins->dreg >= MONO_MAX_IREGS) && (ins->inst_imm > 0)) {
+                               /* 
+                                * X86_LEA is like ADD, but doesn't have the
+                                * sreg1==dreg restriction. inst_imm > 0 is needed since LEA sign-extends 
+                                * its operand to 64 bit.
+                                */
+                               ins->opcode = OP_X86_LEA_MEMBASE;
+                               ins->inst_basereg = ins->sreg1;
+                               /* Fall through */
+                       }
+                       else
+                               break;
+               case CEE_XOR:
+                       if ((ins->sreg1 == ins->sreg2) && (ins->sreg1 == ins->dreg)) {
+                               MonoInst *ins2;
+
+                               /* 
+                                * Replace STORE_MEMBASE_IMM 0 with STORE_MEMBASE_REG since 
+                                * the latter has length 2-3 instead of 6 (reverse constant
+                                * propagation). These instruction sequences are very common
+                                * in the initlocals bblock.
+                                */
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
+                                       if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
+                                               ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
+                                               ins2->sreg1 = ins->dreg;
+                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+                                               /* Continue */
+                                       } else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
+                                               NULLIFY_INS (ins2);
+                                               /* Continue */
+                                       } else {
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               case OP_COMPARE_IMM:
+                       /* OP_COMPARE_IMM (reg, 0) 
+                        * --> 
+                        * OP_AMD64_TEST_NULL (reg) 
+                        */
+                       if (!ins->inst_imm)
+                               ins->opcode = OP_AMD64_TEST_NULL;
+                       break;
+               case OP_ICOMPARE_IMM:
+                       if (!ins->inst_imm)
+                               ins->opcode = OP_X86_TEST_NULL;
+                       break;
+               case OP_AMD64_ICOMPARE_MEMBASE_IMM:
+                       /* 
+                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
+                        * OP_X86_COMPARE_MEMBASE_IMM offset(basereg), imm
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
+                        * OP_COMPARE_IMM reg, imm
+                        *
+                        * Note: if imm = 0 then OP_COMPARE_IMM replaced with OP_X86_TEST_NULL
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG) &&
+                           ins->inst_basereg == last_ins->inst_destbasereg &&
+                           ins->inst_offset == last_ins->inst_offset) {
+                                       ins->opcode = OP_ICOMPARE_IMM;
+                                       ins->sreg1 = last_ins->sreg1;
+
+                                       /* check if we can remove cmp reg,0 with test null */
+                                       if (!ins->inst_imm)
+                                               ins->opcode = OP_X86_TEST_NULL;
+                               }
+
+                       break;
+               case OP_LOAD_MEMBASE:
+               case OP_LOADI4_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG 
+                                        || last_ins->opcode == OP_STORE_MEMBASE_REG) &&
+                           ins->inst_basereg == last_ins->inst_destbasereg &&
+                           ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+
+                       /* 
+                        * Note: reg1 must be different from the basereg in the second load
+                        * Note: if reg1 = reg2 is equal then second load is removed
+                        *
+                        * OP_LOAD_MEMBASE offset(basereg), reg1
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_LOAD_MEMBASE offset(basereg), reg1
+                        * OP_MOVE reg1, reg2
+                        */
+                       } if (last_ins && (last_ins->opcode == OP_LOADI4_MEMBASE
+                                          || last_ins->opcode == OP_LOAD_MEMBASE) &&
+                             ins->inst_basereg != last_ins->dreg &&
+                             ins->inst_basereg == last_ins->inst_basereg &&
+                             ins->inst_offset == last_ins->inst_offset) {
+
+                               if (ins->dreg == last_ins->dreg) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->dreg;
+                               }
+
+                               //g_assert_not_reached ();
+
+#if 0
+                       /* 
+                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg
+                        * -->
+                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
+                        * OP_ICONST reg, imm
+                        */
+                       } else if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_IMM
+                                               || last_ins->opcode == OP_STORE_MEMBASE_IMM) &&
+                                  ins->inst_basereg == last_ins->inst_destbasereg &&
+                                  ins->inst_offset == last_ins->inst_offset) {
+                               //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                               ins->opcode = OP_ICONST;
+                               ins->inst_c0 = last_ins->inst_imm;
+                               g_assert_not_reached (); // check this rule
+#endif
+                       }
+                       break;
+               case OP_LOADI1_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI1_MEMBASE_REG) &&
+                                       ins->inst_basereg == last_ins->inst_destbasereg &&
+                                       ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+                       }
+                       break;
+               case OP_LOADI2_MEMBASE:
+                       /* 
+                        * Note: if reg1 = reg2 the load op is removed
+                        *
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
+                        * OP_LOAD_MEMBASE offset(basereg), reg2
+                        * -->
+                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
+                        * OP_MOVE reg1, reg2
+                        */
+                       if (last_ins && (last_ins->opcode == OP_STOREI2_MEMBASE_REG) &&
+                                       ins->inst_basereg == last_ins->inst_destbasereg &&
+                                       ins->inst_offset == last_ins->inst_offset) {
+                               if (ins->dreg == last_ins->sreg1) {
+                                       last_ins->next = ins->next;                             
+                                       ins = ins->next;                                
+                                       continue;
+                               } else {
+                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
+                                       ins->opcode = OP_MOVE;
+                                       ins->sreg1 = last_ins->sreg1;
+                               }
+                       }
+                       break;
+               case CEE_CONV_I4:
+               case CEE_CONV_U4:
+               case OP_MOVE:
+               case OP_FMOVE:
+                       /*
+                        * Removes:
+                        *
+                        * OP_MOVE reg, reg 
+                        */
+                       if (ins->dreg == ins->sreg1) {
+                               if (last_ins)
+                                       last_ins->next = ins->next;                             
+                               else
+                                       bb->code = ins->next;
+                               ins = ins->next;
+                               continue;
+                       }
+                       /* 
+                        * Removes:
+                        *
+                        * OP_MOVE sreg, dreg 
+                        * OP_MOVE dreg, sreg
+                        */
+                       if (last_ins && last_ins->opcode == OP_MOVE &&
+                           ins->sreg1 == last_ins->dreg &&
+                           ins->dreg == last_ins->sreg1) {
+                               last_ins->next = ins->next;                             
+                               ins = ins->next;                                
+                               continue;
+                       }
+                       break;
+               }
+               last_ins = ins;
+               ins = ins->next;
+       }
+       bb->last_ins = last_ins;
 }
 
-/* FIXME: Add more instructions */
-#define INST_IGNORES_CFLAGS(ins) (((ins)->opcode == CEE_BR) || ((ins)->opcode == OP_STORE_MEMBASE_IMM) || ((ins)->opcode == OP_STOREI8_MEMBASE_REG) || ((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_ICONST) || ((ins)->opcode == OP_I8CONST) || ((ins)->opcode == OP_LOAD_MEMBASE))
-
 static void
 peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -1572,12 +1838,48 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_I8CONST:
                        /* reg = 0 -> XOR (reg, reg) */
                        /* XOR sets cflags on x86, so we cant do it always */
-                       if (ins->inst_c0 == 0 && (ins->next && INST_IGNORES_CFLAGS (ins->next))) {
-                               ins->opcode = CEE_XOR;
+                       if (ins->inst_c0 == 0 && (!ins->next || (ins->next && INST_IGNORES_CFLAGS (ins->next->opcode)))) {
+                               ins->opcode = OP_LXOR;
                                ins->sreg1 = ins->dreg;
                                ins->sreg2 = ins->dreg;
+                               /* Fall through */
+                       }
+                       else
+                               break;
+               case CEE_XOR:
+               case OP_LXOR:
+                       if ((ins->sreg1 == ins->sreg2) && (ins->sreg1 == ins->dreg)) {
+                               MonoInst *ins2;
+
+                               /* 
+                                * Replace STORE_MEMBASE_IMM 0 with STORE_MEMBASE_REG since 
+                                * the latter has length 2-3 instead of 6 (reverse constant
+                                * propagation). These instruction sequences are very common
+                                * in the initlocals bblock.
+                                */
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
+                                       if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
+                                               ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
+                                               ins2->sreg1 = ins->dreg;
+                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+                                               /* Continue */
+                                       } else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
+                                               NULLIFY_INS (ins2);
+                                               /* Continue */
+                                       } else {
+                                               break;
+                                       }
+                               }
                        }
                        break;
+               case OP_IADD_IMM:
+                       if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
+                               ins->opcode = OP_X86_INC_REG;
+                       break;
+               case OP_ISUB_IMM:
+                       if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
+                               ins->opcode = OP_X86_DEC_REG;
+                       break;
                case OP_MUL_IMM: 
                        /* remove unnecessary multiplication with 1 */
                        if (ins->inst_imm == 1) {
@@ -1695,7 +1997,6 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 #endif
                        }
                        break;
-               case OP_LOADU1_MEMBASE:
                case OP_LOADI1_MEMBASE:
                        /* 
                         * Note: if reg1 = reg2 the load op is removed
@@ -1720,7 +2021,6 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                }
                        }
                        break;
-               case OP_LOADU2_MEMBASE:
                case OP_LOADI2_MEMBASE:
                        /* 
                         * Note: if reg1 = reg2 the load op is removed
@@ -1748,6 +2048,7 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_I4:
                case CEE_CONV_U4:
                case OP_MOVE:
+               case OP_FMOVE:
                        /*
                         * Removes:
                         *
@@ -1755,7 +2056,9 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                         */
                        if (ins->dreg == ins->sreg1) {
                                if (last_ins)
-                                       last_ins->next = ins->next;                             
+                                       last_ins->next = ins->next;
+                               else
+                                       bb->code = ins->next;
                                ins = ins->next;
                                continue;
                        }
@@ -1812,10 +2115,8 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
        MonoInst *ins, *temp, *last_ins = NULL;
        ins = bb->code;
 
-       if (bb->max_ireg > cfg->rs->next_vireg)
-               cfg->rs->next_vireg = bb->max_ireg;
-       if (bb->max_freg > cfg->rs->next_vfreg)
-               cfg->rs->next_vfreg = bb->max_freg;
+       if (bb->max_vreg > cfg->rs->next_vreg)
+               cfg->rs->next_vreg = bb->max_vreg;
 
        /*
         * FIXME: Need to add more instructions, but the current machine 
@@ -1884,8 +2185,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
        }
        bb->last_ins = last_ins;
 
-       bb->max_ireg = cfg->rs->next_vireg;
-       bb->max_freg = cfg->rs->next_vfreg;
+       bb->max_vreg = cfg->rs->next_vreg;
 }
 
 static const int 
@@ -1895,40 +2195,18 @@ branch_cc_table [] = {
        X86_CC_O, X86_CC_NO, X86_CC_C, X86_CC_NC
 };
 
-static int
-opcode_to_x86_cond (int opcode)
-{
-       switch (opcode) {
-       case OP_IBEQ:
-               return X86_CC_EQ;
-       case OP_IBNE_UN:
-               return X86_CC_NE;
-       case OP_IBLT:
-               return X86_CC_LT;
-       case OP_IBLT_UN:
-               return X86_CC_LT;
-       case OP_IBGT:
-               return X86_CC_GT;
-       case OP_IBGT_UN:
-               return X86_CC_GT;
-       case OP_IBGE:
-               return X86_CC_GE;
-       case OP_IBGE_UN:
-               return X86_CC_GE;
-       case OP_IBLE:
-               return X86_CC_LE;
-       case OP_IBLE_UN:
-               return X86_CC_LE;
-       case OP_COND_EXC_IOV:
-               return X86_CC_O;
-       case OP_COND_EXC_IC:
-               return X86_CC_C;
-       default:
-               g_assert_not_reached ();
-       }
+/* Maps CMP_... constants to X86_CC_... constants */
+static const int
+cc_table [] = {
+       X86_CC_EQ, X86_CC_NE, X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT,
+       X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT
+};
 
-       return -1;
-}
+static const int
+cc_signed_table [] = {
+       TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
+       FALSE, FALSE, FALSE, FALSE
+};
 
 /*#include "cprop.c"*/
 
@@ -1947,6 +2225,9 @@ mono_arch_local_regalloc (MonoCompile *cfg, MonoBasicBlock *bb)
 
        mono_arch_lowering_pass (cfg, bb);
 
+       if (cfg->opt & MONO_OPT_PEEPHOLE)
+               peephole_pass_1 (cfg, bb);
+
        mono_local_regalloc (cfg, bb);
 }
 
@@ -2108,7 +2389,7 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
        case OP_VCALL:
        case OP_VCALL_REG:
        case OP_VCALL_MEMBASE:
-               cinfo = get_call_info (((MonoCallInst*)ins)->signature, FALSE);
+               cinfo = get_call_info (cfg, cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
                if (cinfo->ret.storage == ArgValuetypeInReg) {
                        /* Pop the destination address from the stack */
                        amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
@@ -2132,7 +2413,6 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
                                }
                        }
                }
-               g_free (cinfo);
                break;
        }
 
@@ -2183,7 +2463,7 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
 
        sig = mono_method_signature (method);
 
-       cinfo = get_call_info (sig, FALSE);
+       cinfo = cfg->arch.cinfo;
        
        /* This is the opposite of the code in emit_prolog */
 
@@ -2196,7 +2476,7 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
                MonoType *arg_type;
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
 
                if (sig->hasthis && (i == 0))
                        arg_type = &mono_defaults.object_class->byval_arg;
@@ -2229,8 +2509,6 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
                }
        }
 
-       g_free (cinfo);
-
        return code;
 }
 
@@ -2303,7 +2581,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        while (ins) {
                offset = code - cfg->native_code;
 
-               max_len = ((guint8 *)ins_spec [ins->opcode])[MONO_INST_LEN];
+               max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
 
                if (offset > (cfg->code_size - max_len - 16)) {
                        cfg->code_size *= 2;
@@ -2466,7 +2744,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_AMD64_ICOMPARE_REG_MEMBASE:
                        amd64_alu_reg_membase_size (code, X86_CMP, ins->sreg1, ins->sreg2, ins->inst_offset, 4);
                        break;
-               case CEE_BREAK:
+               case OP_BREAK:
                        amd64_breakpoint (code);
                        break;
                case OP_ADDCC:
@@ -2576,23 +2854,52 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                }
                case CEE_DIV:
                case OP_LDIV:
-                       amd64_cdq (code);
-                       amd64_div_reg (code, ins->sreg2, TRUE);
-                       break;
-               case CEE_DIV_UN:
-               case OP_LDIV_UN:
-                       amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
-                       amd64_div_reg (code, ins->sreg2, FALSE);
-                       break;
                case CEE_REM:
                case OP_LREM:
-                       amd64_cdq (code);
-                       amd64_div_reg (code, ins->sreg2, TRUE);
+                       /* Regalloc magic makes the div/rem cases the same */
+                       if (ins->sreg2 == AMD64_RDX) {
+                               amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RDX, 8);
+                               amd64_cdq (code);
+                               amd64_div_membase (code, AMD64_RSP, -8, TRUE);
+                       } else {
+                               amd64_cdq (code);
+                               amd64_div_reg (code, ins->sreg2, TRUE);
+                       }
                        break;
+               case CEE_DIV_UN:
+               case OP_LDIV_UN:
                case CEE_REM_UN:
                case OP_LREM_UN:
-                       amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
-                       amd64_div_reg (code, ins->sreg2, FALSE);
+                       if (ins->sreg2 == AMD64_RDX) {
+                               amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RDX, 8);
+                               amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
+                               amd64_div_membase (code, AMD64_RSP, -8, FALSE);
+                       } else {
+                               amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
+                               amd64_div_reg (code, ins->sreg2, FALSE);
+                       }
+                       break;
+               case OP_IDIV:
+               case OP_IREM:
+                       if (ins->sreg2 == AMD64_RDX) {
+                               amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RDX, 8);
+                               amd64_cdq_size (code, 4);
+                               amd64_div_membase_size (code, AMD64_RSP, -8, TRUE, 4);
+                       } else {
+                               amd64_cdq_size (code, 4);
+                               amd64_div_reg_size (code, ins->sreg2, TRUE, 4);
+                       }
+                       break;
+               case OP_IDIV_UN:
+               case OP_IREM_UN:
+                       if (ins->sreg2 == AMD64_RDX) {
+                               amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RDX, 8);
+                               amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
+                               amd64_div_membase_size (code, AMD64_RSP, -8, FALSE, 4);
+                       } else {
+                               amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
+                               amd64_div_reg_size (code, ins->sreg2, FALSE, 4);
+                       }
                        break;
                case OP_LMUL_OVF:
                        amd64_imul_reg_reg (code, ins->sreg1, ins->sreg2);
@@ -2606,6 +2913,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_reg_imm (code, X86_OR, ins->sreg1, ins->inst_imm);
                        break;
                case CEE_XOR:
+               case OP_LXOR:
                        amd64_alu_reg_reg (code, X86_XOR, ins->sreg1, ins->sreg2);
                        break;
                case OP_XOR_IMM:
@@ -2773,22 +3081,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
                        break;
                }
-               case OP_IDIV:
-                       amd64_cdq_size (code, 4);
-                       amd64_div_reg_size (code, ins->sreg2, TRUE, 4);
-                       break;
-               case OP_IDIV_UN:
-                       amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
-                       amd64_div_reg_size (code, ins->sreg2, FALSE, 4);
-                       break;
-               case OP_IREM:
-                       amd64_cdq_size (code, 4);
-                       amd64_div_reg_size (code, ins->sreg2, TRUE, 4);
-                       break;
-               case OP_IREM_UN:
-                       amd64_alu_reg_reg (code, X86_XOR, AMD64_RDX, AMD64_RDX);
-                       amd64_div_reg_size (code, ins->sreg2, FALSE, 4);
-                       break;
                case OP_ICOMPARE:
                        amd64_alu_reg_reg_size (code, X86_CMP, ins->sreg1, ins->sreg2, 4);
                        break;
@@ -2800,22 +3092,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_IBGT:
                case OP_IBGE:
                case OP_IBLE:
-                       EMIT_COND_BRANCH (ins, opcode_to_x86_cond (ins->opcode), TRUE);
-                       break;
                case OP_IBNE_UN:
                case OP_IBLT_UN:
                case OP_IBGT_UN:
                case OP_IBGE_UN:
                case OP_IBLE_UN:
-                       EMIT_COND_BRANCH (ins, opcode_to_x86_cond (ins->opcode), FALSE);
-                       break;
-               case OP_COND_EXC_IOV:
-                       EMIT_COND_SYSTEM_EXCEPTION (opcode_to_x86_cond (ins->opcode),
-                                                                               TRUE, ins->inst_p1);
-                       break;
-               case OP_COND_EXC_IC:
-                       EMIT_COND_SYSTEM_EXCEPTION (opcode_to_x86_cond (ins->opcode),
-                                                                               FALSE, ins->inst_p1);
+                       EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
                case CEE_NOT:
                        amd64_not_reg (code, ins->sreg1);
@@ -2871,7 +3153,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
-               case CEE_JMP: {
+               case OP_JMP: {
                        /*
                         * Note: this 'frame destruction' logic is useful for tail calls, too.
                         * Keep in sync with the code in emit_epilog.
@@ -3063,7 +3345,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_RET:
                        amd64_ret (code);
                        break;
-               case CEE_THROW: {
+               case OP_THROW: {
                        amd64_mov_reg_reg (code, AMD64_RDI, ins->sreg1, 8);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
                                             (gpointer)"mono_arch_throw_exception");
@@ -3083,10 +3365,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        /* Restore stack alignment */
                        amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
                        break;
+
                case OP_LABEL:
                        ins->inst_c0 = code - cfg->native_code;
                        break;
-               case CEE_BR:
+               case OP_NOP:
+                       break;
+               case OP_BR:
                        //g_print ("target: %p, next: %p, curr: %p, last: %p\n", ins->inst_target_bb, bb->next_bb, ins, bb->last_ins);
                        //if ((ins->inst_target_bb == bb->next_bb) && ins == bb->last_ins)
                        //break;
@@ -3119,27 +3404,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_CEQ:
                case OP_ICEQ:
-                       amd64_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
                case OP_CLT:
                case OP_ICLT:
-                       amd64_set_reg (code, X86_CC_LT, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
-               case OP_CLT_UN:
-               case OP_ICLT_UN:
-                       amd64_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
                case OP_CGT:
                case OP_ICGT:
-                       amd64_set_reg (code, X86_CC_GT, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-                       break;
+               case OP_CLT_UN:
+               case OP_ICLT_UN:
                case OP_CGT_UN:
                case OP_ICGT_UN:
-                       amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
+                       amd64_set_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
                        break;
                case OP_COND_EXC_EQ:
@@ -3152,6 +3425,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_COND_EXC_GE_UN:
                case OP_COND_EXC_LE:
                case OP_COND_EXC_LE_UN:
+                       EMIT_COND_SYSTEM_EXCEPTION (cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->inst_p1);
+                       break;
                case OP_COND_EXC_OV:
                case OP_COND_EXC_NO:
                case OP_COND_EXC_C:
@@ -3159,6 +3434,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_EQ], 
                                                    (ins->opcode < OP_COND_EXC_NE_UN), ins->inst_p1);
                        break;
+               case OP_COND_EXC_IOV:
+               case OP_COND_EXC_IC:
+                       EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_IEQ], 
+                                                   (ins->opcode < OP_COND_EXC_INE_UN), ins->inst_p1);
+                       break;
                case CEE_BEQ:
                case CEE_BNE_UN:
                case CEE_BLT:
@@ -3169,7 +3449,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_BGE_UN:
                case CEE_BLE:
                case CEE_BLE_UN:
-                       EMIT_COND_BRANCH (ins, branch_cc_table [ins->opcode - CEE_BEQ], (ins->opcode < CEE_BNE_UN));
+                       EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
 
                /* floating point opcodes */
@@ -3326,7 +3606,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        br = code; x86_branch8 (code, X86_CC_GEZ, 0, TRUE);
        
                        /* add correction constant mn */
-                       x86_fld80_mem (code, mn);
+                       x86_fld80_mem (code, (gssize)mn);
                        x86_fld80_membase (code, AMD64_RSP, 0);
                        amd64_fp_op_reg (code, X86_FADD, 1, TRUE);
                        x86_fst80_membase (code, AMD64_RSP, 0);
@@ -3747,7 +4027,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FBGT:
                case OP_FBGT_UN:
                        if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
+                               if (ins->opcode == OP_FBGT) {
+                                       guchar *br1;
+
+                                       /* skip branch if C1=1 */
+                                       br1 = code;
+                                       x86_branch8 (code, X86_CC_P, 0, FALSE);
+                                       /* branch if (C0 | C3) = 1 */
+                                       EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
+                                       amd64_patch (br1, code);
+                                       break;
+                               } else {
+                                       EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
+                               }
                                break;
                        }
                        amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
@@ -3817,7 +4109,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
                        EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
                        break;
-               case CEE_CKFINITE: {
+               case OP_CKFINITE: {
                        if (use_sse2) {
                                /* Transfer value to the fp stack */
                                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 16);
@@ -3900,6 +4192,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         * hack to overcome limits in x86 reg allocator 
                         * (req: dreg == eax and sreg2 != eax and breg != eax) 
                         */
+                       /* The pushes invalidate rsp */
+                       if ((breg == AMD64_RAX) || (breg == AMD64_RSP)) {
+                               amd64_mov_reg_reg (code, AMD64_R11, breg, 8);
+                               breg = AMD64_R11;
+                       }
+
                        if (ins->dreg != AMD64_RAX)
                                amd64_push_reg (code, AMD64_RAX);
                        
@@ -3910,11 +4208,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                sreg2 = AMD64_RDX;
                        }
 
-                       if (breg == AMD64_RAX) {
-                               amd64_mov_reg_reg (code, AMD64_R11, AMD64_RAX, 8);
-                               breg = AMD64_R11;
-                       }
-
                        amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
 
                        br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
@@ -4023,6 +4316,19 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
        }
 }
 
+/*
+ * This macro is used for testing whenever the unwinder works correctly at every point
+ * where an async exception can happen.
+ */
+/* This will generate a SIGSEGV at the given point in the code */
+#define async_exc_point(code) do { \
+    if (mono_inject_async_exc_method && mono_method_desc_full_match (mono_inject_async_exc_method, cfg->method)) { \
+         if (cfg->arch.async_point_count == mono_inject_async_exc_pos) \
+             amd64_mov_reg_mem (code, AMD64_RAX, 0, 4); \
+         cfg->arch.async_point_count ++; \
+    } \
+} while (0)
+
 guint8 *
 mono_arch_emit_prolog (MonoCompile *cfg)
 {
@@ -4035,7 +4341,11 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        CallInfo *cinfo;
        gint32 lmf_offset = cfg->arch.lmf_offset;
 
-       cfg->code_size =  MAX (((MonoMethodNormal *)method)->header->code_size * 4, 512);
+       cfg->code_size =  MAX (((MonoMethodNormal *)method)->header->code_size * 4, 1024);
+
+       if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
+               cfg->code_size += 512;
+
        code = cfg->native_code = g_malloc (cfg->code_size);
 
        /* Amount of stack space allocated by register saving code */
@@ -4054,9 +4364,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
         * - save callee saved regs using moves
         */
 
+       async_exc_point (code);
+
        if (!cfg->arch.omit_fp) {
                amd64_push_reg (code, AMD64_RBP);
+               async_exc_point (code);
                amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+               async_exc_point (code);
        }
 
        /* Save callee saved registers */
@@ -4065,6 +4379,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_push_reg (code, i);
                                pos += sizeof (gpointer);
+                               async_exc_point (code);
                        }
        }
 
@@ -4089,13 +4404,17 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                guint32 remaining_size = alloc_size;
                while (remaining_size >= 0x1000) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 0x1000);
+                       async_exc_point (code);
                        amd64_test_membase_reg (code, AMD64_RSP, 0, AMD64_RSP);
                        remaining_size -= 0x1000;
                }
-               if (remaining_size)
+               if (remaining_size) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, remaining_size);
+                       async_exc_point (code);
+               }
 #else
                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, alloc_size);
+               async_exc_point (code);
 #endif
        }
 
@@ -4119,14 +4438,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
                /* Save sp */
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
-               /* Save method */
-               /* FIXME: add a relocation for this */
-               if (IS_IMM32 (cfg->method))
-                       amd64_mov_membase_imm (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
-               else {
-                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
-                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
-               }
+               /* Skip method (only needed for trampoline LMF frames) */
                /* Save callee saved regs */
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
                amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
@@ -4146,6 +4458,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
                                save_area_offset += 8;
+                               async_exc_point (code);
                        }
        }
 
@@ -4166,7 +4479,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                if (ins->opcode == OP_LABEL)
                                        ins->inst_c1 = max_offset;
                                
-                               max_offset += ((guint8 *)ins_spec [ins->opcode])[MONO_INST_LEN];
+                               max_offset += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
                                ins = ins->next;
                        }
                }
@@ -4175,7 +4488,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        sig = mono_method_signature (method);
        pos = 0;
 
-       cinfo = get_call_info (sig, FALSE);
+       cinfo = cfg->arch.cinfo;
 
        if (sig->ret->type != MONO_TYPE_VOID) {
                if ((cinfo->ret.storage == ArgInIReg) && (cfg->ret->opcode != OP_REGVAR)) {
@@ -4189,7 +4502,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                ArgInfo *ainfo = cinfo->args + i;
                gint32 stack_offset;
                MonoType *arg_type;
-               inst = cfg->varinfo [i];
+               inst = cfg->args [i];
 
                if (sig->hasthis && (i == 0))
                        arg_type = &mono_defaults.object_class->byval_arg;
@@ -4271,10 +4584,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                 * The call might clobber argument registers, but they are already
                 * saved to the stack/global regs.
                 */
-               if (lmf_tls_offset != -1) {
+               if (lmf_addr_tls_offset != -1) {
                        guint8 *buf;
 
-                       code = emit_tls_get ( code, AMD64_RAX, lmf_tls_offset);
+                       code = emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
                        amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
                        buf = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
@@ -4295,33 +4608,46 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
 
        if (method->save_lmf) {
-               if (lmf_tls_offset != -1) {
-                       /* Load lmf quicky using the FS register */
-                       code = emit_tls_get (code, AMD64_RAX, lmf_tls_offset);
-               }
-               else {
-                       /* 
-                        * The call might clobber argument registers, but they are already
-                        * saved to the stack/global regs.
+               if ((lmf_tls_offset != -1) && !optimize_for_xen) {
+                       /*
+                        * Optimized version which uses the mono_lmf TLS variable instead of indirection
+                        * through the mono_lmf_addr TLS variable.
                         */
+                       /* %rax = previous_lmf */
+                       x86_prefix (code, X86_FS_PREFIX);
+                       amd64_mov_reg_mem (code, AMD64_RAX, lmf_tls_offset, 8);
+
+                       /* Save previous_lmf */
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_RAX, 8);
+                       /* Set new lmf */
+                       amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
+                       x86_prefix (code, X86_FS_PREFIX);
+                       amd64_mov_mem_reg (code, lmf_tls_offset, AMD64_R11, 8);
+               } else {
+                       if (lmf_addr_tls_offset != -1) {
+                               /* Load lmf quicky using the FS register */
+                               code = emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
+                       }
+                       else {
+                               /* 
+                                * The call might clobber argument registers, but they are already
+                                * saved to the stack/global regs.
+                                */
+                               code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
+                                                                 (gpointer)"mono_get_lmf_addr");               
+                       }
 
-                       code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
-                                                                (gpointer)"mono_get_lmf_addr");                
+                       /* Save lmf_addr */
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+                       /* Save previous_lmf */
+                       amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+                       /* Set new lmf */
+                       amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
+                       amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
                }
-
-               /* Save lmf_addr */
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
-               /* Save previous_lmf */
-               amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
-               /* Set new lmf */
-               amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
-               amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
        }
 
-
-       g_free (cinfo);
-
        if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
                code = mono_arch_instrument_prolog (cfg, mono_trace_enter_method, code, TRUE);
 
@@ -4364,14 +4690,25 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
                code = mono_arch_instrument_epilog (cfg, mono_trace_leave_method, code, TRUE);
 
-       /* the code restoring the registers must be kept in sync with CEE_JMP */
+       /* the code restoring the registers must be kept in sync with OP_JMP */
        pos = 0;
        
        if (method->save_lmf) {
-               /* Restore previous lmf */
-               amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-               amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
-               amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+               if ((lmf_tls_offset != -1) && !optimize_for_xen) {
+                       /*
+                        * Optimized version which uses the mono_lmf TLS variable instead of indirection
+                        * through the mono_lmf_addr TLS variable.
+                        */
+                       /* reg = previous_lmf */
+                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+                       x86_prefix (code, X86_FS_PREFIX);
+                       amd64_mov_mem_reg (code, lmf_tls_offset, AMD64_R11, 8);
+               } else {
+                       /* Restore previous lmf */
+                       amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
+                       amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+               }
 
                /* Restore caller saved regs */
                if (cfg->used_int_regs & (1 << AMD64_RBP)) {
@@ -4430,7 +4767,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
 
        /* Load returned vtypes into registers if needed */
-       cinfo = get_call_info (mono_method_signature (method), FALSE);
+       cinfo = cfg->arch.cinfo;
        if (cinfo->ret.storage == ArgValuetypeInReg) {
                ArgInfo *ainfo = &cinfo->ret;
                MonoInst *inst = cfg->ret;
@@ -4453,7 +4790,6 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                        }
                }
        }
-       g_free (cinfo);
 
        if (cfg->arch.omit_fp) {
                if (cfg->arch.stack_alloc_size)
@@ -4461,6 +4797,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        } else {
                amd64_leave (code);
        }
+       async_exc_point (code);
        amd64_ret (code);
 
        cfg->code_len = code - cfg->native_code;
@@ -4582,10 +4919,15 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 
                        pos = cfg->native_code + patch_info->ip.i;
 
-                       if (use_sse2)
-                               *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
-                       else
+
+                       if (use_sse2) {
+                               if (IS_REX (pos [1]))
+                                       *(guint32*)(pos + 5) = (guint8*)code - pos - 9;
+                               else
+                                       *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+                       } else {
                                *(guint32*)(pos + 3) = (guint8*)code - pos - 7;
+                       }
 
                        if (patch_info->type == MONO_PATCH_INFO_R8) {
                                *(double*)code = *(double*)patch_info->data.target;
@@ -4636,7 +4978,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                /* Allocate a new area on the stack and save arguments there */
                sig = mono_method_signature (cfg->method);
 
-               cinfo = get_call_info (sig, FALSE);
+               cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
 
                n = sig->param_count + sig->hasthis;
 
@@ -4645,7 +4987,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, stack_area);
 
                for (i = 0; i < n; ++i) {
-                       inst = cfg->varinfo [i];
+                       inst = cfg->args [i];
 
                        if (inst->opcode == OP_REGVAR)
                                amd64_mov_membase_reg (code, AMD64_RSP, (i * 8), inst->dreg, 8);
@@ -4661,12 +5003,9 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
        amd64_mov_reg_reg (code, AMD64_RSI, AMD64_RSP, 8);
        code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func);
 
-       if (enable_arguments) {
+       if (enable_arguments)
                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, stack_area);
 
-               g_free (cinfo);
-       }
-
        return code;
 }
 
@@ -4796,8 +5135,6 @@ mono_arch_is_inst_imm (gint64 imm)
        return amd64_is_imm32 (imm);
 }
 
-#define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
-
 /*
  * Determine whenever the trap whose info is in SIGINFO is caused by
  * integer overflow.
@@ -4884,7 +5221,7 @@ gpointer*
 mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
 {
        guint32 reg;
-       guint32 disp;
+       gint32 disp;
        guint8 rex = 0;
 
        /* go to the start of the call instruction
@@ -4901,7 +5238,22 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
         * really careful about the ordering of the cases. Longer sequences
         * come first.
         */
-       if ((code [-1] == 0x8b) && (amd64_modrm_mod (code [0]) == 0x2) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
+#ifdef MONO_ARCH_HAVE_IMT
+       if ((code [-2] == 0x41) && (code [-1] == 0xbb) && (code [4] == 0xff) && (x86_modrm_mod (code [5]) == 1) && (x86_modrm_reg (code [5]) == 2) && ((signed char)code [6] < 0)) {
+               /* IMT-based interface calls: with MONO_ARCH_IMT_REG == r11
+                * 41 bb 14 f8 28 08       mov    $0x828f814,%r11d
+                * ff 50 fc                call   *0xfffffffc(%rax)
+                */
+               reg = amd64_modrm_rm (code [5]);
+               disp = (signed char)code [6];
+               /* R10 is clobbered by the IMT thunk code */
+               g_assert (reg != AMD64_R10);
+       }
+#else
+       if (0) {
+       }
+#endif
+       else if ((code [-1] == 0x8b) && (amd64_modrm_mod (code [0]) == 0x2) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
                        /*
                         * This is a interface call
                         * 48 8b 80 f0 e8 ff ff   mov    0xffffffffffffe8f0(%rax),%rax
@@ -4911,8 +5263,9 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
                        rex = code [4];
                reg = amd64_modrm_rm (code [6]);
                disp = 0;
-       }
-       else if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
+               /* R10 is clobbered by the IMT thunk code */
+               g_assert (reg != AMD64_R10);
+       } else if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
                /* call OFFSET(%rip) */
                disp = *(guint32*)(code + 3);
                return (gpointer*)(code + disp + 7);
@@ -4922,8 +5275,9 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
                if (IS_REX (code [0]))
                        rex = code [0];
                reg = amd64_modrm_rm (code [2]);
-               disp = *(guint32*)(code + 3);
-               //printf ("B: [%%r%d+0x%x]\n", reg, disp);
+               disp = *(gint32*)(code + 3);
+               /* R10 is clobbered by the IMT thunk code */
+               g_assert (reg != AMD64_R10);
        }
        else if (code [2] == 0xe8) {
                /* call <ADDR> */
@@ -4938,7 +5292,7 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
                if (IS_REX (code [3]))
                        rex = code [3];
                reg = amd64_modrm_rm (code [5]);
-               disp = *(guint8*)(code + 6);
+               disp = *(gint8*)(code + 6);
                //printf ("B: [%%r%d+0x%x]\n", reg, disp);
        }
        else if ((code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
@@ -4963,26 +5317,84 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
        return (gpointer)(((guint64)(regs [reg])) + disp);
 }
 
-gpointer*
-mono_arch_get_delegate_method_ptr_addr (guint8* code, gpointer *regs)
+gpointer
+mono_arch_get_this_arg_from_call (MonoMethodSignature *sig, gssize *regs, guint8 *code)
 {
-       guint32 reg;
-       guint32 disp;
+       if (MONO_TYPE_ISSTRUCT (sig->ret))
+               return (gpointer)regs [AMD64_RSI];
+       else
+               return (gpointer)regs [AMD64_RDI];
+}
 
-       code -= 10;
+#define MAX_ARCH_DELEGATE_PARAMS 10
+
+gpointer
+mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_target)
+{
+       guint8 *code, *start;
+       int i;
+
+       if (sig->param_count > MAX_ARCH_DELEGATE_PARAMS)
+               return NULL;
+
+       /* FIXME: Support more cases */
+       if (MONO_TYPE_ISSTRUCT (sig->ret))
+               return NULL;
+
+       if (has_target) {
+               static guint8* cached = NULL;
+               mono_mini_arch_lock ();
+               if (cached) {
+                       mono_mini_arch_unlock ();
+                       return cached;
+               }
+
+               start = code = mono_global_codeman_reserve (64);
+
+               /* Replace the this argument with the target */
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RDI, 8);
+               amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, target), 8);
+               amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
 
-       if (IS_REX (code [0]) && (code [1] == 0x8b) && (code [3] == 0x48) && (code [4] == 0x8b) && (code [5] == 0x40) && (code [7] == 0x48) && (code [8] == 0xff) && (code [9] == 0xd0)) {
-               /* mov REG, %rax; mov <OFFSET>(%rax), %rax; call *%rax */
-               reg = amd64_rex_b (code [0]) + amd64_modrm_rm (code [2]);
-               disp = code [6];
+               g_assert ((code - start) < 64);
 
-               if (reg == AMD64_RAX)
+               cached = start;
+               mono_mini_arch_unlock ();
+       } else {
+               static guint8* cache [MAX_ARCH_DELEGATE_PARAMS + 1] = {NULL};
+               for (i = 0; i < sig->param_count; ++i)
+                       if (!mono_is_regsize_var (sig->params [i]))
+                               return NULL;
+               if (sig->param_count > 4)
                        return NULL;
-               else
-                       return (gpointer*)(((guint64)(regs [reg])) + disp);
+
+               mono_mini_arch_lock ();
+               code = cache [sig->param_count];
+               if (code) {
+                       mono_mini_arch_unlock ();
+                       return code;
+               }
+
+               start = code = mono_global_codeman_reserve (64);
+
+               if (sig->param_count == 0) {
+                       amd64_jump_membase (code, AMD64_RDI, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               } else {
+                       /* We have to shift the arguments left */
+                       amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RDI, 8);
+                       for (i = 0; i < sig->param_count; ++i)
+                               amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+
+                       amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               }
+               g_assert ((code - start) < 64);
+
+               cache [sig->param_count] = start;
+               
+               mono_mini_arch_unlock ();
        }
 
-       return NULL;
+       return start;
 }
 
 /*
@@ -5001,7 +5413,8 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                optimize_for_xen = access ("/proc/xen", F_OK) == 0;
 #endif
                appdomain_tls_offset = mono_domain_get_tls_offset ();
-               lmf_tls_offset = mono_get_lmf_tls_offset ();
+               lmf_tls_offset = mono_get_lmf_tls_offset ();
+               lmf_addr_tls_offset = mono_get_lmf_addr_tls_offset ();
                thread_tls_offset = mono_thread_get_tls_offset ();
        }               
 }
@@ -5015,7 +5428,7 @@ void
 mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
 {
        MonoCallInst *call = (MonoCallInst*)inst;
-       CallInfo * cinfo = get_call_info (inst->signature, FALSE);
+       CallInfo * cinfo = get_call_info (cfg, cfg->mempool, inst->signature, FALSE);
 
        if (vt_reg != -1) {
                MonoInst *vtarg;
@@ -5054,10 +5467,199 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
 
                mono_call_inst_add_outarg_reg (cfg, call, this->dreg, cinfo->args [0].reg, FALSE);
        }
+}
 
-       g_free (cinfo);
+#ifdef MONO_ARCH_HAVE_IMT
+
+#define CMP_SIZE (6 + 1)
+#define CMP_REG_REG_SIZE (4 + 1)
+#define BR_SMALL_SIZE 2
+#define BR_LARGE_SIZE 6
+#define MOV_REG_IMM_SIZE 10
+#define MOV_REG_IMM_32BIT_SIZE 6
+#define JUMP_REG_SIZE (2 + 1)
+
+static int
+imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
+{
+       int i, distance = 0;
+       for (i = start; i < target; ++i)
+               distance += imt_entries [i]->chunk_size;
+       return distance;
+}
+
+/*
+ * LOCKING: called with the domain lock held
+ */
+gpointer
+mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count)
+{
+       int i;
+       int size = 0;
+       guint8 *code, *start;
+       gboolean vtable_is_32bit = ((long)(vtable) == (long)(int)(long)(vtable));
+
+       for (i = 0; i < count; ++i) {
+               MonoIMTCheckItem *item = imt_entries [i];
+               if (item->is_equals) {
+                       if (item->check_target_idx) {
+                               if (!item->compare_done) {
+                                       if (amd64_is_imm32 (item->method))
+                                               item->chunk_size += CMP_SIZE;
+                                       else
+                                               item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
+                               }
+                               if (vtable_is_32bit)
+                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                               else
+                                       item->chunk_size += MOV_REG_IMM_SIZE;
+                               item->chunk_size += BR_SMALL_SIZE + JUMP_REG_SIZE;
+                       } else {
+                               if (vtable_is_32bit)
+                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                               else
+                                       item->chunk_size += MOV_REG_IMM_SIZE;
+                               item->chunk_size += JUMP_REG_SIZE;
+                               /* with assert below:
+                                * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
+                                */
+                       }
+               } else {
+                       if (amd64_is_imm32 (item->method))
+                               item->chunk_size += CMP_SIZE;
+                       else
+                               item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
+                       item->chunk_size += BR_LARGE_SIZE;
+                       imt_entries [item->check_target_idx]->compare_done = TRUE;
+               }
+               size += item->chunk_size;
+       }
+       code = mono_code_manager_reserve (domain->code_mp, size);
+       start = code;
+       for (i = 0; i < count; ++i) {
+               MonoIMTCheckItem *item = imt_entries [i];
+               item->code_target = code;
+               if (item->is_equals) {
+                       if (item->check_target_idx) {
+                               if (!item->compare_done) {
+                                       if (amd64_is_imm32 (item->method))
+                                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                                       else {
+                                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                               amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+                                       }
+                               }
+                               item->jmp_code = code;
+                               amd64_branch8 (code, X86_CC_NE, 0, FALSE);
+                               amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->vtable_slot]));
+                               amd64_jump_membase (code, AMD64_R11, 0);
+                       } else {
+                               /* enable the commented code to assert on wrong method */
+#if 0
+                               if (amd64_is_imm32 (item->method))
+                                       amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                               else {
+                                       amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                       amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+                               }
+                               item->jmp_code = code;
+                               amd64_branch8 (code, X86_CC_NE, 0, FALSE);
+                               amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->vtable_slot]));
+                               amd64_jump_membase (code, AMD64_R11, 0);
+                               amd64_patch (item->jmp_code, code);
+                               amd64_breakpoint (code);
+                               item->jmp_code = NULL;
+#else
+                               amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->vtable_slot]));
+                               amd64_jump_membase (code, AMD64_R11, 0);
+#endif
+                       }
+               } else {
+                       if (amd64_is_imm32 (item->method))
+                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                       else {
+                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                               amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+                       }
+                       item->jmp_code = code;
+                       if (x86_is_imm8 (imt_branch_distance (imt_entries, i, item->check_target_idx)))
+                               x86_branch8 (code, X86_CC_GE, 0, FALSE);
+                       else
+                               x86_branch32 (code, X86_CC_GE, 0, FALSE);
+               }
+               g_assert (code - item->code_target <= item->chunk_size);
+       }
+       /* patch the branches to get to the target items */
+       for (i = 0; i < count; ++i) {
+               MonoIMTCheckItem *item = imt_entries [i];
+               if (item->jmp_code) {
+                       if (item->check_target_idx) {
+                               amd64_patch (item->jmp_code, imt_entries [item->check_target_idx]->code_target);
+                       }
+               }
+       }
+               
+       mono_stats.imt_thunks_size += code - start;
+       g_assert (code - start <= size);
+
+       return start;
+}
+
+MonoMethod*
+mono_arch_find_imt_method (gpointer *regs, guint8 *code)
+{
+       /* 
+        * R11 is clobbered by the trampoline code, so we have to retrieve the method 
+        * from the code.
+        * 41 bb c0 f7 89 00     mov    $0x89f7c0,%r11d
+        * ff 90 68 ff ff ff     callq  *0xffffffffffffff68(%rax)
+        */
+       /* Similar to get_vcall_slot_addr () */
+
+       /* Find the start of the call instruction */
+       code -= 7;
+       if ((code [-2] == 0x41) && (code [-1] == 0xbb) && (code [4] == 0xff) && (x86_modrm_mod (code [5]) == 1) && (x86_modrm_reg (code [5]) == 2) && ((signed char)code [6] < 0)) {
+               /* IMT-based interface calls
+                * 41 bb 14 f8 28 08       mov    $0x828f814,%r11
+                * ff 50 fc                call   *0xfffffffc(%rax)
+                */
+               code += 4;
+       } else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
+               /* call *[reg+disp32] */
+               code += 1;
+       } else if ((code [4] == 0xff) && (amd64_modrm_reg (code [5]) == 0x2) && (amd64_modrm_mod (code [5]) == 0x1)) {
+               /* call *[reg+disp8] */
+               code += 4;
+       } else
+               g_assert_not_reached ();
+
+       /* Find the start of the mov instruction */
+       code -= 10;
+       if (code [0] == 0x49 && code [1] == 0xbb) {
+               return (MonoMethod*)*(gssize*)(code + 2);
+       } else if (code [3] == 0x4d && code [4] == 0x8b && code [5] == 0x1d) {
+               /* mov    <OFFSET>(%rip),%r11 */
+               return (MonoMethod*)*(gssize*)(code + 10 + *(guint32*)(code + 6));
+       } else if (code [4] == 0x41 && code [5] == 0xbb) {
+               return (MonoMethod*)(gssize)*(guint32*)(code + 6);
+       } else {
+               int i;
+
+               printf ("Unknown call sequence: ");
+               for (i = -10; i < 20; ++i)
+                       printf ("%x ", code [i]);
+               g_assert_not_reached ();
+               return NULL;
+       }
 }
 
+MonoObject*
+mono_arch_find_this_argument (gpointer *regs, MonoMethod *method)
+{
+       return mono_arch_get_this_arg_from_call (mono_method_signature (method), (gssize*)regs, NULL);
+}
+#endif
+
 MonoInst*
 mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {