2005-12-12 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index f9799ba0210821af43aacca55f678da8deba6b7a..b4a530e6c04326f327817a375981bb294409c11c 100644 (file)
 #include "mini.h"
 #include <string.h>
 #include <math.h>
-#include <unistd.h>
-#include <sys/mman.h>
 
 #include <mono/metadata/appdomain.h>
 #include <mono/metadata/debug-helpers.h>
 #include <mono/metadata/threads.h>
 #include <mono/metadata/profiler-private.h>
+#include <mono/metadata/mono-debug.h>
 #include <mono/utils/mono-math.h>
 
 #include "trace.h"
@@ -47,8 +46,6 @@ static const char*const * ins_spec = amd64_desc;
 #define CALLCONV_IS_STDCALL(call_conv) ((call_conv) == MONO_CALL_STDCALL)
 #endif
 
-#define SIGNAL_STACK_SIZE (64 * 1024)
-
 #define ARGS_OFFSET 16
 #define GP_SCRATCH_REG AMD64_R11
 
@@ -108,6 +105,41 @@ mono_arch_fregname (int reg)
                return "unknown";
 }
 
+G_GNUC_UNUSED static void
+break_count (void)
+{
+}
+
+G_GNUC_UNUSED static gboolean
+debug_count (void)
+{
+       static int count = 0;
+       count ++;
+
+       if (!getenv ("COUNT"))
+               return TRUE;
+
+       if (count == atoi (getenv ("COUNT"))) {
+               break_count ();
+       }
+
+       if (count > atoi (getenv ("COUNT"))) {
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static gboolean
+debug_omit_fp (void)
+{
+#if 0
+       return debug_count ();
+#else
+       return TRUE;
+#endif
+}
+
 static inline void 
 amd64_patch (unsigned char* code, gpointer target)
 {
@@ -749,11 +781,79 @@ mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
        return vars;
 }
 
+/**
+ * mono_arch_compute_omit_fp:
+ *
+ *   Determine whenever the frame pointer can be eliminated.
+ */
+static void
+mono_arch_compute_omit_fp (MonoCompile *cfg)
+{
+       MonoMethodSignature *sig;
+       MonoMethodHeader *header;
+       int i;
+       CallInfo *cinfo;
+
+       if (cfg->arch.omit_fp_computed)
+               return;
+
+       header = mono_method_get_header (cfg->method);
+
+       sig = mono_method_signature (cfg->method);
+
+       cinfo = get_call_info (sig, FALSE);
+
+       /*
+        * FIXME: Remove some of the restrictions.
+        */
+       cfg->arch.omit_fp = TRUE;
+       cfg->arch.omit_fp_computed = TRUE;
+
+       /* Temporarily disable this when running in the debugger until we have support
+        * for this in the debugger. */
+       if (mono_debug_using_mono_debugger ())
+               cfg->arch.omit_fp = FALSE;
+
+       if (!debug_omit_fp ())
+               cfg->arch.omit_fp = FALSE;
+       /*
+       if (cfg->method->save_lmf)
+               cfg->arch.omit_fp = FALSE;
+       */
+       if (cfg->flags & MONO_CFG_HAS_ALLOCA)
+               cfg->arch.omit_fp = FALSE;
+       if (header->num_clauses)
+               cfg->arch.omit_fp = FALSE;
+       if (cfg->param_area)
+               cfg->arch.omit_fp = FALSE;
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG))
+               cfg->arch.omit_fp = FALSE;
+       if ((mono_jit_trace_calls != NULL && mono_trace_eval (cfg->method)) ||
+               (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE))
+               cfg->arch.omit_fp = FALSE;
+       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
+               ArgInfo *ainfo = &cinfo->args [i];
+
+               if (ainfo->storage == ArgOnStack) {
+                       /* 
+                        * The stack offset can only be determined when the frame
+                        * size is known.
+                        */
+                       cfg->arch.omit_fp = FALSE;
+               }
+       }
+}
+
 GList *
 mono_arch_get_global_int_regs (MonoCompile *cfg)
 {
        GList *regs = NULL;
 
+       mono_arch_compute_omit_fp (cfg);
+
+       if (cfg->arch.omit_fp)
+               regs = g_list_prepend (regs, (gpointer)AMD64_RBP);
+
        /* We use the callee saved registers for global allocation */
        regs = g_list_prepend (regs, (gpointer)AMD64_RBX);
        regs = g_list_prepend (regs, (gpointer)AMD64_R12);
@@ -786,7 +886,7 @@ mono_arch_regalloc_cost (MonoCompile *cfg, MonoMethodVar *vmv)
 }
  
 void
-mono_arch_allocate_vars (MonoCompile *m)
+mono_arch_allocate_vars (MonoCompile *cfg)
 {
        MonoMethodSignature *sig;
        MonoMethodHeader *header;
@@ -796,34 +896,53 @@ mono_arch_allocate_vars (MonoCompile *m)
        gint32 *offsets;
        CallInfo *cinfo;
 
-       header = mono_method_get_header (m->method);
+       header = mono_method_get_header (cfg->method);
 
-       sig = mono_method_signature (m->method);
+       sig = mono_method_signature (cfg->method);
 
        cinfo = get_call_info (sig, FALSE);
 
+       mono_arch_compute_omit_fp (cfg);
+
        /*
         * We use the ABI calling conventions for managed code as well.
         * Exception: valuetypes are never passed or returned in registers.
         */
 
-       /* Locals are allocated backwards from %fp */
-       m->frame_reg = AMD64_RBP;
-       offset = 0;
+       if (cfg->arch.omit_fp) {
+               cfg->flags |= MONO_CFG_HAS_SPILLUP;
+               cfg->frame_reg = AMD64_RSP;
+               offset = 0;
+       } else {
+               /* Locals are allocated backwards from %fp */
+               cfg->frame_reg = AMD64_RBP;
+               offset = 0;
+       }
+
+       cfg->arch.reg_save_area_offset = offset;
 
        /* Reserve space for caller saved registers */
        for (i = 0; i < AMD64_NREG; ++i)
-               if (AMD64_IS_CALLEE_SAVED_REG (i) && (m->used_int_regs & (1 << i))) {
+               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                        offset += sizeof (gpointer);
                }
 
-       if (m->method->save_lmf) {
+       if (cfg->method->save_lmf) {
                /* Reserve stack space for saving LMF + argument regs */
-               offset += sizeof (MonoLMF);
+               guint32 size = sizeof (MonoLMF);
+
                if (lmf_tls_offset == -1)
                        /* Need to save argument regs too */
-                       offset += (AMD64_NREG * 8) + (8 * 8);
-               m->arch.lmf_offset = offset;
+                       size += (AMD64_NREG * 8) + (8 * 8);
+
+               if (cfg->arch.omit_fp) {
+                       cfg->arch.lmf_offset = offset;
+                       offset += size;
+               }
+               else {
+                       offset += size;
+                       cfg->arch.lmf_offset = -offset;
+               }
        }
 
        if (sig->ret->type != MONO_TYPE_VOID) {
@@ -833,41 +952,50 @@ mono_arch_allocate_vars (MonoCompile *m)
                case ArgInDoubleSSEReg:
                        if ((MONO_TYPE_ISSTRUCT (sig->ret) && !mono_class_from_mono_type (sig->ret)->enumtype) || (sig->ret->type == MONO_TYPE_TYPEDBYREF)) {
                                /* The register is volatile */
-                               m->ret->opcode = OP_REGOFFSET;
-                               m->ret->inst_basereg = AMD64_RBP;
-                               offset += 8;
-                               m->ret->inst_offset = - offset;
+                               cfg->ret->opcode = OP_REGOFFSET;
+                               cfg->ret->inst_basereg = cfg->frame_reg;
+                               if (cfg->arch.omit_fp) {
+                                       cfg->ret->inst_offset = offset;
+                                       offset += 8;
+                               } else {
+                                       offset += 8;
+                                       cfg->ret->inst_offset = -offset;
+                               }
                        }
                        else {
-                               m->ret->opcode = OP_REGVAR;
-                               m->ret->inst_c0 = cinfo->ret.reg;
+                               cfg->ret->opcode = OP_REGVAR;
+                               cfg->ret->inst_c0 = cinfo->ret.reg;
                        }
                        break;
                case ArgValuetypeInReg:
                        /* Allocate a local to hold the result, the epilog will copy it to the correct place */
+                       g_assert (!cfg->arch.omit_fp);
                        offset += 16;
-                       m->ret->opcode = OP_REGOFFSET;
-                       m->ret->inst_basereg = AMD64_RBP;
-                       m->ret->inst_offset = - offset;
+                       cfg->ret->opcode = OP_REGOFFSET;
+                       cfg->ret->inst_basereg = cfg->frame_reg;
+                       cfg->ret->inst_offset = - offset;
                        break;
                default:
                        g_assert_not_reached ();
                }
-               m->ret->dreg = m->ret->inst_c0;
+               cfg->ret->dreg = cfg->ret->inst_c0;
        }
 
        /* Allocate locals */
-       offsets = mono_allocate_stack_slots (m, &locals_stack_size, &locals_stack_align);
+       offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE: TRUE, &locals_stack_size, &locals_stack_align);
        if (locals_stack_align) {
                offset += (locals_stack_align - 1);
                offset &= ~(locals_stack_align - 1);
        }
-       for (i = m->locals_start; i < m->num_varinfo; i++) {
+       for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
                if (offsets [i] != -1) {
-                       MonoInst *inst = m->varinfo [i];
+                       MonoInst *inst = cfg->varinfo [i];
                        inst->opcode = OP_REGOFFSET;
-                       inst->inst_basereg = AMD64_RBP;
-                       inst->inst_offset = - (offset + offsets [i]);
+                       inst->inst_basereg = cfg->frame_reg;
+                       if (cfg->arch.omit_fp)
+                               inst->inst_offset = (offset + offsets [i]);
+                       else
+                               inst->inst_offset = - (offset + offsets [i]);
                        //printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
                }
        }
@@ -875,12 +1003,13 @@ mono_arch_allocate_vars (MonoCompile *m)
        offset += locals_stack_size;
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG)) {
+               g_assert (!cfg->arch.omit_fp);
                g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-               m->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
+               cfg->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
        }
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               inst = m->varinfo [i];
+               inst = cfg->varinfo [i];
                if (inst->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
@@ -913,8 +1042,9 @@ mono_arch_allocate_vars (MonoCompile *m)
                                inst->dreg = ainfo->reg;
                                break;
                        case ArgOnStack:
+                               g_assert (!cfg->arch.omit_fp);
                                inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = AMD64_RBP;
+                               inst->inst_basereg = cfg->frame_reg;
                                inst->inst_offset = ainfo->offset + ARGS_OFFSET;
                                break;
                        case ArgValuetypeInReg:
@@ -925,18 +1055,20 @@ mono_arch_allocate_vars (MonoCompile *m)
 
                        if (!inreg && (ainfo->storage != ArgOnStack)) {
                                inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = AMD64_RBP;
+                               inst->inst_basereg = cfg->frame_reg;
                                /* These arguments are saved to the stack in the prolog */
-                               if (ainfo->storage == ArgValuetypeInReg)
-                                       offset += 2 * sizeof (gpointer);
-                               else
-                                       offset += sizeof (gpointer);
-                               inst->inst_offset = - offset;
+                               if (cfg->arch.omit_fp) {
+                                       inst->inst_offset = offset;
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                               } else {
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                       inst->inst_offset = - offset;
+                               }
                        }
                }
        }
 
-       m->stack_offset = offset;
+       cfg->stack_offset = offset;
 
        g_free (cinfo);
 }
@@ -1242,9 +1374,14 @@ if (ins->flags & MONO_INST_BRLABEL) { \
 /* emit an exception if condition is fail */
 #define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
         do {                                                        \
-               mono_add_patch_info (cfg, code - cfg->native_code,   \
-                                   MONO_PATCH_INFO_EXC, exc_name);  \
-               x86_branch32 (code, cond, 0, signed);               \
+               MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
+               if (tins == NULL) {                                                                             \
+                       mono_add_patch_info (cfg, code - cfg->native_code,   \
+                                       MONO_PATCH_INFO_EXC, exc_name);  \
+                       x86_branch32 (code, cond, 0, signed);               \
+               } else {        \
+                       EMIT_COND_BRANCH (tins, cond, signed);  \
+               }                       \
        } while (0); 
 
 #define EMIT_FPCOMPARE(code) do { \
@@ -1342,7 +1479,7 @@ emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer dat
 }
 
 /* FIXME: Add more instructions */
-#define INST_IGNORES_CFLAGS(ins) (((ins)->opcode == CEE_BR) || ((ins)->opcode == OP_STORE_MEMBASE_IMM) || ((ins)->opcode == OP_STOREI8_MEMBASE_REG) || ((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_SETREG) || ((ins)->opcode == OP_ICONST) || ((ins)->opcode == OP_I8CONST) || ((ins)->opcode == OP_LOAD_MEMBASE))
+#define INST_IGNORES_CFLAGS(ins) (((ins)->opcode == CEE_BR) || ((ins)->opcode == OP_STORE_MEMBASE_IMM) || ((ins)->opcode == OP_STOREI8_MEMBASE_REG) || ((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_ICONST) || ((ins)->opcode == OP_I8CONST) || ((ins)->opcode == OP_LOAD_MEMBASE))
 
 static void
 peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
@@ -1533,7 +1670,6 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_I4:
                case CEE_CONV_U4:
                case OP_MOVE:
-               case OP_SETREG:
                        /*
                         * Removes:
                         *
@@ -1948,6 +2084,12 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
        
        /* This is the opposite of the code in emit_prolog */
 
+       if (sig->ret->type != MONO_TYPE_VOID) {
+               if ((cinfo->ret.storage == ArgInIReg) && (cfg->ret->opcode != OP_REGVAR)) {
+                       amd64_mov_reg_membase (code, cinfo->ret.reg, cfg->ret->inst_basereg, cfg->ret->inst_offset, 8);
+               }
+       }
+
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
                MonoType *arg_type;
@@ -1977,6 +2119,11 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
                                break;
                        }
                }
+               else {
+                       g_assert (ainfo->storage == ArgInIReg);
+
+                       amd64_mov_reg_reg (code, ainfo->reg, inst->dreg, 8);
+               }
        }
 
        g_free (cinfo);
@@ -2259,8 +2406,67 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_MUL_IMM:
                case OP_LMUL_IMM:
-                       amd64_imul_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_imm);
+               case OP_IMUL_IMM: {
+                       guint32 size = (ins->opcode == OP_IMUL_IMM) ? 4 : 8;
+                       
+                       switch (ins->inst_imm) {
+                       case 2:
+                               /* MOV r1, r2 */
+                               /* ADD r1, r1 */
+                               if (ins->dreg != ins->sreg1)
+                                       amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, size);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 3:
+                               /* LEA r1, [r2 + r2*2] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               break;
+                       case 5:
+                               /* LEA r1, [r2 + r2*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               break;
+                       case 6:
+                               /* LEA r1, [r2 + r2*2] */
+                               /* ADD r1, r1          */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 9:
+                               /* LEA r1, [r2 + r2*8] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 3);
+                               break;
+                       case 10:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* ADD r1, r1          */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 12:
+                               /* LEA r1, [r2 + r2*2] */
+                               /* SHL r1, 2           */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               amd64_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
+                               break;
+                       case 25:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* LEA r1, [r1 + r1*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
+                               break;
+                       case 100:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* SHL r1, 2           */
+                               /* LEA r1, [r1 + r1*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
+                               amd64_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
+                               break;
+                       default:
+                               amd64_imul_reg_reg_imm_size (code, ins->dreg, ins->sreg1, ins->inst_imm, size);
+                               break;
+                       }
                        break;
+               }
                case CEE_DIV:
                case OP_LDIV:
                        amd64_cdq (code);
@@ -2416,9 +2622,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_IMUL:
                        amd64_imul_reg_reg_size (code, ins->sreg1, ins->sreg2, 4);
                        break;
-               case OP_IMUL_IMM:
-                       amd64_imul_reg_reg_imm_size (code, ins->dreg, ins->sreg1, ins->inst_imm, 4);
-                       break;
                case OP_IMUL_OVF:
                        amd64_imul_reg_reg_size (code, ins->sreg1, ins->sreg2, 4);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
@@ -2536,7 +2739,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_I4:
                case CEE_CONV_U4:
                case OP_MOVE:
-               case OP_SETREG:
                        amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof (gpointer));
                        break;
                case OP_AMD64_SET_XMMREG_R4: {
@@ -2577,20 +2779,33 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        code = emit_load_volatile_arguments (cfg, code);
 
-                       for (i = 0; i < AMD64_NREG; ++i)
-                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                                       pos -= sizeof (gpointer);
+                       if (cfg->arch.omit_fp) {
+                               guint32 save_offset = 0;
+                               /* Pop callee-saved registers */
+                               for (i = 0; i < AMD64_NREG; ++i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                               amd64_mov_reg_membase (code, i, AMD64_RSP, save_offset, 8);
+                                               save_offset += 8;
+                                       }
+                               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+                       }
+                       else {
+                               for (i = 0; i < AMD64_NREG; ++i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
+                                               pos -= sizeof (gpointer);
                        
-                       if (pos)
-                               amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+                               if (pos)
+                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
 
-                       /* Pop registers in reverse order */
-                       for (i = AMD64_NREG - 1; i > 0; --i)
-                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                       amd64_pop_reg (code, i);
-                               }
+                               /* Pop registers in reverse order */
+                               for (i = AMD64_NREG - 1; i > 0; --i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                               amd64_pop_reg (code, i);
+                                       }
+
+                               amd64_leave (code);
+                       }
 
-                       amd64_leave (code);
                        offset = code - cfg->native_code;
                        mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
                        if (cfg->compile_aot)
@@ -2605,7 +2820,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_membase_imm (code, X86_CMP, ins->sreg1, 0, 0);
                        break;
                case OP_ARGLIST: {
-                       amd64_lea_membase (code, AMD64_R11, AMD64_RBP, cfg->sig_cookie);
+                       amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, cfg->sig_cookie);
                        amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, 8);
                        break;
                }
@@ -3016,41 +3231,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        break;
                }
-               case OP_LCONV_TO_OVF_I: {
-                       guint8 *br [3], *label [1];
-
-                       if (use_sse2)
-                               g_assert_not_reached ();
-
-                       /* 
-                        * Valid ints: 0xffffffff:8000000 to 00000000:0x7f000000
-                        */
-                       amd64_test_reg_reg (code, ins->sreg1, ins->sreg1);
-
-                       /* If the low word top bit is set, see if we are negative */
-                       br [0] = code; x86_branch8 (code, X86_CC_LT, 0, TRUE);
-                       /* We are not negative (no top bit set, check for our top word to be zero */
-                       amd64_test_reg_reg (code, ins->sreg2, ins->sreg2);
-                       br [1] = code; x86_branch8 (code, X86_CC_EQ, 0, TRUE);
-                       label [0] = code;
-
-                       /* throw exception */
-                       mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_EXC, "OverflowException");
-                       x86_jump32 (code, 0);
-       
-                       amd64_patch (br [0], code);
-                       /* our top bit is set, check that top word is 0xfffffff */
-                       amd64_alu_reg_imm (code, X86_CMP, ins->sreg2, 0xffffffff);
-               
-                       amd64_patch (br [1], code);
-                       /* nope, emit exception */
-                       br [2] = code; x86_branch8 (code, X86_CC_NE, 0, TRUE);
-                       amd64_patch (br [2], label [0]);
-
-                       if (ins->dreg != ins->sreg1)
-                               amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
-                       break;
-               }
                case CEE_CONV_OVF_U4:
                        amd64_alu_reg_imm (code, X86_CMP, ins->sreg1, 0);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_LT, TRUE, "OverflowException");
@@ -3555,6 +3735,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_mov_reg_mem (code, ins->dreg, ins->inst_offset, 8);
                        break;
                }
+               case OP_MEMORY_BARRIER: {
+                       /* Not needed on amd64 */
+                       break;
+               }
                case OP_ATOMIC_ADD_I4:
                case OP_ATOMIC_ADD_I8: {
                        int dreg = ins->dreg;
@@ -3724,57 +3908,34 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        int alloc_size, pos, max_offset, i, quad;
        guint8 *code;
        CallInfo *cinfo;
+       gint32 lmf_offset = cfg->arch.lmf_offset;
 
        cfg->code_size =  MAX (((MonoMethodNormal *)method)->header->code_size * 4, 512);
        code = cfg->native_code = g_malloc (cfg->code_size);
 
-       amd64_push_reg (code, AMD64_RBP);
-       amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
-
-       /* Stack alignment check */
-#if 0
-       {
-               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RSP, 8);
-               amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, 0xf);
-               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0);
-               x86_branch8 (code, X86_CC_EQ, 2, FALSE);
-               amd64_breakpoint (code);
-       }
-#endif
-
-       alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
+       /* Amount of stack space allocated by register saving code */
        pos = 0;
 
-       if (method->save_lmf) {
-               gint32 lmf_offset;
-
-               pos = ALIGN_TO (pos + sizeof (MonoLMF), 16);
-
-               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, pos);
-
-               lmf_offset = - cfg->arch.lmf_offset;
+       /* 
+        * The prolog consists of the following parts:
+        * FP present:
+        * - push rbp, mov rbp, rsp
+        * - save callee saved regs using pushes
+        * - allocate frame
+        * - save lmf if needed
+        * FP not present:
+        * - allocate frame
+        * - save lmf if needed
+        * - save callee saved regs using moves
+        */
 
-               /* Save ip */
-               amd64_lea_membase (code, AMD64_R11, AMD64_RIP, 0);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, 8);
-               /* Save fp */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
-               /* Save method */
-               /* FIXME: add a relocation for this */
-               if (IS_IMM32 (cfg->method))
-                       amd64_mov_membase_imm (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
-               else {
-                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
-                       amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
-               }
-               /* Save callee saved regs */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
-       } else {
+       if (!cfg->arch.omit_fp) {
+               amd64_push_reg (code, AMD64_RBP);
+               amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+       }
 
+       /* Save callee saved registers */
+       if (!cfg->arch.omit_fp && !method->save_lmf) {
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_push_reg (code, i);
@@ -3782,8 +3943,21 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        }
        }
 
+       alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
+
        alloc_size -= pos;
 
+       if (cfg->arch.omit_fp)
+               /* 
+                * On enter, the stack is misaligned by the the pushing of the return
+                * address. It is either made aligned by the pushing of %rbp, or by
+                * this.
+                */
+               alloc_size += 8;
+
+       cfg->arch.stack_alloc_size = alloc_size;
+
+       /* Allocate stack frame */
        if (alloc_size) {
                /* See mono_emit_stack_alloc */
 #if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
@@ -3800,6 +3974,56 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 #endif
        }
 
+       /* Stack alignment check */
+#if 0
+       {
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RSP, 8);
+               amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, 0xf);
+               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0);
+               x86_branch8 (code, X86_CC_EQ, 2, FALSE);
+               amd64_breakpoint (code);
+       }
+#endif
+
+       /* Save LMF */
+       if (method->save_lmf) {
+               /* Save ip */
+               amd64_lea_membase (code, AMD64_R11, AMD64_RIP, 0);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, 8);
+               /* Save fp */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
+               /* Save sp */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
+               /* Save method */
+               /* FIXME: add a relocation for this */
+               if (IS_IMM32 (cfg->method))
+                       amd64_mov_membase_imm (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
+               else {
+                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
+               }
+               /* Save callee saved regs */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
+       }
+
+       /* Save callee saved registers */
+       if (cfg->arch.omit_fp && !method->save_lmf) {
+               gint32 save_area_offset = 0;
+
+               /* Save caller saved registers after sp is adjusted */
+               /* The registers are saved at the bottom of the frame */
+               /* FIXME: Optimize this so the regs are saved at the end of the frame in increasing order */
+               for (i = 0; i < AMD64_NREG; ++i)
+                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                               amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
+                               save_area_offset += 8;
+                       }
+       }
+
        /* compute max_offset in order to use short forward jumps */
        max_offset = 0;
        if (cfg->opt & MONO_OPT_BRANCH) {
@@ -3915,8 +4139,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
 
        if (method->save_lmf) {
-               gint32 lmf_offset;
-
                if (lmf_tls_offset != -1) {
                        /* Load lmf quicky using the FS register */
                        x86_prefix (code, X86_FS_PREFIX);
@@ -3932,15 +4154,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                                                 (gpointer)"mono_get_lmf_addr");                
                }
 
-               lmf_offset = - cfg->arch.lmf_offset;
-
                /* Save lmf_addr */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
                /* Save previous_lmf */
                amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
                /* Set new lmf */
-               amd64_lea_membase (code, AMD64_R11, AMD64_RBP, lmf_offset);
+               amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
                amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
        }
 
@@ -3965,6 +4185,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        guint8 *code;
        int max_epilog_size = 16;
        CallInfo *cinfo;
+       gint32 lmf_offset = cfg->arch.lmf_offset;
        
        if (cfg->method->save_lmf)
                max_epilog_size += 256;
@@ -3992,51 +4213,63 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
        
        if (method->save_lmf) {
-               gint32 lmf_offset = - cfg->arch.lmf_offset;
-
                /* Restore previous lmf */
-               amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-               amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
+               amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+               amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
                amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
 
                /* Restore caller saved regs */
+               if (cfg->used_int_regs & (1 << AMD64_RBP)) {
+                       amd64_mov_reg_membase (code, AMD64_RBP, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), 8);
+               }
                if (cfg->used_int_regs & (1 << AMD64_RBX)) {
-                       amd64_mov_reg_membase (code, AMD64_RBX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), 8);
+                       amd64_mov_reg_membase (code, AMD64_RBX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R12)) {
-                       amd64_mov_reg_membase (code, AMD64_R12, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), 8);
+                       amd64_mov_reg_membase (code, AMD64_R12, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R13)) {
-                       amd64_mov_reg_membase (code, AMD64_R13, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), 8);
+                       amd64_mov_reg_membase (code, AMD64_R13, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R14)) {
-                       amd64_mov_reg_membase (code, AMD64_R14, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
+                       amd64_mov_reg_membase (code, AMD64_R14, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R15)) {
-                       amd64_mov_reg_membase (code, AMD64_R15, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
+                       amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
                }
        } else {
 
-               for (i = 0; i < AMD64_NREG; ++i)
-                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                               pos -= sizeof (gpointer);
+               if (cfg->arch.omit_fp) {
+                       gint32 save_area_offset = 0;
 
-               if (pos) {
-                       if (pos == - sizeof (gpointer)) {
-                               /* Only one register, so avoid lea */
-                               for (i = AMD64_NREG - 1; i > 0; --i)
-                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                               amd64_mov_reg_membase (code, i, AMD64_RBP, pos, 8);
-                                       }
-                       }
-                       else {
-                               amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+                       for (i = 0; i < AMD64_NREG; ++i)
+                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                       amd64_mov_reg_membase (code, i, AMD64_RSP, save_area_offset, 8);
+                                       save_area_offset += 8;
+                               }
+               }
+               else {
+                       for (i = 0; i < AMD64_NREG; ++i)
+                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
+                                       pos -= sizeof (gpointer);
 
-                               /* Pop registers in reverse order */
-                               for (i = AMD64_NREG - 1; i > 0; --i)
-                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                               amd64_pop_reg (code, i);
-                                       }
+                       if (pos) {
+                               if (pos == - sizeof (gpointer)) {
+                                       /* Only one register, so avoid lea */
+                                       for (i = AMD64_NREG - 1; i > 0; --i)
+                                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                                       amd64_mov_reg_membase (code, i, AMD64_RBP, pos, 8);
+                                               }
+                               }
+                               else {
+                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+
+                                       /* Pop registers in reverse order */
+                                       for (i = AMD64_NREG - 1; i > 0; --i)
+                                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                                       amd64_pop_reg (code, i);
+                                               }
+                               }
                        }
                }
        }
@@ -4067,13 +4300,26 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
        g_free (cinfo);
 
-       amd64_leave (code);
+       if (cfg->arch.omit_fp) {
+               if (cfg->arch.stack_alloc_size)
+                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+       } else {
+               amd64_leave (code);
+       }
        amd64_ret (code);
 
        cfg->code_len = code - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
 
+       if (cfg->arch.omit_fp) {
+               /* 
+                * Encode the stack size into used_int_regs so the exception handler
+                * can access it.
+                */
+               g_assert (cfg->arch.stack_alloc_size < (1 << 16));
+               cfg->used_int_regs |= (1 << 31) | (cfg->arch.stack_alloc_size << 16);
+       }
 }
 
 void
@@ -4548,66 +4794,6 @@ mono_arch_get_delegate_method_ptr_addr (guint8* code, gpointer *regs)
 
 static gboolean tls_offset_inited = FALSE;
 
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-
-static void
-setup_stack (MonoJitTlsData *tls)
-{
-       pthread_t self = pthread_self();
-       pthread_attr_t attr;
-       size_t stsize = 0;
-       struct sigaltstack sa;
-       guint8 *staddr = NULL;
-       guint8 *current = (guint8*)&staddr;
-
-       if (mono_running_on_valgrind ())
-               return;
-
-       /* Determine stack boundaries */
-#ifdef HAVE_PTHREAD_GETATTR_NP
-       pthread_getattr_np( self, &attr );
-#else
-#ifdef HAVE_PTHREAD_ATTR_GET_NP
-       pthread_attr_get_np( self, &attr );
-#elif defined(sun)
-       pthread_attr_init( &attr );
-       pthread_attr_getstacksize( &attr, &stsize );
-#else
-#error "Not implemented"
-#endif
-#endif
-#ifndef sun
-       pthread_attr_getstack( &attr, (void**)&staddr, &stsize );
-#endif
-
-       g_assert (staddr);
-
-       g_assert ((current > staddr) && (current < staddr + stsize));
-
-       tls->end_of_stack = staddr + stsize;
-
-       /*
-        * threads created by nptl does not seem to have a guard page, and
-        * since the main thread is not created by us, we can't even set one.
-        * Increasing stsize fools the SIGSEGV signal handler into thinking this
-        * is a stack overflow exception.
-        */
-       tls->stack_size = stsize + getpagesize ();
-
-       /* Setup an alternate signal stack */
-       tls->signal_stack = mmap (0, SIGNAL_STACK_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-       tls->signal_stack_size = SIGNAL_STACK_SIZE;
-
-       g_assert (tls->signal_stack);
-
-       sa.ss_sp = tls->signal_stack;
-       sa.ss_size = SIGNAL_STACK_SIZE;
-       sa.ss_flags = SS_ONSTACK;
-       sigaltstack (&sa, NULL);
-}
-
-#endif
-
 void
 mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
 {
@@ -4618,36 +4804,20 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                lmf_tls_offset = mono_get_lmf_tls_offset ();
                thread_tls_offset = mono_thread_get_tls_offset ();
        }               
-
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-       setup_stack (tls);
-#endif
 }
 
 void
 mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-       struct sigaltstack sa;
-
-       sa.ss_sp = tls->signal_stack;
-       sa.ss_size = SIGNAL_STACK_SIZE;
-       sa.ss_flags = SS_DISABLE;
-       sigaltstack  (&sa, NULL);
-
-       if (tls->signal_stack)
-               munmap (tls->signal_stack, SIGNAL_STACK_SIZE);
-#endif
 }
 
 void
 mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
 {
        MonoCallInst *call = (MonoCallInst*)inst;
-       int out_reg = param_regs [0];
+       CallInfo * cinfo = get_call_info (inst->signature, FALSE);
 
        if (vt_reg != -1) {
-               CallInfo * cinfo = get_call_info (inst->signature, FALSE);
                MonoInst *vtarg;
 
                if (cinfo->ret.storage == ArgValuetypeInReg) {
@@ -4664,30 +4834,28 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
                        MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
                }
                else {
-                       MONO_INST_NEW (cfg, vtarg, OP_SETREG);
+                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
                        vtarg->sreg1 = vt_reg;
                        vtarg->dreg = mono_regstate_next_int (cfg->rs);
                        mono_bblock_add_inst (cfg->cbb, vtarg);
 
-                       mono_call_inst_add_outarg_reg (call, vtarg->dreg, out_reg, FALSE);
-
-                       out_reg = param_regs [1];
+                       mono_call_inst_add_outarg_reg (call, vtarg->dreg, cinfo->ret.reg, FALSE);
                }
-
-               g_free (cinfo);
        }
 
        /* add the this argument */
        if (this_reg != -1) {
                MonoInst *this;
-               MONO_INST_NEW (cfg, this, OP_SETREG);
+               MONO_INST_NEW (cfg, this, OP_MOVE);
                this->type = this_type;
                this->sreg1 = this_reg;
                this->dreg = mono_regstate_next_int (cfg->rs);
                mono_bblock_add_inst (cfg->cbb, this);
 
-               mono_call_inst_add_outarg_reg (call, this->dreg, out_reg, FALSE);
+               mono_call_inst_add_outarg_reg (call, this->dreg, cinfo->args [0].reg, FALSE);
        }
+
+       g_free (cinfo);
 }
 
 MonoInst*
@@ -4727,6 +4895,9 @@ mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethod
                        ins->inst_i1 = args [1];
                }
 #endif
+       } else if (cmethod->klass == mono_defaults.thread_class &&
+                          strcmp (cmethod->name, "MemoryBarrier") == 0) {
+               MONO_INST_NEW (cfg, ins, OP_MEMORY_BARRIER);
        } else if(cmethod->klass->image == mono_defaults.corlib &&
                           (strcmp (cmethod->klass->name_space, "System.Threading") == 0) &&
                           (strcmp (cmethod->klass->name, "Interlocked") == 0)) {