2005-12-12 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index 8bffc9e159173e91c1deb86814e796d4ba32dd11..b4a530e6c04326f327817a375981bb294409c11c 100644 (file)
 #include "mini.h"
 #include <string.h>
 #include <math.h>
-#include <unistd.h>
-#include <sys/mman.h>
 
 #include <mono/metadata/appdomain.h>
 #include <mono/metadata/debug-helpers.h>
 #include <mono/metadata/threads.h>
 #include <mono/metadata/profiler-private.h>
+#include <mono/metadata/mono-debug.h>
 #include <mono/utils/mono-math.h>
 
 #include "trace.h"
@@ -47,8 +46,6 @@ static const char*const * ins_spec = amd64_desc;
 #define CALLCONV_IS_STDCALL(call_conv) ((call_conv) == MONO_CALL_STDCALL)
 #endif
 
-#define SIGNAL_STACK_SIZE (64 * 1024)
-
 #define ARGS_OFFSET 16
 #define GP_SCRATCH_REG AMD64_R11
 
@@ -108,6 +105,41 @@ mono_arch_fregname (int reg)
                return "unknown";
 }
 
+G_GNUC_UNUSED static void
+break_count (void)
+{
+}
+
+G_GNUC_UNUSED static gboolean
+debug_count (void)
+{
+       static int count = 0;
+       count ++;
+
+       if (!getenv ("COUNT"))
+               return TRUE;
+
+       if (count == atoi (getenv ("COUNT"))) {
+               break_count ();
+       }
+
+       if (count > atoi (getenv ("COUNT"))) {
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static gboolean
+debug_omit_fp (void)
+{
+#if 0
+       return debug_count ();
+#else
+       return TRUE;
+#endif
+}
+
 static inline void 
 amd64_patch (unsigned char* code, gpointer target)
 {
@@ -230,7 +262,7 @@ typedef enum ArgumentClass {
 static ArgumentClass
 merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
 {
-       ArgumentClass class2;
+       ArgumentClass class2 = ARG_CLASS_NO_CLASS;
        MonoType *ptype;
 
        ptype = mono_type_get_underlying_type (type);
@@ -749,11 +781,79 @@ mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
        return vars;
 }
 
+/**
+ * mono_arch_compute_omit_fp:
+ *
+ *   Determine whenever the frame pointer can be eliminated.
+ */
+static void
+mono_arch_compute_omit_fp (MonoCompile *cfg)
+{
+       MonoMethodSignature *sig;
+       MonoMethodHeader *header;
+       int i;
+       CallInfo *cinfo;
+
+       if (cfg->arch.omit_fp_computed)
+               return;
+
+       header = mono_method_get_header (cfg->method);
+
+       sig = mono_method_signature (cfg->method);
+
+       cinfo = get_call_info (sig, FALSE);
+
+       /*
+        * FIXME: Remove some of the restrictions.
+        */
+       cfg->arch.omit_fp = TRUE;
+       cfg->arch.omit_fp_computed = TRUE;
+
+       /* Temporarily disable this when running in the debugger until we have support
+        * for this in the debugger. */
+       if (mono_debug_using_mono_debugger ())
+               cfg->arch.omit_fp = FALSE;
+
+       if (!debug_omit_fp ())
+               cfg->arch.omit_fp = FALSE;
+       /*
+       if (cfg->method->save_lmf)
+               cfg->arch.omit_fp = FALSE;
+       */
+       if (cfg->flags & MONO_CFG_HAS_ALLOCA)
+               cfg->arch.omit_fp = FALSE;
+       if (header->num_clauses)
+               cfg->arch.omit_fp = FALSE;
+       if (cfg->param_area)
+               cfg->arch.omit_fp = FALSE;
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG))
+               cfg->arch.omit_fp = FALSE;
+       if ((mono_jit_trace_calls != NULL && mono_trace_eval (cfg->method)) ||
+               (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE))
+               cfg->arch.omit_fp = FALSE;
+       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
+               ArgInfo *ainfo = &cinfo->args [i];
+
+               if (ainfo->storage == ArgOnStack) {
+                       /* 
+                        * The stack offset can only be determined when the frame
+                        * size is known.
+                        */
+                       cfg->arch.omit_fp = FALSE;
+               }
+       }
+}
+
 GList *
 mono_arch_get_global_int_regs (MonoCompile *cfg)
 {
        GList *regs = NULL;
 
+       mono_arch_compute_omit_fp (cfg);
+
+       if (cfg->arch.omit_fp)
+               regs = g_list_prepend (regs, (gpointer)AMD64_RBP);
+
        /* We use the callee saved registers for global allocation */
        regs = g_list_prepend (regs, (gpointer)AMD64_RBX);
        regs = g_list_prepend (regs, (gpointer)AMD64_R12);
@@ -786,7 +886,7 @@ mono_arch_regalloc_cost (MonoCompile *cfg, MonoMethodVar *vmv)
 }
  
 void
-mono_arch_allocate_vars (MonoCompile *m)
+mono_arch_allocate_vars (MonoCompile *cfg)
 {
        MonoMethodSignature *sig;
        MonoMethodHeader *header;
@@ -796,34 +896,53 @@ mono_arch_allocate_vars (MonoCompile *m)
        gint32 *offsets;
        CallInfo *cinfo;
 
-       header = mono_method_get_header (m->method);
+       header = mono_method_get_header (cfg->method);
 
-       sig = mono_method_signature (m->method);
+       sig = mono_method_signature (cfg->method);
 
        cinfo = get_call_info (sig, FALSE);
 
+       mono_arch_compute_omit_fp (cfg);
+
        /*
         * We use the ABI calling conventions for managed code as well.
         * Exception: valuetypes are never passed or returned in registers.
         */
 
-       /* Locals are allocated backwards from %fp */
-       m->frame_reg = AMD64_RBP;
-       offset = 0;
+       if (cfg->arch.omit_fp) {
+               cfg->flags |= MONO_CFG_HAS_SPILLUP;
+               cfg->frame_reg = AMD64_RSP;
+               offset = 0;
+       } else {
+               /* Locals are allocated backwards from %fp */
+               cfg->frame_reg = AMD64_RBP;
+               offset = 0;
+       }
+
+       cfg->arch.reg_save_area_offset = offset;
 
        /* Reserve space for caller saved registers */
        for (i = 0; i < AMD64_NREG; ++i)
-               if (AMD64_IS_CALLEE_SAVED_REG (i) && (m->used_int_regs & (1 << i))) {
+               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                        offset += sizeof (gpointer);
                }
 
-       if (m->method->save_lmf) {
+       if (cfg->method->save_lmf) {
                /* Reserve stack space for saving LMF + argument regs */
-               offset += sizeof (MonoLMF);
+               guint32 size = sizeof (MonoLMF);
+
                if (lmf_tls_offset == -1)
                        /* Need to save argument regs too */
-                       offset += (AMD64_NREG * 8) + (8 * 8);
-               m->arch.lmf_offset = offset;
+                       size += (AMD64_NREG * 8) + (8 * 8);
+
+               if (cfg->arch.omit_fp) {
+                       cfg->arch.lmf_offset = offset;
+                       offset += size;
+               }
+               else {
+                       offset += size;
+                       cfg->arch.lmf_offset = -offset;
+               }
        }
 
        if (sig->ret->type != MONO_TYPE_VOID) {
@@ -833,41 +952,50 @@ mono_arch_allocate_vars (MonoCompile *m)
                case ArgInDoubleSSEReg:
                        if ((MONO_TYPE_ISSTRUCT (sig->ret) && !mono_class_from_mono_type (sig->ret)->enumtype) || (sig->ret->type == MONO_TYPE_TYPEDBYREF)) {
                                /* The register is volatile */
-                               m->ret->opcode = OP_REGOFFSET;
-                               m->ret->inst_basereg = AMD64_RBP;
-                               offset += 8;
-                               m->ret->inst_offset = - offset;
+                               cfg->ret->opcode = OP_REGOFFSET;
+                               cfg->ret->inst_basereg = cfg->frame_reg;
+                               if (cfg->arch.omit_fp) {
+                                       cfg->ret->inst_offset = offset;
+                                       offset += 8;
+                               } else {
+                                       offset += 8;
+                                       cfg->ret->inst_offset = -offset;
+                               }
                        }
                        else {
-                               m->ret->opcode = OP_REGVAR;
-                               m->ret->inst_c0 = cinfo->ret.reg;
+                               cfg->ret->opcode = OP_REGVAR;
+                               cfg->ret->inst_c0 = cinfo->ret.reg;
                        }
                        break;
                case ArgValuetypeInReg:
                        /* Allocate a local to hold the result, the epilog will copy it to the correct place */
+                       g_assert (!cfg->arch.omit_fp);
                        offset += 16;
-                       m->ret->opcode = OP_REGOFFSET;
-                       m->ret->inst_basereg = AMD64_RBP;
-                       m->ret->inst_offset = - offset;
+                       cfg->ret->opcode = OP_REGOFFSET;
+                       cfg->ret->inst_basereg = cfg->frame_reg;
+                       cfg->ret->inst_offset = - offset;
                        break;
                default:
                        g_assert_not_reached ();
                }
-               m->ret->dreg = m->ret->inst_c0;
+               cfg->ret->dreg = cfg->ret->inst_c0;
        }
 
        /* Allocate locals */
-       offsets = mono_allocate_stack_slots (m, &locals_stack_size, &locals_stack_align);
+       offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE: TRUE, &locals_stack_size, &locals_stack_align);
        if (locals_stack_align) {
                offset += (locals_stack_align - 1);
                offset &= ~(locals_stack_align - 1);
        }
-       for (i = m->locals_start; i < m->num_varinfo; i++) {
+       for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
                if (offsets [i] != -1) {
-                       MonoInst *inst = m->varinfo [i];
+                       MonoInst *inst = cfg->varinfo [i];
                        inst->opcode = OP_REGOFFSET;
-                       inst->inst_basereg = AMD64_RBP;
-                       inst->inst_offset = - (offset + offsets [i]);
+                       inst->inst_basereg = cfg->frame_reg;
+                       if (cfg->arch.omit_fp)
+                               inst->inst_offset = (offset + offsets [i]);
+                       else
+                               inst->inst_offset = - (offset + offsets [i]);
                        //printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
                }
        }
@@ -875,12 +1003,13 @@ mono_arch_allocate_vars (MonoCompile *m)
        offset += locals_stack_size;
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG)) {
+               g_assert (!cfg->arch.omit_fp);
                g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-               m->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
+               cfg->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
        }
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               inst = m->varinfo [i];
+               inst = cfg->varinfo [i];
                if (inst->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
@@ -913,8 +1042,9 @@ mono_arch_allocate_vars (MonoCompile *m)
                                inst->dreg = ainfo->reg;
                                break;
                        case ArgOnStack:
+                               g_assert (!cfg->arch.omit_fp);
                                inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = AMD64_RBP;
+                               inst->inst_basereg = cfg->frame_reg;
                                inst->inst_offset = ainfo->offset + ARGS_OFFSET;
                                break;
                        case ArgValuetypeInReg:
@@ -925,18 +1055,20 @@ mono_arch_allocate_vars (MonoCompile *m)
 
                        if (!inreg && (ainfo->storage != ArgOnStack)) {
                                inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = AMD64_RBP;
+                               inst->inst_basereg = cfg->frame_reg;
                                /* These arguments are saved to the stack in the prolog */
-                               if (ainfo->storage == ArgValuetypeInReg)
-                                       offset += 2 * sizeof (gpointer);
-                               else
-                                       offset += sizeof (gpointer);
-                               inst->inst_offset = - offset;
+                               if (cfg->arch.omit_fp) {
+                                       inst->inst_offset = offset;
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                               } else {
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                       inst->inst_offset = - offset;
+                               }
                        }
                }
        }
 
-       m->stack_offset = offset;
+       cfg->stack_offset = offset;
 
        g_free (cinfo);
 }
@@ -1090,8 +1222,15 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                else
                                if (sig->pinvoke)
                                        size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
-                               else
-                                       size = mono_type_stack_size (&in->klass->byval_arg, &align);
+                               else {
+                                       /* 
+                                        * Other backends use mono_type_stack_size (), but that
+                                        * aligns the size to 8, which is larger than the size of
+                                        * the source, leading to reads of invalid memory if the
+                                        * source is at the end of address space.
+                                        */
+                                       size = mono_class_value_size (in->klass, &align);
+                               }
                                if (ainfo->storage == ArgValuetypeInReg) {
                                        if (ainfo->pair_storage [1] == ArgNone) {
                                                MonoInst *load;
@@ -1235,9 +1374,14 @@ if (ins->flags & MONO_INST_BRLABEL) { \
 /* emit an exception if condition is fail */
 #define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
         do {                                                        \
-               mono_add_patch_info (cfg, code - cfg->native_code,   \
-                                   MONO_PATCH_INFO_EXC, exc_name);  \
-               x86_branch32 (code, cond, 0, signed);               \
+               MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
+               if (tins == NULL) {                                                                             \
+                       mono_add_patch_info (cfg, code - cfg->native_code,   \
+                                       MONO_PATCH_INFO_EXC, exc_name);  \
+                       x86_branch32 (code, cond, 0, signed);               \
+               } else {        \
+                       EMIT_COND_BRANCH (tins, cond, signed);  \
+               }                       \
        } while (0); 
 
 #define EMIT_FPCOMPARE(code) do { \
@@ -1258,7 +1402,7 @@ emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer dat
 {
        mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
 
-       if (mono_compile_aot) {
+       if (cfg->compile_aot) {
                amd64_call_membase (code, AMD64_RIP, 0);
        }
        else {
@@ -1335,7 +1479,7 @@ emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer dat
 }
 
 /* FIXME: Add more instructions */
-#define INST_IGNORES_CFLAGS(ins) (((ins)->opcode == CEE_BR) || ((ins)->opcode == OP_STORE_MEMBASE_IMM) || ((ins)->opcode == OP_STOREI8_MEMBASE_REG) || ((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_SETREG) || ((ins)->opcode == OP_ICONST) || ((ins)->opcode == OP_I8CONST) || ((ins)->opcode == OP_LOAD_MEMBASE))
+#define INST_IGNORES_CFLAGS(ins) (((ins)->opcode == CEE_BR) || ((ins)->opcode == OP_STORE_MEMBASE_IMM) || ((ins)->opcode == OP_STOREI8_MEMBASE_REG) || ((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_ICONST) || ((ins)->opcode == OP_I8CONST) || ((ins)->opcode == OP_LOAD_MEMBASE))
 
 static void
 peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
@@ -1526,7 +1670,6 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_I4:
                case CEE_CONV_U4:
                case OP_MOVE:
-               case OP_SETREG:
                        /*
                         * Removes:
                         *
@@ -1652,7 +1795,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                temp->inst_c0 = ins->inst_imm;
                                temp->dreg = mono_regstate_next_int (cfg->rs);
                                ins->opcode = OP_STOREI8_MEMBASE_REG;
-                               ins->sreg2 = temp->dreg;
+                               ins->sreg1 = temp->dreg;
                        }
                        break;
                default:
@@ -1941,6 +2084,12 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
        
        /* This is the opposite of the code in emit_prolog */
 
+       if (sig->ret->type != MONO_TYPE_VOID) {
+               if ((cinfo->ret.storage == ArgInIReg) && (cfg->ret->opcode != OP_REGVAR)) {
+                       amd64_mov_reg_membase (code, cinfo->ret.reg, cfg->ret->inst_basereg, cfg->ret->inst_offset, 8);
+               }
+       }
+
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
                MonoType *arg_type;
@@ -1970,6 +2119,11 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
                                break;
                        }
                }
+               else {
+                       g_assert (ainfo->storage == ArgInIReg);
+
+                       amd64_mov_reg_reg (code, ainfo->reg, inst->dreg, 8);
+               }
        }
 
        g_free (cinfo);
@@ -2029,7 +2183,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
        if (cfg->prof_options & MONO_PROFILE_COVERAGE) {
                MonoProfileCoverageInfo *cov = cfg->coverage_info;
-               g_assert (!mono_compile_aot);
+               g_assert (!cfg->compile_aot);
                cpos += 6;
 
                cov->data [bb->dfn].cil_code = bb->cil_code;
@@ -2252,8 +2406,67 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_MUL_IMM:
                case OP_LMUL_IMM:
-                       amd64_imul_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_imm);
+               case OP_IMUL_IMM: {
+                       guint32 size = (ins->opcode == OP_IMUL_IMM) ? 4 : 8;
+                       
+                       switch (ins->inst_imm) {
+                       case 2:
+                               /* MOV r1, r2 */
+                               /* ADD r1, r1 */
+                               if (ins->dreg != ins->sreg1)
+                                       amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, size);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 3:
+                               /* LEA r1, [r2 + r2*2] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               break;
+                       case 5:
+                               /* LEA r1, [r2 + r2*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               break;
+                       case 6:
+                               /* LEA r1, [r2 + r2*2] */
+                               /* ADD r1, r1          */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 9:
+                               /* LEA r1, [r2 + r2*8] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 3);
+                               break;
+                       case 10:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* ADD r1, r1          */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 12:
+                               /* LEA r1, [r2 + r2*2] */
+                               /* SHL r1, 2           */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               amd64_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
+                               break;
+                       case 25:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* LEA r1, [r1 + r1*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
+                               break;
+                       case 100:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* SHL r1, 2           */
+                               /* LEA r1, [r1 + r1*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
+                               amd64_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
+                               break;
+                       default:
+                               amd64_imul_reg_reg_imm_size (code, ins->dreg, ins->sreg1, ins->inst_imm, size);
+                               break;
+                       }
                        break;
+               }
                case CEE_DIV:
                case OP_LDIV:
                        amd64_cdq (code);
@@ -2409,9 +2622,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_IMUL:
                        amd64_imul_reg_reg_size (code, ins->sreg1, ins->sreg2, 4);
                        break;
-               case OP_IMUL_IMM:
-                       amd64_imul_reg_reg_imm_size (code, ins->dreg, ins->sreg1, ins->inst_imm, 4);
-                       break;
                case OP_IMUL_OVF:
                        amd64_imul_reg_reg_size (code, ins->sreg1, ins->sreg2, 4);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
@@ -2512,6 +2722,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_SEXT_I2:
                        amd64_widen_reg (code, ins->dreg, ins->sreg1, TRUE, TRUE);
                        break;
+               case OP_SEXT_I4:
+                       amd64_movsxd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
                case OP_ICONST:
                case OP_I8CONST:
                        if ((((guint64)ins->inst_c0) >> 32) == 0)
@@ -2526,7 +2739,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_I4:
                case CEE_CONV_U4:
                case OP_MOVE:
-               case OP_SETREG:
                        amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof (gpointer));
                        break;
                case OP_AMD64_SET_XMMREG_R4: {
@@ -2567,23 +2779,36 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        code = emit_load_volatile_arguments (cfg, code);
 
-                       for (i = 0; i < AMD64_NREG; ++i)
-                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                                       pos -= sizeof (gpointer);
+                       if (cfg->arch.omit_fp) {
+                               guint32 save_offset = 0;
+                               /* Pop callee-saved registers */
+                               for (i = 0; i < AMD64_NREG; ++i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                               amd64_mov_reg_membase (code, i, AMD64_RSP, save_offset, 8);
+                                               save_offset += 8;
+                                       }
+                               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+                       }
+                       else {
+                               for (i = 0; i < AMD64_NREG; ++i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
+                                               pos -= sizeof (gpointer);
                        
-                       if (pos)
-                               amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+                               if (pos)
+                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
 
-                       /* Pop registers in reverse order */
-                       for (i = AMD64_NREG - 1; i > 0; --i)
-                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                       amd64_pop_reg (code, i);
-                               }
+                               /* Pop registers in reverse order */
+                               for (i = AMD64_NREG - 1; i > 0; --i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                               amd64_pop_reg (code, i);
+                                       }
+
+                               amd64_leave (code);
+                       }
 
-                       amd64_leave (code);
                        offset = code - cfg->native_code;
                        mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
-                       if (mono_compile_aot)
+                       if (cfg->compile_aot)
                                amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, 8);
                        else
                                amd64_set_reg_template (code, AMD64_R11);
@@ -2595,7 +2820,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_membase_imm (code, X86_CMP, ins->sreg1, 0, 0);
                        break;
                case OP_ARGLIST: {
-                       amd64_lea_membase (code, AMD64_R11, AMD64_RBP, cfg->sig_cookie);
+                       amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, cfg->sig_cookie);
                        amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, 8);
                        break;
                }
@@ -3006,41 +3231,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        break;
                }
-               case OP_LCONV_TO_OVF_I: {
-                       guint8 *br [3], *label [1];
-
-                       if (use_sse2)
-                               g_assert_not_reached ();
-
-                       /* 
-                        * Valid ints: 0xffffffff:8000000 to 00000000:0x7f000000
-                        */
-                       amd64_test_reg_reg (code, ins->sreg1, ins->sreg1);
-
-                       /* If the low word top bit is set, see if we are negative */
-                       br [0] = code; x86_branch8 (code, X86_CC_LT, 0, TRUE);
-                       /* We are not negative (no top bit set, check for our top word to be zero */
-                       amd64_test_reg_reg (code, ins->sreg2, ins->sreg2);
-                       br [1] = code; x86_branch8 (code, X86_CC_EQ, 0, TRUE);
-                       label [0] = code;
-
-                       /* throw exception */
-                       mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_EXC, "OverflowException");
-                       x86_jump32 (code, 0);
-       
-                       amd64_patch (br [0], code);
-                       /* our top bit is set, check that top word is 0xfffffff */
-                       amd64_alu_reg_imm (code, X86_CMP, ins->sreg2, 0xffffffff);
-               
-                       amd64_patch (br [1], code);
-                       /* nope, emit exception */
-                       br [2] = code; x86_branch8 (code, X86_CC_NE, 0, TRUE);
-                       amd64_patch (br [2], label [0]);
-
-                       if (ins->dreg != ins->sreg1)
-                               amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
-                       break;
-               }
                case CEE_CONV_OVF_U4:
                        amd64_alu_reg_imm (code, X86_CMP, ins->sreg1, 0);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_LT, TRUE, "OverflowException");
@@ -3545,6 +3735,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_mov_reg_mem (code, ins->dreg, ins->inst_offset, 8);
                        break;
                }
+               case OP_MEMORY_BARRIER: {
+                       /* Not needed on amd64 */
+                       break;
+               }
                case OP_ATOMIC_ADD_I4:
                case OP_ATOMIC_ADD_I8: {
                        int dreg = ins->dreg;
@@ -3662,6 +3856,7 @@ void
 mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, MonoJumpInfo *ji, gboolean run_cctors)
 {
        MonoJumpInfo *patch_info;
+       gboolean compile_aot = !run_cctors;
 
        for (patch_info = ji; patch_info; patch_info = patch_info->next) {
                unsigned char *ip = patch_info->ip.i + code;
@@ -3669,23 +3864,14 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
 
                target = mono_resolve_patch_target (method, domain, code, patch_info, run_cctors);
 
-               if (mono_compile_aot) {
+               if (compile_aot) {
                        switch (patch_info->type) {
                        case MONO_PATCH_INFO_BB:
                        case MONO_PATCH_INFO_LABEL:
                                break;
-                       default: {
-                               /* Just to make code run at aot time work */
-                               const unsigned char **tmp;
-
-                               mono_domain_lock (domain);
-                               tmp = mono_code_manager_reserve (domain->code_mp, sizeof (gpointer));
-                               mono_domain_unlock (domain);
-
-                               *tmp = target;
-                               target = (const unsigned char*)(guint64)((guint8*)tmp - (guint8*)ip);
-                               break;
-                       }
+                       default:
+                               /* No need to patch these */
+                               continue;
                        }
                }
 
@@ -3695,11 +3881,7 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
                case MONO_PATCH_INFO_CLASS_INIT: {
                        /* Might already been changed to a nop */
                        guint8* ip2 = ip;
-                       if (mono_compile_aot)
-                               amd64_call_membase (ip2, AMD64_RIP, 0);
-                       else {
-                               amd64_call_code (ip2, 0);
-                       }
+                       amd64_call_code (ip2, 0);
                        break;
                }
                case MONO_PATCH_INFO_METHOD_REL:
@@ -3726,57 +3908,34 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        int alloc_size, pos, max_offset, i, quad;
        guint8 *code;
        CallInfo *cinfo;
+       gint32 lmf_offset = cfg->arch.lmf_offset;
 
        cfg->code_size =  MAX (((MonoMethodNormal *)method)->header->code_size * 4, 512);
        code = cfg->native_code = g_malloc (cfg->code_size);
 
-       amd64_push_reg (code, AMD64_RBP);
-       amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
-
-       /* Stack alignment check */
-#if 0
-       {
-               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RSP, 8);
-               amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, 0xf);
-               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0);
-               x86_branch8 (code, X86_CC_EQ, 2, FALSE);
-               amd64_breakpoint (code);
-       }
-#endif
-
-       alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
+       /* Amount of stack space allocated by register saving code */
        pos = 0;
 
-       if (method->save_lmf) {
-               gint32 lmf_offset;
-
-               pos = ALIGN_TO (pos + sizeof (MonoLMF), 16);
-
-               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, pos);
-
-               lmf_offset = - cfg->arch.lmf_offset;
+       /* 
+        * The prolog consists of the following parts:
+        * FP present:
+        * - push rbp, mov rbp, rsp
+        * - save callee saved regs using pushes
+        * - allocate frame
+        * - save lmf if needed
+        * FP not present:
+        * - allocate frame
+        * - save lmf if needed
+        * - save callee saved regs using moves
+        */
 
-               /* Save ip */
-               amd64_lea_membase (code, AMD64_R11, AMD64_RIP, 0);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, 8);
-               /* Save fp */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
-               /* Save method */
-               /* FIXME: add a relocation for this */
-               if (IS_IMM32 (cfg->method))
-                       amd64_mov_membase_imm (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
-               else {
-                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
-                       amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
-               }
-               /* Save callee saved regs */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
-       } else {
+       if (!cfg->arch.omit_fp) {
+               amd64_push_reg (code, AMD64_RBP);
+               amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+       }
 
+       /* Save callee saved registers */
+       if (!cfg->arch.omit_fp && !method->save_lmf) {
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_push_reg (code, i);
@@ -3784,8 +3943,21 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        }
        }
 
+       alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
+
        alloc_size -= pos;
 
+       if (cfg->arch.omit_fp)
+               /* 
+                * On enter, the stack is misaligned by the the pushing of the return
+                * address. It is either made aligned by the pushing of %rbp, or by
+                * this.
+                */
+               alloc_size += 8;
+
+       cfg->arch.stack_alloc_size = alloc_size;
+
+       /* Allocate stack frame */
        if (alloc_size) {
                /* See mono_emit_stack_alloc */
 #if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
@@ -3802,6 +3974,56 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 #endif
        }
 
+       /* Stack alignment check */
+#if 0
+       {
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RSP, 8);
+               amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, 0xf);
+               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0);
+               x86_branch8 (code, X86_CC_EQ, 2, FALSE);
+               amd64_breakpoint (code);
+       }
+#endif
+
+       /* Save LMF */
+       if (method->save_lmf) {
+               /* Save ip */
+               amd64_lea_membase (code, AMD64_R11, AMD64_RIP, 0);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, 8);
+               /* Save fp */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
+               /* Save sp */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
+               /* Save method */
+               /* FIXME: add a relocation for this */
+               if (IS_IMM32 (cfg->method))
+                       amd64_mov_membase_imm (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
+               else {
+                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
+               }
+               /* Save callee saved regs */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
+       }
+
+       /* Save callee saved registers */
+       if (cfg->arch.omit_fp && !method->save_lmf) {
+               gint32 save_area_offset = 0;
+
+               /* Save caller saved registers after sp is adjusted */
+               /* The registers are saved at the bottom of the frame */
+               /* FIXME: Optimize this so the regs are saved at the end of the frame in increasing order */
+               for (i = 0; i < AMD64_NREG; ++i)
+                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                               amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
+                               save_area_offset += 8;
+                       }
+       }
+
        /* compute max_offset in order to use short forward jumps */
        max_offset = 0;
        if (cfg->opt & MONO_OPT_BRANCH) {
@@ -3917,8 +4139,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
 
        if (method->save_lmf) {
-               gint32 lmf_offset;
-
                if (lmf_tls_offset != -1) {
                        /* Load lmf quicky using the FS register */
                        x86_prefix (code, X86_FS_PREFIX);
@@ -3934,15 +4154,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                                                 (gpointer)"mono_get_lmf_addr");                
                }
 
-               lmf_offset = - cfg->arch.lmf_offset;
-
                /* Save lmf_addr */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
                /* Save previous_lmf */
                amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
                /* Set new lmf */
-               amd64_lea_membase (code, AMD64_R11, AMD64_RBP, lmf_offset);
+               amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
                amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
        }
 
@@ -3967,6 +4185,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        guint8 *code;
        int max_epilog_size = 16;
        CallInfo *cinfo;
+       gint32 lmf_offset = cfg->arch.lmf_offset;
        
        if (cfg->method->save_lmf)
                max_epilog_size += 256;
@@ -3994,51 +4213,63 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
        
        if (method->save_lmf) {
-               gint32 lmf_offset = - cfg->arch.lmf_offset;
-
                /* Restore previous lmf */
-               amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-               amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
+               amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+               amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
                amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
 
                /* Restore caller saved regs */
+               if (cfg->used_int_regs & (1 << AMD64_RBP)) {
+                       amd64_mov_reg_membase (code, AMD64_RBP, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), 8);
+               }
                if (cfg->used_int_regs & (1 << AMD64_RBX)) {
-                       amd64_mov_reg_membase (code, AMD64_RBX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), 8);
+                       amd64_mov_reg_membase (code, AMD64_RBX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R12)) {
-                       amd64_mov_reg_membase (code, AMD64_R12, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), 8);
+                       amd64_mov_reg_membase (code, AMD64_R12, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R13)) {
-                       amd64_mov_reg_membase (code, AMD64_R13, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), 8);
+                       amd64_mov_reg_membase (code, AMD64_R13, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R14)) {
-                       amd64_mov_reg_membase (code, AMD64_R14, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
+                       amd64_mov_reg_membase (code, AMD64_R14, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R15)) {
-                       amd64_mov_reg_membase (code, AMD64_R15, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
+                       amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
                }
        } else {
 
-               for (i = 0; i < AMD64_NREG; ++i)
-                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                               pos -= sizeof (gpointer);
+               if (cfg->arch.omit_fp) {
+                       gint32 save_area_offset = 0;
 
-               if (pos) {
-                       if (pos == - sizeof (gpointer)) {
-                               /* Only one register, so avoid lea */
-                               for (i = AMD64_NREG - 1; i > 0; --i)
-                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                               amd64_mov_reg_membase (code, i, AMD64_RBP, pos, 8);
-                                       }
-                       }
-                       else {
-                               amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+                       for (i = 0; i < AMD64_NREG; ++i)
+                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                       amd64_mov_reg_membase (code, i, AMD64_RSP, save_area_offset, 8);
+                                       save_area_offset += 8;
+                               }
+               }
+               else {
+                       for (i = 0; i < AMD64_NREG; ++i)
+                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
+                                       pos -= sizeof (gpointer);
 
-                               /* Pop registers in reverse order */
-                               for (i = AMD64_NREG - 1; i > 0; --i)
-                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                               amd64_pop_reg (code, i);
-                                       }
+                       if (pos) {
+                               if (pos == - sizeof (gpointer)) {
+                                       /* Only one register, so avoid lea */
+                                       for (i = AMD64_NREG - 1; i > 0; --i)
+                                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                                       amd64_mov_reg_membase (code, i, AMD64_RBP, pos, 8);
+                                               }
+                               }
+                               else {
+                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+
+                                       /* Pop registers in reverse order */
+                                       for (i = AMD64_NREG - 1; i > 0; --i)
+                                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                                       amd64_pop_reg (code, i);
+                                               }
+                               }
                        }
                }
        }
@@ -4069,13 +4300,26 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
        g_free (cinfo);
 
-       amd64_leave (code);
+       if (cfg->arch.omit_fp) {
+               if (cfg->arch.stack_alloc_size)
+                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+       } else {
+               amd64_leave (code);
+       }
        amd64_ret (code);
 
        cfg->code_len = code - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
 
+       if (cfg->arch.omit_fp) {
+               /* 
+                * Encode the stack size into used_int_regs so the exception handler
+                * can access it.
+                */
+               g_assert (cfg->arch.stack_alloc_size < (1 << 16));
+               cfg->used_int_regs |= (1 << 31) | (cfg->arch.stack_alloc_size << 16);
+       }
 }
 
 void
@@ -4146,7 +4390,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                                patch_info->type = MONO_PATCH_INFO_INTERNAL_METHOD;
                                patch_info->ip.i = code - cfg->native_code;
 
-                               if (mono_compile_aot) {
+                               if (cfg->compile_aot) {
                                        amd64_mov_reg_membase (code, GP_SCRATCH_REG, AMD64_RIP, 0, 8);
                                        amd64_call_reg (code, GP_SCRATCH_REG);
                                } else {
@@ -4239,7 +4483,7 @@ void*
 mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
 {
        guchar *code = p;
-       CallInfo *cinfo;
+       CallInfo *cinfo = NULL;
        MonoMethodSignature *sig;
        MonoInst *inst;
        int i, n, stack_area = 0;
@@ -4472,7 +4716,8 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
         */
        if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
                /* call OFFSET(%rip) */
-               return NULL;
+               disp = *(guint32*)(code + 3);
+               return (gpointer*)(code + disp + 7);
        }
        else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
                /* call *[reg+disp32] */
@@ -4549,149 +4794,30 @@ mono_arch_get_delegate_method_ptr_addr (guint8* code, gpointer *regs)
 
 static gboolean tls_offset_inited = FALSE;
 
-/* code should be simply return <tls var>; */
-static int 
-read_tls_offset_from_method (void* method)
-{
-       guint8 *code = (guint8*)method;
-
-       /* 
-        * Determine the offset of mono_lfm_addr inside the TLS structures
-        * by disassembling the function above.
-        */
-       /* This is generated by gcc 3.3.2 */
-       if ((code [0] == 0x55) && (code [1] == 0x48) && (code [2] == 0x89) &&
-               (code [3] == 0xe5) && (code [4] == 0x64) && (code [5] == 0x48) &&
-               (code [6] == 0x8b) && (code [7] == 0x04) && (code [8] == 0x25) &&
-               (code [9] == 0x00) && (code [10] == 0x00) && (code [11] == 0x00) &&
-               (code [12] == 0x0) && (code [13] == 0x48) && (code [14] == 0x8b) &&
-               (code [15] == 0x80)) {
-               return *(gint32*)&(code [16]);
-       } else if
-               /* This is generated by gcc-3.3.2 with -O=2 */
-               /* mov fs:0, %rax ; mov <offset>(%rax), %rax ; retq */
-               ((code [0] == 0x64) && (code [1] == 0x48) && (code [2] == 0x8b) &&
-                (code [3] == 0x04) && (code [4] == 0x25) &&
-                (code [9] == 0x48) && (code [10] == 0x8b) && (code [11] == 0x80) &&
-                (code [16] == 0xc3)) {
-                       return *(gint32*)&(code [12]);
-       } else if 
-               /* This is generated by gcc-3.4.1 */
-               ((code [0] == 0x55) && (code [1] == 0x48) && (code [2] == 0x89) &&
-                (code [3] == 0xe5) && (code [4] == 0x64) && (code [5] == 0x48) &&
-                (code [6] == 0x8b) && (code [7] == 0x04) && (code [8] == 0x25) &&
-                (code [13] == 0xc9) && (code [14] == 0xc3)) {
-                       return *(gint32*)&(code [9]);
-       } else if
-               /* This is generated by gcc-3.4.1 with -O=2 */
-               ((code [0] == 0x64) && (code [1] == 0x48) && (code [2] == 0x8b) &&
-                (code [3] == 0x04) && (code [4] == 0x25)) {
-               return *(gint32*)&(code [5]);
-       }
-
-       return -1;
-}
-
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-
-static void
-setup_stack (MonoJitTlsData *tls)
-{
-       pthread_t self = pthread_self();
-       pthread_attr_t attr;
-       size_t stsize = 0;
-       struct sigaltstack sa;
-       guint8 *staddr = NULL;
-       guint8 *current = (guint8*)&staddr;
-
-       if (mono_running_on_valgrind ())
-               return;
-
-       /* Determine stack boundaries */
-#ifdef HAVE_PTHREAD_GETATTR_NP
-       pthread_getattr_np( self, &attr );
-#else
-#ifdef HAVE_PTHREAD_ATTR_GET_NP
-       pthread_attr_get_np( self, &attr );
-#elif defined(sun)
-       pthread_attr_init( &attr );
-       pthread_attr_getstacksize( &attr, &stsize );
-#else
-#error "Not implemented"
-#endif
-#endif
-#ifndef sun
-       pthread_attr_getstack( &attr, (void**)&staddr, &stsize );
-#endif
-
-       g_assert (staddr);
-
-       g_assert ((current > staddr) && (current < staddr + stsize));
-
-       tls->end_of_stack = staddr + stsize;
-
-       /*
-        * threads created by nptl does not seem to have a guard page, and
-        * since the main thread is not created by us, we can't even set one.
-        * Increasing stsize fools the SIGSEGV signal handler into thinking this
-        * is a stack overflow exception.
-        */
-       tls->stack_size = stsize + getpagesize ();
-
-       /* Setup an alternate signal stack */
-       tls->signal_stack = mmap (0, SIGNAL_STACK_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-       tls->signal_stack_size = SIGNAL_STACK_SIZE;
-
-       g_assert (tls->signal_stack);
-
-       sa.ss_sp = tls->signal_stack;
-       sa.ss_size = SIGNAL_STACK_SIZE;
-       sa.ss_flags = SS_ONSTACK;
-       sigaltstack (&sa, NULL);
-}
-
-#endif
-
 void
 mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
 {
        if (!tls_offset_inited) {
                tls_offset_inited = TRUE;
 
-               lmf_tls_offset = read_tls_offset_from_method (mono_get_lmf_addr);
-               appdomain_tls_offset = read_tls_offset_from_method (mono_domain_get);
-               thread_tls_offset = read_tls_offset_from_method (mono_thread_current);
+               appdomain_tls_offset = mono_domain_get_tls_offset ();
+               lmf_tls_offset = mono_get_lmf_tls_offset ();
+               thread_tls_offset = mono_thread_get_tls_offset ();
        }               
-
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-       setup_stack (tls);
-#endif
 }
 
 void
 mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-       struct sigaltstack sa;
-
-       sa.ss_sp = tls->signal_stack;
-       sa.ss_size = SIGNAL_STACK_SIZE;
-       sa.ss_flags = SS_DISABLE;
-       sigaltstack  (&sa, NULL);
-
-       if (tls->signal_stack)
-               munmap (tls->signal_stack, SIGNAL_STACK_SIZE);
-#endif
 }
 
 void
 mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
 {
        MonoCallInst *call = (MonoCallInst*)inst;
-       int out_reg = param_regs [0];
+       CallInfo * cinfo = get_call_info (inst->signature, FALSE);
 
        if (vt_reg != -1) {
-               CallInfo * cinfo = get_call_info (inst->signature, FALSE);
                MonoInst *vtarg;
 
                if (cinfo->ret.storage == ArgValuetypeInReg) {
@@ -4708,30 +4834,28 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
                        MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
                }
                else {
-                       MONO_INST_NEW (cfg, vtarg, OP_SETREG);
+                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
                        vtarg->sreg1 = vt_reg;
                        vtarg->dreg = mono_regstate_next_int (cfg->rs);
                        mono_bblock_add_inst (cfg->cbb, vtarg);
 
-                       mono_call_inst_add_outarg_reg (call, vtarg->dreg, out_reg, FALSE);
-
-                       out_reg = param_regs [1];
+                       mono_call_inst_add_outarg_reg (call, vtarg->dreg, cinfo->ret.reg, FALSE);
                }
-
-               g_free (cinfo);
        }
 
        /* add the this argument */
        if (this_reg != -1) {
                MonoInst *this;
-               MONO_INST_NEW (cfg, this, OP_SETREG);
+               MONO_INST_NEW (cfg, this, OP_MOVE);
                this->type = this_type;
                this->sreg1 = this_reg;
                this->dreg = mono_regstate_next_int (cfg->rs);
                mono_bblock_add_inst (cfg->cbb, this);
 
-               mono_call_inst_add_outarg_reg (call, this->dreg, out_reg, FALSE);
+               mono_call_inst_add_outarg_reg (call, this->dreg, cinfo->args [0].reg, FALSE);
        }
+
+       g_free (cinfo);
 }
 
 MonoInst*
@@ -4771,6 +4895,9 @@ mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethod
                        ins->inst_i1 = args [1];
                }
 #endif
+       } else if (cmethod->klass == mono_defaults.thread_class &&
+                          strcmp (cmethod->name, "MemoryBarrier") == 0) {
+               MONO_INST_NEW (cfg, ins, OP_MEMORY_BARRIER);
        } else if(cmethod->klass->image == mono_defaults.corlib &&
                           (strcmp (cmethod->klass->name_space, "System.Threading") == 0) &&
                           (strcmp (cmethod->klass->name, "Interlocked") == 0)) {