2005-12-12 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index 19fe4ef53dbaafe082672e8d3c52e91889ebaeda..b4a530e6c04326f327817a375981bb294409c11c 100644 (file)
@@ -18,6 +18,7 @@
 #include <mono/metadata/debug-helpers.h>
 #include <mono/metadata/threads.h>
 #include <mono/metadata/profiler-private.h>
+#include <mono/metadata/mono-debug.h>
 #include <mono/utils/mono-math.h>
 
 #include "trace.h"
@@ -104,6 +105,41 @@ mono_arch_fregname (int reg)
                return "unknown";
 }
 
+G_GNUC_UNUSED static void
+break_count (void)
+{
+}
+
+G_GNUC_UNUSED static gboolean
+debug_count (void)
+{
+       static int count = 0;
+       count ++;
+
+       if (!getenv ("COUNT"))
+               return TRUE;
+
+       if (count == atoi (getenv ("COUNT"))) {
+               break_count ();
+       }
+
+       if (count > atoi (getenv ("COUNT"))) {
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static gboolean
+debug_omit_fp (void)
+{
+#if 0
+       return debug_count ();
+#else
+       return TRUE;
+#endif
+}
+
 static inline void 
 amd64_patch (unsigned char* code, gpointer target)
 {
@@ -745,11 +781,79 @@ mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
        return vars;
 }
 
+/**
+ * mono_arch_compute_omit_fp:
+ *
+ *   Determine whenever the frame pointer can be eliminated.
+ */
+static void
+mono_arch_compute_omit_fp (MonoCompile *cfg)
+{
+       MonoMethodSignature *sig;
+       MonoMethodHeader *header;
+       int i;
+       CallInfo *cinfo;
+
+       if (cfg->arch.omit_fp_computed)
+               return;
+
+       header = mono_method_get_header (cfg->method);
+
+       sig = mono_method_signature (cfg->method);
+
+       cinfo = get_call_info (sig, FALSE);
+
+       /*
+        * FIXME: Remove some of the restrictions.
+        */
+       cfg->arch.omit_fp = TRUE;
+       cfg->arch.omit_fp_computed = TRUE;
+
+       /* Temporarily disable this when running in the debugger until we have support
+        * for this in the debugger. */
+       if (mono_debug_using_mono_debugger ())
+               cfg->arch.omit_fp = FALSE;
+
+       if (!debug_omit_fp ())
+               cfg->arch.omit_fp = FALSE;
+       /*
+       if (cfg->method->save_lmf)
+               cfg->arch.omit_fp = FALSE;
+       */
+       if (cfg->flags & MONO_CFG_HAS_ALLOCA)
+               cfg->arch.omit_fp = FALSE;
+       if (header->num_clauses)
+               cfg->arch.omit_fp = FALSE;
+       if (cfg->param_area)
+               cfg->arch.omit_fp = FALSE;
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG))
+               cfg->arch.omit_fp = FALSE;
+       if ((mono_jit_trace_calls != NULL && mono_trace_eval (cfg->method)) ||
+               (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE))
+               cfg->arch.omit_fp = FALSE;
+       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
+               ArgInfo *ainfo = &cinfo->args [i];
+
+               if (ainfo->storage == ArgOnStack) {
+                       /* 
+                        * The stack offset can only be determined when the frame
+                        * size is known.
+                        */
+                       cfg->arch.omit_fp = FALSE;
+               }
+       }
+}
+
 GList *
 mono_arch_get_global_int_regs (MonoCompile *cfg)
 {
        GList *regs = NULL;
 
+       mono_arch_compute_omit_fp (cfg);
+
+       if (cfg->arch.omit_fp)
+               regs = g_list_prepend (regs, (gpointer)AMD64_RBP);
+
        /* We use the callee saved registers for global allocation */
        regs = g_list_prepend (regs, (gpointer)AMD64_RBX);
        regs = g_list_prepend (regs, (gpointer)AMD64_R12);
@@ -782,7 +886,7 @@ mono_arch_regalloc_cost (MonoCompile *cfg, MonoMethodVar *vmv)
 }
  
 void
-mono_arch_allocate_vars (MonoCompile *m)
+mono_arch_allocate_vars (MonoCompile *cfg)
 {
        MonoMethodSignature *sig;
        MonoMethodHeader *header;
@@ -792,34 +896,53 @@ mono_arch_allocate_vars (MonoCompile *m)
        gint32 *offsets;
        CallInfo *cinfo;
 
-       header = mono_method_get_header (m->method);
+       header = mono_method_get_header (cfg->method);
 
-       sig = mono_method_signature (m->method);
+       sig = mono_method_signature (cfg->method);
 
        cinfo = get_call_info (sig, FALSE);
 
+       mono_arch_compute_omit_fp (cfg);
+
        /*
         * We use the ABI calling conventions for managed code as well.
         * Exception: valuetypes are never passed or returned in registers.
         */
 
-       /* Locals are allocated backwards from %fp */
-       m->frame_reg = AMD64_RBP;
-       offset = 0;
+       if (cfg->arch.omit_fp) {
+               cfg->flags |= MONO_CFG_HAS_SPILLUP;
+               cfg->frame_reg = AMD64_RSP;
+               offset = 0;
+       } else {
+               /* Locals are allocated backwards from %fp */
+               cfg->frame_reg = AMD64_RBP;
+               offset = 0;
+       }
+
+       cfg->arch.reg_save_area_offset = offset;
 
        /* Reserve space for caller saved registers */
        for (i = 0; i < AMD64_NREG; ++i)
-               if (AMD64_IS_CALLEE_SAVED_REG (i) && (m->used_int_regs & (1 << i))) {
+               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                        offset += sizeof (gpointer);
                }
 
-       if (m->method->save_lmf) {
+       if (cfg->method->save_lmf) {
                /* Reserve stack space for saving LMF + argument regs */
-               offset += sizeof (MonoLMF);
+               guint32 size = sizeof (MonoLMF);
+
                if (lmf_tls_offset == -1)
                        /* Need to save argument regs too */
-                       offset += (AMD64_NREG * 8) + (8 * 8);
-               m->arch.lmf_offset = offset;
+                       size += (AMD64_NREG * 8) + (8 * 8);
+
+               if (cfg->arch.omit_fp) {
+                       cfg->arch.lmf_offset = offset;
+                       offset += size;
+               }
+               else {
+                       offset += size;
+                       cfg->arch.lmf_offset = -offset;
+               }
        }
 
        if (sig->ret->type != MONO_TYPE_VOID) {
@@ -829,41 +952,50 @@ mono_arch_allocate_vars (MonoCompile *m)
                case ArgInDoubleSSEReg:
                        if ((MONO_TYPE_ISSTRUCT (sig->ret) && !mono_class_from_mono_type (sig->ret)->enumtype) || (sig->ret->type == MONO_TYPE_TYPEDBYREF)) {
                                /* The register is volatile */
-                               m->ret->opcode = OP_REGOFFSET;
-                               m->ret->inst_basereg = AMD64_RBP;
-                               offset += 8;
-                               m->ret->inst_offset = - offset;
+                               cfg->ret->opcode = OP_REGOFFSET;
+                               cfg->ret->inst_basereg = cfg->frame_reg;
+                               if (cfg->arch.omit_fp) {
+                                       cfg->ret->inst_offset = offset;
+                                       offset += 8;
+                               } else {
+                                       offset += 8;
+                                       cfg->ret->inst_offset = -offset;
+                               }
                        }
                        else {
-                               m->ret->opcode = OP_REGVAR;
-                               m->ret->inst_c0 = cinfo->ret.reg;
+                               cfg->ret->opcode = OP_REGVAR;
+                               cfg->ret->inst_c0 = cinfo->ret.reg;
                        }
                        break;
                case ArgValuetypeInReg:
                        /* Allocate a local to hold the result, the epilog will copy it to the correct place */
+                       g_assert (!cfg->arch.omit_fp);
                        offset += 16;
-                       m->ret->opcode = OP_REGOFFSET;
-                       m->ret->inst_basereg = AMD64_RBP;
-                       m->ret->inst_offset = - offset;
+                       cfg->ret->opcode = OP_REGOFFSET;
+                       cfg->ret->inst_basereg = cfg->frame_reg;
+                       cfg->ret->inst_offset = - offset;
                        break;
                default:
                        g_assert_not_reached ();
                }
-               m->ret->dreg = m->ret->inst_c0;
+               cfg->ret->dreg = cfg->ret->inst_c0;
        }
 
        /* Allocate locals */
-       offsets = mono_allocate_stack_slots (m, &locals_stack_size, &locals_stack_align);
+       offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE: TRUE, &locals_stack_size, &locals_stack_align);
        if (locals_stack_align) {
                offset += (locals_stack_align - 1);
                offset &= ~(locals_stack_align - 1);
        }
-       for (i = m->locals_start; i < m->num_varinfo; i++) {
+       for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
                if (offsets [i] != -1) {
-                       MonoInst *inst = m->varinfo [i];
+                       MonoInst *inst = cfg->varinfo [i];
                        inst->opcode = OP_REGOFFSET;
-                       inst->inst_basereg = AMD64_RBP;
-                       inst->inst_offset = - (offset + offsets [i]);
+                       inst->inst_basereg = cfg->frame_reg;
+                       if (cfg->arch.omit_fp)
+                               inst->inst_offset = (offset + offsets [i]);
+                       else
+                               inst->inst_offset = - (offset + offsets [i]);
                        //printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
                }
        }
@@ -871,12 +1003,13 @@ mono_arch_allocate_vars (MonoCompile *m)
        offset += locals_stack_size;
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG)) {
+               g_assert (!cfg->arch.omit_fp);
                g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-               m->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
+               cfg->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
        }
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               inst = m->varinfo [i];
+               inst = cfg->varinfo [i];
                if (inst->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
@@ -909,8 +1042,9 @@ mono_arch_allocate_vars (MonoCompile *m)
                                inst->dreg = ainfo->reg;
                                break;
                        case ArgOnStack:
+                               g_assert (!cfg->arch.omit_fp);
                                inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = AMD64_RBP;
+                               inst->inst_basereg = cfg->frame_reg;
                                inst->inst_offset = ainfo->offset + ARGS_OFFSET;
                                break;
                        case ArgValuetypeInReg:
@@ -921,18 +1055,20 @@ mono_arch_allocate_vars (MonoCompile *m)
 
                        if (!inreg && (ainfo->storage != ArgOnStack)) {
                                inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = AMD64_RBP;
+                               inst->inst_basereg = cfg->frame_reg;
                                /* These arguments are saved to the stack in the prolog */
-                               if (ainfo->storage == ArgValuetypeInReg)
-                                       offset += 2 * sizeof (gpointer);
-                               else
-                                       offset += sizeof (gpointer);
-                               inst->inst_offset = - offset;
+                               if (cfg->arch.omit_fp) {
+                                       inst->inst_offset = offset;
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                               } else {
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                       inst->inst_offset = - offset;
+                               }
                        }
                }
        }
 
-       m->stack_offset = offset;
+       cfg->stack_offset = offset;
 
        g_free (cinfo);
 }
@@ -1086,8 +1222,15 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                else
                                if (sig->pinvoke)
                                        size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
-                               else
-                                       size = mono_type_stack_size (&in->klass->byval_arg, &align);
+                               else {
+                                       /* 
+                                        * Other backends use mono_type_stack_size (), but that
+                                        * aligns the size to 8, which is larger than the size of
+                                        * the source, leading to reads of invalid memory if the
+                                        * source is at the end of address space.
+                                        */
+                                       size = mono_class_value_size (in->klass, &align);
+                               }
                                if (ainfo->storage == ArgValuetypeInReg) {
                                        if (ainfo->pair_storage [1] == ArgNone) {
                                                MonoInst *load;
@@ -1231,9 +1374,14 @@ if (ins->flags & MONO_INST_BRLABEL) { \
 /* emit an exception if condition is fail */
 #define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
         do {                                                        \
-               mono_add_patch_info (cfg, code - cfg->native_code,   \
-                                   MONO_PATCH_INFO_EXC, exc_name);  \
-               x86_branch32 (code, cond, 0, signed);               \
+               MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
+               if (tins == NULL) {                                                                             \
+                       mono_add_patch_info (cfg, code - cfg->native_code,   \
+                                       MONO_PATCH_INFO_EXC, exc_name);  \
+                       x86_branch32 (code, cond, 0, signed);               \
+               } else {        \
+                       EMIT_COND_BRANCH (tins, cond, signed);  \
+               }                       \
        } while (0); 
 
 #define EMIT_FPCOMPARE(code) do { \
@@ -1936,6 +2084,12 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
        
        /* This is the opposite of the code in emit_prolog */
 
+       if (sig->ret->type != MONO_TYPE_VOID) {
+               if ((cinfo->ret.storage == ArgInIReg) && (cfg->ret->opcode != OP_REGVAR)) {
+                       amd64_mov_reg_membase (code, cinfo->ret.reg, cfg->ret->inst_basereg, cfg->ret->inst_offset, 8);
+               }
+       }
+
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
                MonoType *arg_type;
@@ -1965,6 +2119,11 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
                                break;
                        }
                }
+               else {
+                       g_assert (ainfo->storage == ArgInIReg);
+
+                       amd64_mov_reg_reg (code, ainfo->reg, inst->dreg, 8);
+               }
        }
 
        g_free (cinfo);
@@ -2247,8 +2406,67 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_MUL_IMM:
                case OP_LMUL_IMM:
-                       amd64_imul_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_imm);
+               case OP_IMUL_IMM: {
+                       guint32 size = (ins->opcode == OP_IMUL_IMM) ? 4 : 8;
+                       
+                       switch (ins->inst_imm) {
+                       case 2:
+                               /* MOV r1, r2 */
+                               /* ADD r1, r1 */
+                               if (ins->dreg != ins->sreg1)
+                                       amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, size);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 3:
+                               /* LEA r1, [r2 + r2*2] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               break;
+                       case 5:
+                               /* LEA r1, [r2 + r2*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               break;
+                       case 6:
+                               /* LEA r1, [r2 + r2*2] */
+                               /* ADD r1, r1          */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 9:
+                               /* LEA r1, [r2 + r2*8] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 3);
+                               break;
+                       case 10:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* ADD r1, r1          */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
+                               break;
+                       case 12:
+                               /* LEA r1, [r2 + r2*2] */
+                               /* SHL r1, 2           */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
+                               amd64_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
+                               break;
+                       case 25:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* LEA r1, [r1 + r1*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
+                               break;
+                       case 100:
+                               /* LEA r1, [r2 + r2*4] */
+                               /* SHL r1, 2           */
+                               /* LEA r1, [r1 + r1*4] */
+                               amd64_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
+                               amd64_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
+                               amd64_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
+                               break;
+                       default:
+                               amd64_imul_reg_reg_imm_size (code, ins->dreg, ins->sreg1, ins->inst_imm, size);
+                               break;
+                       }
                        break;
+               }
                case CEE_DIV:
                case OP_LDIV:
                        amd64_cdq (code);
@@ -2404,9 +2622,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_IMUL:
                        amd64_imul_reg_reg_size (code, ins->sreg1, ins->sreg2, 4);
                        break;
-               case OP_IMUL_IMM:
-                       amd64_imul_reg_reg_imm_size (code, ins->dreg, ins->sreg1, ins->inst_imm, 4);
-                       break;
                case OP_IMUL_OVF:
                        amd64_imul_reg_reg_size (code, ins->sreg1, ins->sreg2, 4);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
@@ -2564,20 +2779,33 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        code = emit_load_volatile_arguments (cfg, code);
 
-                       for (i = 0; i < AMD64_NREG; ++i)
-                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                                       pos -= sizeof (gpointer);
+                       if (cfg->arch.omit_fp) {
+                               guint32 save_offset = 0;
+                               /* Pop callee-saved registers */
+                               for (i = 0; i < AMD64_NREG; ++i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                               amd64_mov_reg_membase (code, i, AMD64_RSP, save_offset, 8);
+                                               save_offset += 8;
+                                       }
+                               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+                       }
+                       else {
+                               for (i = 0; i < AMD64_NREG; ++i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
+                                               pos -= sizeof (gpointer);
                        
-                       if (pos)
-                               amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+                               if (pos)
+                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
 
-                       /* Pop registers in reverse order */
-                       for (i = AMD64_NREG - 1; i > 0; --i)
-                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                       amd64_pop_reg (code, i);
-                               }
+                               /* Pop registers in reverse order */
+                               for (i = AMD64_NREG - 1; i > 0; --i)
+                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                               amd64_pop_reg (code, i);
+                                       }
+
+                               amd64_leave (code);
+                       }
 
-                       amd64_leave (code);
                        offset = code - cfg->native_code;
                        mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
                        if (cfg->compile_aot)
@@ -2592,7 +2820,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_membase_imm (code, X86_CMP, ins->sreg1, 0, 0);
                        break;
                case OP_ARGLIST: {
-                       amd64_lea_membase (code, AMD64_R11, AMD64_RBP, cfg->sig_cookie);
+                       amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, cfg->sig_cookie);
                        amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, 8);
                        break;
                }
@@ -3003,41 +3231,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        break;
                }
-               case OP_LCONV_TO_OVF_I: {
-                       guint8 *br [3], *label [1];
-
-                       if (use_sse2)
-                               g_assert_not_reached ();
-
-                       /* 
-                        * Valid ints: 0xffffffff:8000000 to 00000000:0x7f000000
-                        */
-                       amd64_test_reg_reg (code, ins->sreg1, ins->sreg1);
-
-                       /* If the low word top bit is set, see if we are negative */
-                       br [0] = code; x86_branch8 (code, X86_CC_LT, 0, TRUE);
-                       /* We are not negative (no top bit set, check for our top word to be zero */
-                       amd64_test_reg_reg (code, ins->sreg2, ins->sreg2);
-                       br [1] = code; x86_branch8 (code, X86_CC_EQ, 0, TRUE);
-                       label [0] = code;
-
-                       /* throw exception */
-                       mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_EXC, "OverflowException");
-                       x86_jump32 (code, 0);
-       
-                       amd64_patch (br [0], code);
-                       /* our top bit is set, check that top word is 0xfffffff */
-                       amd64_alu_reg_imm (code, X86_CMP, ins->sreg2, 0xffffffff);
-               
-                       amd64_patch (br [1], code);
-                       /* nope, emit exception */
-                       br [2] = code; x86_branch8 (code, X86_CC_NE, 0, TRUE);
-                       amd64_patch (br [2], label [0]);
-
-                       if (ins->dreg != ins->sreg1)
-                               amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
-                       break;
-               }
                case CEE_CONV_OVF_U4:
                        amd64_alu_reg_imm (code, X86_CMP, ins->sreg1, 0);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_LT, TRUE, "OverflowException");
@@ -3715,57 +3908,34 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        int alloc_size, pos, max_offset, i, quad;
        guint8 *code;
        CallInfo *cinfo;
+       gint32 lmf_offset = cfg->arch.lmf_offset;
 
        cfg->code_size =  MAX (((MonoMethodNormal *)method)->header->code_size * 4, 512);
        code = cfg->native_code = g_malloc (cfg->code_size);
 
-       amd64_push_reg (code, AMD64_RBP);
-       amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
-
-       /* Stack alignment check */
-#if 0
-       {
-               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RSP, 8);
-               amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, 0xf);
-               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0);
-               x86_branch8 (code, X86_CC_EQ, 2, FALSE);
-               amd64_breakpoint (code);
-       }
-#endif
-
-       alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
+       /* Amount of stack space allocated by register saving code */
        pos = 0;
 
-       if (method->save_lmf) {
-               gint32 lmf_offset;
-
-               pos = ALIGN_TO (pos + sizeof (MonoLMF), 16);
-
-               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, pos);
-
-               lmf_offset = - cfg->arch.lmf_offset;
+       /* 
+        * The prolog consists of the following parts:
+        * FP present:
+        * - push rbp, mov rbp, rsp
+        * - save callee saved regs using pushes
+        * - allocate frame
+        * - save lmf if needed
+        * FP not present:
+        * - allocate frame
+        * - save lmf if needed
+        * - save callee saved regs using moves
+        */
 
-               /* Save ip */
-               amd64_lea_membase (code, AMD64_R11, AMD64_RIP, 0);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, 8);
-               /* Save fp */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
-               /* Save method */
-               /* FIXME: add a relocation for this */
-               if (IS_IMM32 (cfg->method))
-                       amd64_mov_membase_imm (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
-               else {
-                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
-                       amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
-               }
-               /* Save callee saved regs */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
-       } else {
+       if (!cfg->arch.omit_fp) {
+               amd64_push_reg (code, AMD64_RBP);
+               amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+       }
 
+       /* Save callee saved registers */
+       if (!cfg->arch.omit_fp && !method->save_lmf) {
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_push_reg (code, i);
@@ -3773,8 +3943,21 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        }
        }
 
+       alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
+
        alloc_size -= pos;
 
+       if (cfg->arch.omit_fp)
+               /* 
+                * On enter, the stack is misaligned by the the pushing of the return
+                * address. It is either made aligned by the pushing of %rbp, or by
+                * this.
+                */
+               alloc_size += 8;
+
+       cfg->arch.stack_alloc_size = alloc_size;
+
+       /* Allocate stack frame */
        if (alloc_size) {
                /* See mono_emit_stack_alloc */
 #if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
@@ -3791,6 +3974,56 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 #endif
        }
 
+       /* Stack alignment check */
+#if 0
+       {
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_RSP, 8);
+               amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, 0xf);
+               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0);
+               x86_branch8 (code, X86_CC_EQ, 2, FALSE);
+               amd64_breakpoint (code);
+       }
+#endif
+
+       /* Save LMF */
+       if (method->save_lmf) {
+               /* Save ip */
+               amd64_lea_membase (code, AMD64_R11, AMD64_RIP, 0);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, 8);
+               /* Save fp */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), AMD64_RBP, 8);
+               /* Save sp */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
+               /* Save method */
+               /* FIXME: add a relocation for this */
+               if (IS_IMM32 (cfg->method))
+                       amd64_mov_membase_imm (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), (guint64)cfg->method, 8);
+               else {
+                       amd64_mov_reg_imm (code, AMD64_R11, cfg->method);
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
+               }
+               /* Save callee saved regs */
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
+       }
+
+       /* Save callee saved registers */
+       if (cfg->arch.omit_fp && !method->save_lmf) {
+               gint32 save_area_offset = 0;
+
+               /* Save caller saved registers after sp is adjusted */
+               /* The registers are saved at the bottom of the frame */
+               /* FIXME: Optimize this so the regs are saved at the end of the frame in increasing order */
+               for (i = 0; i < AMD64_NREG; ++i)
+                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                               amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
+                               save_area_offset += 8;
+                       }
+       }
+
        /* compute max_offset in order to use short forward jumps */
        max_offset = 0;
        if (cfg->opt & MONO_OPT_BRANCH) {
@@ -3906,8 +4139,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
 
        if (method->save_lmf) {
-               gint32 lmf_offset;
-
                if (lmf_tls_offset != -1) {
                        /* Load lmf quicky using the FS register */
                        x86_prefix (code, X86_FS_PREFIX);
@@ -3923,15 +4154,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                                                 (gpointer)"mono_get_lmf_addr");                
                }
 
-               lmf_offset = - cfg->arch.lmf_offset;
-
                /* Save lmf_addr */
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
                /* Save previous_lmf */
                amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-               amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
                /* Set new lmf */
-               amd64_lea_membase (code, AMD64_R11, AMD64_RBP, lmf_offset);
+               amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
                amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
        }
 
@@ -3956,6 +4185,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        guint8 *code;
        int max_epilog_size = 16;
        CallInfo *cinfo;
+       gint32 lmf_offset = cfg->arch.lmf_offset;
        
        if (cfg->method->save_lmf)
                max_epilog_size += 256;
@@ -3983,51 +4213,63 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
        
        if (method->save_lmf) {
-               gint32 lmf_offset = - cfg->arch.lmf_offset;
-
                /* Restore previous lmf */
-               amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-               amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
+               amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+               amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
                amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
 
                /* Restore caller saved regs */
+               if (cfg->used_int_regs & (1 << AMD64_RBP)) {
+                       amd64_mov_reg_membase (code, AMD64_RBP, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), 8);
+               }
                if (cfg->used_int_regs & (1 << AMD64_RBX)) {
-                       amd64_mov_reg_membase (code, AMD64_RBX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), 8);
+                       amd64_mov_reg_membase (code, AMD64_RBX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R12)) {
-                       amd64_mov_reg_membase (code, AMD64_R12, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), 8);
+                       amd64_mov_reg_membase (code, AMD64_R12, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R13)) {
-                       amd64_mov_reg_membase (code, AMD64_R13, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), 8);
+                       amd64_mov_reg_membase (code, AMD64_R13, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R14)) {
-                       amd64_mov_reg_membase (code, AMD64_R14, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
+                       amd64_mov_reg_membase (code, AMD64_R14, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
                }
                if (cfg->used_int_regs & (1 << AMD64_R15)) {
-                       amd64_mov_reg_membase (code, AMD64_R15, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
+                       amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
                }
        } else {
 
-               for (i = 0; i < AMD64_NREG; ++i)
-                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                               pos -= sizeof (gpointer);
+               if (cfg->arch.omit_fp) {
+                       gint32 save_area_offset = 0;
 
-               if (pos) {
-                       if (pos == - sizeof (gpointer)) {
-                               /* Only one register, so avoid lea */
-                               for (i = AMD64_NREG - 1; i > 0; --i)
-                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                               amd64_mov_reg_membase (code, i, AMD64_RBP, pos, 8);
-                                       }
-                       }
-                       else {
-                               amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+                       for (i = 0; i < AMD64_NREG; ++i)
+                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                       amd64_mov_reg_membase (code, i, AMD64_RSP, save_area_offset, 8);
+                                       save_area_offset += 8;
+                               }
+               }
+               else {
+                       for (i = 0; i < AMD64_NREG; ++i)
+                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
+                                       pos -= sizeof (gpointer);
 
-                               /* Pop registers in reverse order */
-                               for (i = AMD64_NREG - 1; i > 0; --i)
-                                       if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                               amd64_pop_reg (code, i);
-                                       }
+                       if (pos) {
+                               if (pos == - sizeof (gpointer)) {
+                                       /* Only one register, so avoid lea */
+                                       for (i = AMD64_NREG - 1; i > 0; --i)
+                                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                                       amd64_mov_reg_membase (code, i, AMD64_RBP, pos, 8);
+                                               }
+                               }
+                               else {
+                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+
+                                       /* Pop registers in reverse order */
+                                       for (i = AMD64_NREG - 1; i > 0; --i)
+                                               if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
+                                                       amd64_pop_reg (code, i);
+                                               }
+                               }
                        }
                }
        }
@@ -4058,13 +4300,26 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
        g_free (cinfo);
 
-       amd64_leave (code);
+       if (cfg->arch.omit_fp) {
+               if (cfg->arch.stack_alloc_size)
+                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+       } else {
+               amd64_leave (code);
+       }
        amd64_ret (code);
 
        cfg->code_len = code - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
 
+       if (cfg->arch.omit_fp) {
+               /* 
+                * Encode the stack size into used_int_regs so the exception handler
+                * can access it.
+                */
+               g_assert (cfg->arch.stack_alloc_size < (1 << 16));
+               cfg->used_int_regs |= (1 << 31) | (cfg->arch.stack_alloc_size << 16);
+       }
 }
 
 void