2010-02-15 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index fc83e91e0c1e1f5cdc56e6096f6569bcc670b062..3b2033a7287b0ae8d27c6524afc1d25bc998436e 100644 (file)
 #include <mono/metadata/profiler-private.h>
 #include <mono/metadata/mono-debug.h>
 #include <mono/utils/mono-math.h>
+#include <mono/utils/mono-mmap.h>
 
 #include "trace.h"
+#include "ir-emit.h"
 #include "mini-amd64.h"
-#include "inssel.h"
 #include "cpu-amd64.h"
+#include "debugger-agent.h"
+
+/* 
+ * Can't define this in mini-amd64.h cause that would turn on the generic code in
+ * method-to-ir.c.
+ */
+#define MONO_ARCH_IMT_REG AMD64_R11
 
 static gint lmf_tls_offset = -1;
 static gint lmf_addr_tls_offset = -1;
 static gint appdomain_tls_offset = -1;
-static gint thread_tls_offset = -1;
 
 #ifdef MONO_XEN_OPT
 static gboolean optimize_for_xen = TRUE;
@@ -41,17 +48,15 @@ static gboolean optimize_for_xen = TRUE;
 #define optimize_for_xen 0
 #endif
 
-static gboolean use_sse2 = !MONO_ARCH_USE_FPSTACK;
-
 #define ALIGN_TO(val,align) ((((guint64)val) + ((align) - 1)) & ~((align) - 1))
 
 #define IS_IMM32(val) ((((guint64)val) >> 32) == 0)
 
 #define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
 
-#ifdef PLATFORM_WIN32
-/* Under windows, the default pinvoke calling convention is stdcall */
-#define CALLCONV_IS_STDCALL(call_conv) (((call_conv) == MONO_CALL_STDCALL) || ((call_conv) == MONO_CALL_DEFAULT))
+#ifdef HOST_WIN32
+/* Under windows, the calling convention is never stdcall */
+#define CALLCONV_IS_STDCALL(call_conv) (FALSE)
 #else
 #define CALLCONV_IS_STDCALL(call_conv) ((call_conv) == MONO_CALL_STDCALL)
 #endif
@@ -64,7 +69,30 @@ static CRITICAL_SECTION mini_arch_mutex;
 MonoBreakpointInfo
 mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
 
+/*
+ * The code generated for sequence points reads from this location, which is
+ * made read-only when single stepping is enabled.
+ */
+static gpointer ss_trigger_page;
+
+/* Enabled breakpoints read from this trigger page */
+static gpointer bp_trigger_page;
+
+/* The size of the breakpoint sequence */
+static int breakpoint_size;
+
+/* The size of the breakpoint instruction causing the actual fault */
+static int breakpoint_fault_size;
+
+/* The size of the single step instruction causing the actual fault */
+static int single_step_fault_size;
+
+#ifdef HOST_WIN32
+/* On Win64 always reserve first 32 bytes for first four arguments */
+#define ARGS_OFFSET 48
+#else
 #define ARGS_OFFSET 16
+#endif
 #define GP_SCRATCH_REG AMD64_R11
 
 /*
@@ -108,16 +136,30 @@ mono_arch_regname (int reg)
        return "unknown";
 }
 
-static const char * xmmregs [] = {
-       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8",
-       "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+static const char * packed_xmmregs [] = {
+       "p:xmm0", "p:xmm1", "p:xmm2", "p:xmm3", "p:xmm4", "p:xmm5", "p:xmm6", "p:xmm7", "p:xmm8",
+       "p:xmm9", "p:xmm10", "p:xmm11", "p:xmm12", "p:xmm13", "p:xmm14", "p:xmm15"
+};
+
+static const char * single_xmmregs [] = {
+       "s:xmm0", "s:xmm1", "s:xmm2", "s:xmm3", "s:xmm4", "s:xmm5", "s:xmm6", "s:xmm7", "s:xmm8",
+       "s:xmm9", "s:xmm10", "s:xmm11", "s:xmm12", "s:xmm13", "s:xmm14", "s:xmm15"
 };
 
 const char*
 mono_arch_fregname (int reg)
 {
        if (reg < AMD64_XMM_NREG)
-               return xmmregs [reg];
+               return single_xmmregs [reg];
+       else
+               return "unknown";
+}
+
+const char *
+mono_arch_xregname (int reg)
+{
+       if (reg < AMD64_XMM_NREG)
+               return packed_xmmregs [reg];
        else
                return "unknown";
 }
@@ -170,15 +212,19 @@ amd64_is_near_call (guint8 *code)
 static inline void 
 amd64_patch (unsigned char* code, gpointer target)
 {
+       guint8 rex = 0;
+
        /* Skip REX */
-       if ((code [0] >= 0x40) && (code [0] <= 0x4f))
+       if ((code [0] >= 0x40) && (code [0] <= 0x4f)) {
+               rex = code [0];
                code += 1;
+       }
 
        if ((code [0] & 0xf8) == 0xb8) {
                /* amd64_set_reg_template */
                *(guint64*)(code + 1) = (guint64)target;
        }
-       else if (code [0] == 0x8b) {
+       else if ((code [0] == 0x8b) && rex && x86_modrm_mod (code [1]) == 0 && x86_modrm_rm (code [1]) == 5) {
                /* mov 0(%rip), %dreg */
                *(guint32*)(code + 2) = (guint32)(guint64)target - 7;
        }
@@ -208,6 +254,7 @@ typedef enum {
        ArgInDoubleSSEReg,
        ArgOnStack,
        ArgValuetypeInReg,
+       ArgValuetypeAddrInIReg,
        ArgNone /* only in pair_storage */
 } ArgStorage;
 
@@ -227,6 +274,7 @@ typedef struct {
        guint32 reg_usage;
        guint32 freg_usage;
        gboolean need_stack_align;
+       gboolean vtype_retaddr;
        ArgInfo ret;
        ArgInfo sig_cookie;
        ArgInfo args [1];
@@ -234,14 +282,7 @@ typedef struct {
 
 #define DEBUG(a) if (cfg->verbose_level > 1) a
 
-#define NEW_ICONST(cfg,dest,val) do {  \
-               (dest) = mono_mempool_alloc0 ((cfg)->mempool, sizeof (MonoInst));       \
-               (dest)->opcode = OP_ICONST;     \
-               (dest)->inst_c0 = (val);        \
-               (dest)->type = STACK_I4;        \
-       } while (0)
-
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 #define PARAM_REGS 4
 
 static AMD64_Reg_No param_regs [] = { AMD64_RCX, AMD64_RDX, AMD64_R8, AMD64_R9 };
@@ -271,7 +312,7 @@ add_general (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
     }
 }
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 #define FLOAT_PARAM_REGS 4
 #else
 #define FLOAT_PARAM_REGS 8
@@ -310,7 +351,7 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
        ArgumentClass class2 = ARG_CLASS_NO_CLASS;
        MonoType *ptype;
 
-       ptype = mono_type_get_underlying_type (type);
+       ptype = mini_type_get_underlying_type (NULL, type);
        switch (ptype->type) {
        case MONO_TYPE_BOOLEAN:
        case MONO_TYPE_CHAR:
@@ -335,7 +376,11 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
                break;
        case MONO_TYPE_R4:
        case MONO_TYPE_R8:
+#ifdef HOST_WIN32
+               class2 = ARG_CLASS_INTEGER;
+#else
                class2 = ARG_CLASS_SSE;
+#endif
                break;
 
        case MONO_TYPE_TYPEDBYREF:
@@ -378,21 +423,39 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
 
 static void
 add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
-              gboolean is_return,
-              guint32 *gr, guint32 *fr, guint32 *stack_size)
+                          gboolean is_return,
+                          guint32 *gr, guint32 *fr, guint32 *stack_size)
 {
        guint32 size, quad, nquads, i;
        ArgumentClass args [2];
-       MonoMarshalType *info;
+       MonoMarshalType *info = NULL;
        MonoClass *klass;
+       MonoGenericSharingContext tmp_gsctx;
+       gboolean pass_on_stack = FALSE;
+       
+       /* 
+        * The gsctx currently contains no data, it is only used for checking whenever
+        * open types are allowed, some callers like mono_arch_get_argument_info ()
+        * don't pass it to us, so work around that.
+        */
+       if (!gsctx)
+               gsctx = &tmp_gsctx;
 
        klass = mono_class_from_mono_type (type);
-       if (sig->pinvoke) 
-               size = mono_type_native_stack_size (&klass->byval_arg, NULL);
-       else 
-               size = mini_type_stack_size (gsctx, &klass->byval_arg, NULL);
+       size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
+#ifndef HOST_WIN32
+       if (!sig->pinvoke && !disable_vtypes_in_regs && ((is_return && (size == 8)) || (!is_return && (size <= 16)))) {
+               /* We pass and return vtypes of size 8 in a register */
+       } else if (!sig->pinvoke || (size == 0) || (size > 16)) {
+               pass_on_stack = TRUE;
+       }
+#else
+       if (!sig->pinvoke) {
+               pass_on_stack = TRUE;
+       }
+#endif
 
-       if (!sig->pinvoke || (size == 0) || (size > 16)) {
+       if (pass_on_stack) {
                /* Allways pass in memory */
                ainfo->offset = *stack_size;
                *stack_size += ALIGN_TO (size, 8);
@@ -410,48 +473,92 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
        else
                nquads = 1;
 
-       /*
-        * Implement the algorithm from section 3.2.3 of the X86_64 ABI.
-        * The X87 and SSEUP stuff is left out since there are no such types in
-        * the CLR.
-        */
-       info = mono_marshal_load_type_info (klass);
-       g_assert (info);
-       if (info->native_size > 16) {
-               ainfo->offset = *stack_size;
-               *stack_size += ALIGN_TO (info->native_size, 8);
-               ainfo->storage = ArgOnStack;
+       if (!sig->pinvoke) {
+               /* Always pass in 1 or 2 integer registers */
+               args [0] = ARG_CLASS_INTEGER;
+               args [1] = ARG_CLASS_INTEGER;
+               /* Only the simplest cases are supported */
+               if (is_return && nquads != 1) {
+                       args [0] = ARG_CLASS_MEMORY;
+                       args [1] = ARG_CLASS_MEMORY;
+               }
+       } else {
+               /*
+                * Implement the algorithm from section 3.2.3 of the X86_64 ABI.
+                * The X87 and SSEUP stuff is left out since there are no such types in
+                * the CLR.
+                */
+               info = mono_marshal_load_type_info (klass);
+               g_assert (info);
 
-               return;
-       }
+#ifndef HOST_WIN32
+               if (info->native_size > 16) {
+                       ainfo->offset = *stack_size;
+                       *stack_size += ALIGN_TO (info->native_size, 8);
+                       ainfo->storage = ArgOnStack;
 
-       args [0] = ARG_CLASS_NO_CLASS;
-       args [1] = ARG_CLASS_NO_CLASS;
-       for (quad = 0; quad < nquads; ++quad) {
-               int size;
-               guint32 align;
-               ArgumentClass class1;
-               
-               class1 = ARG_CLASS_NO_CLASS;
-               for (i = 0; i < info->num_fields; ++i) {
-                       size = mono_marshal_type_size (info->fields [i].field->type, 
-                                                                                  info->fields [i].mspec, 
-                                                                                  &align, TRUE, klass->unicode);
-                       if ((info->fields [i].offset < 8) && (info->fields [i].offset + size) > 8) {
-                               /* Unaligned field */
-                               NOT_IMPLEMENTED;
+                       return;
+               }
+#else
+               switch (info->native_size) {
+               case 1: case 2: case 4: case 8:
+                       break;
+               default:
+                       if (is_return) {
+                               ainfo->storage = ArgOnStack;
+                               ainfo->offset = *stack_size;
+                               *stack_size += ALIGN_TO (info->native_size, 8);
                        }
+                       else {
+                               ainfo->storage = ArgValuetypeAddrInIReg;
 
-                       /* Skip fields in other quad */
-                       if ((quad == 0) && (info->fields [i].offset >= 8))
-                               continue;
-                       if ((quad == 1) && (info->fields [i].offset < 8))
-                               continue;
+                               if (*gr < PARAM_REGS) {
+                                       ainfo->pair_storage [0] = ArgInIReg;
+                                       ainfo->pair_regs [0] = param_regs [*gr];
+                                       (*gr) ++;
+                               }
+                               else {
+                                       ainfo->pair_storage [0] = ArgOnStack;
+                                       ainfo->offset = *stack_size;
+                                       *stack_size += 8;
+                               }
+                       }
+
+                       return;
+               }
+#endif
+
+               args [0] = ARG_CLASS_NO_CLASS;
+               args [1] = ARG_CLASS_NO_CLASS;
+               for (quad = 0; quad < nquads; ++quad) {
+                       int size;
+                       guint32 align;
+                       ArgumentClass class1;
+               
+                       if (info->num_fields == 0)
+                               class1 = ARG_CLASS_MEMORY;
+                       else
+                               class1 = ARG_CLASS_NO_CLASS;
+                       for (i = 0; i < info->num_fields; ++i) {
+                               size = mono_marshal_type_size (info->fields [i].field->type, 
+                                                                                          info->fields [i].mspec, 
+                                                                                          &align, TRUE, klass->unicode);
+                               if ((info->fields [i].offset < 8) && (info->fields [i].offset + size) > 8) {
+                                       /* Unaligned field */
+                                       NOT_IMPLEMENTED;
+                               }
 
-                       class1 = merge_argument_class_from_type (info->fields [i].field->type, class1);
+                               /* Skip fields in other quad */
+                               if ((quad == 0) && (info->fields [i].offset >= 8))
+                                       continue;
+                               if ((quad == 1) && (info->fields [i].offset < 8))
+                                       continue;
+
+                               class1 = merge_argument_class_from_type (info->fields [i].field->type, class1);
+                       }
+                       g_assert (class1 != ARG_CLASS_NO_CLASS);
+                       args [quad] = class1;
                }
-               g_assert (class1 != ARG_CLASS_NO_CLASS);
-               args [quad] = class1;
        }
 
        /* Post merger cleanup */
@@ -501,7 +608,10 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
                        *fr = orig_fr;
 
                        ainfo->offset = *stack_size;
-                       *stack_size += ALIGN_TO (info->native_size, 8);
+                       if (sig->pinvoke)
+                               *stack_size += ALIGN_TO (info->native_size, 8);
+                       else
+                               *stack_size += nquads * sizeof (gpointer);
                        ainfo->storage = ArgOnStack;
                }
        }
@@ -515,27 +625,27 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
  * Draft Version 0.23" document for more information.
  */
 static CallInfo*
-get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
+get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
 {
        guint32 i, gr, fr;
        MonoType *ret_type;
        int n = sig->hasthis + sig->param_count;
        guint32 stack_size = 0;
        CallInfo *cinfo;
-       MonoGenericSharingContext *gsctx = cfg ? cfg->generic_sharing_context : NULL;
 
        if (mp)
                cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
        else
                cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
 
+       cinfo->nargs = n;
+
        gr = 0;
        fr = 0;
 
        /* return value */
        {
-               ret_type = mono_type_get_underlying_type (sig->ret);
-               ret_type = mini_get_basic_type_from_generic (gsctx, ret_type);
+               ret_type = mini_type_get_underlying_type (gsctx, sig->ret);
                switch (ret_type->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -571,7 +681,7 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                        cinfo->ret.reg = AMD64_XMM0;
                        break;
                case MONO_TYPE_GENERICINST:
-                       if (!mono_type_generic_inst_is_valuetype (sig->ret)) {
+                       if (!mono_type_generic_inst_is_valuetype (ret_type)) {
                                cinfo->ret.storage = ArgInIReg;
                                cinfo->ret.reg = AMD64_RAX;
                                break;
@@ -581,9 +691,11 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                        guint32 tmp_gr = 0, tmp_fr = 0, tmp_stacksize = 0;
 
                        add_valuetype (gsctx, sig, &cinfo->ret, sig->ret, TRUE, &tmp_gr, &tmp_fr, &tmp_stacksize);
-                       if (cinfo->ret.storage == ArgOnStack)
+                       if (cinfo->ret.storage == ArgOnStack) {
+                               cinfo->vtype_retaddr = TRUE;
                                /* The caller passes the address where the value is stored */
                                add_general (&gr, &stack_size, &cinfo->ret);
+                       }
                        break;
                }
                case MONO_TYPE_TYPEDBYREF:
@@ -614,6 +726,14 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                ArgInfo *ainfo = &cinfo->args [sig->hasthis + i];
                MonoType *ptype;
 
+#ifdef HOST_WIN32
+               /* The float param registers and other param registers must be the same index on Windows x64.*/
+               if (gr > fr)
+                       fr = gr;
+               else if (fr > gr)
+                       gr = fr;
+#endif
+
                if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
                        /* We allways pass the sig cookie on the stack for simplicity */
                        /* 
@@ -627,12 +747,7 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                        add_general (&gr, &stack_size, &cinfo->sig_cookie);
                }
 
-               if (sig->params [i]->byref) {
-                       add_general (&gr, &stack_size, ainfo);
-                       continue;
-               }
-               ptype = mono_type_get_underlying_type (sig->params [i]);
-               ptype = mini_get_basic_type_from_generic (gsctx, ptype);
+               ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
                switch (ptype->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -669,8 +784,12 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                        add_valuetype (gsctx, sig, ainfo, sig->params [i], FALSE, &gr, &fr, &stack_size);
                        break;
                case MONO_TYPE_TYPEDBYREF:
+#ifdef HOST_WIN32
+                       add_valuetype (gsctx, sig, ainfo, sig->params [i], FALSE, &gr, &fr, &stack_size);
+#else
                        stack_size += sizeof (MonoTypedRef);
                        ainfo->storage = ArgOnStack;
+#endif
                        break;
                case MONO_TYPE_U8:
                case MONO_TYPE_I8:
@@ -695,11 +814,9 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                add_general (&gr, &stack_size, &cinfo->sig_cookie);
        }
 
-#ifdef PLATFORM_WIN32
-       if (stack_size < 32) {
-               /* The Win64 ABI requires 32 bits  */
-               stack_size = 32;
-       }
+#ifdef HOST_WIN32
+       // There always is 32 bytes reserved on the stack when calling on Winx64
+       stack_size += 0x20;
 #endif
 
        if (stack_size & 0x8) {
@@ -751,9 +868,18 @@ mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJit
 static int 
 cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
 {
+#ifndef _MSC_VER
        __asm__ __volatile__ ("cpuid"
                : "=a" (*p_eax), "=b" (*p_ebx), "=c" (*p_ecx), "=d" (*p_edx)
                : "a" (id));
+#else
+       int info[4];
+       __cpuid(info, id);
+       *p_eax = info[0];
+       *p_ebx = info[1];
+       *p_ecx = info[2];
+       *p_edx = info[3];
+#endif
        return 1;
 }
 
@@ -773,7 +899,9 @@ mono_arch_cpu_init (void)
        __asm__  __volatile__ ("fldcw %0\n": : "m" (fpcw));
        __asm__  __volatile__ ("fnstcw %0\n": "=m" (fpcw));
 #else
-       _control87 (_PC_53, MCW_PC);
+       /* TODO: This is crashing on Win64 right now.
+       * _control87 (_PC_53, MCW_PC);
+       */
 #endif
 }
 
@@ -783,7 +911,28 @@ mono_arch_cpu_init (void)
 void
 mono_arch_init (void)
 {
+       int flags;
+
        InitializeCriticalSection (&mini_arch_mutex);
+
+#ifdef MONO_ARCH_NOMAP32BIT
+       flags = MONO_MMAP_READ;
+       /* amd64_mov_reg_imm () + amd64_mov_reg_membase () */
+       breakpoint_size = 13;
+       breakpoint_fault_size = 3;
+       /* amd64_alu_membase_imm_size (code, X86_CMP, AMD64_R11, 0, 0, 4); */
+       single_step_fault_size = 5;
+#else
+       flags = MONO_MMAP_READ|MONO_MMAP_32BIT;
+       /* amd64_mov_reg_mem () */
+       breakpoint_size = 8;
+       breakpoint_fault_size = 8;
+       single_step_fault_size = 8;
+#endif
+
+       ss_trigger_page = mono_valloc (NULL, mono_pagesize (), flags);
+       bp_trigger_page = mono_valloc (NULL, mono_pagesize (), flags);
+       mono_mprotect (bp_trigger_page, mono_pagesize (), 0);
 }
 
 /*
@@ -818,13 +967,50 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
                } else
                        *exclude_mask |= MONO_OPT_CMOV;
        }
+
        return opts;
 }
 
-gboolean
-mono_amd64_is_sse2 (void)
+/*
+ * This function test for all SSE functions supported.
+ *
+ * Returns a bitmask corresponding to all supported versions.
+ * 
+ */
+guint32
+mono_arch_cpu_enumerate_simd_versions (void)
 {
-       return use_sse2;
+       int eax, ebx, ecx, edx;
+       guint32 sse_opts = 0;
+
+       if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
+               if (edx & (1 << 25))
+                       sse_opts |= 1 << SIMD_VERSION_SSE1;
+               if (edx & (1 << 26))
+                       sse_opts |= 1 << SIMD_VERSION_SSE2;
+               if (ecx & (1 << 0))
+                       sse_opts |= 1 << SIMD_VERSION_SSE3;
+               if (ecx & (1 << 9))
+                       sse_opts |= 1 << SIMD_VERSION_SSSE3;
+               if (ecx & (1 << 19))
+                       sse_opts |= 1 << SIMD_VERSION_SSE41;
+               if (ecx & (1 << 20))
+                       sse_opts |= 1 << SIMD_VERSION_SSE42;
+       }
+
+       /* Yes, all this needs to be done to check for sse4a.
+          See: "Amd: CPUID Specification"
+        */
+       if (cpuid (0x80000000, &eax, &ebx, &ecx, &edx)) {
+               /* eax greater or equal than 0x80000001, ebx = 'htuA', ecx = DMAc', edx = 'itne'*/
+               if ((((unsigned int) eax) >= 0x80000001) && (ebx == 0x68747541) && (ecx == 0x444D4163) && (edx == 0x69746E65)) {
+                       cpuid (0x80000001, &eax, &ebx, &ecx, &edx);
+                       if (ecx & (1 << 6))
+                               sse_opts |= 1 << SIMD_VERSION_SSE4a;
+               }
+       }
+
+       return sse_opts;        
 }
 
 GList *
@@ -878,7 +1064,7 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
        sig = mono_method_signature (cfg->method);
 
        if (!cfg->arch.cinfo)
-               cfg->arch.cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+               cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
        cinfo = cfg->arch.cinfo;
 
        /*
@@ -887,9 +1073,7 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
        cfg->arch.omit_fp = TRUE;
        cfg->arch.omit_fp_computed = TRUE;
 
-       /* Temporarily disable this when running in the debugger until we have support
-        * for this in the debugger. */
-       if (mono_debug_using_mono_debugger ())
+       if (cfg->disable_omit_fp)
                cfg->arch.omit_fp = FALSE;
 
        if (!debug_omit_fp ())
@@ -928,11 +1112,6 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 
                locals_size += mono_type_size (ins->inst_vtype, &ialign);
        }
-
-       if ((cfg->num_varinfo > 10000) || (locals_size >= (1 << 15))) {
-               /* Avoid hitting the stack_alloc_size < (1 << 16) assertion in emit_epilog () */
-               cfg->arch.omit_fp = FALSE;
-       }
 }
 
 GList *
@@ -942,19 +1121,104 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
 
        mono_arch_compute_omit_fp (cfg);
 
-       if (cfg->arch.omit_fp)
-               regs = g_list_prepend (regs, (gpointer)AMD64_RBP);
+       if (cfg->globalra) {
+               if (cfg->arch.omit_fp)
+                       regs = g_list_prepend (regs, (gpointer)AMD64_RBP);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RBX);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R12);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R13);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R10);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R9);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R8);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RDX);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RCX);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RAX);
+       } else {
+               if (cfg->arch.omit_fp)
+                       regs = g_list_prepend (regs, (gpointer)AMD64_RBP);
+
+               /* We use the callee saved registers for global allocation */
+               regs = g_list_prepend (regs, (gpointer)AMD64_RBX);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R12);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R13);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#ifdef HOST_WIN32
+               regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
+#endif
+       }
+
+       return regs;
+}
+GList*
+mono_arch_get_global_fp_regs (MonoCompile *cfg)
+{
+       GList *regs = NULL;
+       int i;
 
-       /* We use the callee saved registers for global allocation */
-       regs = g_list_prepend (regs, (gpointer)AMD64_RBX);
-       regs = g_list_prepend (regs, (gpointer)AMD64_R12);
-       regs = g_list_prepend (regs, (gpointer)AMD64_R13);
-       regs = g_list_prepend (regs, (gpointer)AMD64_R14);
-       regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+       /* All XMM registers */
+       for (i = 0; i < 16; ++i)
+               regs = g_list_prepend (regs, GINT_TO_POINTER (i));
 
        return regs;
 }
 
+GList*
+mono_arch_get_iregs_clobbered_by_call (MonoCallInst *call)
+{
+       static GList *r = NULL;
+
+       if (r == NULL) {
+               GList *regs = NULL;
+
+               regs = g_list_prepend (regs, (gpointer)AMD64_RBP);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RBX);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R12);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R13);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+
+               regs = g_list_prepend (regs, (gpointer)AMD64_R10);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R9);
+               regs = g_list_prepend (regs, (gpointer)AMD64_R8);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RDX);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RCX);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RAX);
+
+               InterlockedCompareExchangePointer ((gpointer*)&r, regs, NULL);
+       }
+
+       return r;
+}
+
+GList*
+mono_arch_get_fregs_clobbered_by_call (MonoCallInst *call)
+{
+       int i;
+       static GList *r = NULL;
+
+       if (r == NULL) {
+               GList *regs = NULL;
+
+               for (i = 0; i < AMD64_XMM_NREG; ++i)
+                       regs = g_list_prepend (regs, GINT_TO_POINTER (MONO_MAX_IREGS + i));
+
+               InterlockedCompareExchangePointer ((gpointer*)&r, regs, NULL);
+       }
+
+       return r;
+}
+
 /*
  * mono_arch_regalloc_cost:
  *
@@ -975,13 +1239,97 @@ mono_arch_regalloc_cost (MonoCompile *cfg, MonoMethodVar *vmv)
                /* push+pop */
                return (ins->opcode == OP_ARG) ? 1 : 2;
 }
+
+/*
+ * mono_arch_fill_argument_info:
+ *
+ *   Populate cfg->args, cfg->ret and cfg->vret_addr with information about the arguments
+ * of the method.
+ */
+void
+mono_arch_fill_argument_info (MonoCompile *cfg)
+{
+       MonoMethodSignature *sig;
+       MonoMethodHeader *header;
+       MonoInst *ins;
+       int i;
+       CallInfo *cinfo;
+
+       header = mono_method_get_header (cfg->method);
+
+       sig = mono_method_signature (cfg->method);
+
+       cinfo = cfg->arch.cinfo;
+
+       /*
+        * Contrary to mono_arch_allocate_vars (), the information should describe
+        * where the arguments are at the beginning of the method, not where they can be 
+        * accessed during the execution of the method. The later makes no sense for the 
+        * global register allocator, since a variable can be in more than one location.
+        */
+       if (sig->ret->type != MONO_TYPE_VOID) {
+               switch (cinfo->ret.storage) {
+               case ArgInIReg:
+               case ArgInFloatSSEReg:
+               case ArgInDoubleSSEReg:
+                       if ((MONO_TYPE_ISSTRUCT (sig->ret) && !mono_class_from_mono_type (sig->ret)->enumtype) || (sig->ret->type == MONO_TYPE_TYPEDBYREF)) {
+                               cfg->vret_addr->opcode = OP_REGVAR;
+                               cfg->vret_addr->inst_c0 = cinfo->ret.reg;
+                       }
+                       else {
+                               cfg->ret->opcode = OP_REGVAR;
+                               cfg->ret->inst_c0 = cinfo->ret.reg;
+                       }
+                       break;
+               case ArgValuetypeInReg:
+                       cfg->ret->opcode = OP_REGOFFSET;
+                       cfg->ret->inst_basereg = -1;
+                       cfg->ret->inst_offset = -1;
+                       break;
+               default:
+                       g_assert_not_reached ();
+               }
+       }
+
+       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
+               ArgInfo *ainfo = &cinfo->args [i];
+               MonoType *arg_type;
+
+               ins = cfg->args [i];
+
+               if (sig->hasthis && (i == 0))
+                       arg_type = &mono_defaults.object_class->byval_arg;
+               else
+                       arg_type = sig->params [i - sig->hasthis];
+
+               switch (ainfo->storage) {
+               case ArgInIReg:
+               case ArgInFloatSSEReg:
+               case ArgInDoubleSSEReg:
+                       ins->opcode = OP_REGVAR;
+                       ins->inst_c0 = ainfo->reg;
+                       break;
+               case ArgOnStack:
+                       ins->opcode = OP_REGOFFSET;
+                       ins->inst_basereg = -1;
+                       ins->inst_offset = -1;
+                       break;
+               case ArgValuetypeInReg:
+                       /* Dummy */
+                       ins->opcode = OP_NOP;
+                       break;
+               default:
+                       g_assert_not_reached ();
+               }
+       }
+}
  
 void
 mono_arch_allocate_vars (MonoCompile *cfg)
 {
        MonoMethodSignature *sig;
        MonoMethodHeader *header;
-       MonoInst *inst;
+       MonoInst *ins;
        int i, offset;
        guint32 locals_stack_size, locals_stack_align;
        gint32 *offsets;
@@ -997,7 +1345,18 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        /*
         * We use the ABI calling conventions for managed code as well.
-        * Exception: valuetypes are never passed or returned in registers.
+        * Exception: valuetypes are only sometimes passed or returned in registers.
+        */
+
+       /*
+        * The stack looks like this:
+        * <incoming arguments passed on the stack>
+        * <return value>
+        * <lmf/caller saved registers>
+        * <locals>
+        * <spill area>
+        * <localloc area>  -> grows dynamically
+        * <params area>
         */
 
        if (cfg->arch.omit_fp) {
@@ -1012,8 +1371,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        if (cfg->method->save_lmf) {
                /* Reserve stack space for saving LMF */
-               /* mono_arch_find_jit_info () expects to find the LMF at a fixed offset */
-               g_assert (offset == 0);
                if (cfg->arch.omit_fp) {
                        cfg->arch.lmf_offset = offset;
                        offset += sizeof (MonoLMF);
@@ -1023,6 +1380,8 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        cfg->arch.lmf_offset = -offset;
                }
        } else {
+               if (cfg->arch.omit_fp)
+                       cfg->arch.reg_save_area_offset = offset;
                /* Reserve space for caller saved registers */
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
@@ -1036,19 +1395,24 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                case ArgInFloatSSEReg:
                case ArgInDoubleSSEReg:
                        if ((MONO_TYPE_ISSTRUCT (sig->ret) && !mono_class_from_mono_type (sig->ret)->enumtype) || (sig->ret->type == MONO_TYPE_TYPEDBYREF)) {
-                               /* The register is volatile */
-                               cfg->vret_addr->opcode = OP_REGOFFSET;
-                               cfg->vret_addr->inst_basereg = cfg->frame_reg;
-                               if (cfg->arch.omit_fp) {
-                                       cfg->vret_addr->inst_offset = offset;
-                                       offset += 8;
+                               if (cfg->globalra) {
+                                       cfg->vret_addr->opcode = OP_REGVAR;
+                                       cfg->vret_addr->inst_c0 = cinfo->ret.reg;
                                } else {
-                                       offset += 8;
-                                       cfg->vret_addr->inst_offset = -offset;
-                               }
-                               if (G_UNLIKELY (cfg->verbose_level > 1)) {
-                                       printf ("vret_addr =");
-                                       mono_print_ins (cfg->vret_addr);
+                                       /* The register is volatile */
+                                       cfg->vret_addr->opcode = OP_REGOFFSET;
+                                       cfg->vret_addr->inst_basereg = cfg->frame_reg;
+                                       if (cfg->arch.omit_fp) {
+                                               cfg->vret_addr->inst_offset = offset;
+                                               offset += 8;
+                                       } else {
+                                               offset += 8;
+                                               cfg->vret_addr->inst_offset = -offset;
+                                       }
+                                       if (G_UNLIKELY (cfg->verbose_level > 1)) {
+                                               printf ("vret_addr =");
+                                               mono_print_ins (cfg->vret_addr);
+                                       }
                                }
                        }
                        else {
@@ -1071,28 +1435,47 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                default:
                        g_assert_not_reached ();
                }
-               cfg->ret->dreg = cfg->ret->inst_c0;
+               if (!cfg->globalra)
+                       cfg->ret->dreg = cfg->ret->inst_c0;
        }
 
        /* Allocate locals */
-       offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE: TRUE, &locals_stack_size, &locals_stack_align);
-       if (locals_stack_align) {
-               offset += (locals_stack_align - 1);
-               offset &= ~(locals_stack_align - 1);
-       }
-       for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
-               if (offsets [i] != -1) {
-                       MonoInst *inst = cfg->varinfo [i];
-                       inst->opcode = OP_REGOFFSET;
-                       inst->inst_basereg = cfg->frame_reg;
-                       if (cfg->arch.omit_fp)
-                               inst->inst_offset = (offset + offsets [i]);
-                       else
-                               inst->inst_offset = - (offset + offsets [i]);
-                       //printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
+       if (!cfg->globalra) {
+               offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE: TRUE, &locals_stack_size, &locals_stack_align);
+               if (locals_stack_size > MONO_ARCH_MAX_FRAME_SIZE) {
+                       char *mname = mono_method_full_name (cfg->method, TRUE);
+                       cfg->exception_type = MONO_EXCEPTION_INVALID_PROGRAM;
+                       cfg->exception_message = g_strdup_printf ("Method %s stack is too big.", mname);
+                       g_free (mname);
+                       return;
+               }
+               
+               if (locals_stack_align) {
+                       offset += (locals_stack_align - 1);
+                       offset &= ~(locals_stack_align - 1);
+               }
+               if (cfg->arch.omit_fp) {
+                       cfg->locals_min_stack_offset = offset;
+                       cfg->locals_max_stack_offset = offset + locals_stack_size;
+               } else {
+                       cfg->locals_min_stack_offset = - (offset + locals_stack_size);
+                       cfg->locals_max_stack_offset = - offset;
+               }
+               
+               for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
+                       if (offsets [i] != -1) {
+                               MonoInst *ins = cfg->varinfo [i];
+                               ins->opcode = OP_REGOFFSET;
+                               ins->inst_basereg = cfg->frame_reg;
+                               if (cfg->arch.omit_fp)
+                                       ins->inst_offset = (offset + offsets [i]);
+                               else
+                                       ins->inst_offset = - (offset + offsets [i]);
+                               //printf ("allocated local %d to ", i); mono_print_tree_nl (ins);
+                       }
                }
+               offset += locals_stack_size;
        }
-       offset += locals_stack_size;
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG)) {
                g_assert (!cfg->arch.omit_fp);
@@ -1101,8 +1484,8 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        }
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               inst = cfg->args [i];
-               if (inst->opcode != OP_REGVAR) {
+               ins = cfg->args [i];
+               if (ins->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
                        MonoType *arg_type;
@@ -1112,8 +1495,43 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        else
                                arg_type = sig->params [i - sig->hasthis];
 
+                       if (cfg->globalra) {
+                               /* The new allocator needs info about the original locations of the arguments */
+                               switch (ainfo->storage) {
+                               case ArgInIReg:
+                               case ArgInFloatSSEReg:
+                               case ArgInDoubleSSEReg:
+                                       ins->opcode = OP_REGVAR;
+                                       ins->inst_c0 = ainfo->reg;
+                                       break;
+                               case ArgOnStack:
+                                       g_assert (!cfg->arch.omit_fp);
+                                       ins->opcode = OP_REGOFFSET;
+                                       ins->inst_basereg = cfg->frame_reg;
+                                       ins->inst_offset = ainfo->offset + ARGS_OFFSET;
+                                       break;
+                               case ArgValuetypeInReg:
+                                       ins->opcode = OP_REGOFFSET;
+                                       ins->inst_basereg = cfg->frame_reg;
+                                       /* These arguments are saved to the stack in the prolog */
+                                       offset = ALIGN_TO (offset, sizeof (gpointer));
+                                       if (cfg->arch.omit_fp) {
+                                               ins->inst_offset = offset;
+                                               offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                       } else {
+                                               offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                               ins->inst_offset = - offset;
+                                       }
+                                       break;
+                               default:
+                                       g_assert_not_reached ();
+                               }
+
+                               continue;
+                       }
+
                        /* FIXME: Allocate volatile arguments to registers */
-                       if (inst->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT))
+                       if (ins->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT))
                                inreg = FALSE;
 
                        /* 
@@ -1124,38 +1542,65 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        if ((ainfo->storage == ArgInIReg) || (ainfo->storage == ArgInFloatSSEReg) || (ainfo->storage == ArgInDoubleSSEReg) || (ainfo->storage == ArgValuetypeInReg))
                                inreg = FALSE;
 
-                       inst->opcode = OP_REGOFFSET;
+                       ins->opcode = OP_REGOFFSET;
 
                        switch (ainfo->storage) {
                        case ArgInIReg:
                        case ArgInFloatSSEReg:
                        case ArgInDoubleSSEReg:
-                               inst->opcode = OP_REGVAR;
-                               inst->dreg = ainfo->reg;
+                               if (inreg) {
+                                       ins->opcode = OP_REGVAR;
+                                       ins->dreg = ainfo->reg;
+                               }
                                break;
                        case ArgOnStack:
                                g_assert (!cfg->arch.omit_fp);
-                               inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = cfg->frame_reg;
-                               inst->inst_offset = ainfo->offset + ARGS_OFFSET;
+                               ins->opcode = OP_REGOFFSET;
+                               ins->inst_basereg = cfg->frame_reg;
+                               ins->inst_offset = ainfo->offset + ARGS_OFFSET;
                                break;
                        case ArgValuetypeInReg:
                                break;
-                       default:
-                               NOT_IMPLEMENTED;
-                       }
-
-                       if (!inreg && (ainfo->storage != ArgOnStack)) {
-                               inst->opcode = OP_REGOFFSET;
-                               inst->inst_basereg = cfg->frame_reg;
+                       case ArgValuetypeAddrInIReg: {
+                               MonoInst *indir;
+                               g_assert (!cfg->arch.omit_fp);
+                               
+                               MONO_INST_NEW (cfg, indir, 0);
+                               indir->opcode = OP_REGOFFSET;
+                               if (ainfo->pair_storage [0] == ArgInIReg) {
+                                       indir->inst_basereg = cfg->frame_reg;
+                                       offset = ALIGN_TO (offset, sizeof (gpointer));
+                                       offset += (sizeof (gpointer));
+                                       indir->inst_offset = - offset;
+                               }
+                               else {
+                                       indir->inst_basereg = cfg->frame_reg;
+                                       indir->inst_offset = ainfo->offset + ARGS_OFFSET;
+                               }
+                               
+                               ins->opcode = OP_VTARG_ADDR;
+                               ins->inst_left = indir;
+                               
+                               break;
+                       }
+                       default:
+                               NOT_IMPLEMENTED;
+                       }
+
+                       if (!inreg && (ainfo->storage != ArgOnStack) && (ainfo->storage != ArgValuetypeAddrInIReg)) {
+                               ins->opcode = OP_REGOFFSET;
+                               ins->inst_basereg = cfg->frame_reg;
                                /* These arguments are saved to the stack in the prolog */
                                offset = ALIGN_TO (offset, sizeof (gpointer));
                                if (cfg->arch.omit_fp) {
-                                       inst->inst_offset = offset;
+                                       ins->inst_offset = offset;
                                        offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                       // Arguments are yet supported by the stack map creation code
+                                       //cfg->locals_max_stack_offset = MAX (cfg->locals_max_stack_offset, offset);
                                } else {
                                        offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
-                                       inst->inst_offset = - offset;
+                                       ins->inst_offset = - offset;
+                                       //cfg->locals_min_stack_offset = MIN (cfg->locals_min_stack_offset, offset);
                                }
                        }
                }
@@ -1173,7 +1618,7 @@ mono_arch_create_vars (MonoCompile *cfg)
        sig = mono_method_signature (cfg->method);
 
        if (!cfg->arch.cinfo)
-               cfg->arch.cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+               cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
        cinfo = cfg->arch.cinfo;
 
        if (cinfo->ret.storage == ArgValuetypeInReg)
@@ -1186,49 +1631,74 @@ mono_arch_create_vars (MonoCompile *cfg)
                        mono_print_ins (cfg->vret_addr);
                }
        }
+
+       if (cfg->gen_seq_points) {
+               MonoInst *ins;
+
+           ins = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
+               ins->flags |= MONO_INST_VOLATILE;
+               cfg->arch.ss_trigger_page_var = ins;
+       }
+
+#ifdef MONO_AMD64_NO_PUSHES
+       /*
+        * When this is set, we pass arguments on the stack by moves, and by allocating 
+        * a bigger stack frame, instead of pushes.
+        * Pushes complicate exception handling because the arguments on the stack have
+        * to be popped each time a frame is unwound. They also make fp elimination
+        * impossible.
+        * FIXME: This doesn't work inside filter/finally clauses, since those execute
+        * on a new frame which doesn't include a param area.
+        */
+       cfg->arch.no_pushes = TRUE;
+#endif
 }
 
 static void
-add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, MonoInst *arg, ArgStorage storage, int reg, MonoInst *tree)
+add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int reg, MonoInst *tree)
 {
+       MonoInst *ins;
+
        switch (storage) {
        case ArgInIReg:
-               arg->opcode = OP_OUTARG_REG;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
+               MONO_INST_NEW (cfg, ins, OP_MOVE);
+               ins->dreg = mono_alloc_ireg (cfg);
+               ins->sreg1 = tree->dreg;
+               MONO_ADD_INS (cfg->cbb, ins);
+               mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, reg, FALSE);
                break;
        case ArgInFloatSSEReg:
-               arg->opcode = OP_AMD64_OUTARG_XMMREG_R4;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
+               MONO_INST_NEW (cfg, ins, OP_AMD64_SET_XMMREG_R4);
+               ins->dreg = mono_alloc_freg (cfg);
+               ins->sreg1 = tree->dreg;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, reg, TRUE);
                break;
        case ArgInDoubleSSEReg:
-               arg->opcode = OP_AMD64_OUTARG_XMMREG_R8;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
+               MONO_INST_NEW (cfg, ins, OP_FMOVE);
+               ins->dreg = mono_alloc_freg (cfg);
+               ins->sreg1 = tree->dreg;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, reg, TRUE);
+
                break;
        default:
                g_assert_not_reached ();
        }
 }
 
-/* Fixme: we need an alignment solution for enter_method and mono_arch_call_opcode,
- * currently alignment in mono_arch_call_opcode is computed without arch_get_argument_info 
- */
-
 static int
-arg_storage_to_ldind (ArgStorage storage)
+arg_storage_to_load_membase (ArgStorage storage)
 {
        switch (storage) {
        case ArgInIReg:
-               return CEE_LDIND_I;
+               return OP_LOAD_MEMBASE;
        case ArgInDoubleSSEReg:
-               return CEE_LDIND_R8;
+               return OP_LOADR8_MEMBASE;
        case ArgInFloatSSEReg:
-               return CEE_LDIND_R4;
+               return OP_LOADR4_MEMBASE;
        default:
                g_assert_not_reached ();
        }
@@ -1242,12 +1712,15 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
        MonoInst *arg;
        MonoMethodSignature *tmp_sig;
        MonoInst *sig_arg;
-                       
+
+       if (call->tail_call)
+               NOT_IMPLEMENTED;
+
        /* FIXME: Add support for signature tokens to AOT */
        cfg->disable_aot = TRUE;
 
        g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-
+                       
        /*
         * mono_ArgIterator_Setup assumes the signature cookie is 
         * passed first and all the arguments which were before it are
@@ -1260,23 +1733,124 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
        memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
 
        MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
+       sig_arg->dreg = mono_alloc_ireg (cfg);
        sig_arg->inst_p0 = tmp_sig;
+       MONO_ADD_INS (cfg->cbb, sig_arg);
 
-       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-       arg->inst_left = sig_arg;
-       arg->type = STACK_PTR;
-       MONO_INST_LIST_ADD (&arg->node, &call->out_args);
+       if (cfg->arch.no_pushes) {
+               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, cinfo->sig_cookie.offset, sig_arg->dreg);
+       } else {
+               MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
+               arg->sreg1 = sig_arg->dreg;
+               MONO_ADD_INS (cfg->cbb, arg);
+       }
 }
 
-/* 
- * take the arguments and generate the arch-specific
- * instructions to properly call the function in call.
- * This includes pushing, moving arguments to the right register
- * etc.
- * Issue: who does the spilling if needed, and when?
- */
-MonoCallInst*
-mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call, int is_virtual) {
+static inline LLVMArgStorage
+arg_storage_to_llvm_arg_storage (MonoCompile *cfg, ArgStorage storage)
+{
+       switch (storage) {
+       case ArgInIReg:
+               return LLVMArgInIReg;
+       case ArgNone:
+               return LLVMArgNone;
+       default:
+               g_assert_not_reached ();
+               return LLVMArgNone;
+       }
+}
+
+#ifdef ENABLE_LLVM
+LLVMCallInfo*
+mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
+{
+       int i, n;
+       CallInfo *cinfo;
+       ArgInfo *ainfo;
+       int j;
+       LLVMCallInfo *linfo;
+
+       n = sig->param_count + sig->hasthis;
+
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
+
+       linfo = mono_mempool_alloc0 (cfg->mempool, sizeof (LLVMCallInfo) + (sizeof (LLVMArgInfo) * n));
+
+       /*
+        * LLVM always uses the native ABI while we use our own ABI, the
+        * only difference is the handling of vtypes:
+        * - we only pass/receive them in registers in some cases, and only 
+        *   in 1 or 2 integer registers.
+        */
+       if (cinfo->ret.storage == ArgValuetypeInReg) {
+               if (sig->pinvoke) {
+                       cfg->exception_message = g_strdup ("pinvoke + vtypes");
+                       cfg->disable_llvm = TRUE;
+                       return linfo;
+               }
+
+               linfo->ret.storage = LLVMArgVtypeInReg;
+               for (j = 0; j < 2; ++j)
+                       linfo->ret.pair_storage [j] = arg_storage_to_llvm_arg_storage (cfg, cinfo->ret.pair_storage [j]);
+       }
+
+       if (MONO_TYPE_ISSTRUCT (sig->ret) && cinfo->ret.storage == ArgInIReg) {
+               /* Vtype returned using a hidden argument */
+               linfo->ret.storage = LLVMArgVtypeRetAddr;
+       }
+
+       for (i = 0; i < n; ++i) {
+               ainfo = cinfo->args + i;
+
+               linfo->args [i].storage = LLVMArgNone;
+
+               switch (ainfo->storage) {
+               case ArgInIReg:
+                       linfo->args [i].storage = LLVMArgInIReg;
+                       break;
+               case ArgInDoubleSSEReg:
+               case ArgInFloatSSEReg:
+                       linfo->args [i].storage = LLVMArgInFPReg;
+                       break;
+               case ArgOnStack:
+                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+                               linfo->args [i].storage = LLVMArgVtypeByVal;
+                       } else {
+                               linfo->args [i].storage = LLVMArgInIReg;
+                               if (!sig->params [i - sig->hasthis]->byref) {
+                                       if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4) {
+                                               linfo->args [i].storage = LLVMArgInFPReg;
+                                       } else if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8) {
+                                               linfo->args [i].storage = LLVMArgInFPReg;
+                                       }
+                               }
+                       }
+                       break;
+               case ArgValuetypeInReg:
+                       if (sig->pinvoke) {
+                               cfg->exception_message = g_strdup ("pinvoke + vtypes");
+                               cfg->disable_llvm = TRUE;
+                               return linfo;
+                       }
+
+                       linfo->args [i].storage = LLVMArgVtypeInReg;
+                       for (j = 0; j < 2; ++j)
+                               linfo->args [i].pair_storage [j] = arg_storage_to_llvm_arg_storage (cfg, ainfo->pair_storage [j]);
+                       break;
+               default:
+                       cfg->exception_message = g_strdup ("ainfo->storage");
+                       cfg->disable_llvm = TRUE;
+                       break;
+               }
+       }
+
+       return linfo;
+}
+#endif
+
+void
+mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
+{
        MonoInst *arg, *in;
        MonoMethodSignature *sig;
        int i, n, stack_size;
@@ -1288,32 +1862,84 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
        sig = call->signature;
        n = sig->param_count + sig->hasthis;
 
-       cinfo = get_call_info (cfg, cfg->mempool, sig, sig->pinvoke);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
 
-       for (i = 0; i < n; ++i) {
-               ainfo = cinfo->args + i;
+       if (COMPILE_LLVM (cfg)) {
+               /* We shouldn't be called in the llvm case */
+               cfg->disable_llvm = TRUE;
+               return;
+       }
 
-               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
-                       /* Emit the signature cookie just before the implicit arguments */
-                       emit_sig_cookie (cfg, call, cinfo);
-               }
+       if (cinfo->need_stack_align) {
+               if (!cfg->arch.no_pushes)
+                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+       }
+
+       /* 
+        * Emit all arguments which are passed on the stack to prevent register
+        * allocation problems.
+        */
+       if (cfg->arch.no_pushes) {
+               for (i = 0; i < n; ++i) {
+                       MonoType *t;
+                       ainfo = cinfo->args + i;
 
-               if (is_virtual && i == 0) {
-                       /* the argument will be attached to the call instruction */
-                       in = call->args [i];
-               } else {
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
                        in = call->args [i];
-                       arg->cil_code = in->cil_code;
-                       arg->inst_left = in;
-                       arg->type = in->type;
-                       if (!cinfo->stack_usage)
-                               /* Keep the assignments to the arg registers in order if possible */
-                               MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);
+
+                       if (sig->hasthis && i == 0)
+                               t = &mono_defaults.object_class->byval_arg;
                        else
-                               MONO_INST_LIST_ADD (&arg->node, &call->out_args);
+                               t = sig->params [i - sig->hasthis];
+
+                       if (ainfo->storage == ArgOnStack && !MONO_TYPE_ISSTRUCT (t) && !call->tail_call) {
+                               if (!t->byref) {
+                                       if (t->type == MONO_TYPE_R4)
+                                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORER4_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                                       else if (t->type == MONO_TYPE_R8)
+                                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORER8_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                                       else
+                                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                               } else {
+                                       MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                               }
+                       }
+               }
+       }
 
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+       /*
+        * Emit all parameters passed in registers in non-reverse order for better readability
+        * and to help the optimization in emit_prolog ().
+        */
+       for (i = 0; i < n; ++i) {
+               ainfo = cinfo->args + i;
+
+               in = call->args [i];
+
+               if (ainfo->storage == ArgInIReg)
+                       add_outarg_reg (cfg, call, ainfo->storage, ainfo->reg, in);
+       }
+
+       for (i = n - 1; i >= 0; --i) {
+               ainfo = cinfo->args + i;
+
+               in = call->args [i];
+
+               switch (ainfo->storage) {
+               case ArgInIReg:
+                       /* Already done */
+                       break;
+               case ArgInFloatSSEReg:
+               case ArgInDoubleSSEReg:
+                       add_outarg_reg (cfg, call, ainfo->storage, ainfo->reg, in);
+                       break;
+               case ArgOnStack:
+               case ArgValuetypeInReg:
+               case ArgValuetypeAddrInIReg:
+                       if (ainfo->storage == ArgOnStack && call->tail_call) {
+                               MonoInst *call_inst = (MonoInst*)call;
+                               cfg->args [i]->flags |= MONO_INST_VOLATILE;
+                               EMIT_NEW_ARGSTORE (cfg, call_inst, i, in);
+                       } else if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
                                guint32 align;
                                guint32 size;
 
@@ -1321,180 +1947,568 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                        size = sizeof (MonoTypedRef);
                                        align = sizeof (gpointer);
                                }
-                               else
-                               if (sig->pinvoke)
-                                       size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
                                else {
-                                       /* 
-                                        * Other backends use mini_type_stack_size (), but that
-                                        * aligns the size to 8, which is larger than the size of
-                                        * the source, leading to reads of invalid memory if the
-                                        * source is at the end of address space.
-                                        */
-                                       size = mono_class_value_size (in->klass, &align);
-                               }
-                               if (ainfo->storage == ArgValuetypeInReg) {
-                                       if (ainfo->pair_storage [1] == ArgNone) {
-                                               MonoInst *load;
-
-                                               /* Simpler case */
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [0]));
-                                               load->inst_left = in;
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [0], ainfo->pair_regs [0], load);
-                                       }
+                                       if (sig->pinvoke)
+                                               size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
                                        else {
-                                               /* Trees can't be shared so make a copy */
-                                               MonoInst *vtaddr = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
-                                               MonoInst *load, *load2, *offset_ins;
-
-                                               /* Reg1 */
-                                               MONO_INST_NEW (cfg, load, CEE_LDIND_I);
-                                               load->ssa_op = MONO_SSA_LOAD;
-                                               load->inst_i0 = (cfg)->varinfo [vtaddr->inst_c0];
-
-                                               NEW_ICONST (cfg, offset_ins, 0);
-                                               MONO_INST_NEW (cfg, load2, CEE_ADD);
-                                               load2->inst_left = load;
-                                               load2->inst_right = offset_ins;
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [0]));
-                                               load->inst_left = load2;
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [0], ainfo->pair_regs [0], load);
-
-                                               /* Reg2 */
-                                               MONO_INST_NEW (cfg, load, CEE_LDIND_I);
-                                               load->ssa_op = MONO_SSA_LOAD;
-                                               load->inst_i0 = (cfg)->varinfo [vtaddr->inst_c0];
-
-                                               NEW_ICONST (cfg, offset_ins, 8);
-                                               MONO_INST_NEW (cfg, load2, CEE_ADD);
-                                               load2->inst_left = load;
-                                               load2->inst_right = offset_ins;
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [1]));
-                                               load->inst_left = load2;
-
-                                               MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                                               arg->cil_code = in->cil_code;
-                                               arg->type = in->type;
-                                               MONO_INST_LIST_ADD (&arg->node, &call->out_args);
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [1], ainfo->pair_regs [1], load);
-
-                                               /* Prepend a copy inst */
-                                               MONO_INST_NEW (cfg, arg, CEE_STIND_I);
-                                               arg->cil_code = in->cil_code;
-                                               arg->ssa_op = MONO_SSA_STORE;
-                                               arg->inst_left = vtaddr;
-                                               arg->inst_right = in;
-                                               arg->type = in->type;
-
-                                               MONO_INST_LIST_ADD (&arg->node, &call->out_args);
+                                               /* 
+                                                * Other backends use mono_type_stack_size (), but that
+                                                * aligns the size to 8, which is larger than the size of
+                                                * the source, leading to reads of invalid memory if the
+                                                * source is at the end of address space.
+                                                */
+                                               size = mono_class_value_size (in->klass, &align);
                                        }
                                }
-                               else {
-                                       arg->opcode = OP_OUTARG_VT;
+                               g_assert (in->klass);
+
+                               if (size > 0) {
+                                       MONO_INST_NEW (cfg, arg, OP_OUTARG_VT);
+                                       arg->sreg1 = in->dreg;
                                        arg->klass = in->klass;
-                                       arg->backend.is_pinvoke = sig->pinvoke;
-                                       arg->inst_imm = size;
+                                       arg->backend.size = size;
+                                       arg->inst_p0 = call;
+                                       arg->inst_p1 = mono_mempool_alloc (cfg->mempool, sizeof (ArgInfo));
+                                       memcpy (arg->inst_p1, ainfo, sizeof (ArgInfo));
+
+                                       MONO_ADD_INS (cfg->cbb, arg);
                                }
-                       }
-                       else {
-                               switch (ainfo->storage) {
-                               case ArgInIReg:
-                                       add_outarg_reg (cfg, call, arg, ainfo->storage, ainfo->reg, in);
-                                       break;
-                               case ArgInFloatSSEReg:
-                               case ArgInDoubleSSEReg:
-                                       add_outarg_reg (cfg, call, arg, ainfo->storage, ainfo->reg, in);
-                                       break;
-                               case ArgOnStack:
-                                       arg->opcode = OP_OUTARG;
+                       } else {
+                               if (cfg->arch.no_pushes) {
+                                       /* Already done */
+                               } else {
+                                       MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
+                                       arg->sreg1 = in->dreg;
                                        if (!sig->params [i - sig->hasthis]->byref) {
-                                               if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4)
-                                                       arg->opcode = OP_OUTARG_R4;
-                                               else
-                                                       if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8)
-                                                               arg->opcode = OP_OUTARG_R8;
+                                               if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4) {
+                                                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+                                                       arg->opcode = OP_STORER4_MEMBASE_REG;
+                                                       arg->inst_destbasereg = X86_ESP;
+                                                       arg->inst_offset = 0;
+                                               } else if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8) {
+                                                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+                                                       arg->opcode = OP_STORER8_MEMBASE_REG;
+                                                       arg->inst_destbasereg = X86_ESP;
+                                                       arg->inst_offset = 0;
+                                               }
                                        }
-                                       break;
-                               default:
-                                       g_assert_not_reached ();
+                                       MONO_ADD_INS (cfg->cbb, arg);
                                }
                        }
+                       break;
+               default:
+                       g_assert_not_reached ();
                }
+
+               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos))
+                       /* Emit the signature cookie just before the implicit arguments */
+                       emit_sig_cookie (cfg, call, cinfo);
        }
 
        /* Handle the case where there are no implicit arguments */
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos)) {
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos))
                emit_sig_cookie (cfg, call, cinfo);
+
+       if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
+               MonoInst *vtarg;
+
+               if (cinfo->ret.storage == ArgValuetypeInReg) {
+                       if (cinfo->ret.pair_storage [0] == ArgInIReg && cinfo->ret.pair_storage [1] == ArgNone) {
+                               /*
+                                * Tell the JIT to use a more efficient calling convention: call using
+                                * OP_CALL, compute the result location after the call, and save the 
+                                * result there.
+                                */
+                               call->vret_in_reg = TRUE;
+                               /* 
+                                * Nullify the instruction computing the vret addr to enable 
+                                * future optimizations.
+                                */
+                               if (call->vret_var)
+                                       NULLIFY_INS (call->vret_var);
+                       } else {
+                               if (call->tail_call)
+                                       NOT_IMPLEMENTED;
+                               /*
+                                * The valuetype is in RAX:RDX after the call, need to be copied to
+                                * the stack. Push the address here, so the call instruction can
+                                * access it.
+                                */
+                               if (!cfg->arch.vret_addr_loc) {
+                                       cfg->arch.vret_addr_loc = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
+                                       /* Prevent it from being register allocated or optimized away */
+                                       ((MonoInst*)cfg->arch.vret_addr_loc)->flags |= MONO_INST_VOLATILE;
+                               }
+
+                               MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, ((MonoInst*)cfg->arch.vret_addr_loc)->dreg, call->vret_var->dreg);
+                       }
+               }
+               else {
+                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
+                       vtarg->sreg1 = call->vret_var->dreg;
+                       vtarg->dreg = mono_alloc_preg (cfg);
+                       MONO_ADD_INS (cfg->cbb, vtarg);
+
+                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
+               }
        }
 
-       if (cinfo->need_stack_align) {
-               MONO_INST_NEW (cfg, arg, OP_AMD64_OUTARG_ALIGN_STACK);
-               MONO_INST_LIST_ADD (&arg->node, &call->out_args);
+#ifdef HOST_WIN32
+       if (call->inst.opcode != OP_JMP && OP_TAILCALL != call->inst.opcode) {
+               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 0x20);
        }
+#endif
 
        if (cfg->method->save_lmf) {
                MONO_INST_NEW (cfg, arg, OP_AMD64_SAVE_SP_TO_LMF);
-               MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);
+               MONO_ADD_INS (cfg->cbb, arg);
        }
 
        call->stack_usage = cinfo->stack_usage;
-       cfg->param_area = MAX (cfg->param_area, call->stack_usage);
-       cfg->flags |= MONO_CFG_HAS_CALLS;
+}
+
+void
+mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
+{
+       MonoInst *arg;
+       MonoCallInst *call = (MonoCallInst*)ins->inst_p0;
+       ArgInfo *ainfo = (ArgInfo*)ins->inst_p1;
+       int size = ins->backend.size;
+
+       if (ainfo->storage == ArgValuetypeInReg) {
+               MonoInst *load;
+               int part;
 
-       return call;
+               for (part = 0; part < 2; ++part) {
+                       if (ainfo->pair_storage [part] == ArgNone)
+                               continue;
+
+                       MONO_INST_NEW (cfg, load, arg_storage_to_load_membase (ainfo->pair_storage [part]));
+                       load->inst_basereg = src->dreg;
+                       load->inst_offset = part * sizeof (gpointer);
+
+                       switch (ainfo->pair_storage [part]) {
+                       case ArgInIReg:
+                               load->dreg = mono_alloc_ireg (cfg);
+                               break;
+                       case ArgInDoubleSSEReg:
+                       case ArgInFloatSSEReg:
+                               load->dreg = mono_alloc_freg (cfg);
+                               break;
+                       default:
+                               g_assert_not_reached ();
+                       }
+                       MONO_ADD_INS (cfg->cbb, load);
+
+                       add_outarg_reg (cfg, call, ainfo->pair_storage [part], ainfo->pair_regs [part], load);
+               }
+       } else if (ainfo->storage == ArgValuetypeAddrInIReg) {
+               MonoInst *vtaddr, *load;
+               vtaddr = mono_compile_create_var (cfg, &ins->klass->byval_arg, OP_LOCAL);
+               
+               g_assert (!cfg->arch.no_pushes);
+
+               MONO_INST_NEW (cfg, load, OP_LDADDR);
+               load->inst_p0 = vtaddr;
+               vtaddr->flags |= MONO_INST_INDIRECT;
+               load->type = STACK_MP;
+               load->klass = vtaddr->klass;
+               load->dreg = mono_alloc_ireg (cfg);
+               MONO_ADD_INS (cfg->cbb, load);
+               mini_emit_memcpy (cfg, load->dreg, 0, src->dreg, 0, size, 4);
+
+               if (ainfo->pair_storage [0] == ArgInIReg) {
+                       MONO_INST_NEW (cfg, arg, OP_X86_LEA_MEMBASE);
+                       arg->dreg = mono_alloc_ireg (cfg);
+                       arg->sreg1 = load->dreg;
+                       arg->inst_imm = 0;
+                       MONO_ADD_INS (cfg->cbb, arg);
+                       mono_call_inst_add_outarg_reg (cfg, call, arg->dreg, ainfo->pair_regs [0], FALSE);
+               } else {
+                       MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
+                       arg->sreg1 = load->dreg;
+                       MONO_ADD_INS (cfg->cbb, arg);
+               }
+       } else {
+               if (size == 8) {
+                       if (cfg->arch.no_pushes) {
+                               int dreg = mono_alloc_ireg (cfg);
+
+                               MONO_EMIT_NEW_LOAD_MEMBASE (cfg, dreg, src->dreg, 0);
+                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, ainfo->offset, dreg);
+                       } else {
+                               /* Can't use this for < 8 since it does an 8 byte memory load */
+                               MONO_INST_NEW (cfg, arg, OP_X86_PUSH_MEMBASE);
+                               arg->inst_basereg = src->dreg;
+                               arg->inst_offset = 0;
+                               MONO_ADD_INS (cfg->cbb, arg);
+                       }
+               } else if (size <= 40) {
+                       if (cfg->arch.no_pushes) {
+                               mini_emit_memcpy (cfg, AMD64_RSP, ainfo->offset, src->dreg, 0, size, 4);
+                       } else {
+                               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, ALIGN_TO (size, 8));
+                               mini_emit_memcpy (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
+                       }
+               } else {
+                       if (cfg->arch.no_pushes) {
+                               // FIXME: Code growth
+                               mini_emit_memcpy (cfg, AMD64_RSP, ainfo->offset, src->dreg, 0, size, 4);
+                       } else {
+                               MONO_INST_NEW (cfg, arg, OP_X86_PUSH_OBJ);
+                               arg->inst_basereg = src->dreg;
+                               arg->inst_offset = 0;
+                               arg->inst_imm = size;
+                               MONO_ADD_INS (cfg->cbb, arg);
+                       }
+               }
+       }
+}
+
+void
+mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
+{
+       MonoType *ret = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
+
+       if (ret->type == MONO_TYPE_R4) {
+               if (COMPILE_LLVM (cfg))
+                       MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
+               else
+                       MONO_EMIT_NEW_UNALU (cfg, OP_AMD64_SET_XMMREG_R4, cfg->ret->dreg, val->dreg);
+               return;
+       } else if (ret->type == MONO_TYPE_R8) {
+               MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
+               return;
+       }
+                       
+       MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, cfg->ret->dreg, val->dreg);
 }
 
 #define EMIT_COND_BRANCH(ins,cond,sign) \
-if (ins->flags & MONO_INST_BRLABEL) { \
-        if (ins->inst_i0->inst_c0) { \
-               x86_branch (code, cond, cfg->native_code + ins->inst_i0->inst_c0, sign); \
-        } else { \
-               mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_LABEL, ins->inst_i0); \
-               if ((cfg->opt & MONO_OPT_BRANCH) && \
-                    x86_is_imm8 (ins->inst_i0->inst_c1 - cpos)) \
-                       x86_branch8 (code, cond, 0, sign); \
-                else \
-                       x86_branch32 (code, cond, 0, sign); \
-        } \
-} else { \
         if (ins->inst_true_bb->native_offset) { \
                x86_branch (code, cond, cfg->native_code + ins->inst_true_bb->native_offset, sign); \
         } else { \
                mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_true_bb); \
                if ((cfg->opt & MONO_OPT_BRANCH) && \
-                    x86_is_imm8 (ins->inst_true_bb->max_offset - cpos)) \
+            x86_is_imm8 (ins->inst_true_bb->max_offset - offset)) \
                        x86_branch8 (code, cond, 0, sign); \
                 else \
                        x86_branch32 (code, cond, 0, sign); \
-        } \
 }
 
-/* emit an exception if condition is fail */
-#define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
-        do {                                                        \
-               MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
-               if (tins == NULL) {                                                                             \
-                       mono_add_patch_info (cfg, code - cfg->native_code,   \
-                                       MONO_PATCH_INFO_EXC, exc_name);  \
-                       x86_branch32 (code, cond, 0, signed);               \
-               } else {        \
-                       EMIT_COND_BRANCH (tins, cond, signed);  \
-               }                       \
-       } while (0); 
+typedef struct {
+       MonoMethodSignature *sig;
+       CallInfo *cinfo;
+} ArchDynCallInfo;
 
-#define EMIT_FPCOMPARE(code) do { \
-       amd64_fcompp (code); \
-       amd64_fnstsw (code); \
-} while (0); 
+typedef struct {
+       mgreg_t regs [PARAM_REGS];
+       mgreg_t res;
+       guint8 *ret;
+} DynCallArgs;
 
-#define EMIT_SSE2_FPFUNC(code, op, dreg, sreg1) do { \
-    amd64_movsd_membase_reg (code, AMD64_RSP, -8, (sreg1)); \
+static gboolean
+dyn_call_supported (MonoMethodSignature *sig, CallInfo *cinfo)
+{
+       int i;
+
+#ifdef HOST_WIN32
+       return FALSE;
+#endif
+
+       switch (cinfo->ret.storage) {
+       case ArgNone:
+       case ArgInIReg:
+               break;
+       case ArgValuetypeInReg: {
+               ArgInfo *ainfo = &cinfo->ret;
+
+               if (ainfo->pair_storage [0] != ArgNone && ainfo->pair_storage [0] != ArgInIReg)
+                       return FALSE;
+               if (ainfo->pair_storage [1] != ArgNone && ainfo->pair_storage [1] != ArgInIReg)
+                       return FALSE;
+               break;
+       }
+       default:
+               return FALSE;
+       }
+
+       for (i = 0; i < cinfo->nargs; ++i) {
+               ArgInfo *ainfo = &cinfo->args [i];
+               switch (ainfo->storage) {
+               case ArgInIReg:
+                       break;
+               case ArgValuetypeInReg:
+                       if (ainfo->pair_storage [0] != ArgNone && ainfo->pair_storage [0] != ArgInIReg)
+                               return FALSE;
+                       if (ainfo->pair_storage [1] != ArgNone && ainfo->pair_storage [1] != ArgInIReg)
+                               return FALSE;
+                       break;
+               default:
+                       return FALSE;
+               }
+       }
+
+       return TRUE;
+}
+
+/*
+ * mono_arch_dyn_call_prepare:
+ *
+ *   Return a pointer to an arch-specific structure which contains information 
+ * needed by mono_arch_get_dyn_call_args (). Return NULL if OP_DYN_CALL is not
+ * supported for SIG.
+ * This function is equivalent to ffi_prep_cif in libffi.
+ */
+MonoDynCallInfo*
+mono_arch_dyn_call_prepare (MonoMethodSignature *sig)
+{
+       ArchDynCallInfo *info;
+       CallInfo *cinfo;
+
+       cinfo = get_call_info (NULL, NULL, sig, FALSE);
+
+       if (!dyn_call_supported (sig, cinfo)) {
+               g_free (cinfo);
+               return NULL;
+       }
+
+       info = g_new0 (ArchDynCallInfo, 1);
+       // FIXME: Preprocess the info to speed up get_dyn_call_args ().
+       info->sig = sig;
+       info->cinfo = cinfo;
+       
+       return (MonoDynCallInfo*)info;
+}
+
+/*
+ * mono_arch_dyn_call_free:
+ *
+ *   Free a MonoDynCallInfo structure.
+ */
+void
+mono_arch_dyn_call_free (MonoDynCallInfo *info)
+{
+       ArchDynCallInfo *ainfo = (ArchDynCallInfo*)info;
+
+       g_free (ainfo->cinfo);
+       g_free (ainfo);
+}
+
+/*
+ * mono_arch_get_start_dyn_call:
+ *
+ *   Convert the arguments ARGS to a format which can be passed to OP_DYN_CALL, and
+ * store the result into BUF.
+ * ARGS should be an array of pointers pointing to the arguments.
+ * RET should point to a memory buffer large enought to hold the result of the
+ * call.
+ * This function should be as fast as possible, any work which does not depend
+ * on the actual values of the arguments should be done in 
+ * mono_arch_dyn_call_prepare ().
+ * start_dyn_call + OP_DYN_CALL + finish_dyn_call is equivalent to ffi_call in
+ * libffi.
+ */
+void
+mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, guint8 *buf, int buf_len)
+{
+       ArchDynCallInfo *dinfo = (ArchDynCallInfo*)info;
+       DynCallArgs *p = (DynCallArgs*)buf;
+       int arg_index, greg, i;
+       MonoMethodSignature *sig = dinfo->sig;
+
+       g_assert (buf_len >= sizeof (DynCallArgs));
+
+       p->res = 0;
+       p->ret = ret;
+
+       arg_index = 0;
+       greg = 0;
+
+       if (dinfo->cinfo->vtype_retaddr)
+               p->regs [greg ++] = (mgreg_t)ret;
+
+       if (sig->hasthis) {
+               p->regs [greg ++] = (mgreg_t)*(args [arg_index ++]);
+       }
+
+       for (i = 0; i < sig->param_count; i++) {
+               MonoType *t = mono_type_get_underlying_type (sig->params [i]);
+               gpointer *arg = args [arg_index ++];
+
+               if (t->byref) {
+                       p->regs [greg ++] = (mgreg_t)*(arg);
+                       continue;
+               }
+
+               switch (t->type) {
+               case MONO_TYPE_STRING:
+               case MONO_TYPE_CLASS:  
+               case MONO_TYPE_ARRAY:
+               case MONO_TYPE_SZARRAY:
+               case MONO_TYPE_OBJECT:
+               case MONO_TYPE_PTR:
+               case MONO_TYPE_I:
+               case MONO_TYPE_U:
+               case MONO_TYPE_I8:
+               case MONO_TYPE_U8:
+                       g_assert (dinfo->cinfo->args [i + sig->hasthis].reg == param_regs [greg]);
+                       p->regs [greg ++] = (mgreg_t)*(arg);
+                       break;
+               case MONO_TYPE_BOOLEAN:
+               case MONO_TYPE_U1:
+                       p->regs [greg ++] = *(guint8*)(arg);
+                       break;
+               case MONO_TYPE_I1:
+                       p->regs [greg ++] = *(gint8*)(arg);
+                       break;
+               case MONO_TYPE_I2:
+                       p->regs [greg ++] = *(gint16*)(arg);
+                       break;
+               case MONO_TYPE_U2:
+               case MONO_TYPE_CHAR:
+                       p->regs [greg ++] = *(guint16*)(arg);
+                       break;
+               case MONO_TYPE_I4:
+                       p->regs [greg ++] = *(gint32*)(arg);
+                       break;
+               case MONO_TYPE_U4:
+                       p->regs [greg ++] = *(guint32*)(arg);
+                       break;
+               case MONO_TYPE_GENERICINST:
+                   if (MONO_TYPE_IS_REFERENCE (t)) {
+                               p->regs [greg ++] = (mgreg_t)*(arg);
+                               break;
+                       } else {
+                               /* Fall through */
+                       }
+               case MONO_TYPE_VALUETYPE: {
+                       ArgInfo *ainfo = &dinfo->cinfo->args [i + sig->hasthis];
+
+                       g_assert (ainfo->storage == ArgValuetypeInReg);
+                       if (ainfo->pair_storage [0] != ArgNone) {
+                               g_assert (ainfo->pair_storage [0] == ArgInIReg);
+                               p->regs [greg ++] = ((mgreg_t*)(arg))[0];
+                       }
+                       if (ainfo->pair_storage [1] != ArgNone) {
+                               g_assert (ainfo->pair_storage [1] == ArgInIReg);
+                               p->regs [greg ++] = ((mgreg_t*)(arg))[1];
+                       }
+                       break;
+               }
+               default:
+                       g_assert_not_reached ();
+               }
+       }
+
+       g_assert (greg <= PARAM_REGS);
+}
+
+/*
+ * mono_arch_finish_dyn_call:
+ *
+ *   Store the result of a dyn call into the return value buffer passed to
+ * start_dyn_call ().
+ * This function should be as fast as possible, any work which does not depend
+ * on the actual values of the arguments should be done in 
+ * mono_arch_dyn_call_prepare ().
+ */
+void
+mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
+{
+       ArchDynCallInfo *dinfo = (ArchDynCallInfo*)info;
+       MonoMethodSignature *sig = dinfo->sig;
+       guint8 *ret = ((DynCallArgs*)buf)->ret;
+       mgreg_t res = ((DynCallArgs*)buf)->res;
+
+       switch (mono_type_get_underlying_type (sig->ret)->type) {
+       case MONO_TYPE_VOID:
+               *(gpointer*)ret = NULL;
+               break;
+       case MONO_TYPE_STRING:
+       case MONO_TYPE_CLASS:  
+       case MONO_TYPE_ARRAY:
+       case MONO_TYPE_SZARRAY:
+       case MONO_TYPE_OBJECT:
+       case MONO_TYPE_I:
+       case MONO_TYPE_U:
+       case MONO_TYPE_PTR:
+               *(gpointer*)ret = (gpointer)res;
+               break;
+       case MONO_TYPE_I1:
+               *(gint8*)ret = res;
+               break;
+       case MONO_TYPE_U1:
+       case MONO_TYPE_BOOLEAN:
+               *(guint8*)ret = res;
+               break;
+       case MONO_TYPE_I2:
+               *(gint16*)ret = res;
+               break;
+       case MONO_TYPE_U2:
+       case MONO_TYPE_CHAR:
+               *(guint16*)ret = res;
+               break;
+       case MONO_TYPE_I4:
+               *(gint32*)ret = res;
+               break;
+       case MONO_TYPE_U4:
+               *(guint32*)ret = res;
+               break;
+       case MONO_TYPE_I8:
+               *(gint64*)ret = res;
+               break;
+       case MONO_TYPE_U8:
+               *(guint64*)ret = res;
+               break;
+       case MONO_TYPE_GENERICINST:
+               if (MONO_TYPE_IS_REFERENCE (sig->ret)) {
+                       *(gpointer*)ret = (gpointer)res;
+                       break;
+               } else {
+                       /* Fall through */
+               }
+       case MONO_TYPE_VALUETYPE:
+               if (dinfo->cinfo->vtype_retaddr) {
+                       /* Nothing to do */
+               } else {
+                       ArgInfo *ainfo = &dinfo->cinfo->ret;
+
+                       g_assert (ainfo->storage == ArgValuetypeInReg);
+
+                       if (ainfo->pair_storage [0] != ArgNone) {
+                               g_assert (ainfo->pair_storage [0] == ArgInIReg);
+                               ((mgreg_t*)ret)[0] = res;
+                       }
+
+                       g_assert (ainfo->pair_storage [1] == ArgNone);
+               }
+               break;
+       default:
+               g_assert_not_reached ();
+       }
+}
+
+/* emit an exception if condition is fail */
+#define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
+        do {                                                        \
+               MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
+               if (tins == NULL) {                                                                             \
+                       mono_add_patch_info (cfg, code - cfg->native_code,   \
+                                       MONO_PATCH_INFO_EXC, exc_name);  \
+                       x86_branch32 (code, cond, 0, signed);               \
+               } else {        \
+                       EMIT_COND_BRANCH (tins, cond, signed);  \
+               }                       \
+       } while (0); 
+
+#define EMIT_FPCOMPARE(code) do { \
+       amd64_fcompp (code); \
+       amd64_fnstsw (code); \
+} while (0); 
+
+#define EMIT_SSE2_FPFUNC(code, op, dreg, sreg1) do { \
+    amd64_movsd_membase_reg (code, AMD64_RSP, -8, (sreg1)); \
        amd64_fld_membase (code, AMD64_RSP, -8, TRUE); \
        amd64_ ##op (code); \
        amd64_fst_membase (code, AMD64_RSP, -8, TRUE, TRUE); \
@@ -1504,7 +2518,7 @@ if (ins->flags & MONO_INST_BRLABEL) { \
 static guint8*
 emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data)
 {
-       mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
+       gboolean no_patch = FALSE;
 
        /* 
         * FIXME: Add support for thunks
@@ -1523,7 +2537,7 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        near_call = TRUE;
 
                        if ((patch_type == MONO_PATCH_INFO_METHOD) || (patch_type == MONO_PATCH_INFO_METHOD_JUMP)) {
-                               if (((MonoMethod*)data)->klass->image->assembly->aot_module)
+                               if (((MonoMethod*)data)->klass->image->aot_module)
                                        /* The callee might be an AOT method */
                                        near_call = FALSE;
                                if (((MonoMethod*)data)->dynamic)
@@ -1545,9 +2559,13 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        }
                }
                else {
-                       if (mono_find_class_init_trampoline_by_addr (data))
+                       if (cfg->abs_patches && g_hash_table_lookup (cfg->abs_patches, data)) {
+                               /* 
+                                * This is not really an optimization, but required because the
+                                * generic class init trampolines use R11 to pass the vtable.
+                                */
                                near_call = TRUE;
-                       else {
+                       else {
                                MonoJitICallInfo *info = mono_find_jit_icall_by_addr (data);
                                if (info) {
                                        if ((cfg->method->wrapper_type == MONO_WRAPPER_MANAGED_TO_NATIVE) && 
@@ -1555,6 +2573,7 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                                                /* A call to the wrapped function */
                                                if ((((guint64)data) >> 32) == 0)
                                                        near_call = TRUE;
+                                               no_patch = TRUE;
                                        }
                                        else if (info->func == info->wrapper) {
                                                /* No wrapper */
@@ -1567,8 +2586,10 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                                                        near_call = TRUE;
                                        }
                                }
-                               else if ((((guint64)data) >> 32) == 0)
+                               else if ((((guint64)data) >> 32) == 0) {
                                        near_call = TRUE;
+                                       no_patch = TRUE;
+                               }
                        }
                }
 
@@ -1576,17 +2597,32 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        /* These methods are allocated using malloc */
                        near_call = FALSE;
 
-               if (cfg->compile_aot)
-                       near_call = TRUE;
-
 #ifdef MONO_ARCH_NOMAP32BIT
                near_call = FALSE;
 #endif
 
+               /* The 64bit XEN kernel does not honour the MAP_32BIT flag. (#522894) */
+               if (optimize_for_xen)
+                       near_call = FALSE;
+
+               if (cfg->compile_aot) {
+                       near_call = TRUE;
+                       no_patch = TRUE;
+               }
+
                if (near_call) {
+                       /* 
+                        * Align the call displacement to an address divisible by 4 so it does
+                        * not span cache lines. This is required for code patching to work on SMP
+                        * systems.
+                        */
+                       if (!no_patch && ((guint32)(code + 1 - cfg->native_code) % 4) != 0)
+                               amd64_padding (code, 4 - ((guint32)(code + 1 - cfg->native_code) % 4));
+                       mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
                        amd64_call_code (code, 0);
                }
                else {
+                       mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
                        amd64_set_reg_template (code, GP_SCRATCH_REG);
                        amd64_call_reg (code, GP_SCRATCH_REG);
                }
@@ -1596,11 +2632,19 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
 }
 
 static inline guint8*
-emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data)
+emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data, gboolean win64_adjust_stack)
 {
-       mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
-
-       return emit_call_body (cfg, code, patch_type, data);
+#ifdef HOST_WIN32
+       if (win64_adjust_stack)
+               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 32);
+#endif
+       code = emit_call_body (cfg, code, patch_type, data);
+#ifdef HOST_WIN32
+       if (win64_adjust_stack)
+               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 32);
+#endif 
+       
+       return code;
 }
 
 static inline int
@@ -1631,7 +2675,7 @@ mono_arch_peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
        MonoInst *ins, *n;
 
        MONO_BB_FOR_EACH_INS_SAFE (bb, n, ins) {
-               MonoInst *last_ins = mono_inst_list_prev (&ins->node, &bb->ins_list);
+               MonoInst *last_ins = ins->prev;
 
                switch (ins->opcode) {
                case OP_ADD_IMM:
@@ -1658,8 +2702,7 @@ mono_arch_peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
                                 * propagation). These instruction sequences are very common
                                 * in the initlocals bblock.
                                 */
-                               for (ins2 = mono_inst_list_next (&ins->node, &bb->ins_list); ins2;
-                                               ins2 = mono_inst_list_next (&ins2->node, &bb->ins_list)) {
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
                                        if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
                                                ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
                                                ins2->sreg1 = ins->dreg;
@@ -1724,13 +2767,9 @@ mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb)
                switch (ins->opcode) {
                case OP_ICONST:
                case OP_I8CONST: {
-                       MonoInst *next;
-
                        /* reg = 0 -> XOR (reg, reg) */
                        /* XOR sets cflags on x86, so we cant do it always */
-                       next = mono_inst_list_next (&ins->node, &bb->ins_list);
-                       if (ins->inst_c0 == 0 && (!next ||
-                                       (next && INST_IGNORES_CFLAGS (next->opcode)))) {
+                       if (ins->inst_c0 == 0 && (!ins->next || (ins->next && INST_IGNORES_CFLAGS (ins->next->opcode)))) {
                                ins->opcode = OP_LXOR;
                                ins->sreg1 = ins->dreg;
                                ins->sreg2 = ins->dreg;
@@ -1758,12 +2797,11 @@ mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb)
                                 * propagation). These instruction sequences are very common
                                 * in the initlocals bblock.
                                 */
-                               for (ins2 = mono_inst_list_next (&ins->node, &bb->ins_list); ins2;
-                                               ins2 = mono_inst_list_next (&ins2->node, &bb->ins_list)) {
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
                                        if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
                                                ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
                                                ins2->sreg1 = ins->dreg;
-                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_REG) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_REG) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG) || (ins2->opcode == OP_LIVERANGE_START)) {
                                                /* Continue */
                                        } else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
                                                NULLIFY_INS (ins2);
@@ -1790,7 +2828,8 @@ mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb)
 
 #define NEW_INS(cfg,ins,dest,op) do {  \
                MONO_INST_NEW ((cfg), (dest), (op)); \
-               MONO_INST_LIST_ADD_TAIL (&(dest)->node, &(ins)->node); \
+        (dest)->cil_code = (ins)->cil_code; \
+        mono_bblock_insert_before_ins (bb, ins, (dest)); \
        } while (0)
 
 /*
@@ -1804,9 +2843,6 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 {
        MonoInst *ins, *n, *temp;
 
-       if (bb->max_vreg > cfg->rs->next_vreg)
-               cfg->rs->next_vreg = bb->max_vreg;
-
        /*
         * FIXME: Need to add more instructions, but the current machine 
         * description can't model some parts of the composite instructions like
@@ -1817,17 +2853,21 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_DIV_IMM:
                case OP_REM_IMM:
                case OP_IDIV_IMM:
-               case OP_IREM_IMM:
                case OP_IDIV_UN_IMM:
                case OP_IREM_UN_IMM:
-                       mono_decompose_op_imm (cfg, ins);
+                       mono_decompose_op_imm (cfg, bb, ins);
+                       break;
+               case OP_IREM_IMM:
+                       /* Keep the opcode if we can implement it efficiently */
+                       if (!((ins->inst_imm > 0) && (mono_is_power_of_two (ins->inst_imm) != -1)))
+                               mono_decompose_op_imm (cfg, bb, ins);
                        break;
                case OP_COMPARE_IMM:
                case OP_LCOMPARE_IMM:
                        if (!amd64_is_imm32 (ins->inst_imm)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_imm;
-                               temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_COMPARE;
                                ins->sreg2 = temp->dreg;
                        }
@@ -1837,7 +2877,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (!amd64_is_imm32 (ins->inst_offset)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_offset;
-                               temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_AMD64_LOADI8_MEMINDEX;
                                ins->inst_indexreg = temp->dreg;
                        }
@@ -1847,17 +2887,41 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (!amd64_is_imm32 (ins->inst_imm)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_imm;
-                               temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_STOREI8_MEMBASE_REG;
                                ins->sreg1 = temp->dreg;
                        }
                        break;
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               case OP_EXPAND_I1: {
+                               int temp_reg1 = mono_alloc_ireg (cfg);
+                               int temp_reg2 = mono_alloc_ireg (cfg);
+                               int original_reg = ins->sreg1;
+
+                               NEW_INS (cfg, ins, temp, OP_ICONV_TO_U1);
+                               temp->sreg1 = original_reg;
+                               temp->dreg = temp_reg1;
+
+                               NEW_INS (cfg, ins, temp, OP_SHL_IMM);
+                               temp->sreg1 = temp_reg1;
+                               temp->dreg = temp_reg2;
+                               temp->inst_imm = 8;
+
+                               NEW_INS (cfg, ins, temp, OP_LOR);
+                               temp->sreg1 = temp->dreg = temp_reg2;
+                               temp->sreg2 = temp_reg1;
+
+                               ins->opcode = OP_EXPAND_I2;
+                               ins->sreg1 = temp_reg2;
+                       }
+                       break;
+#endif
                default:
                        break;
                }
        }
 
-       bb->max_vreg = cfg->rs->next_vreg;
+       bb->max_vreg = cfg->next_vreg;
 }
 
 static const int 
@@ -1885,22 +2949,7 @@ cc_signed_table [] = {
 static unsigned char*
 emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int sreg, int size, gboolean is_signed)
 {
-       if (use_sse2) {
-               amd64_sse_cvttsd2si_reg_reg (code, dreg, sreg);
-       }
-       else {
-               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 16);
-               x86_fnstcw_membase(code, AMD64_RSP, 0);
-               amd64_mov_reg_membase (code, dreg, AMD64_RSP, 0, 2);
-               amd64_alu_reg_imm (code, X86_OR, dreg, 0xc00);
-               amd64_mov_membase_reg (code, AMD64_RSP, 2, dreg, 2);
-               amd64_fldcw_membase (code, AMD64_RSP, 2);
-               amd64_push_reg (code, AMD64_RAX); // SP = SP - 8
-               amd64_fist_pop_membase (code, AMD64_RSP, 0, size == 8);
-               amd64_pop_reg (code, dreg);
-               amd64_fldcw_membase (code, AMD64_RSP, 0);
-               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 16);
-       }
+       amd64_sse_cvttsd2si_reg_reg (code, dreg, sreg);
 
        if (size == 1)
                amd64_widen_reg (code, dreg, dreg, is_signed, FALSE);
@@ -1910,12 +2959,12 @@ emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int sreg, int size,
 }
 
 static unsigned char*
-mono_emit_stack_alloc (guchar *code, MonoInst* tree)
+mono_emit_stack_alloc (MonoCompile *cfg, guchar *code, MonoInst* tree)
 {
        int sreg = tree->sreg1;
        int need_touch = FALSE;
 
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(HOST_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
        if (!tree->flags & MONO_INST_INIT)
                need_touch = TRUE;
 #endif
@@ -1982,6 +3031,8 @@ mono_emit_stack_alloc (guchar *code, MonoInst* tree)
                amd64_alu_reg_reg (code, X86_XOR, AMD64_RAX, AMD64_RAX);
                                
                amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, offset);
+               if (cfg->param_area && cfg->arch.no_pushes)
+                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RDI, cfg->param_area);
                amd64_cld (code);
                amd64_prefix (code, X86_REP_PREFIX);
                amd64_stosl (code);
@@ -2017,35 +3068,27 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
        case OP_FCALL_REG:
        case OP_FCALL_MEMBASE:
                if (((MonoCallInst*)ins)->signature->ret->type == MONO_TYPE_R4) {
-                       if (use_sse2)
-                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, AMD64_XMM0);
-                       else {
-                               /* FIXME: optimize this */
-                               amd64_movss_membase_reg (code, AMD64_RSP, -8, AMD64_XMM0);
-                               amd64_fld_membase (code, AMD64_RSP, -8, FALSE);
-                       }
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, AMD64_XMM0);
                }
                else {
-                       if (use_sse2) {
-                               if (ins->dreg != AMD64_XMM0)
-                                       amd64_sse_movsd_reg_reg (code, ins->dreg, AMD64_XMM0);
-                       }
-                       else {
-                               /* FIXME: optimize this */
-                               amd64_movsd_membase_reg (code, AMD64_RSP, -8, AMD64_XMM0);
-                               amd64_fld_membase (code, AMD64_RSP, -8, TRUE);
-                       }
+                       if (ins->dreg != AMD64_XMM0)
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, AMD64_XMM0);
                }
                break;
        case OP_VCALL:
        case OP_VCALL_REG:
        case OP_VCALL_MEMBASE:
-               cinfo = get_call_info (cfg, cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
+       case OP_VCALL2:
+       case OP_VCALL2_REG:
+       case OP_VCALL2_MEMBASE:
+               cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
                if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       /* Pop the destination address from the stack */
-                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
-                       amd64_pop_reg (code, AMD64_RCX);
-                       
+                       MonoInst *loc = cfg->arch.vret_addr_loc;
+
+                       /* Load the destination address */
+                       g_assert (loc->opcode == OP_REGOFFSET);
+                       amd64_mov_reg_membase (code, AMD64_RCX, loc->inst_basereg, loc->inst_offset, 8);
+
                        for (quad = 0; quad < 2; quad ++) {
                                switch (cinfo->ret.pair_storage [quad]) {
                                case ArgInIReg:
@@ -2071,19 +3114,25 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 }
 
 /*
- * emit_tls_get:
+ * mono_amd64_emit_tls_get:
  * @code: buffer to store code to
  * @dreg: hard register where to place the result
  * @tls_offset: offset info
  *
- * emit_tls_get emits in @code the native code that puts in the dreg register
- * the item in the thread local storage identified by tls_offset.
+ * mono_amd64_emit_tls_get emits in @code the native code that puts in
+ * the dreg register the item in the thread local storage identified
+ * by tls_offset.
  *
  * Returns: a pointer to the end of the stored code
  */
-static guint8*
-emit_tls_get (guint8* code, int dreg, int tls_offset)
+guint8*
+mono_amd64_emit_tls_get (guint8* code, int dreg, int tls_offset)
 {
+#ifdef HOST_WIN32
+       g_assert (tls_offset < 64);
+       x86_prefix (code, X86_GS_PREFIX);
+       amd64_mov_reg_mem (code, dreg, (tls_offset * 8) + 0x1480, 8);
+#else
        if (optimize_for_xen) {
                x86_prefix (code, X86_FS_PREFIX);
                amd64_mov_reg_mem (code, dreg, 0, 8);
@@ -2092,110 +3141,30 @@ emit_tls_get (guint8* code, int dreg, int tls_offset)
                x86_prefix (code, X86_FS_PREFIX);
                amd64_mov_reg_mem (code, dreg, tls_offset, 8);
        }
+#endif
        return code;
 }
 
-/*
- * emit_load_volatile_arguments:
- *
- *  Load volatile arguments from the stack to the original input registers.
- * Required before a tail call.
- */
-static guint8*
-emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
-{
-       MonoMethod *method = cfg->method;
-       MonoMethodSignature *sig;
-       MonoInst *ins;
-       CallInfo *cinfo;
-       guint32 i, quad;
+#define REAL_PRINT_REG(text,reg) \
+mono_assert (reg >= 0); \
+amd64_push_reg (code, AMD64_RAX); \
+amd64_push_reg (code, AMD64_RDX); \
+amd64_push_reg (code, AMD64_RCX); \
+amd64_push_reg (code, reg); \
+amd64_push_imm (code, reg); \
+amd64_push_imm (code, text " %d %p\n"); \
+amd64_mov_reg_imm (code, AMD64_RAX, printf); \
+amd64_call_reg (code, AMD64_RAX); \
+amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 3*4); \
+amd64_pop_reg (code, AMD64_RCX); \
+amd64_pop_reg (code, AMD64_RDX); \
+amd64_pop_reg (code, AMD64_RAX);
 
-       /* FIXME: Generate intermediate code instead */
+/* benchmark and set based on cpu */
+#define LOOP_ALIGNMENT 8
+#define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
 
-       sig = mono_method_signature (method);
-
-       cinfo = cfg->arch.cinfo;
-       
-       /* This is the opposite of the code in emit_prolog */
-       if (sig->ret->type != MONO_TYPE_VOID) {
-               if (cfg->vret_addr && (cfg->vret_addr->opcode != OP_REGVAR))
-                       amd64_mov_reg_membase (code, cinfo->ret.reg, cfg->vret_addr->inst_basereg, cfg->vret_addr->inst_offset, 8);
-       }
-
-       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               ArgInfo *ainfo = cinfo->args + i;
-               MonoType *arg_type;
-               ins = cfg->args [i];
-
-               if (sig->hasthis && (i == 0))
-                       arg_type = &mono_defaults.object_class->byval_arg;
-               else
-                       arg_type = sig->params [i - sig->hasthis];
-
-               if (ins->opcode != OP_REGVAR) {
-                       switch (ainfo->storage) {
-                       case ArgInIReg: {
-                               guint32 size = 8;
-
-                               /* FIXME: I1 etc */
-                               amd64_mov_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset, size);
-                               break;
-                       }
-                       case ArgInFloatSSEReg:
-                               amd64_movss_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset);
-                               break;
-                       case ArgInDoubleSSEReg:
-                               amd64_movsd_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset);
-                               break;
-                       case ArgValuetypeInReg:
-                               for (quad = 0; quad < 2; quad ++) {
-                                       switch (ainfo->pair_storage [quad]) {
-                                       case ArgInIReg:
-                                               amd64_mov_reg_membase (code, ainfo->pair_regs [quad], ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), sizeof (gpointer));
-                                               break;
-                                       case ArgInFloatSSEReg:
-                                       case ArgInDoubleSSEReg:
-                                               g_assert_not_reached ();
-                                               break;
-                                       case ArgNone:
-                                               break;
-                                       default:
-                                               g_assert_not_reached ();
-                                       }
-                               }
-                               break;
-                       default:
-                               break;
-                       }
-               }
-               else {
-                       g_assert (ainfo->storage == ArgInIReg);
-
-                       amd64_mov_reg_reg (code, ainfo->reg, ins->dreg, 8);
-               }
-       }
-
-       return code;
-}
-
-#define REAL_PRINT_REG(text,reg) \
-mono_assert (reg >= 0); \
-amd64_push_reg (code, AMD64_RAX); \
-amd64_push_reg (code, AMD64_RDX); \
-amd64_push_reg (code, AMD64_RCX); \
-amd64_push_reg (code, reg); \
-amd64_push_imm (code, reg); \
-amd64_push_imm (code, text " %d %p\n"); \
-amd64_mov_reg_imm (code, AMD64_RAX, printf); \
-amd64_call_reg (code, AMD64_RAX); \
-amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 3*4); \
-amd64_pop_reg (code, AMD64_RCX); \
-amd64_pop_reg (code, AMD64_RDX); \
-amd64_pop_reg (code, AMD64_RAX);
-
-/* benchmark and set based on cpu */
-#define LOOP_ALIGNMENT 8
-#define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
+#ifndef DISABLE_JIT
 
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
@@ -2204,8 +3173,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        MonoCallInst *call;
        guint offset;
        guint8 *code = cfg->native_code + cfg->code_len;
+       MonoInst *last_ins = NULL;
        guint last_offset = 0;
-       int max_len, cpos;
+       int max_len;
+
+       /* Fix max_offset estimate for each successor bb */
+       if (cfg->opt & MONO_OPT_BRANCH) {
+               int current_offset = cfg->code_len;
+               MonoBasicBlock *current_bb;
+               for (current_bb = bb; current_bb != NULL; current_bb = current_bb->next_bb) {
+                       current_bb->max_offset = current_offset;
+                       current_offset += current_bb->max_length;
+               }
+       }
 
        if (cfg->opt & MONO_OPT_LOOP) {
                int pad, align = LOOP_ALIGNMENT;
@@ -2222,12 +3202,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        if (cfg->verbose_level > 2)
                g_print ("Basic block %d starting at offset 0x%x\n", bb->block_num, bb->native_offset);
 
-       cpos = bb->max_offset;
-
        if (cfg->prof_options & MONO_PROFILE_COVERAGE) {
                MonoProfileCoverageInfo *cov = cfg->coverage_info;
                g_assert (!cfg->compile_aot);
-               cpos += 6;
 
                cov->data [bb->dfn].cil_code = bb->cil_code;
                amd64_mov_reg_imm (code, AMD64_R11, (guint64)&cov->data [bb->dfn].count);
@@ -2239,6 +3216,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
        mono_debug_open_block (cfg, bb, offset);
 
+    if (mono_break_at_bb_method && mono_method_desc_full_match (mono_break_at_bb_method, cfg->method) && bb->block_num == mono_break_at_bb_bb_num)
+               x86_breakpoint (code);
+
        MONO_BB_FOR_EACH_INS (bb, ins) {
                offset = code - cfg->native_code;
 
@@ -2306,8 +3286,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_movsxd_reg_membase (code, ins->dreg, ins->dreg, 0);
                        break;
                case OP_LOADU4_MEM:
-                       amd64_mov_reg_imm (code, ins->dreg, ins->inst_p0);
-                       amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 4);
+                       // FIXME: Decompose this earlier
+                       if (amd64_is_imm32 (ins->inst_imm))
+                               amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
+                       else {
+                               amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
+                               amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 4);
+                       }
                        break;
                case OP_LOADU1_MEM:
                        amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
@@ -2329,13 +3314,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, 4);
                        break;
                case OP_LOADU1_MEMBASE:
-                       amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, FALSE);
+                       /* The cpu zero extends the result into 64 bits */
+                       amd64_widen_membase_size (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, FALSE, 4);
                        break;
                case OP_LOADI1_MEMBASE:
                        amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, TRUE, FALSE);
                        break;
                case OP_LOADU2_MEMBASE:
-                       amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, TRUE);
+                       /* The cpu zero extends the result into 64 bits */
+                       amd64_widen_membase_size (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, TRUE, 4);
                        break;
                case OP_LOADI2_MEMBASE:
                        amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, TRUE, TRUE);
@@ -2451,6 +3438,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_dec_reg_size (code, ins->dreg, 4);
                        break;
                case OP_X86_MUL_REG_MEMBASE:
+               case OP_X86_MUL_MEMBASE_REG:
                        amd64_imul_reg_membase_size (code, ins->sreg1, ins->sreg2, ins->inst_offset, 4);
                        break;
                case OP_AMD64_ICOMPARE_MEMBASE_REG:
@@ -2532,12 +3520,57 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_BREAK:
                        amd64_breakpoint (code);
                        break;
+               case OP_RELAXED_NOP:
+                       x86_prefix (code, X86_REP_PREFIX);
+                       x86_nop (code);
+                       break;
+               case OP_HARD_NOP:
+                       x86_nop (code);
+                       break;
                case OP_NOP:
                case OP_DUMMY_USE:
                case OP_DUMMY_STORE:
                case OP_NOT_REACHED:
                case OP_NOT_NULL:
                        break;
+               case OP_SEQ_POINT: {
+                       int i;
+
+                       if (cfg->compile_aot)
+                               NOT_IMPLEMENTED;
+
+                       /* 
+                        * Read from the single stepping trigger page. This will cause a
+                        * SIGSEGV when single stepping is enabled.
+                        * We do this _before_ the breakpoint, so single stepping after
+                        * a breakpoint is hit will step to the next IL offset.
+                        */
+                       if (ins->flags & MONO_INST_SINGLE_STEP_LOC) {
+                               if (((guint64)ss_trigger_page >> 32) == 0)
+                                       amd64_mov_reg_mem (code, AMD64_R11, (guint64)ss_trigger_page, 4);
+                               else {
+                                       MonoInst *var = cfg->arch.ss_trigger_page_var;
+
+                                       amd64_mov_reg_membase (code, AMD64_R11, var->inst_basereg, var->inst_offset, 8);
+                                       amd64_alu_membase_imm_size (code, X86_CMP, AMD64_R11, 0, 0, 4);
+                               }
+                       }
+
+                       /* 
+                        * This is the address which is saved in seq points, 
+                        * get_ip_for_single_step () / get_ip_for_breakpoint () needs to compute this
+                        * from the address of the instruction causing the fault.
+                        */
+                       mono_add_seq_point (cfg, bb, ins, code - cfg->native_code);
+
+                       /* 
+                        * A placeholder for a possible breakpoint inserted by
+                        * mono_arch_set_breakpoint ().
+                        */
+                       for (i = 0; i < breakpoint_size; ++i)
+                               x86_nop (code);
+                       break;
+               }
                case OP_ADDCC:
                case OP_LADD:
                        amd64_alu_reg_reg (code, X86_ADD, ins->sreg1, ins->sreg2);
@@ -2689,6 +3722,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_div_reg_size (code, ins->sreg2, FALSE, 4);
                        }
                        break;
+               case OP_IREM_IMM: {
+                       int power = mono_is_power_of_two (ins->inst_imm);
+
+                       g_assert (ins->sreg1 == X86_EAX);
+                       g_assert (ins->dreg == X86_EAX);
+                       g_assert (power >= 0);
+
+                       if (power == 0) {
+                               amd64_mov_reg_imm (code, ins->dreg, 0);
+                               break;
+                       }
+
+                       /* Based on gcc code */
+
+                       /* Add compensation for negative dividents */
+                       amd64_mov_reg_reg_size (code, AMD64_RDX, AMD64_RAX, 4);
+                       if (power > 1)
+                               amd64_shift_reg_imm_size (code, X86_SAR, AMD64_RDX, 31, 4);
+                       amd64_shift_reg_imm_size (code, X86_SHR, AMD64_RDX, 32 - power, 4);
+                       amd64_alu_reg_reg_size (code, X86_ADD, AMD64_RAX, AMD64_RDX, 4);
+                       /* Compute remainder */
+                       amd64_alu_reg_imm_size (code, X86_AND, AMD64_RAX, (1 << power) - 1, 4);
+                       /* Remove compensation */
+                       amd64_alu_reg_reg_size (code, X86_SUB, AMD64_RAX, AMD64_RDX, 4);
+                       break;
+               }
                case OP_LMUL_OVF:
                        amd64_imul_reg_reg (code, ins->sreg1, ins->sreg2);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
@@ -2893,6 +3952,31 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
 
+               case OP_CMOV_IEQ:
+               case OP_CMOV_IGE:
+               case OP_CMOV_IGT:
+               case OP_CMOV_ILE:
+               case OP_CMOV_ILT:
+               case OP_CMOV_INE_UN:
+               case OP_CMOV_IGE_UN:
+               case OP_CMOV_IGT_UN:
+               case OP_CMOV_ILE_UN:
+               case OP_CMOV_ILT_UN:
+               case OP_CMOV_LEQ:
+               case OP_CMOV_LGE:
+               case OP_CMOV_LGT:
+               case OP_CMOV_LLE:
+               case OP_CMOV_LLT:
+               case OP_CMOV_LNE_UN:
+               case OP_CMOV_LGE_UN:
+               case OP_CMOV_LGT_UN:
+               case OP_CMOV_LLE_UN:
+               case OP_CMOV_LLT_UN:
+                       g_assert (ins->dreg == ins->sreg1);
+                       /* This needs to operate on 64 bit values */
+                       amd64_cmov_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, ins->sreg2);
+                       break;
+
                case OP_LNOT:
                        amd64_not_reg (code, ins->sreg1);
                        break;
@@ -2911,33 +3995,23 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
                        amd64_mov_reg_membase (code, ins->dreg, AMD64_RIP, 0, 8);
                        break;
+               case OP_JUMP_TABLE:
+                       mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
+                       amd64_mov_reg_imm_size (code, ins->dreg, 0, 8);
+                       break;
                case OP_MOVE:
                        amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof (gpointer));
                        break;
                case OP_AMD64_SET_XMMREG_R4: {
-                       if (use_sse2) {
-                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
-                       }
-                       else {
-                               amd64_fst_membase (code, AMD64_RSP, -8, FALSE, TRUE);
-                               /* ins->dreg is set to -1 by the reg allocator */
-                               amd64_movss_reg_membase (code, ins->backend.reg3, AMD64_RSP, -8);
-                       }
+                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
                }
                case OP_AMD64_SET_XMMREG_R8: {
-                       if (use_sse2) {
-                               if (ins->dreg != ins->sreg1)
-                                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
-                       }
-                       else {
-                               amd64_fst_membase (code, AMD64_RSP, -8, TRUE, TRUE);
-                               /* ins->dreg is set to -1 by the reg allocator */
-                               amd64_movsd_reg_membase (code, ins->backend.reg3, AMD64_RSP, -8);
-                       }
+                       if (ins->dreg != ins->sreg1)
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
                }
-               case OP_JMP: {
+               case OP_TAILCALL: {
                        /*
                         * Note: this 'frame destruction' logic is useful for tail calls, too.
                         * Keep in sync with the code in emit_epilog.
@@ -2946,12 +4020,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        /* FIXME: no tracing support... */
                        if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
-                               code = mono_arch_instrument_epilog (cfg, mono_profiler_method_leave, code, FALSE);
+                               code = mono_arch_instrument_epilog_full (cfg, mono_profiler_method_leave, code, FALSE, FALSE);
 
                        g_assert (!cfg->method->save_lmf);
 
-                       code = emit_load_volatile_arguments (cfg, code);
-
                        if (cfg->arch.omit_fp) {
                                guint32 save_offset = 0;
                                /* Pop callee-saved registers */
@@ -2997,11 +4069,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, 8);
                        break;
                }
+               case OP_CALL:
                case OP_FCALL:
                case OP_LCALL:
                case OP_VCALL:
+               case OP_VCALL2:
                case OP_VOIDCALL:
-               case OP_CALL:
                        call = (MonoCallInst*)ins;
                        /*
                         * The AMD64 ABI forces callers to know about varargs.
@@ -3026,16 +4099,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
 
                        if (ins->flags & MONO_INST_HAS_METHOD)
-                               code = emit_call (cfg, code, MONO_PATCH_INFO_METHOD, call->method);
+                               code = emit_call (cfg, code, MONO_PATCH_INFO_METHOD, call->method, FALSE);
                        else
-                               code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, call->fptr);
-                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention))
+                               code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, call->fptr, FALSE);
+                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
                                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
                        code = emit_move_return_value (cfg, ins, code);
                        break;
                case OP_FCALL_REG:
                case OP_LCALL_REG:
                case OP_VCALL_REG:
+               case OP_VCALL2_REG:
                case OP_VOIDCALL_REG:
                case OP_CALL_REG:
                        call = (MonoCallInst*)ins;
@@ -3054,15 +4128,36 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                        ins->sreg1 = AMD64_R11;
                                }
                                amd64_alu_reg_reg (code, X86_XOR, AMD64_RAX, AMD64_RAX);
+                       } else if ((cfg->method->wrapper_type == MONO_WRAPPER_MANAGED_TO_NATIVE) && (cfg->method->klass->image != mono_defaults.corlib)) {
+                               /* 
+                                * Since the unmanaged calling convention doesn't contain a 
+                                * 'vararg' entry, we have to treat every pinvoke call as a
+                                * potential vararg call.
+                                */
+                               guint32 nregs, i;
+                               nregs = 0;
+                               for (i = 0; i < AMD64_XMM_NREG; ++i)
+                                       if (call->used_fregs & (1 << i))
+                                               nregs ++;
+                               if (ins->sreg1 == AMD64_RAX) {
+                                       amd64_mov_reg_reg (code, AMD64_R11, AMD64_RAX, 8);
+                                       ins->sreg1 = AMD64_R11;
+                               }
+                               if (!nregs)
+                                       amd64_alu_reg_reg (code, X86_XOR, AMD64_RAX, AMD64_RAX);
+                               else
+                                       amd64_mov_reg_imm (code, AMD64_RAX, nregs);
                        }
+
                        amd64_call_reg (code, ins->sreg1);
-                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention))
+                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
                                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
                        code = emit_move_return_value (cfg, ins, code);
                        break;
                case OP_FCALL_MEMBASE:
                case OP_LCALL_MEMBASE:
                case OP_VCALL_MEMBASE:
+               case OP_VCALL2_MEMBASE:
                case OP_VOIDCALL_MEMBASE:
                case OP_CALL_MEMBASE:
                        call = (MonoCallInst*)ins;
@@ -3076,27 +4171,66 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                ins->sreg1 = AMD64_RAX;
                        }
 
+                       /* 
+                        * Emit a few nops to simplify get_vcall_slot ().
+                        */
+                       amd64_nop (code);
+                       amd64_nop (code);
+                       amd64_nop (code);
+
                        amd64_call_membase (code, ins->sreg1, ins->inst_offset);
-                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention))
+                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
                                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
                        code = emit_move_return_value (cfg, ins, code);
                        break;
+               case OP_DYN_CALL: {
+                       int i;
+                       MonoInst *var = cfg->dyn_call_var;
+
+                       g_assert (var->opcode == OP_REGOFFSET);
+
+                       /* r11 = args buffer filled by mono_arch_get_dyn_call_args () */
+                       amd64_mov_reg_reg (code, AMD64_R11, ins->sreg1, 8);
+                       /* r10 = ftn */
+                       amd64_mov_reg_reg (code, AMD64_R10, ins->sreg2, 8);
+
+                       /* Save args buffer */
+                       amd64_mov_membase_reg (code, var->inst_basereg, var->inst_offset, AMD64_R11, 8);
+
+                       /* Set argument registers */
+                       for (i = 0; i < PARAM_REGS; ++i)
+                               amd64_mov_reg_membase (code, param_regs [i], AMD64_R11, i * sizeof (gpointer), 8);
+                       
+                       /* Make the call */
+                       amd64_call_reg (code, AMD64_R10);
+
+                       /* Save result */
+                       amd64_mov_reg_membase (code, AMD64_R11, var->inst_basereg, var->inst_offset, 8);
+                       amd64_mov_membase_reg (code, AMD64_R11, G_STRUCT_OFFSET (DynCallArgs, res), AMD64_RAX, 8);
+                       break;
+               }
                case OP_AMD64_SAVE_SP_TO_LMF:
                        amd64_mov_membase_reg (code, cfg->frame_reg, cfg->arch.lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
                        break;
-               case OP_OUTARG:
                case OP_X86_PUSH:
+                       g_assert (!cfg->arch.no_pushes);
                        amd64_push_reg (code, ins->sreg1);
                        break;
                case OP_X86_PUSH_IMM:
+                       g_assert (!cfg->arch.no_pushes);
                        g_assert (amd64_is_imm32 (ins->inst_imm));
                        amd64_push_imm (code, ins->inst_imm);
                        break;
                case OP_X86_PUSH_MEMBASE:
+                       g_assert (!cfg->arch.no_pushes);
                        amd64_push_membase (code, ins->inst_basereg, ins->inst_offset);
                        break;
-               case OP_X86_PUSH_OBJ: 
-                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, ins->inst_imm);
+               case OP_X86_PUSH_OBJ: {
+                       int size = ALIGN_TO (ins->inst_imm, 8);
+
+                       g_assert (!cfg->arch.no_pushes);
+
+                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, size);
                        amd64_push_reg (code, AMD64_RDI);
                        amd64_push_reg (code, AMD64_RSI);
                        amd64_push_reg (code, AMD64_RCX);
@@ -3104,8 +4238,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_lea_membase (code, AMD64_RSI, ins->inst_basereg, ins->inst_offset);
                        else
                                amd64_mov_reg_reg (code, AMD64_RSI, ins->inst_basereg, 8);
-                       amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, 3 * 8);
-                       amd64_mov_reg_imm (code, AMD64_RCX, (ins->inst_imm >> 3));
+                       amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, (3 * 8));
+                       amd64_mov_reg_imm (code, AMD64_RCX, (size >> 3));
                        amd64_cld (code);
                        amd64_prefix (code, X86_REP_PREFIX);
                        amd64_movsd (code);
@@ -3113,6 +4247,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_pop_reg (code, AMD64_RSI);
                        amd64_pop_reg (code, AMD64_RDI);
                        break;
+               }
                case OP_X86_LEA:
                        amd64_lea_memindex (code, ins->dreg, ins->sreg1, ins->inst_imm, ins->sreg2, ins->backend.shift_amount);
                        break;
@@ -3126,36 +4261,50 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        /* keep alignment */
                        amd64_alu_reg_imm (code, X86_ADD, ins->sreg1, MONO_ARCH_FRAME_ALIGNMENT - 1);
                        amd64_alu_reg_imm (code, X86_AND, ins->sreg1, ~(MONO_ARCH_FRAME_ALIGNMENT - 1));
-                       code = mono_emit_stack_alloc (code, ins);
+                       code = mono_emit_stack_alloc (cfg, code, ins);
                        amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);
+                       if (cfg->param_area && cfg->arch.no_pushes)
+                               amd64_alu_reg_imm (code, X86_ADD, ins->dreg, cfg->param_area);
                        break;
                case OP_LOCALLOC_IMM: {
                        guint32 size = ins->inst_imm;
                        size = (size + (MONO_ARCH_FRAME_ALIGNMENT - 1)) & ~ (MONO_ARCH_FRAME_ALIGNMENT - 1);
 
                        if (ins->flags & MONO_INST_INIT) {
-                               /* FIXME: Optimize this */
-                               amd64_mov_reg_imm (code, ins->dreg, size);
-                               ins->sreg1 = ins->dreg;
+                               if (size < 64) {
+                                       int i;
 
-                               code = mono_emit_stack_alloc (code, ins);
-                               amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);
+                                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, size);
+                                       amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
+
+                                       for (i = 0; i < size; i += 8)
+                                               amd64_mov_membase_reg (code, AMD64_RSP, i, ins->dreg, 8);
+                                       amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);                                      
+                               } else {
+                                       amd64_mov_reg_imm (code, ins->dreg, size);
+                                       ins->sreg1 = ins->dreg;
+
+                                       code = mono_emit_stack_alloc (cfg, code, ins);
+                                       amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);
+                               }
                        } else {
                                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, size);
                                amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);
                        }
+                       if (cfg->param_area && cfg->arch.no_pushes)
+                               amd64_alu_reg_imm (code, X86_ADD, ins->dreg, cfg->param_area);
                        break;
                }
                case OP_THROW: {
                        amd64_mov_reg_reg (code, AMD64_ARG_REG1, ins->sreg1, 8);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
-                                            (gpointer)"mono_arch_throw_exception");
+                                            (gpointer)"mono_arch_throw_exception", FALSE);
                        break;
                }
                case OP_RETHROW: {
                        amd64_mov_reg_reg (code, AMD64_ARG_REG1, ins->sreg1, 8);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
-                                            (gpointer)"mono_arch_rethrow_exception");
+                                            (gpointer)"mono_arch_rethrow_exception", FALSE);
                        break;
                }
                case OP_CALL_HANDLER: 
@@ -3169,6 +4318,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_START_HANDLER: {
                        MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
                        amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, 8);
+
+                       if ((MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY) ||
+                                MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY)) &&
+                               cfg->param_area && cfg->arch.no_pushes) {
+                               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, ALIGN_TO (cfg->param_area, MONO_ARCH_FRAME_ALIGNMENT));
+                       }
                        break;
                }
                case OP_ENDFINALLY: {
@@ -3189,28 +4344,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        ins->inst_c0 = code - cfg->native_code;
                        break;
                case OP_BR:
-                       if (ins->flags & MONO_INST_BRLABEL) {
-                               if (ins->inst_i0->inst_c0) {
-                                       amd64_jump_code (code, cfg->native_code + ins->inst_i0->inst_c0);
-                               } else {
-                                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_LABEL, ins->inst_i0);
-                                       if ((cfg->opt & MONO_OPT_BRANCH) &&
-                                           x86_is_imm8 (ins->inst_i0->inst_c1 - cpos))
-                                               x86_jump8 (code, 0);
-                                       else 
-                                               x86_jump32 (code, 0);
-                               }
-                       } else {
+                       //g_print ("target: %p, next: %p, curr: %p, last: %p\n", ins->inst_target_bb, bb->next_bb, ins, bb->last_ins);
+                       //if ((ins->inst_target_bb == bb->next_bb) && ins == bb->last_ins)
+                       //break;
                                if (ins->inst_target_bb->native_offset) {
                                        amd64_jump_code (code, cfg->native_code + ins->inst_target_bb->native_offset); 
                                } else {
                                        mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb);
                                        if ((cfg->opt & MONO_OPT_BRANCH) &&
-                                           x86_is_imm8 (ins->inst_target_bb->max_offset - cpos))
+                                           x86_is_imm8 (ins->inst_target_bb->max_offset - offset))
                                                x86_jump8 (code, 0);
                                        else 
                                                x86_jump32 (code, 0);
-                               } 
                        }
                        break;
                case OP_BR_REG:
@@ -3275,112 +4420,50 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_R8CONST: {
                        double d = *(double *)ins->inst_p0;
 
-                       if (use_sse2) {
-                               if ((d == 0.0) && (mono_signbit (d) == 0)) {
-                                       amd64_sse_xorpd_reg_reg (code, ins->dreg, ins->dreg);
-                               }
-                               else {
-                                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, ins->inst_p0);
-                                       amd64_sse_movsd_reg_membase (code, ins->dreg, AMD64_RIP, 0);
-                               }
+                       if ((d == 0.0) && (mono_signbit (d) == 0)) {
+                               amd64_sse_xorpd_reg_reg (code, ins->dreg, ins->dreg);
                        }
-                       else if ((d == 0.0) && (mono_signbit (d) == 0)) {
-                               amd64_fldz (code);
-                       } else if (d == 1.0) {
-                               x86_fld1 (code);
-                       } else {
+                       else {
                                mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, ins->inst_p0);
-                               amd64_fld_membase (code, AMD64_RIP, 0, TRUE);
+                               amd64_sse_movsd_reg_membase (code, ins->dreg, AMD64_RIP, 0);
                        }
                        break;
                }
                case OP_R4CONST: {
                        float f = *(float *)ins->inst_p0;
 
-                       if (use_sse2) {
-                               if ((f == 0.0) && (mono_signbit (f) == 0)) {
-                                       amd64_sse_xorpd_reg_reg (code, ins->dreg, ins->dreg);
-                               }
-                               else {
-                                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4, ins->inst_p0);
-                                       amd64_sse_movss_reg_membase (code, ins->dreg, AMD64_RIP, 0);
-                                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
-                               }
+                       if ((f == 0.0) && (mono_signbit (f) == 0)) {
+                               amd64_sse_xorpd_reg_reg (code, ins->dreg, ins->dreg);
                        }
-                       else if ((f == 0.0) && (mono_signbit (f) == 0)) {
-                               amd64_fldz (code);
-                       } else if (f == 1.0) {
-                               x86_fld1 (code);
-                       } else {
+                       else {
                                mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4, ins->inst_p0);
-                               amd64_fld_membase (code, AMD64_RIP, 0, FALSE);
+                               amd64_sse_movss_reg_membase (code, ins->dreg, AMD64_RIP, 0);
+                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
                        }
                        break;
                }
                case OP_STORER8_MEMBASE_REG:
-                       if (use_sse2)
-                               amd64_sse_movsd_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1);
-                       else
-                               amd64_fst_membase (code, ins->inst_destbasereg, ins->inst_offset, TRUE, TRUE);
-                       break;
-               case OP_LOADR8_SPILL_MEMBASE:
-                       if (use_sse2)
-                               g_assert_not_reached ();
-                       amd64_fld_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
-                       amd64_fxch (code, 1);
+                       amd64_sse_movsd_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1);
                        break;
                case OP_LOADR8_MEMBASE:
-                       if (use_sse2)
-                               amd64_sse_movsd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
-                       else
-                               amd64_fld_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
+                       amd64_sse_movsd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
                        break;
                case OP_STORER4_MEMBASE_REG:
-                       if (use_sse2) {
-                               /* This requires a double->single conversion */
-                               amd64_sse_cvtsd2ss_reg_reg (code, AMD64_XMM15, ins->sreg1);
-                               amd64_sse_movss_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, AMD64_XMM15);
-                       }
-                       else
-                               amd64_fst_membase (code, ins->inst_destbasereg, ins->inst_offset, FALSE, TRUE);
+                       /* This requires a double->single conversion */
+                       amd64_sse_cvtsd2ss_reg_reg (code, AMD64_XMM15, ins->sreg1);
+                       amd64_sse_movss_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, AMD64_XMM15);
                        break;
                case OP_LOADR4_MEMBASE:
-                       if (use_sse2) {
-                               amd64_sse_movss_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
-                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
-                       }
-                       else
-                               amd64_fld_membase (code, ins->inst_basereg, ins->inst_offset, FALSE);
+                       amd64_sse_movss_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
                        break;
                case OP_ICONV_TO_R4: /* FIXME: change precision */
                case OP_ICONV_TO_R8:
-                       if (use_sse2)
-                               amd64_sse_cvtsi2sd_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
-                       else {
-                               amd64_push_reg (code, ins->sreg1);
-                               amd64_fild_membase (code, AMD64_RSP, 0, FALSE);
-                               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
-                       }
+                       amd64_sse_cvtsi2sd_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
                        break;
                case OP_LCONV_TO_R4: /* FIXME: change precision */
                case OP_LCONV_TO_R8:
-                       if (use_sse2)
-                               amd64_sse_cvtsi2sd_reg_reg (code, ins->dreg, ins->sreg1);
-                       else {
-                               amd64_push_reg (code, ins->sreg1);
-                               amd64_fild_membase (code, AMD64_RSP, 0, TRUE);
-                               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
-                       }
-                       break;
-               case OP_X86_FP_LOAD_I8:
-                       if (use_sse2)
-                               g_assert_not_reached ();
-                       amd64_fild_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
-                       break;
-               case OP_X86_FP_LOAD_I4:
-                       if (use_sse2)
-                               g_assert_not_reached ();
-                       amd64_fild_membase (code, ins->inst_basereg, ins->inst_offset, FALSE);
+                       amd64_sse_cvtsi2sd_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
                case OP_FCONV_TO_R4:
                        /* FIXME: nothing to do ?? */
@@ -3408,61 +4491,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 8, TRUE);
                        break;
                case OP_LCONV_TO_R_UN: { 
-                       static guint8 mn[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x40 };
                        guint8 *br [2];
 
-                       if (use_sse2) {
-                               /* Based on gcc code */
-                               amd64_test_reg_reg (code, ins->sreg1, ins->sreg1);
-                               br [0] = code; x86_branch8 (code, X86_CC_S, 0, TRUE);
-
-                               /* Positive case */
-                               amd64_sse_cvtsi2sd_reg_reg (code, ins->dreg, ins->sreg1);
-                               br [1] = code; x86_jump8 (code, 0);
-                               amd64_patch (br [0], code);
-
-                               /* Negative case */
-                               /* Save to the red zone */
-                               amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RAX, 8);
-                               amd64_mov_membase_reg (code, AMD64_RSP, -16, AMD64_RCX, 8);
-                               amd64_mov_reg_reg (code, AMD64_RCX, ins->sreg1, 8);
-                               amd64_mov_reg_reg (code, AMD64_RAX, ins->sreg1, 8);
-                               amd64_alu_reg_imm (code, X86_AND, AMD64_RCX, 1);
-                               amd64_shift_reg_imm (code, X86_SHR, AMD64_RAX, 1);
-                               amd64_alu_reg_imm (code, X86_OR, AMD64_RAX, AMD64_RCX);
-                               amd64_sse_cvtsi2sd_reg_reg (code, ins->dreg, AMD64_RAX);
-                               amd64_sse_addsd_reg_reg (code, ins->dreg, ins->dreg);
-                               /* Restore */
-                               amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RSP, -16, 8);
-                               amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, -8, 8);
-                               amd64_patch (br [1], code);
-
-                               break;
-                       }
-
-                       /* load 64bit integer to FP stack */
-                       amd64_push_imm (code, 0);
-                       amd64_push_reg (code, ins->sreg2);
-                       amd64_push_reg (code, ins->sreg1);
-                       amd64_fild_membase (code, AMD64_RSP, 0, TRUE);
-                       /* store as 80bit FP value */
-                       x86_fst80_membase (code, AMD64_RSP, 0);
-                       
-                       /* test if lreg is negative */
-                       amd64_test_reg_reg (code, ins->sreg2, ins->sreg2);
-                       br [0] = code; x86_branch8 (code, X86_CC_GEZ, 0, TRUE);
-       
-                       /* add correction constant mn */
-                       x86_fld80_mem (code, (gssize)mn);
-                       x86_fld80_membase (code, AMD64_RSP, 0);
-                       amd64_fp_op_reg (code, X86_FADD, 1, TRUE);
-                       x86_fst80_membase (code, AMD64_RSP, 0);
+                       /* Based on gcc code */
+                       amd64_test_reg_reg (code, ins->sreg1, ins->sreg1);
+                       br [0] = code; x86_branch8 (code, X86_CC_S, 0, TRUE);
 
+                       /* Positive case */
+                       amd64_sse_cvtsi2sd_reg_reg (code, ins->dreg, ins->sreg1);
+                       br [1] = code; x86_jump8 (code, 0);
                        amd64_patch (br [0], code);
 
-                       x86_fld80_membase (code, AMD64_RSP, 0);
-                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 12);
-
+                       /* Negative case */
+                       /* Save to the red zone */
+                       amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RAX, 8);
+                       amd64_mov_membase_reg (code, AMD64_RSP, -16, AMD64_RCX, 8);
+                       amd64_mov_reg_reg (code, AMD64_RCX, ins->sreg1, 8);
+                       amd64_mov_reg_reg (code, AMD64_RAX, ins->sreg1, 8);
+                       amd64_alu_reg_imm (code, X86_AND, AMD64_RCX, 1);
+                       amd64_shift_reg_imm (code, X86_SHR, AMD64_RAX, 1);
+                       amd64_alu_reg_imm (code, X86_OR, AMD64_RAX, AMD64_RCX);
+                       amd64_sse_cvtsi2sd_reg_reg (code, ins->dreg, AMD64_RAX);
+                       amd64_sse_addsd_reg_reg (code, ins->dreg, ins->dreg);
+                       /* Restore */
+                       amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RSP, -16, 8);
+                       amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, -8, 8);
+                       amd64_patch (br [1], code);
                        break;
                }
                case OP_LCONV_TO_OVF_U4:
@@ -3476,123 +4530,47 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, 8);
                        break;
                case OP_FMOVE:
-                       if (use_sse2 && (ins->dreg != ins->sreg1))
+                       if (ins->dreg != ins->sreg1)
                                amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
                case OP_FADD:
-                       if (use_sse2)
-                               amd64_sse_addsd_reg_reg (code, ins->dreg, ins->sreg2);
-                       else
-                               amd64_fp_op_reg (code, X86_FADD, 1, TRUE);
+                       amd64_sse_addsd_reg_reg (code, ins->dreg, ins->sreg2);
                        break;
                case OP_FSUB:
-                       if (use_sse2)
-                               amd64_sse_subsd_reg_reg (code, ins->dreg, ins->sreg2);
-                       else
-                               amd64_fp_op_reg (code, X86_FSUB, 1, TRUE);
+                       amd64_sse_subsd_reg_reg (code, ins->dreg, ins->sreg2);
                        break;          
                case OP_FMUL:
-                       if (use_sse2)
-                               amd64_sse_mulsd_reg_reg (code, ins->dreg, ins->sreg2);
-                       else
-                               amd64_fp_op_reg (code, X86_FMUL, 1, TRUE);
+                       amd64_sse_mulsd_reg_reg (code, ins->dreg, ins->sreg2);
                        break;          
                case OP_FDIV:
-                       if (use_sse2)
-                               amd64_sse_divsd_reg_reg (code, ins->dreg, ins->sreg2);
-                       else
-                               amd64_fp_op_reg (code, X86_FDIV, 1, TRUE);
+                       amd64_sse_divsd_reg_reg (code, ins->dreg, ins->sreg2);
                        break;          
-               case OP_FNEG:
-                       if (use_sse2) {
-                               static double r8_0 = -0.0;
+               case OP_FNEG: {
+                       static double r8_0 = -0.0;
 
-                               g_assert (ins->sreg1 == ins->dreg);
+                       g_assert (ins->sreg1 == ins->dreg);
                                        
-                               mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, &r8_0);
-                               amd64_sse_xorpd_reg_membase (code, ins->dreg, AMD64_RIP, 0);
-                       }
-                       else
-                               amd64_fchs (code);
-                       break;          
+                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, &r8_0);
+                       amd64_sse_xorpd_reg_membase (code, ins->dreg, AMD64_RIP, 0);
+                       break;
+               }
                case OP_SIN:
-                       if (use_sse2) {
-                               EMIT_SSE2_FPFUNC (code, fsin, ins->dreg, ins->sreg1);
-                       }
-                       else {
-                               amd64_fsin (code);
-                               amd64_fldz (code);
-                               amd64_fp_op_reg (code, X86_FADD, 1, TRUE);
-                       }
+                       EMIT_SSE2_FPFUNC (code, fsin, ins->dreg, ins->sreg1);
                        break;          
                case OP_COS:
-                       if (use_sse2) {
-                               EMIT_SSE2_FPFUNC (code, fcos, ins->dreg, ins->sreg1);
-                       }
-                       else {
-                               amd64_fcos (code);
-                               amd64_fldz (code);
-                               amd64_fp_op_reg (code, X86_FADD, 1, TRUE);
-                       }
+                       EMIT_SSE2_FPFUNC (code, fcos, ins->dreg, ins->sreg1);
                        break;          
-               case OP_ABS:
-                       if (use_sse2) {
-                               EMIT_SSE2_FPFUNC (code, fabs, ins->dreg, ins->sreg1);
-                       }
-                       else
-                               amd64_fabs (code);
+               case OP_ABS: {
+                       static guint64 d = 0x7fffffffffffffffUL;
+
+                       g_assert (ins->sreg1 == ins->dreg);
+                                       
+                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, &d);
+                       amd64_sse_andpd_reg_membase (code, ins->dreg, AMD64_RIP, 0);
                        break;          
-               case OP_TAN: {
-                       /* 
-                        * it really doesn't make sense to inline all this code,
-                        * it's here just to show that things may not be as simple 
-                        * as they appear.
-                        */
-                       guchar *check_pos, *end_tan, *pop_jump;
-                       if (use_sse2)
-                               g_assert_not_reached ();
-                       amd64_push_reg (code, AMD64_RAX);
-                       amd64_fptan (code);
-                       amd64_fnstsw (code);
-                       amd64_test_reg_imm (code, AMD64_RAX, X86_FP_C2);
-                       check_pos = code;
-                       x86_branch8 (code, X86_CC_NE, 0, FALSE);
-                       amd64_fstp (code, 0); /* pop the 1.0 */
-                       end_tan = code;
-                       x86_jump8 (code, 0);
-                       amd64_fldpi (code);
-                       amd64_fp_op (code, X86_FADD, 0);
-                       amd64_fxch (code, 1);
-                       x86_fprem1 (code);
-                       amd64_fstsw (code);
-                       amd64_test_reg_imm (code, AMD64_RAX, X86_FP_C2);
-                       pop_jump = code;
-                       x86_branch8 (code, X86_CC_NE, 0, FALSE);
-                       amd64_fstp (code, 1);
-                       amd64_fptan (code);
-                       amd64_patch (pop_jump, code);
-                       amd64_fstp (code, 0); /* pop the 1.0 */
-                       amd64_patch (check_pos, code);
-                       amd64_patch (end_tan, code);
-                       amd64_fldz (code);
-                       amd64_fp_op_reg (code, X86_FADD, 1, TRUE);
-                       amd64_pop_reg (code, AMD64_RAX);
-                       break;
                }
-               case OP_ATAN:
-                       if (use_sse2)
-                               g_assert_not_reached ();
-                       x86_fld1 (code);
-                       amd64_fpatan (code);
-                       amd64_fldz (code);
-                       amd64_fp_op_reg (code, X86_FADD, 1, TRUE);
-                       break;          
                case OP_SQRT:
-                       if (use_sse2) {
-                               EMIT_SSE2_FPFUNC (code, fsqrt, ins->dreg, ins->sreg1);
-                       }
-                       else
-                               amd64_fsqrt (code);
+                       EMIT_SSE2_FPFUNC (code, fsqrt, ins->dreg, ins->sreg1);
                        break;
                case OP_IMIN:
                        g_assert (cfg->opt & MONO_OPT_CMOV);
@@ -3600,201 +4578,109 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_reg_reg_size (code, X86_CMP, ins->sreg1, ins->sreg2, 4);
                        amd64_cmov_reg_size (code, X86_CC_GT, TRUE, ins->dreg, ins->sreg2, 4);
                        break;
+               case OP_IMIN_UN:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       amd64_alu_reg_reg_size (code, X86_CMP, ins->sreg1, ins->sreg2, 4);
+                       amd64_cmov_reg_size (code, X86_CC_GT, FALSE, ins->dreg, ins->sreg2, 4);
+                       break;
                case OP_IMAX:
                        g_assert (cfg->opt & MONO_OPT_CMOV);
                        g_assert (ins->dreg == ins->sreg1);
                        amd64_alu_reg_reg_size (code, X86_CMP, ins->sreg1, ins->sreg2, 4);
                        amd64_cmov_reg_size (code, X86_CC_LT, TRUE, ins->dreg, ins->sreg2, 4);
                        break;
+               case OP_IMAX_UN:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       amd64_alu_reg_reg_size (code, X86_CMP, ins->sreg1, ins->sreg2, 4);
+                       amd64_cmov_reg_size (code, X86_CC_LT, FALSE, ins->dreg, ins->sreg2, 4);
+                       break;
                case OP_LMIN:
                        g_assert (cfg->opt & MONO_OPT_CMOV);
                        g_assert (ins->dreg == ins->sreg1);
                        amd64_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
                        amd64_cmov_reg (code, X86_CC_GT, TRUE, ins->dreg, ins->sreg2);
                        break;
+               case OP_LMIN_UN:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       amd64_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
+                       amd64_cmov_reg (code, X86_CC_GT, FALSE, ins->dreg, ins->sreg2);
+                       break;
                case OP_LMAX:
                        g_assert (cfg->opt & MONO_OPT_CMOV);
                        g_assert (ins->dreg == ins->sreg1);
                        amd64_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
                        amd64_cmov_reg (code, X86_CC_LT, TRUE, ins->dreg, ins->sreg2);
-                       break;  
+                       break;
+               case OP_LMAX_UN:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       amd64_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
+                       amd64_cmov_reg (code, X86_CC_LT, FALSE, ins->dreg, ins->sreg2);
+                       break;  
                case OP_X86_FPOP:
-                       if (!use_sse2)
-                               amd64_fstp (code, 0);
                        break;          
-               case OP_FREM: {
-                       guint8 *l1, *l2;
-
-                       if (use_sse2)
-                               g_assert_not_reached ();
-                       amd64_push_reg (code, AMD64_RAX);
-                       /* we need to exchange ST(0) with ST(1) */
-                       amd64_fxch (code, 1);
-
-                       /* this requires a loop, because fprem somtimes 
-                        * returns a partial remainder */
-                       l1 = code;
-                       /* looks like MS is using fprem instead of the IEEE compatible fprem1 */
-                       /* x86_fprem1 (code); */
-                       amd64_fprem (code);
-                       amd64_fnstsw (code);
-                       amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, X86_FP_C2);
-                       l2 = code + 2;
-                       x86_branch8 (code, X86_CC_NE, l1 - l2, FALSE);
-
-                       /* pop result */
-                       amd64_fstp (code, 1);
-
-                       amd64_pop_reg (code, AMD64_RAX);
-                       break;
-               }
                case OP_FCOMPARE:
-                       if (use_sse2) {
-                               /* 
-                                * The two arguments are swapped because the fbranch instructions
-                                * depend on this for the non-sse case to work.
-                                */
-                               amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
-                               break;
-                       }
-                       if (cfg->opt & MONO_OPT_FCMOV) {
-                               amd64_fcomip (code, 1);
-                               amd64_fstp (code, 0);
-                               break;
-                       }
-                       /* this overwrites EAX */
-                       EMIT_FPCOMPARE(code);
-                       amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, X86_FP_CC_MASK);
-                       break;
-               case OP_FCEQ:
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               /* zeroing the register at the start results in 
-                                * shorter and faster code (we can also remove the widening op)
-                                */
-                               guchar *unordered_check;
-                               amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
-                               
-                               if (use_sse2)
-                                       amd64_sse_comisd_reg_reg (code, ins->sreg1, ins->sreg2);
-                               else {
-                                       amd64_fcomip (code, 1);
-                                       amd64_fstp (code, 0);
-                               }
-                               unordered_check = code;
-                               x86_branch8 (code, X86_CC_P, 0, FALSE);
-                               amd64_set_reg (code, X86_CC_EQ, ins->dreg, FALSE);
-                               amd64_patch (unordered_check, code);
-                               break;
-                       }
-                       if (ins->dreg != AMD64_RAX) 
-                               amd64_push_reg (code, AMD64_RAX);
-
-                       EMIT_FPCOMPARE(code);
-                       amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, X86_FP_CC_MASK);
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0x4000);
-                       amd64_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-
-                       if (ins->dreg != AMD64_RAX) 
-                               amd64_pop_reg (code, AMD64_RAX);
+                       /* 
+                        * The two arguments are swapped because the fbranch instructions
+                        * depend on this for the non-sse case to work.
+                        */
+                       amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
+                       break;
+               case OP_FCEQ: {
+                       /* zeroing the register at the start results in 
+                        * shorter and faster code (we can also remove the widening op)
+                        */
+                       guchar *unordered_check;
+                       amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
+                       amd64_sse_comisd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       unordered_check = code;
+                       x86_branch8 (code, X86_CC_P, 0, FALSE);
+                       amd64_set_reg (code, X86_CC_EQ, ins->dreg, FALSE);
+                       amd64_patch (unordered_check, code);
                        break;
+               }
                case OP_FCLT:
                case OP_FCLT_UN:
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               /* zeroing the register at the start results in 
-                                * shorter and faster code (we can also remove the widening op)
-                                */
-                               amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
-                               if (use_sse2)
-                                       amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
-                               else {
-                                       amd64_fcomip (code, 1);
-                                       amd64_fstp (code, 0);
-                               }
-                               if (ins->opcode == OP_FCLT_UN) {
-                                       guchar *unordered_check = code;
-                                       guchar *jump_to_end;
-                                       x86_branch8 (code, X86_CC_P, 0, FALSE);
-                                       amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
-                                       jump_to_end = code;
-                                       x86_jump8 (code, 0);
-                                       amd64_patch (unordered_check, code);
-                                       amd64_inc_reg (code, ins->dreg);
-                                       amd64_patch (jump_to_end, code);
-                               } else {
-                                       amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
-                               }
-                               break;
-                       }
-                       if (ins->dreg != AMD64_RAX) 
-                               amd64_push_reg (code, AMD64_RAX);
-
-                       EMIT_FPCOMPARE(code);
-                       amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, X86_FP_CC_MASK);
+                       /* zeroing the register at the start results in 
+                        * shorter and faster code (we can also remove the widening op)
+                        */
+                       amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
+                       amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
                        if (ins->opcode == OP_FCLT_UN) {
-                               guchar *is_not_zero_check, *end_jump;
-                               is_not_zero_check = code;
-                               x86_branch8 (code, X86_CC_NZ, 0, TRUE);
-                               end_jump = code;
+                               guchar *unordered_check = code;
+                               guchar *jump_to_end;
+                               x86_branch8 (code, X86_CC_P, 0, FALSE);
+                               amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
+                               jump_to_end = code;
                                x86_jump8 (code, 0);
-                               amd64_patch (is_not_zero_check, code);
-                               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_CC_MASK);
-
-                               amd64_patch (end_jump, code);
+                               amd64_patch (unordered_check, code);
+                               amd64_inc_reg (code, ins->dreg);
+                               amd64_patch (jump_to_end, code);
+                       } else {
+                               amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
                        }
-                       amd64_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-
-                       if (ins->dreg != AMD64_RAX) 
-                               amd64_pop_reg (code, AMD64_RAX);
                        break;
                case OP_FCGT:
-               case OP_FCGT_UN:
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               /* zeroing the register at the start results in 
-                                * shorter and faster code (we can also remove the widening op)
-                                */
-                               guchar *unordered_check;
-                               amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
-                               if (use_sse2)
-                                       amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
-                               else {
-                                       amd64_fcomip (code, 1);
-                                       amd64_fstp (code, 0);
-                               }
-                               if (ins->opcode == OP_FCGT) {
-                                       unordered_check = code;
-                                       x86_branch8 (code, X86_CC_P, 0, FALSE);
-                                       amd64_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
-                                       amd64_patch (unordered_check, code);
-                               } else {
-                                       amd64_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
-                               }
-                               break;
-                       }
-                       if (ins->dreg != AMD64_RAX) 
-                               amd64_push_reg (code, AMD64_RAX);
-
-                       EMIT_FPCOMPARE(code);
-                       amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, X86_FP_CC_MASK);
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
-                       if (ins->opcode == OP_FCGT_UN) {
-                               guchar *is_not_zero_check, *end_jump;
-                               is_not_zero_check = code;
-                               x86_branch8 (code, X86_CC_NZ, 0, TRUE);
-                               end_jump = code;
-                               x86_jump8 (code, 0);
-                               amd64_patch (is_not_zero_check, code);
-                               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_CC_MASK);
-
-                               amd64_patch (end_jump, code);
+               case OP_FCGT_UN: {
+                       /* zeroing the register at the start results in 
+                        * shorter and faster code (we can also remove the widening op)
+                        */
+                       guchar *unordered_check;
+                       amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
+                       amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
+                       if (ins->opcode == OP_FCGT) {
+                               unordered_check = code;
+                               x86_branch8 (code, X86_CC_P, 0, FALSE);
+                               amd64_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
+                               amd64_patch (unordered_check, code);
+                       } else {
+                               amd64_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
                        }
-                       amd64_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
-                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
-
-                       if (ins->dreg != AMD64_RAX) 
-                               amd64_pop_reg (code, AMD64_RAX);
                        break;
+               }
                case OP_FCLT_MEMBASE:
                case OP_FCGT_MEMBASE:
                case OP_FCLT_UN_MEMBASE:
@@ -3802,7 +4688,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FCEQ_MEMBASE: {
                        guchar *unordered_check, *jump_to_end;
                        int x86_cond;
-                       g_assert (use_sse2);
 
                        amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
                        amd64_sse_comisd_reg_membase (code, ins->sreg1, ins->sreg2, ins->inst_offset);
@@ -3846,162 +4731,94 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
-               case OP_FBEQ:
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               guchar *jump = code;
-                               x86_branch8 (code, X86_CC_P, 0, TRUE);
-                               EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
-                               amd64_patch (jump, code);
-                               break;
-                       }
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0x4000);
-                       EMIT_COND_BRANCH (ins, X86_CC_EQ, TRUE);
+               case OP_FBEQ: {
+                       guchar *jump = code;
+                       x86_branch8 (code, X86_CC_P, 0, TRUE);
+                       EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
+                       amd64_patch (jump, code);
                        break;
+               }
                case OP_FBNE_UN:
                        /* Branch if C013 != 100 */
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               /* branch if !ZF or (PF|CF) */
-                               EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
-                               EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
-                               EMIT_COND_BRANCH (ins, X86_CC_B, FALSE);
-                               break;
-                       }
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C3);
+                       /* branch if !ZF or (PF|CF) */
                        EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_B, FALSE);
                        break;
                case OP_FBLT:
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE);
-                               break;
-                       }
-                       EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE);
                        break;
                case OP_FBLT_UN:
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
-                               EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE);
-                               break;
-                       }
-                       if (ins->opcode == OP_FBLT_UN) {
-                               guchar *is_not_zero_check, *end_jump;
-                               is_not_zero_check = code;
-                               x86_branch8 (code, X86_CC_NZ, 0, TRUE);
-                               end_jump = code;
-                               x86_jump8 (code, 0);
-                               amd64_patch (is_not_zero_check, code);
-                               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_CC_MASK);
-
-                               amd64_patch (end_jump, code);
-                       }
-                       EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE);
                        break;
                case OP_FBGT:
                case OP_FBGT_UN:
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               if (ins->opcode == OP_FBGT) {
-                                       guchar *br1;
-
-                                       /* skip branch if C1=1 */
-                                       br1 = code;
-                                       x86_branch8 (code, X86_CC_P, 0, FALSE);
-                                       /* branch if (C0 | C3) = 1 */
-                                       EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
-                                       amd64_patch (br1, code);
-                                       break;
-                               } else {
-                                       EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
-                               }
-                               break;
-                       }
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
-                       if (ins->opcode == OP_FBGT_UN) {
-                               guchar *is_not_zero_check, *end_jump;
-                               is_not_zero_check = code;
-                               x86_branch8 (code, X86_CC_NZ, 0, TRUE);
-                               end_jump = code;
-                               x86_jump8 (code, 0);
-                               amd64_patch (is_not_zero_check, code);
-                               amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_CC_MASK);
-
-                               amd64_patch (end_jump, code);
-                       }
-                       EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
-                       break;
-               case OP_FBGE:
-                       /* Branch if C013 == 100 or 001 */
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
+                       if (ins->opcode == OP_FBGT) {
                                guchar *br1;
 
                                /* skip branch if C1=1 */
                                br1 = code;
                                x86_branch8 (code, X86_CC_P, 0, FALSE);
                                /* branch if (C0 | C3) = 1 */
-                               EMIT_COND_BRANCH (ins, X86_CC_BE, FALSE);
+                               EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
                                amd64_patch (br1, code);
                                break;
+                       } else {
+                               EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
                        }
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
-                       EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C3);
-                       EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
                        break;
+               case OP_FBGE: {
+                       /* Branch if C013 == 100 or 001 */
+                       guchar *br1;
+
+                       /* skip branch if C1=1 */
+                       br1 = code;
+                       x86_branch8 (code, X86_CC_P, 0, FALSE);
+                       /* branch if (C0 | C3) = 1 */
+                       EMIT_COND_BRANCH (ins, X86_CC_BE, FALSE);
+                       amd64_patch (br1, code);
+                       break;
+               }
                case OP_FBGE_UN:
                        /* Branch if C013 == 000 */
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               EMIT_COND_BRANCH (ins, X86_CC_LE, FALSE);
-                               break;
-                       }
-                       EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_LE, FALSE);
                        break;
-               case OP_FBLE:
+               case OP_FBLE: {
                        /* Branch if C013=000 or 100 */
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               guchar *br1;
+                       guchar *br1;
 
-                               /* skip branch if C1=1 */
-                               br1 = code;
-                               x86_branch8 (code, X86_CC_P, 0, FALSE);
-                               /* branch if C0=0 */
-                               EMIT_COND_BRANCH (ins, X86_CC_NB, FALSE);
-                               amd64_patch (br1, code);
-                               break;
-                       }
-                       amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, (X86_FP_C0|X86_FP_C1));
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, 0);
-                       EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
+                       /* skip branch if C1=1 */
+                       br1 = code;
+                       x86_branch8 (code, X86_CC_P, 0, FALSE);
+                       /* branch if C0=0 */
+                       EMIT_COND_BRANCH (ins, X86_CC_NB, FALSE);
+                       amd64_patch (br1, code);
                        break;
+               }
                case OP_FBLE_UN:
                        /* Branch if C013 != 001 */
-                       if (use_sse2 || (cfg->opt & MONO_OPT_FCMOV)) {
-                               EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
-                               EMIT_COND_BRANCH (ins, X86_CC_GE, FALSE);
-                               break;
-                       }
-                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
-                       EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
+                       EMIT_COND_BRANCH (ins, X86_CC_GE, FALSE);
                        break;
                case OP_CKFINITE:
-                       if (use_sse2) {
-                               /* Transfer value to the fp stack */
-                               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 16);
-                               amd64_movsd_membase_reg (code, AMD64_RSP, 0, ins->sreg1);
-                               amd64_fld_membase (code, AMD64_RSP, 0, TRUE);
-                       }
+                       /* Transfer value to the fp stack */
+                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 16);
+                       amd64_movsd_membase_reg (code, AMD64_RSP, 0, ins->sreg1);
+                       amd64_fld_membase (code, AMD64_RSP, 0, TRUE);
+
                        amd64_push_reg (code, AMD64_RAX);
                        amd64_fxam (code);
                        amd64_fnstsw (code);
                        amd64_alu_reg_imm (code, X86_AND, AMD64_RAX, 0x4100);
                        amd64_alu_reg_imm (code, X86_CMP, AMD64_RAX, X86_FP_C0);
                        amd64_pop_reg (code, AMD64_RAX);
-                       if (use_sse2) {
-                               amd64_fstp (code, 0);
-                       }                               
+                       amd64_fstp (code, 0);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_EQ, FALSE, "ArithmeticException");
-                       if (use_sse2)
-                               amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 16);
+                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 16);
                        break;
                case OP_TLS_GET: {
-                       code = emit_tls_get (code, ins->dreg, ins->inst_offset);
+                       code = mono_amd64_emit_tls_get (code, ins->dreg, ins->inst_offset);
                        break;
                }
                case OP_MEMORY_BARRIER: {
@@ -4051,10 +4868,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        guchar *br[2];
                        int sreg2 = ins->sreg2;
                        int breg = ins->inst_basereg;
-                       guint32 size = (ins->opcode == OP_ATOMIC_EXCHANGE_I4) ? 4 : 8;
+                       guint32 size;
+                       gboolean need_push = FALSE, rdx_pushed = FALSE;
+
+                       if (ins->opcode == OP_ATOMIC_EXCHANGE_I8)
+                               size = 8;
+                       else
+                               size = 4;
 
                        /* 
-                        * See http://msdn.microsoft.com/msdnmag/issues/0700/Win32/ for
+                        * See http://msdn.microsoft.com/en-us/magazine/cc302329.aspx for
                         * an explanation of how this works.
                         */
 
@@ -4062,37 +4885,651 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         * hack to overcome limits in x86 reg allocator 
                         * (req: dreg == eax and sreg2 != eax and breg != eax) 
                         */
+                       g_assert (ins->dreg == AMD64_RAX);
+
+                       if (breg == AMD64_RAX && ins->sreg2 == AMD64_RAX)
+                               /* Highly unlikely, but possible */
+                               need_push = TRUE;
+
                        /* The pushes invalidate rsp */
-                       if ((breg == AMD64_RAX) || (breg == AMD64_RSP)) {
+                       if ((breg == AMD64_RAX) || need_push) {
                                amd64_mov_reg_reg (code, AMD64_R11, breg, 8);
                                breg = AMD64_R11;
                        }
 
-                       if (ins->dreg != AMD64_RAX)
-                               amd64_push_reg (code, AMD64_RAX);
-                       
-                       /* We need the EAX reg for the cmpxchg */
+                       /* We need the EAX reg for the comparand */
                        if (ins->sreg2 == AMD64_RAX) {
-                               amd64_push_reg (code, AMD64_RDX);
-                               amd64_mov_reg_reg (code, AMD64_RDX, AMD64_RAX, size);
-                               sreg2 = AMD64_RDX;
+                               if (breg != AMD64_R11) {
+                                       amd64_mov_reg_reg (code, AMD64_R11, AMD64_RAX, 8);
+                                       sreg2 = AMD64_R11;
+                               } else {
+                                       g_assert (need_push);
+                                       amd64_push_reg (code, AMD64_RDX);
+                                       amd64_mov_reg_reg (code, AMD64_RDX, AMD64_RAX, size);
+                                       sreg2 = AMD64_RDX;
+                                       rdx_pushed = TRUE;
+                               }
                        }
 
                        amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
 
-                       br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
-                       amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
-                       br [1] = code; amd64_branch8 (code, X86_CC_NE, -1, FALSE);
-                       amd64_patch (br [1], br [0]);
+                       br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
+                       amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
+                       br [1] = code; amd64_branch8 (code, X86_CC_NE, -1, FALSE);
+                       amd64_patch (br [1], br [0]);
+
+                       if (rdx_pushed)
+                               amd64_pop_reg (code, AMD64_RDX);
+
+                       break;
+               }
+               case OP_ATOMIC_CAS_I4:
+               case OP_ATOMIC_CAS_I8: {
+                       guint32 size;
+
+                       if (ins->opcode == OP_ATOMIC_CAS_I8)
+                               size = 8;
+                       else
+                               size = 4;
+
+                       /* 
+                        * See http://msdn.microsoft.com/en-us/magazine/cc302329.aspx for
+                        * an explanation of how this works.
+                        */
+                       g_assert (ins->sreg3 == AMD64_RAX);
+                       g_assert (ins->sreg1 != AMD64_RAX);
+                       g_assert (ins->sreg1 != ins->sreg2);
+
+                       amd64_prefix (code, X86_LOCK_PREFIX);
+                       amd64_cmpxchg_membase_reg_size (code, ins->sreg1, ins->inst_offset, ins->sreg2, size);
+
+                       if (ins->dreg != AMD64_RAX)
+                               amd64_mov_reg_reg (code, ins->dreg, AMD64_RAX, size);
+                       break;
+               }
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               /* TODO: Some of these IR opcodes are marked as no clobber when they indeed do. */
+               case OP_ADDPS:
+                       amd64_sse_addps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DIVPS:
+                       amd64_sse_divps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MULPS:
+                       amd64_sse_mulps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SUBPS:
+                       amd64_sse_subps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MAXPS:
+                       amd64_sse_maxps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MINPS:
+                       amd64_sse_minps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_COMPPS:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
+                       amd64_sse_cmpps_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_ANDPS:
+                       amd64_sse_andps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ANDNPS:
+                       amd64_sse_andnps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ORPS:
+                       amd64_sse_orps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_XORPS:
+                       amd64_sse_xorps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SQRTPS:
+                       amd64_sse_sqrtps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_RSQRTPS:
+                       amd64_sse_rsqrtps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_RCPPS:
+                       amd64_sse_rcpps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_ADDSUBPS:
+                       amd64_sse_addsubps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HADDPS:
+                       amd64_sse_haddps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HSUBPS:
+                       amd64_sse_hsubps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DUPPS_HIGH:
+                       amd64_sse_movshdup_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_DUPPS_LOW:
+                       amd64_sse_movsldup_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_PSHUFLEW_HIGH:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       amd64_sse_pshufhw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_PSHUFLEW_LOW:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       amd64_sse_pshuflw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_PSHUFLED:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+
+               case OP_ADDPD:
+                       amd64_sse_addpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DIVPD:
+                       amd64_sse_divpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MULPD:
+                       amd64_sse_mulpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SUBPD:
+                       amd64_sse_subpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MAXPD:
+                       amd64_sse_maxpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MINPD:
+                       amd64_sse_minpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_COMPPD:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
+                       amd64_sse_cmppd_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_ANDPD:
+                       amd64_sse_andpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ANDNPD:
+                       amd64_sse_andnpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ORPD:
+                       amd64_sse_orpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_XORPD:
+                       amd64_sse_xorpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SQRTPD:
+                       amd64_sse_sqrtpd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_ADDSUBPD:
+                       amd64_sse_addsubpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HADDPD:
+                       amd64_sse_haddpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HSUBPD:
+                       amd64_sse_hsubpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DUPPD:
+                       amd64_sse_movddup_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_EXTRACT_MASK:
+                       amd64_sse_pmovmskb_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_PAND:
+                       amd64_sse_pand_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_POR:
+                       amd64_sse_por_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PXOR:
+                       amd64_sse_pxor_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB:
+                       amd64_sse_paddb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW:
+                       amd64_sse_paddw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDD:
+                       amd64_sse_paddd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDQ:
+                       amd64_sse_paddq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSUBB:
+                       amd64_sse_psubb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW:
+                       amd64_sse_psubw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBD:
+                       amd64_sse_psubd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBQ:
+                       amd64_sse_psubq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMAXB_UN:
+                       amd64_sse_pmaxub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW_UN:
+                       amd64_sse_pmaxuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD_UN:
+                       amd64_sse_pmaxud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               
+               case OP_PMAXB:
+                       amd64_sse_pmaxsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW:
+                       amd64_sse_pmaxsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD:
+                       amd64_sse_pmaxsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PAVGB_UN:
+                       amd64_sse_pavgb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PAVGW_UN:
+                       amd64_sse_pavgw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB_UN:
+                       amd64_sse_pminub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW_UN:
+                       amd64_sse_pminuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND_UN:
+                       amd64_sse_pminud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB:
+                       amd64_sse_pminsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW:
+                       amd64_sse_pminsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND:
+                       amd64_sse_pminsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPEQB:
+                       amd64_sse_pcmpeqb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQW:
+                       amd64_sse_pcmpeqw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQD:
+                       amd64_sse_pcmpeqd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQQ:
+                       amd64_sse_pcmpeqq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPGTB:
+                       amd64_sse_pcmpgtb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTW:
+                       amd64_sse_pcmpgtw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTD:
+                       amd64_sse_pcmpgtd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTQ:
+                       amd64_sse_pcmpgtq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSUM_ABS_DIFF:
+                       amd64_sse_psadbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_LOWB:
+                       amd64_sse_punpcklbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWW:
+                       amd64_sse_punpcklwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWD:
+                       amd64_sse_punpckldq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWQ:
+                       amd64_sse_punpcklqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPS:
+                       amd64_sse_unpcklps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPD:
+                       amd64_sse_unpcklpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_HIGHB:
+                       amd64_sse_punpckhbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHW:
+                       amd64_sse_punpckhwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHD:
+                       amd64_sse_punpckhdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHQ:
+                       amd64_sse_punpckhqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPS:
+                       amd64_sse_unpckhps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPD:
+                       amd64_sse_unpckhpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PACKW:
+                       amd64_sse_packsswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD:
+                       amd64_sse_packssdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKW_UN:
+                       amd64_sse_packuswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD_UN:
+                       amd64_sse_packusdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT_UN:
+                       amd64_sse_paddusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT_UN:
+                       amd64_sse_psubusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT_UN:
+                       amd64_sse_paddusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT_UN:
+                       amd64_sse_psubusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT:
+                       amd64_sse_paddsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT:
+                       amd64_sse_psubsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT:
+                       amd64_sse_paddsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT:
+                       amd64_sse_psubsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+                       
+               case OP_PMULW:
+                       amd64_sse_pmullw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULD:
+                       amd64_sse_pmulld_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULQ:
+                       amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH_UN:
+                       amd64_sse_pmulhuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH:
+                       amd64_sse_pmulhw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSHRW:
+                       amd64_sse_psrlw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRW_REG:
+                       amd64_sse_psrlw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARW:
+                       amd64_sse_psraw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARW_REG:
+                       amd64_sse_psraw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLW:
+                       amd64_sse_psllw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLW_REG:
+                       amd64_sse_psllw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRD:
+                       amd64_sse_psrld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRD_REG:
+                       amd64_sse_psrld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARD:
+                       amd64_sse_psrad_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARD_REG:
+                       amd64_sse_psrad_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLD:
+                       amd64_sse_pslld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLD_REG:
+                       amd64_sse_pslld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRQ:
+                       amd64_sse_psrlq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRQ_REG:
+                       amd64_sse_psrlq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               
+               /*TODO: This is appart of the sse spec but not added
+               case OP_PSARQ:
+                       amd64_sse_psraq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARQ_REG:
+                       amd64_sse_psraq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+               */
+       
+               case OP_PSHLQ:
+                       amd64_sse_psllq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLQ_REG:
+                       amd64_sse_psllq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+
+               case OP_ICONV_TO_X:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I4:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I8:
+                       if (ins->inst_c0) {
+                               amd64_movhlps_reg_reg (code, AMD64_XMM15, ins->sreg1);
+                               amd64_movd_reg_xreg_size (code, ins->dreg, AMD64_XMM15, 8);
+                       } else {
+                               amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8);
+                       }
+                       break;
+               case OP_EXTRACT_I1:
+               case OP_EXTRACT_U1:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
+                       break;
+               case OP_EXTRACT_I2:
+               case OP_EXTRACT_U2:
+                       /*amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm_size (code, X86_SHR, ins->dreg, 16, 4);*/
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE, 4);
+                       break;
+               case OP_EXTRACT_R8:
+                       if (ins->inst_c0)
+                               amd64_movhlps_reg_reg (code, ins->dreg, ins->sreg1);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_INSERT_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_EXTRACTX_U2:
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_INSERTX_U1_SLOW:
+                       /*sreg1 is the extracted ireg (scratch)
+                       /sreg2 is the to be inserted ireg (scratch)
+                       /dreg is the xreg to receive the value*/
+
+                       /*clear the bits from the extracted word*/
+                       amd64_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
+                       /*shift the value to insert if needed*/
+                       if (ins->inst_c0 & 1)
+                               amd64_shift_reg_imm_size (code, X86_SHL, ins->sreg2, 8, 4);
+                       /*join them together*/
+                       amd64_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
+                       break;
+               case OP_INSERTX_I4_SLOW:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
+                       amd64_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
+                       break;
+               case OP_INSERTX_I8_SLOW:
+                       amd64_movd_xreg_reg_size(code, AMD64_XMM15, ins->sreg2, 8);
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       break;
+
+               case OP_INSERTX_R4_SLOW:
+                       switch (ins->inst_c0) {
+                       case 0:
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               break;
+                       case 1:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               break;
+                       case 2:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               break;
+                       case 3:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               break;
+                       }
+                       break;
+               case OP_INSERTX_R8_SLOW:
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, ins->sreg2);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_STOREX_MEMBASE_REG:
+               case OP_STOREX_MEMBASE:
+                       amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_LOADX_MEMBASE:
+                       amd64_sse_movups_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_LOADX_ALIGNED_MEMBASE:
+                       amd64_sse_movaps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_STOREX_ALIGNED_MEMBASE_REG:
+                       amd64_sse_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_STOREX_NTA_MEMBASE_REG:
+                       amd64_sse_movntps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_PREFETCH_MEMBASE:
+                       amd64_sse_prefetch_reg_membase (code, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
+                       break;
+
+               case OP_XMOVE:
+                       /*FIXME the peephole pass should have killed this*/
+                       if (ins->dreg != ins->sreg1)
+                               amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;          
+               case OP_XZERO:
+                       amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+               case OP_ICONV_TO_R8_RAW:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
 
-                       if (ins->dreg != AMD64_RAX) {
-                               amd64_mov_reg_reg (code, ins->dreg, AMD64_RAX, size);
-                               amd64_pop_reg (code, AMD64_RAX);
-                       }
+               case OP_FCONV_TO_R8_X:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
 
-                       if (ins->sreg2 != sreg2)
-                               amd64_pop_reg (code, AMD64_RDX);
+               case OP_XCONV_R8_TO_I4:
+                       amd64_sse_cvttsd2si_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       switch (ins->backend.source_opcode) {
+                       case OP_FCONV_TO_I1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
+                               break;
+                       case OP_FCONV_TO_U1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
+                               break;
+                       case OP_FCONV_TO_I2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
+                               break;
+                       case OP_FCONV_TO_U2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
+                               break;
+                       }                       
+                       break;
 
+               case OP_EXPAND_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 0);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I4:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I8:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 8);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
+               case OP_EXPAND_R4:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->dreg);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R8:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
+#endif
+               case OP_LIVERANGE_START: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d START=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_start = code - cfg->native_code;
+                       break;
+               }
+               case OP_LIVERANGE_END: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d END=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
                        break;
                }
                default:
@@ -4106,17 +5543,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        g_assert_not_reached ();
                }
               
-               cpos += max_len;
-
+               last_ins = ins;
                last_offset = offset;
        }
 
        cfg->code_len = code - cfg->native_code;
 }
 
+#endif /* DISABLE_JIT */
+
 void
 mono_arch_register_lowlevel_calls (void)
 {
+       /* The signature doesn't matter */
+       mono_register_jit_icall (mono_amd64_throw_exception, "mono_amd64_throw_exception", mono_create_icall_signature ("void"), TRUE);
 }
 
 void
@@ -4183,6 +5623,25 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
        }
 }
 
+static int
+get_max_epilog_size (MonoCompile *cfg)
+{
+       int max_epilog_size = 16;
+       
+       if (cfg->method->save_lmf)
+               max_epilog_size += 256;
+       
+       if (mono_jit_trace_calls != NULL)
+               max_epilog_size += 50;
+
+       if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
+               max_epilog_size += 50;
+
+       max_epilog_size += (AMD64_NREG * 2);
+
+       return max_epilog_size;
+}
+
 /*
  * This macro is used for testing whenever the unwinder works correctly at every point
  * where an async exception can happen.
@@ -4203,7 +5662,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        MonoBasicBlock *bb;
        MonoMethodSignature *sig;
        MonoInst *ins;
-       int alloc_size, pos, max_offset, i, quad;
+       int alloc_size, pos, i, cfa_offset, quad, max_epilog_size;
        guint8 *code;
        CallInfo *cinfo;
        gint32 lmf_offset = cfg->arch.lmf_offset;
@@ -4220,6 +5679,9 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        /* Amount of stack space allocated by register saving code */
        pos = 0;
 
+       /* Offset between RSP and the CFA */
+       cfa_offset = 0;
+
        /* 
         * The prolog consists of the following parts:
         * FP present:
@@ -4235,25 +5697,54 @@ mono_arch_emit_prolog (MonoCompile *cfg)
         * - save callee saved regs using moves
         */
 
+       // CFA = sp + 8
+       cfa_offset = 8;
+       mono_emit_unwind_op_def_cfa (cfg, code, AMD64_RSP, 8);
+       // IP saved at CFA - 8
+       mono_emit_unwind_op_offset (cfg, code, AMD64_RIP, -cfa_offset);
        async_exc_point (code);
 
        if (!cfg->arch.omit_fp) {
                amd64_push_reg (code, AMD64_RBP);
+               cfa_offset += 8;
+               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+               mono_emit_unwind_op_offset (cfg, code, AMD64_RBP, - cfa_offset);
                async_exc_point (code);
+#ifdef HOST_WIN32
+               mono_arch_unwindinfo_add_push_nonvol (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
+#endif
+               
                amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+               mono_emit_unwind_op_def_cfa_reg (cfg, code, AMD64_RBP);
                async_exc_point (code);
+#ifdef HOST_WIN32
+               mono_arch_unwindinfo_add_set_fpreg (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
+#endif
        }
 
        /* Save callee saved registers */
        if (!cfg->arch.omit_fp && !method->save_lmf) {
+               int offset = cfa_offset;
+
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_push_reg (code, i);
                                pos += sizeof (gpointer);
+                               offset += 8;
+                               mono_emit_unwind_op_offset (cfg, code, i, - offset);
                                async_exc_point (code);
                        }
        }
 
+       /* The param area is always at offset 0 from sp */
+       /* This needs to be allocated here, since it has to come after the spill area */
+       if (cfg->arch.no_pushes && cfg->param_area) {
+               if (cfg->arch.omit_fp)
+                       // FIXME:
+                       g_assert_not_reached ();
+               cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof (gpointer));
+       }
+
        if (cfg->arch.omit_fp) {
                /* 
                 * On enter, the stack is misaligned by the the pushing of the return
@@ -4274,21 +5765,53 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        /* Allocate stack frame */
        if (alloc_size) {
                /* See mono_emit_stack_alloc */
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(HOST_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
                guint32 remaining_size = alloc_size;
+               /*FIXME handle unbounded code expansion, we should use a loop in case of more than X interactions*/
+               guint32 required_code_size = ((remaining_size / 0x1000) + 1) * 10; /*10 is the max size of amd64_alu_reg_imm + amd64_test_membase_reg*/
+               guint32 offset = code - cfg->native_code;
+               if (G_UNLIKELY (required_code_size >= (cfg->code_size - offset))) {
+                       while (required_code_size >= (cfg->code_size - offset))
+                               cfg->code_size *= 2;
+                       cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+                       code = cfg->native_code + offset;
+                       mono_jit_stats.code_reallocs++;
+               }
+
                while (remaining_size >= 0x1000) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 0x1000);
+                       if (cfg->arch.omit_fp) {
+                               cfa_offset += 0x1000;
+                               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                       }
                        async_exc_point (code);
+#ifdef HOST_WIN32
+                       if (cfg->arch.omit_fp) 
+                               mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, 0x1000);
+#endif
+
                        amd64_test_membase_reg (code, AMD64_RSP, 0, AMD64_RSP);
                        remaining_size -= 0x1000;
                }
                if (remaining_size) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, remaining_size);
-                       async_exc_point (code);
+                       if (cfg->arch.omit_fp) {
+                               cfa_offset += remaining_size;
+                               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                               async_exc_point (code);
+                       }
+#ifdef HOST_WIN32
+                       if (cfg->arch.omit_fp) 
+                               mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, remaining_size);
+#endif
                }
 #else
                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, alloc_size);
-               async_exc_point (code);
+               if (cfg->arch.omit_fp) {
+                       cfa_offset += alloc_size;
+                       mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                       async_exc_point (code);
+               }
 #endif
        }
 
@@ -4303,25 +5826,69 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
 #endif
 
+#ifndef TARGET_WIN32
+       if (mini_get_debug_options ()->init_stacks) {
+               /* Fill the stack frame with a dummy value to force deterministic behavior */
+       
+               /* Save registers to the red zone */
+               amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RDI, 8);
+               amd64_mov_membase_reg (code, AMD64_RSP, -16, AMD64_RCX, 8);
+
+               amd64_mov_reg_imm (code, AMD64_RAX, 0x2a2a2a2a2a2a2a2a);
+               amd64_mov_reg_imm (code, AMD64_RCX, alloc_size / 8);
+               amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RSP, 8);
+
+               amd64_cld (code);
+               amd64_prefix (code, X86_REP_PREFIX);
+               amd64_stosl (code);
+
+               amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RSP, -8, 8);
+               amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RSP, -16, 8);
+       }
+#endif 
+
        /* Save LMF */
        if (method->save_lmf) {
                /* 
                 * The ip field is not set, the exception handling code will obtain it from the stack location pointed to by the sp field.
                 */
-               /* sp is saved right before calls */
+               /* 
+                * sp is saved right before calls but we need to save it here too so
+                * async stack walks would work.
+                */
+               amd64_mov_membase_reg (code, cfg->frame_reg, cfg->arch.lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
                /* Skip method (only needed for trampoline LMF frames) */
                /* Save callee saved regs */
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbp), AMD64_RBP, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
+               for (i = 0; i < MONO_MAX_IREGS; ++i) {
+                       int offset;
+
+                       switch (i) {
+                       case AMD64_RBX: offset = G_STRUCT_OFFSET (MonoLMF, rbx); break;
+                       case AMD64_RBP: offset = G_STRUCT_OFFSET (MonoLMF, rbp); break;
+                       case AMD64_R12: offset = G_STRUCT_OFFSET (MonoLMF, r12); break;
+                       case AMD64_R13: offset = G_STRUCT_OFFSET (MonoLMF, r13); break;
+                       case AMD64_R14: offset = G_STRUCT_OFFSET (MonoLMF, r14); break;
+                       case AMD64_R15: offset = G_STRUCT_OFFSET (MonoLMF, r15); break;
+#ifdef HOST_WIN32
+                       case AMD64_RDI: offset = G_STRUCT_OFFSET (MonoLMF, rdi); break;
+                       case AMD64_RSI: offset = G_STRUCT_OFFSET (MonoLMF, rsi); break;
+#endif
+                       default:
+                               offset = -1;
+                               break;
+                       }
+
+                       if (offset != -1) {
+                               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + offset, i, 8);
+                               if (cfg->arch.omit_fp || (i != AMD64_RBP))
+                                       mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - (lmf_offset + offset)));
+                       }
+               }
        }
 
        /* Save callee saved registers */
        if (cfg->arch.omit_fp && !method->save_lmf) {
-               gint32 save_area_offset = 0;
+               gint32 save_area_offset = cfg->arch.reg_save_area_offset;
 
                /* Save caller saved registers after sp is adjusted */
                /* The registers are saved at the bottom of the frame */
@@ -4329,6 +5896,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
+                               mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - save_area_offset));
                                save_area_offset += 8;
                                async_exc_point (code);
                        }
@@ -4342,24 +5910,28 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, 8);
        }
 
-       /* compute max_offset in order to use short forward jumps */
-       max_offset = 0;
+       /* compute max_length in order to use short forward jumps */
+       max_epilog_size = get_max_epilog_size (cfg);
        if (cfg->opt & MONO_OPT_BRANCH) {
                for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
-                       bb->max_offset = max_offset;
+                       MonoInst *ins;
+                       int max_length = 0;
 
                        if (cfg->prof_options & MONO_PROFILE_COVERAGE)
-                               max_offset += 6;
+                               max_length += 6;
                        /* max alignment for loops */
                        if ((cfg->opt & MONO_OPT_LOOP) && bb_is_loop_start (bb))
-                               max_offset += LOOP_ALIGNMENT;
+                               max_length += LOOP_ALIGNMENT;
 
                        MONO_BB_FOR_EACH_INS (bb, ins) {
-                               if (ins->opcode == OP_LABEL)
-                                       ins->inst_c1 = max_offset;
-                               
-                               max_offset += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
+                               max_length += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
                        }
+
+                       /* Take prolog and epilog instrumentation into account */
+                       if (bb == cfg->bb_entry || bb == cfg->bb_exit)
+                               max_length += max_epilog_size;
+                       
+                       bb->max_length = max_length;
                }
        }
 
@@ -4393,6 +5965,38 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 
                stack_offset = ainfo->offset + ARGS_OFFSET;
 
+               if (cfg->globalra) {
+                       /* All the other moves are done by the register allocator */
+                       switch (ainfo->storage) {
+                       case ArgInFloatSSEReg:
+                               amd64_sse_cvtss2sd_reg_reg (code, ainfo->reg, ainfo->reg);
+                               break;
+                       case ArgValuetypeInReg:
+                               for (quad = 0; quad < 2; quad ++) {
+                                       switch (ainfo->pair_storage [quad]) {
+                                       case ArgInIReg:
+                                               amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad], sizeof (gpointer));
+                                               break;
+                                       case ArgInFloatSSEReg:
+                                               amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+                                               break;
+                                       case ArgInDoubleSSEReg:
+                                               amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+                                               break;
+                                       case ArgNone:
+                                               break;
+                                       default:
+                                               g_assert_not_reached ();
+                                       }
+                               }
+                               break;
+                       default:
+                               break;
+                       }
+
+                       continue;
+               }
+
                /* Save volatile arguments to the stack */
                if (ins->opcode != OP_REGVAR) {
                        switch (ainfo->storage) {
@@ -4438,6 +6042,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                        }
                                }
                                break;
+                       case ArgValuetypeAddrInIReg:
+                               if (ainfo->pair_storage [0] == ArgInIReg)
+                                       amd64_mov_membase_reg (code, ins->inst_left->inst_basereg, ins->inst_left->inst_offset, ainfo->pair_regs [0],  sizeof (gpointer));
+                               break;
                        default:
                                break;
                        }
@@ -4469,36 +6077,53 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                if (appdomain_tls_offset != -1 && lmf_tls_offset != -1) {
                        guint8 *buf, *no_domain_branch;
 
-                       code = emit_tls_get (code, AMD64_RAX, appdomain_tls_offset);
-                       if ((domain >> 32) == 0)
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
-                       else
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
+                       code = mono_amd64_emit_tls_get (code, AMD64_RAX, appdomain_tls_offset);
+                       if (cfg->compile_aot) {
+                               /* AOT code is only used in the root domain */
+                               amd64_mov_reg_imm (code, AMD64_ARG_REG1, 0);
+                       } else {
+                               if ((domain >> 32) == 0)
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
+                               else
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
+                       }
                        amd64_alu_reg_reg (code, X86_CMP, AMD64_RAX, AMD64_ARG_REG1);
                        no_domain_branch = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
-                       code = emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
+                       code = mono_amd64_emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
                        amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
                        buf = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
                        amd64_patch (no_domain_branch, code);
-                       code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, (gpointer)"mono_jit_thread_attach");
+                       code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
+                                         (gpointer)"mono_jit_thread_attach", TRUE);
                        amd64_patch (buf, code);
+#ifdef HOST_WIN32
+                       /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
+                       /* FIXME: Add a separate key for LMF to avoid this */
+                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
+#endif
                } else {
                        g_assert (!cfg->compile_aot);
-                       if ((domain >> 32) == 0)
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
-                       else
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
-                       code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, (gpointer)"mono_jit_thread_attach");
+                       if (cfg->compile_aot) {
+                               /* AOT code is only used in the root domain */
+                               amd64_mov_reg_imm (code, AMD64_ARG_REG1, 0);
+                       } else {
+                               if ((domain >> 32) == 0)
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
+                               else
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
+                       }
+                       code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD,
+                                         (gpointer)"mono_jit_thread_attach", TRUE);
                }
        }
 
        if (method->save_lmf) {
                if ((lmf_tls_offset != -1) && !optimize_for_xen) {
                        /*
-                        * Optimized version which uses the mono_lmf TLS variable instead of indirection
-                        * through the mono_lmf_addr TLS variable.
+                        * Optimized version which uses the mono_lmf TLS variable instead of 
+                        * indirection through the mono_lmf_addr TLS variable.
                         */
                        /* %rax = previous_lmf */
                        x86_prefix (code, X86_FS_PREFIX);
@@ -4518,7 +6143,12 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                } else {
                        if (lmf_addr_tls_offset != -1) {
                                /* Load lmf quicky using the FS register */
-                               code = emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
+                               code = mono_amd64_emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
+#ifdef HOST_WIN32
+                               /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
+                               /* FIXME: Add a separate key for LMF to avoid this */
+                               amd64_alu_reg_imm (code, X86_ADD, AMD64_RAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
+#endif
                        }
                        else {
                                /* 
@@ -4527,7 +6157,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                 */
                                args_clobbered = TRUE;
                                code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
-                                                                 (gpointer)"mono_get_lmf_addr");               
+                                                                 (gpointer)"mono_get_lmf_addr", TRUE);         
                        }
 
                        /* Save lmf_addr */
@@ -4559,10 +6189,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                MonoBasicBlock *first_bb = cfg->bb_entry;
                MonoInst *next;
 
-               next = mono_inst_list_first (&first_bb->ins_list);
+               next = mono_bb_first_ins (first_bb);
                if (!next && first_bb->next_bb) {
                        first_bb = first_bb->next_bb;
-                       next = mono_inst_list_first (&first_bb->ins_list);
+                       next = mono_bb_first_ins (first_bb);
                }
 
                if (first_bb->in_count > 1)
@@ -4608,13 +6238,25 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        }
 
                        if (match) {
-                               next = mono_inst_list_next (&next->node, &first_bb->ins_list);
+                               next = next->next;
+                               //next = mono_inst_list_next (&next->node, &first_bb->ins_list);
                                if (!next)
                                        break;
                        }
                }
        }
 
+       /* Initialize ss_trigger_page_var */
+       if (cfg->arch.ss_trigger_page_var) {
+               MonoInst *var = cfg->arch.ss_trigger_page_var;
+
+               g_assert (!cfg->compile_aot);
+               g_assert (var->opcode == OP_REGOFFSET);
+
+               amd64_mov_reg_imm (code, AMD64_R11, (guint64)ss_trigger_page);
+               amd64_mov_membase_reg (code, var->inst_basereg, var->inst_offset, AMD64_R11, 8);
+       }
+
        cfg->code_len = code - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
@@ -4628,20 +6270,11 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        MonoMethod *method = cfg->method;
        int quad, pos, i;
        guint8 *code;
-       int max_epilog_size = 16;
+       int max_epilog_size;
        CallInfo *cinfo;
        gint32 lmf_offset = cfg->arch.lmf_offset;
        
-       if (cfg->method->save_lmf)
-               max_epilog_size += 256;
-       
-       if (mono_jit_trace_calls != NULL)
-               max_epilog_size += 50;
-
-       if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
-               max_epilog_size += 50;
-
-       max_epilog_size += (AMD64_NREG * 2);
+       max_epilog_size = get_max_epilog_size (cfg);
 
        while (cfg->code_len + max_epilog_size > (cfg->code_size - 16)) {
                cfg->code_size *= 2;
@@ -4658,6 +6291,23 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
        
        if (method->save_lmf) {
+               /* check if we need to restore protection of the stack after a stack overflow */
+               if (mono_get_jit_tls_offset () != -1) {
+                       guint8 *patch;
+                       code = mono_amd64_emit_tls_get (code, X86_ECX, mono_get_jit_tls_offset ());
+                       /* we load the value in a separate instruction: this mechanism may be
+                        * used later as a safer way to do thread interruption
+                        */
+                       amd64_mov_reg_membase (code, X86_ECX, X86_ECX, G_STRUCT_OFFSET (MonoJitTlsData, restore_stack_prot), 8);
+                       x86_alu_reg_imm (code, X86_CMP, X86_ECX, 0);
+                       patch = code;
+                       x86_branch8 (code, X86_CC_Z, 0, FALSE);
+                       /* note that the call trampoline will preserve eax/edx */
+                       x86_call_reg (code, X86_ECX);
+                       x86_patch (patch, code);
+               } else {
+                       /* FIXME: maybe save the jit tls in the prolog */
+               }
                if ((lmf_tls_offset != -1) && !optimize_for_xen) {
                        /*
                         * Optimized version which uses the mono_lmf TLS variable instead of indirection
@@ -4693,10 +6343,18 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                if (cfg->used_int_regs & (1 << AMD64_R15)) {
                        amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
                }
+#ifdef HOST_WIN32
+               if (cfg->used_int_regs & (1 << AMD64_RDI)) {
+                       amd64_mov_reg_membase (code, AMD64_RDI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), 8);
+               }
+               if (cfg->used_int_regs & (1 << AMD64_RSI)) {
+                       amd64_mov_reg_membase (code, AMD64_RSI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsi), 8);
+               }
+#endif
        } else {
 
                if (cfg->arch.omit_fp) {
-                       gint32 save_area_offset = 0;
+                       gint32 save_area_offset = cfg->arch.reg_save_area_offset;
 
                        for (i = 0; i < AMD64_NREG; ++i)
                                if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
@@ -4767,15 +6425,6 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        cfg->code_len = code - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
-
-       if (cfg->arch.omit_fp) {
-               /* 
-                * Encode the stack size into used_int_regs so the exception handler
-                * can access it.
-                */
-               g_assert (cfg->arch.stack_alloc_size < (1 << 16));
-               cfg->used_int_regs |= (1 << 31) | (cfg->arch.stack_alloc_size << 16);
-       }
 }
 
 void
@@ -4841,11 +6490,10 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                                        exc_throw_start [nthrows] = code;
                                }
                                amd64_mov_reg_imm (code, AMD64_ARG_REG1, exc_class->type_token);
-                               patch_info->data.name = "mono_arch_throw_corlib_exception";
-                               patch_info->type = MONO_PATCH_INFO_INTERNAL_METHOD;
-                               patch_info->ip.i = code - cfg->native_code;
 
-                               code = emit_call_body (cfg, code, patch_info->type, patch_info->data.name);
+                               patch_info->type = MONO_PATCH_INFO_NONE;
+
+                               code = emit_call_body (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, "mono_arch_throw_corlib_exception");
 
                                amd64_mov_reg_imm (buf, AMD64_ARG_REG2, (code - cfg->native_code) - throw_ip);
                                while (buf < buf2)
@@ -4873,24 +6521,15 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                case MONO_PATCH_INFO_R4: {
                        guint8 *pos;
 
-                       if (use_sse2) {
-                               /* The SSE opcodes require a 16 byte alignment */
-                               code = (guint8*)ALIGN_TO (code, 16);
-                       } else {
-                               code = (guint8*)ALIGN_TO (code, 8);
-                       }
+                       /* The SSE opcodes require a 16 byte alignment */
+                       code = (guint8*)ALIGN_TO (code, 16);
 
                        pos = cfg->native_code + patch_info->ip.i;
 
-
-                       if (use_sse2) {
-                               if (IS_REX (pos [1]))
-                                       *(guint32*)(pos + 5) = (guint8*)code - pos - 9;
-                               else
-                                       *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
-                       } else {
-                               *(guint32*)(pos + 3) = (guint8*)code - pos - 7;
-                       }
+                       if (IS_REX (pos [1]))
+                               *(guint32*)(pos + 5) = (guint8*)code - pos - 9;
+                       else
+                               *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
 
                        if (patch_info->type == MONO_PATCH_INFO_R8) {
                                *(double*)code = *(double*)patch_info->data.target;
@@ -4941,7 +6580,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                /* Allocate a new area on the stack and save arguments there */
                sig = mono_method_signature (cfg->method);
 
-               cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+               cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
 
                n = sig->param_count + sig->hasthis;
 
@@ -4964,7 +6603,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
        mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_METHODCONST, cfg->method);
        amd64_set_reg_template (code, AMD64_ARG_REG1);
        amd64_mov_reg_reg (code, AMD64_ARG_REG2, AMD64_RSP, 8);
-       code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func);
+       code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func, TRUE);
 
        if (enable_arguments)
                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, stack_area);
@@ -4981,14 +6620,14 @@ enum {
 };
 
 void*
-mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
+mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments, gboolean preserve_argument_registers)
 {
        guchar *code = p;
        int save_mode = SAVE_NONE;
        MonoMethod *method = cfg->method;
-       int rtype = mono_type_get_underlying_type (mono_method_signature (method)->ret)->type;
+       MonoType *ret_type = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
        
-       switch (rtype) {
+       switch (ret_type->type) {
        case MONO_TYPE_VOID:
                /* special case string .ctor icall */
                if (strcmp (".ctor", method->name) && method->klass == mono_defaults.string_class)
@@ -5005,7 +6644,7 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
                save_mode = SAVE_XMM;
                break;
        case MONO_TYPE_GENERICINST:
-               if (!mono_type_generic_inst_is_valuetype (mono_method_signature (method)->ret)) {
+               if (!mono_type_generic_inst_is_valuetype (ret_type)) {
                        save_mode = SAVE_EAX;
                        break;
                }
@@ -5054,9 +6693,19 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
        else
                amd64_mov_reg_imm (code, AMD64_RAX, 0);
 
+       if (preserve_argument_registers) {
+               amd64_push_reg (code, MONO_AMD64_ARG_REG1);
+               amd64_push_reg (code, MONO_AMD64_ARG_REG2);
+       }
+
        mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_METHODCONST, method);
        amd64_set_reg_template (code, AMD64_ARG_REG1);
-       code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func);
+       code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func, TRUE);
+
+       if (preserve_argument_registers) {
+               amd64_pop_reg (code, MONO_AMD64_ARG_REG2);
+               amd64_pop_reg (code, MONO_AMD64_ARG_REG1);
+       }
 
        /* Restore result */
        switch (save_mode) {
@@ -5224,65 +6873,51 @@ mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guin
 }
 
 gpointer
-mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
+mono_arch_get_vcall_slot (guint8 *code, mgreg_t *regs, int *displacement)
 {
        guint8 buf [10];
        guint32 reg;
        gint32 disp;
        guint8 rex = 0;
+       MonoJitInfo *ji = NULL;
+
+#ifdef ENABLE_LLVM
+       /* code - 9 might be before the start of the method */
+       /* FIXME: Avoid this expensive call somehow */
+       ji = mono_jit_info_table_find (mono_domain_get (), (char*)code);
+#endif
 
-       mono_breakpoint_clean_code (NULL, code, 9, buf, sizeof (buf));
+       mono_breakpoint_clean_code (ji ? ji->code_start : NULL, code, 9, buf, sizeof (buf));
        code = buf + 9;
 
        *displacement = 0;
 
-       /* go to the start of the call instruction
-        *
-        * address_byte = (m << 6) | (o << 3) | reg
-        * call opcode: 0xff address_byte displacement
-        * 0xff m=1,o=2 imm8
-        * 0xff m=2,o=2 imm32
-        */
        code -= 7;
 
        /* 
         * A given byte sequence can match more than case here, so we have to be
         * really careful about the ordering of the cases. Longer sequences
         * come first.
+        * There are two types of calls:
+        * - direct calls: 0xff address_byte 8/32 bits displacement
+        * - indirect calls: nop nop nop <call>
+        * The nops make sure we don't confuse the instruction preceeding an indirect
+        * call with a direct call.
         */
-#ifdef MONO_ARCH_HAVE_IMT
-       if ((code [-2] == 0x41) && (code [-1] == 0xbb) && (code [4] == 0xff) && (x86_modrm_mod (code [5]) == 1) && (x86_modrm_reg (code [5]) == 2) && ((signed char)code [6] < 0)) {
-               /* IMT-based interface calls: with MONO_ARCH_IMT_REG == r11
-                * 41 bb 14 f8 28 08       mov    $0x828f814,%r11d
-                * ff 50 fc                call   *0xfffffffc(%rax)
-                */
-               reg = amd64_modrm_rm (code [5]);
-               disp = (signed char)code [6];
-               /* R10 is clobbered by the IMT thunk code */
-               g_assert (reg != AMD64_R10);
-       }
-#else
-       if (0) {
-       }
-#endif
-       else if ((code [-1] == 0x8b) && (amd64_modrm_mod (code [0]) == 0x2) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-                       /*
-                        * This is a interface call
-                        * 48 8b 80 f0 e8 ff ff   mov    0xffffffffffffe8f0(%rax),%rax
-                        * ff 10                  callq  *(%rax)
-                        */
-               if (IS_REX (code [4]))
-                       rex = code [4];
-               reg = amd64_modrm_rm (code [6]);
-               disp = 0;
-               /* R10 is clobbered by the IMT thunk code */
-               g_assert (reg != AMD64_R10);
-       } else if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
+       if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
                /* call OFFSET(%rip) */
                disp = *(guint32*)(code + 3);
                return (gpointer*)(code + disp + 7);
-       }
-       else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
+       } else if ((code [0] == 0xff) && (amd64_modrm_reg (code [1]) == 0x2) && (amd64_modrm_mod (code [1]) == 0x2) && (amd64_sib_index (code [2]) == 4) && (amd64_sib_scale (code [2]) == 0)) {
+               /* call *[reg+disp32] using indexed addressing */
+               /* The LLVM JIT emits this, and we emit it too for %r12 */
+               if (IS_REX (code [-1])) {
+                       rex = code [-1];
+                       g_assert (amd64_rex_x (rex) == 0);
+               }                       
+               reg = amd64_sib_base (code [2]);
+               disp = *(gint32*)(code + 3);
+       } else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
                /* call *[reg+disp32] */
                if (IS_REX (code [0]))
                        rex = code [0];
@@ -5290,16 +6925,19 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
                disp = *(gint32*)(code + 3);
                /* R10 is clobbered by the IMT thunk code */
                g_assert (reg != AMD64_R10);
-       }
-       else if (code [2] == 0xe8) {
+       } else if (code [2] == 0xe8) {
                /* call <ADDR> */
                return NULL;
-       }
-       else if (IS_REX (code [4]) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x3)) {
+       } else if ((code [3] == 0xff) && (amd64_modrm_reg (code [4]) == 0x2) && (amd64_modrm_mod (code [4]) == 0x1) && (amd64_sib_index (code [5]) == 4) && (amd64_sib_scale (code [5]) == 0)) {
+               /* call *[r12+disp8] using indexed addressing */
+               if (IS_REX (code [2]))
+                       rex = code [2];
+               reg = amd64_sib_base (code [5]);
+               disp = *(gint8*)(code + 6);
+       } else if (IS_REX (code [4]) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x3)) {
                /* call *%reg */
                return NULL;
-       }
-       else if ((code [4] == 0xff) && (amd64_modrm_reg (code [5]) == 0x2) && (amd64_modrm_mod (code [5]) == 0x1)) {
+       } else if ((code [4] == 0xff) && (amd64_modrm_reg (code [5]) == 0x2) && (amd64_modrm_mod (code [5]) == 0x1)) {
                /* call *[reg+disp8] */
                if (IS_REX (code [3]))
                        rex = code [3];
@@ -5308,63 +6946,125 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
                //printf ("B: [%%r%d+0x%x]\n", reg, disp);
        }
        else if ((code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-                       /*
-                        * This is a interface call: should check the above code can't catch it earlier 
-                        * 8b 40 30   mov    0x30(%eax),%eax
-                        * ff 10      call   *(%eax)
-                        */
+               /* call *%reg */
                if (IS_REX (code [4]))
                        rex = code [4];
                reg = amd64_modrm_rm (code [6]);
                disp = 0;
        }
-       else
-               g_assert_not_reached ();
+       else
+               g_assert_not_reached ();
+
+       reg += amd64_rex_b (rex);
+
+       /* R11 is clobbered by the trampoline code */
+       g_assert (reg != AMD64_R11);
+
+       *displacement = disp;
+       return (gpointer)regs [reg];
+}
+
+int
+mono_arch_get_this_arg_reg (MonoMethodSignature *sig, MonoGenericSharingContext *gsctx, guint8 *code)
+{
+       int this_reg = AMD64_ARG_REG1;
+
+       if (MONO_TYPE_ISSTRUCT (sig->ret)) {
+               CallInfo *cinfo;
+
+               if (!gsctx && code)
+                       gsctx = mono_get_generic_context_from_code (code);
+
+               cinfo = get_call_info (gsctx, NULL, sig, FALSE);
+               
+               if (cinfo->ret.storage != ArgValuetypeInReg)
+                       this_reg = AMD64_ARG_REG2;
+               g_free (cinfo);
+       }
+
+       return this_reg;
+}
+
+gpointer
+mono_arch_get_this_arg_from_call (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, mgreg_t *regs, guint8 *code)
+{
+       return (gpointer)regs [mono_arch_get_this_arg_reg (sig, gsctx, code)];
+}
+
+#define MAX_ARCH_DELEGATE_PARAMS 10
+
+static gpointer
+get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *code_len)
+{
+       guint8 *code, *start;
+       int i;
+
+       if (has_target) {
+               start = code = mono_global_codeman_reserve (64);
+
+               /* Replace the this argument with the target */
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
+               amd64_mov_reg_membase (code, AMD64_ARG_REG1, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, target), 8);
+               amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+
+               g_assert ((code - start) < 64);
+       } else {
+               start = code = mono_global_codeman_reserve (64);
+
+               if (param_count == 0) {
+                       amd64_jump_membase (code, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               } else {
+                       /* We have to shift the arguments left */
+                       amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
+                       for (i = 0; i < param_count; ++i) {
+#ifdef HOST_WIN32
+                               if (i < 3)
+                                       amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+                               else
+                                       amd64_mov_reg_membase (code, param_regs [i], AMD64_RSP, 0x28, 8);
+#else
+                               amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+#endif
+                       }
+
+                       amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               }
+               g_assert ((code - start) < 64);
+       }
 
-       reg += amd64_rex_b (rex);
+       mono_debug_add_delegate_trampoline (start, code - start);
 
-       /* R11 is clobbered by the trampoline code */
-       g_assert (reg != AMD64_R11);
+       if (code_len)
+               *code_len = code - start;
 
-       *displacement = disp;
-       return regs [reg];
+       return start;
 }
 
-gpointer*
-mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
+/*
+ * mono_arch_get_delegate_invoke_impls:
+ *
+ *   Return a list of MonoAotTrampInfo structures for the delegate invoke impl
+ * trampolines.
+ */
+GSList*
+mono_arch_get_delegate_invoke_impls (void)
 {
-       gpointer vt;
-       int displacement;
-       vt = mono_arch_get_vcall_slot (code, regs, &displacement);
-       if (!vt)
-               return NULL;
-       return (gpointer*)((char*)vt + displacement);
-}
+       GSList *res = NULL;
+       guint8 *code;
+       guint32 code_len;
+       int i;
 
-int
-mono_arch_get_this_arg_reg (MonoMethodSignature *sig)
-{
-       int this_reg = AMD64_ARG_REG1;
+       code = get_delegate_invoke_impl (TRUE, 0, &code_len);
+       res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup ("delegate_invoke_impl_has_target"), code, code_len));
 
-       if (MONO_TYPE_ISSTRUCT (sig->ret)) {
-               CallInfo *cinfo = get_call_info (NULL, NULL, sig, FALSE);
-               
-               if (cinfo->ret.storage != ArgValuetypeInReg)
-                       this_reg = AMD64_ARG_REG2;
-               g_free (cinfo);
+       for (i = 0; i < MAX_ARCH_DELEGATE_PARAMS; ++i) {
+               code = get_delegate_invoke_impl (FALSE, i, &code_len);
+               res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup_printf ("delegate_invoke_impl_target_%d", i), code, code_len));
        }
 
-       return this_reg;
-}
-
-gpointer
-mono_arch_get_this_arg_from_call (MonoMethodSignature *sig, gssize *regs, guint8 *code)
-{
-       return (gpointer)regs [mono_arch_get_this_arg_reg (sig)];
+       return res;
 }
 
-#define MAX_ARCH_DELEGATE_PARAMS 10
-
 gpointer
 mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_target)
 {
@@ -5380,24 +7080,18 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
 
        if (has_target) {
                static guint8* cached = NULL;
-               mono_mini_arch_lock ();
-               if (cached) {
-                       mono_mini_arch_unlock ();
-                       return cached;
-               }
 
-               start = code = mono_global_codeman_reserve (64);
+               if (cached)
+                       return cached;
 
-               /* Replace the this argument with the target */
-               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
-               amd64_mov_reg_membase (code, AMD64_ARG_REG1, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, target), 8);
-               amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               if (mono_aot_only)
+                       start = mono_aot_get_named_code ("delegate_invoke_impl_has_target");
+               else
+                       start = get_delegate_invoke_impl (TRUE, 0, NULL);
 
-               g_assert ((code - start) < 64);
+               mono_memory_barrier ();
 
                cached = start;
-               mono_debug_add_delegate_trampoline (start, code - start);
-               mono_mini_arch_unlock ();
        } else {
                static guint8* cache [MAX_ARCH_DELEGATE_PARAMS + 1] = {NULL};
                for (i = 0; i < sig->param_count; ++i)
@@ -5406,31 +7100,21 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
                if (sig->param_count > 4)
                        return NULL;
 
-               mono_mini_arch_lock ();
                code = cache [sig->param_count];
-               if (code) {
-                       mono_mini_arch_unlock ();
+               if (code)
                        return code;
-               }
-
-               start = code = mono_global_codeman_reserve (64);
 
-               if (sig->param_count == 0) {
-                       amd64_jump_membase (code, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               if (mono_aot_only) {
+                       char *name = g_strdup_printf ("delegate_invoke_impl_target_%d", sig->param_count);
+                       start = mono_aot_get_named_code (name);
+                       g_free (name);
                } else {
-                       /* We have to shift the arguments left */
-                       amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
-                       for (i = 0; i < sig->param_count; ++i)
-                               amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
-
-                       amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+                       start = get_delegate_invoke_impl (FALSE, sig->param_count, NULL);
                }
-               g_assert ((code - start) < 64);
+
+               mono_memory_barrier ();
 
                cache [sig->param_count] = start;
-               
-               mono_debug_add_delegate_trampoline (start, code - start);
-               mono_mini_arch_unlock ();
        }
 
        return start;
@@ -5447,6 +7131,21 @@ void
 mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
 {
        if (!tls_offset_inited) {
+#ifdef HOST_WIN32
+               /* 
+                * We need to init this multiple times, since when we are first called, the key might not
+                * be initialized yet.
+                */
+               appdomain_tls_offset = mono_domain_get_tls_key ();
+               lmf_tls_offset = mono_get_jit_tls_key ();
+               lmf_addr_tls_offset = mono_get_jit_tls_key ();
+
+               /* Only 64 tls entries can be accessed using inline code */
+               if (appdomain_tls_offset >= 64)
+                       appdomain_tls_offset = -1;
+               if (lmf_tls_offset >= 64)
+                       lmf_tls_offset = -1;
+#else
                tls_offset_inited = TRUE;
 #ifdef MONO_XEN_OPT
                optimize_for_xen = access ("/proc/xen", F_OK) == 0;
@@ -5454,7 +7153,7 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                appdomain_tls_offset = mono_domain_get_tls_offset ();
                lmf_tls_offset = mono_get_lmf_tls_offset ();
                lmf_addr_tls_offset = mono_get_lmf_addr_tls_offset ();
-               thread_tls_offset = mono_thread_get_tls_offset ();
+#endif
        }               
 }
 
@@ -5463,51 +7162,6 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
 }
 
-void
-mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
-{
-       MonoCallInst *call = (MonoCallInst*)inst;
-       CallInfo * cinfo = get_call_info (cfg, cfg->mempool, inst->signature, FALSE);
-
-       if (vt_reg != -1) {
-               MonoInst *vtarg;
-
-               if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       /*
-                        * The valuetype is in RAX:RDX after the call, need to be copied to
-                        * the stack. Push the address here, so the call instruction can
-                        * access it.
-                        */
-                       MONO_INST_NEW (cfg, vtarg, OP_X86_PUSH);
-                       vtarg->sreg1 = vt_reg;
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-
-                       /* Align stack */
-                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
-               }
-               else {
-                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
-                       vtarg->sreg1 = vt_reg;
-                       vtarg->dreg = mono_regstate_next_int (cfg->rs);
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-
-                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
-               }
-       }
-
-       /* add the this argument */
-       if (this_reg != -1) {
-               MonoInst *this;
-               MONO_INST_NEW (cfg, this, OP_MOVE);
-               this->type = this_type;
-               this->sreg1 = this_reg;
-               this->dreg = mono_regstate_next_int (cfg->rs);
-               mono_bblock_add_inst (cfg->cbb, this);
-
-               mono_call_inst_add_outarg_reg (cfg, call, this->dreg, cinfo->args [0].reg, FALSE);
-       }
-}
-
 #ifdef MONO_ARCH_HAVE_IMT
 
 #define CMP_SIZE (6 + 1)
@@ -5531,7 +7185,8 @@ imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
  * LOCKING: called with the domain lock held
  */
 gpointer
-mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count)
+mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count,
+       gpointer fail_tramp)
 {
        int i;
        int size = 0;
@@ -5543,28 +7198,37 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                if (item->is_equals) {
                        if (item->check_target_idx) {
                                if (!item->compare_done) {
-                                       if (amd64_is_imm32 (item->method))
+                                       if (amd64_is_imm32 (item->key))
                                                item->chunk_size += CMP_SIZE;
                                        else
                                                item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
                                }
-                               if (vtable_is_32bit)
-                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
-                               else
+                               if (item->has_target_code) {
                                        item->chunk_size += MOV_REG_IMM_SIZE;
+                               } else {
+                                       if (vtable_is_32bit)
+                                               item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                                       else
+                                               item->chunk_size += MOV_REG_IMM_SIZE;
+                               }
                                item->chunk_size += BR_SMALL_SIZE + JUMP_REG_SIZE;
                        } else {
-                               if (vtable_is_32bit)
-                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
-                               else
-                                       item->chunk_size += MOV_REG_IMM_SIZE;
-                               item->chunk_size += JUMP_REG_SIZE;
-                               /* with assert below:
-                                * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
-                                */
+                               if (fail_tramp) {
+                                       item->chunk_size += MOV_REG_IMM_SIZE * 3 + CMP_REG_REG_SIZE +
+                                               BR_SMALL_SIZE + JUMP_REG_SIZE * 2;
+                               } else {
+                                       if (vtable_is_32bit)
+                                               item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                                       else
+                                               item->chunk_size += MOV_REG_IMM_SIZE;
+                                       item->chunk_size += JUMP_REG_SIZE;
+                                       /* with assert below:
+                                        * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
+                                        */
+                               }
                        }
                } else {
-                       if (amd64_is_imm32 (item->method))
+                       if (amd64_is_imm32 (item->key))
                                item->chunk_size += CMP_SIZE;
                        else
                                item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
@@ -5573,51 +7237,75 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                }
                size += item->chunk_size;
        }
-       code = mono_code_manager_reserve (domain->code_mp, size);
+       if (fail_tramp)
+               code = mono_method_alloc_generic_virtual_thunk (domain, size);
+       else
+               code = mono_domain_code_reserve (domain, size);
        start = code;
        for (i = 0; i < count; ++i) {
                MonoIMTCheckItem *item = imt_entries [i];
                item->code_target = code;
                if (item->is_equals) {
-                       if (item->check_target_idx) {
-                               if (!item->compare_done) {
-                                       if (amd64_is_imm32 (item->method))
-                                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                       gboolean fail_case = !item->check_target_idx && fail_tramp;
+
+                       if (item->check_target_idx || fail_case) {
+                               if (!item->compare_done || fail_case) {
+                                       if (amd64_is_imm32 (item->key))
+                                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                        else {
-                                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                                amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                                        }
                                }
                                item->jmp_code = code;
                                amd64_branch8 (code, X86_CC_NE, 0, FALSE);
-                               amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->vtable_slot]));
-                               amd64_jump_membase (code, AMD64_R11, 0);
+                               /* See the comment below about R10 */
+                               if (item->has_target_code) {
+                                       amd64_mov_reg_imm (code, AMD64_R10, item->value.target_code);
+                                       amd64_jump_reg (code, AMD64_R10);
+                               } else {
+                                       amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
+                                       amd64_jump_membase (code, AMD64_R10, 0);
+                               }
+
+                               if (fail_case) {
+                                       amd64_patch (item->jmp_code, code);
+                                       amd64_mov_reg_imm (code, AMD64_R10, fail_tramp);
+                                       amd64_jump_reg (code, AMD64_R10);
+                                       item->jmp_code = NULL;
+                               }
                        } else {
                                /* enable the commented code to assert on wrong method */
 #if 0
-                               if (amd64_is_imm32 (item->method))
-                                       amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                               if (amd64_is_imm32 (item->key))
+                                       amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                else {
-                                       amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                       amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                        amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                                }
                                item->jmp_code = code;
                                amd64_branch8 (code, X86_CC_NE, 0, FALSE);
-                               amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->vtable_slot]));
-                               amd64_jump_membase (code, AMD64_R11, 0);
+                               /* See the comment below about R10 */
+                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
+                               amd64_jump_membase (code, AMD64_R10, 0);
                                amd64_patch (item->jmp_code, code);
                                amd64_breakpoint (code);
                                item->jmp_code = NULL;
 #else
-                               amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->vtable_slot]));
-                               amd64_jump_membase (code, AMD64_R11, 0);
+                               /* We're using R10 here because R11
+                                  needs to be preserved.  R10 needs
+                                  to be preserved for calls which
+                                  require a runtime generic context,
+                                  but interface calls don't. */
+                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
+                               amd64_jump_membase (code, AMD64_R10, 0);
 #endif
                        }
                } else {
-                       if (amd64_is_imm32 (item->method))
-                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                       if (amd64_is_imm32 (item->key))
+                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                        else {
-                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                        }
                        item->jmp_code = code;
@@ -5637,125 +7325,83 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                        }
                }
        }
-               
-       mono_stats.imt_thunks_size += code - start;
+
+       if (!fail_tramp)
+               mono_stats.imt_thunks_size += code - start;
        g_assert (code - start <= size);
 
        return start;
 }
 
 MonoMethod*
-mono_arch_find_imt_method (gpointer *regs, guint8 *code)
-{
-       /* 
-        * R11 is clobbered by the trampoline code, so we have to retrieve the method 
-        * from the code.
-        * 41 bb c0 f7 89 00     mov    $0x89f7c0,%r11d
-        * ff 90 68 ff ff ff     callq  *0xffffffffffffff68(%rax)
-        */
-       /* Similar to get_vcall_slot_addr () */
-
-       /* Find the start of the call instruction */
-       code -= 7;
-       if ((code [-2] == 0x41) && (code [-1] == 0xbb) && (code [4] == 0xff) && (x86_modrm_mod (code [5]) == 1) && (x86_modrm_reg (code [5]) == 2) && ((signed char)code [6] < 0)) {
-               /* IMT-based interface calls
-                * 41 bb 14 f8 28 08       mov    $0x828f814,%r11
-                * ff 50 fc                call   *0xfffffffc(%rax)
-                */
-               code += 4;
-       } else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
-               /* call *[reg+disp32] */
-               code += 1;
-       } else if ((code [4] == 0xff) && (amd64_modrm_reg (code [5]) == 0x2) && (amd64_modrm_mod (code [5]) == 0x1)) {
-               /* call *[reg+disp8] */
-               code += 4;
-       } else
-               g_assert_not_reached ();
-
-       /* Find the start of the mov instruction */
-       code -= 10;
-       if (code [0] == 0x49 && code [1] == 0xbb) {
-               return (MonoMethod*)*(gssize*)(code + 2);
-       } else if (code [3] == 0x4d && code [4] == 0x8b && code [5] == 0x1d) {
-               /* mov    <OFFSET>(%rip),%r11 */
-               return (MonoMethod*)*(gssize*)(code + 10 + *(guint32*)(code + 6));
-       } else if (code [4] == 0x41 && code [5] == 0xbb) {
-               return (MonoMethod*)(gssize)*(guint32*)(code + 6);
-       } else {
-               int i;
-
-               printf ("Unknown call sequence: ");
-               for (i = -10; i < 20; ++i)
-                       printf ("%x ", code [i]);
-               g_assert_not_reached ();
-               return NULL;
-       }
-}
-
-MonoObject*
-mono_arch_find_this_argument (gpointer *regs, MonoMethod *method, MonoGenericSharingContext *gsctx)
+mono_arch_find_imt_method (mgreg_t *regs, guint8 *code)
 {
-       return mono_arch_get_this_arg_from_call (mono_method_signature (method), (gssize*)regs, NULL);
+       return (MonoMethod*)regs [MONO_ARCH_IMT_REG];
 }
 #endif
 
-MonoRuntimeGenericContext*
-mono_arch_find_static_call_rgctx (gpointer *regs, guint8 *code)
+MonoVTable*
+mono_arch_find_static_call_vtable (mgreg_t *regs, guint8 *code)
 {
-       return (MonoRuntimeGenericContext*) regs [MONO_ARCH_RGCTX_REG];
+       return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
 }
 
 MonoInst*
-mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
+mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {
        MonoInst *ins = NULL;
+       int opcode = 0;
 
        if (cmethod->klass == mono_defaults.math_class) {
                if (strcmp (cmethod->name, "Sin") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SIN);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_SIN;
                } else if (strcmp (cmethod->name, "Cos") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_COS);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Tan") == 0) {
-                       if (use_sse2)
-                               return ins;
-                       MONO_INST_NEW (cfg, ins, OP_TAN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Atan") == 0) {
-                       if (use_sse2)
-                               return ins;
-                       MONO_INST_NEW (cfg, ins, OP_ATAN);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_COS;
                } else if (strcmp (cmethod->name, "Sqrt") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SQRT);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_SQRT;
                } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
-                       MONO_INST_NEW (cfg, ins, OP_ABS);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_ABS;
+               }
+               
+               if (opcode) {
+                       MONO_INST_NEW (cfg, ins, opcode);
+                       ins->type = STACK_R8;
+                       ins->dreg = mono_alloc_freg (cfg);
+                       ins->sreg1 = args [0]->dreg;
+                       MONO_ADD_INS (cfg->cbb, ins);
                }
 
+               opcode = 0;
                if (cfg->opt & MONO_OPT_CMOV) {
-                       int opcode = 0;
-
                        if (strcmp (cmethod->name, "Min") == 0) {
                                if (fsig->params [0]->type == MONO_TYPE_I4)
                                        opcode = OP_IMIN;
+                               if (fsig->params [0]->type == MONO_TYPE_U4)
+                                       opcode = OP_IMIN_UN;
                                else if (fsig->params [0]->type == MONO_TYPE_I8)
                                        opcode = OP_LMIN;
+                               else if (fsig->params [0]->type == MONO_TYPE_U8)
+                                       opcode = OP_LMIN_UN;
                        } else if (strcmp (cmethod->name, "Max") == 0) {
                                if (fsig->params [0]->type == MONO_TYPE_I4)
                                        opcode = OP_IMAX;
+                               if (fsig->params [0]->type == MONO_TYPE_U4)
+                                       opcode = OP_IMAX_UN;
                                else if (fsig->params [0]->type == MONO_TYPE_I8)
                                        opcode = OP_LMAX;
-                       }               
-
-                       if (opcode) {
-                               MONO_INST_NEW (cfg, ins, opcode);
-                               ins->inst_i0 = args [0];
-                               ins->inst_i1 = args [1];
+                               else if (fsig->params [0]->type == MONO_TYPE_U8)
+                                       opcode = OP_LMAX_UN;
                        }
                }
+               
+               if (opcode) {
+                       MONO_INST_NEW (cfg, ins, opcode);
+                       ins->type = fsig->params [0]->type == MONO_TYPE_I4 ? STACK_I4 : STACK_I8;
+                       ins->dreg = mono_alloc_ireg (cfg);
+                       ins->sreg1 = args [0]->dreg;
+                       ins->sreg2 = args [1]->dreg;
+                       MONO_ADD_INS (cfg->cbb, ins);
+               }
 
 #if 0
                /* OP_FREM is not IEEE compatible */
@@ -5765,15 +7411,13 @@ mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethod
                        ins->inst_i1 = args [1];
                }
 #endif
-       } else if(cmethod->klass->image == mono_defaults.corlib &&
-                          (strcmp (cmethod->klass->name_space, "System.Threading") == 0) &&
-                          (strcmp (cmethod->klass->name, "Interlocked") == 0)) {
-               /* 
-                * Can't implement CompareExchange methods this way since they have
-                * three arguments.
-                */
        }
 
+       /* 
+        * Can't implement CompareExchange methods this way since they have
+        * three arguments.
+        */
+
        return ins;
 }
 
@@ -5795,18 +7439,6 @@ MonoInst* mono_arch_get_domain_intrinsic (MonoCompile* cfg)
        return ins;
 }
 
-MonoInst* mono_arch_get_thread_intrinsic (MonoCompile* cfg)
-{
-       MonoInst* ins;
-       
-       if (thread_tls_offset == -1)
-               return NULL;
-       
-       MONO_INST_NEW (cfg, ins, OP_TLS_GET);
-       ins->inst_offset = thread_tls_offset;
-       return ins;
-}
-
 #define _CTX_REG(ctx,fld,i) ((gpointer)((&ctx->fld)[i]))
 
 gpointer
@@ -5827,3 +7459,179 @@ mono_arch_context_get_int_reg (MonoContext *ctx, int reg)
                        g_assert_not_reached ();
        }
 }
+
+/* Soft Debug support */
+#ifdef MONO_ARCH_SOFT_DEBUG_SUPPORTED
+
+/*
+ * mono_arch_set_breakpoint:
+ *
+ *   Set a breakpoint at the native code corresponding to JI at NATIVE_OFFSET.
+ * The location should contain code emitted by OP_SEQ_POINT.
+ */
+void
+mono_arch_set_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+       guint8 *code = ip;
+       guint8 *orig_code = code;
+
+       /* 
+        * In production, we will use int3 (has to fix the size in the md 
+        * file). But that could confuse gdb, so during development, we emit a SIGSEGV
+        * instead.
+        */
+       g_assert (code [0] == 0x90);
+       if (breakpoint_size == 8) {
+               amd64_mov_reg_mem (code, AMD64_R11, (guint64)bp_trigger_page, 4);
+       } else {
+               amd64_mov_reg_imm_size (code, AMD64_R11, (guint64)bp_trigger_page, 8);
+               amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, 0, 4);
+       }
+
+       g_assert (code - orig_code == breakpoint_size);
+}
+
+/*
+ * mono_arch_clear_breakpoint:
+ *
+ *   Clear the breakpoint at IP.
+ */
+void
+mono_arch_clear_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+       guint8 *code = ip;
+       int i;
+
+       for (i = 0; i < breakpoint_size; ++i)
+               x86_nop (code);
+}
+
+gboolean
+mono_arch_is_breakpoint_event (void *info, void *sigctx)
+{
+#ifdef HOST_WIN32
+       EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;
+       return FALSE;
+#else
+       siginfo_t* sinfo = (siginfo_t*) info;
+       /* Sometimes the address is off by 4 */
+       if (sinfo->si_addr >= bp_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)bp_trigger_page + 128)
+               return TRUE;
+       else
+               return FALSE;
+#endif
+}
+
+/*
+ * mono_arch_get_ip_for_breakpoint:
+ *
+ *   Convert the ip in CTX to the address where a breakpoint was placed.
+ */
+guint8*
+mono_arch_get_ip_for_breakpoint (MonoJitInfo *ji, MonoContext *ctx)
+{
+       guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+       /* ip points to the instruction causing the fault */
+       ip -= (breakpoint_size - breakpoint_fault_size);
+
+       return ip;
+}
+
+/*
+ * mono_arch_skip_breakpoint:
+ *
+ *   Modify CTX so the ip is placed after the breakpoint instruction, so when
+ * we resume, the instruction is not executed again.
+ */
+void
+mono_arch_skip_breakpoint (MonoContext *ctx)
+{
+       MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + breakpoint_fault_size);
+}
+       
+/*
+ * mono_arch_start_single_stepping:
+ *
+ *   Start single stepping.
+ */
+void
+mono_arch_start_single_stepping (void)
+{
+       mono_mprotect (ss_trigger_page, mono_pagesize (), 0);
+}
+       
+/*
+ * mono_arch_stop_single_stepping:
+ *
+ *   Stop single stepping.
+ */
+void
+mono_arch_stop_single_stepping (void)
+{
+       mono_mprotect (ss_trigger_page, mono_pagesize (), MONO_MMAP_READ);
+}
+
+/*
+ * mono_arch_is_single_step_event:
+ *
+ *   Return whenever the machine state in SIGCTX corresponds to a single
+ * step event.
+ */
+gboolean
+mono_arch_is_single_step_event (void *info, void *sigctx)
+{
+#ifdef HOST_WIN32
+       EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;
+       return FALSE;
+#else
+       siginfo_t* sinfo = (siginfo_t*) info;
+       /* Sometimes the address is off by 4 */
+       if (sinfo->si_addr >= ss_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)ss_trigger_page + 128)
+               return TRUE;
+       else
+               return FALSE;
+#endif
+}
+
+/*
+ * mono_arch_get_ip_for_single_step:
+ *
+ *   Convert the ip in CTX to the address stored in seq_points.
+ */
+guint8*
+mono_arch_get_ip_for_single_step (MonoJitInfo *ji, MonoContext *ctx)
+{
+       guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+       ip += single_step_fault_size;
+
+       return ip;
+}
+
+/*
+ * mono_arch_skip_single_step:
+ *
+ *   Modify CTX so the ip is placed after the single step trigger instruction,
+ * we resume, the instruction is not executed again.
+ */
+void
+mono_arch_skip_single_step (MonoContext *ctx)
+{
+       MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + single_step_fault_size);
+}
+
+/*
+ * mono_arch_create_seq_point_info:
+ *
+ *   Return a pointer to a data structure which is used by the sequence
+ * point implementation in AOTed code.
+ */
+gpointer
+mono_arch_get_seq_point_info (MonoDomain *domain, guint8 *code)
+{
+       NOT_IMPLEMENTED;
+       return NULL;
+}
+
+#endif