Replace SIZEOF_REGISTER with sizeof(mgreg_t) for consistency with sizeof(gpointer)

[mono.git] / mono / mini / mini-amd64.c
diff --git a/mono/mini/mini-amd64.c b/mono/mini/mini-amd64.c

index d51ecaf37898b9f03c6abced860ce6032da4a520..b224680855b002c8fe4674268b2f98cc9e6a1c1d 100644 (file)
--- a/mono/mini/mini-amd64.c
+++ b/mono/mini/mini-amd64.c
@@ -23,23 +23,19 @@
  #include <mono/metadata/threads.h>
  #include <mono/metadata/profiler-private.h>
  #include <mono/metadata/mono-debug.h>
+#include <mono/metadata/gc-internal.h>
  #include <mono/utils/mono-math.h>
+#include <mono/utils/mono-mmap.h>
  
  #include "trace.h"
  #include "ir-emit.h"
  #include "mini-amd64.h"
  #include "cpu-amd64.h"
-
-/* 
- * Can't define this in mini-amd64.h cause that would turn on the generic code in
- * method-to-ir.c.
- */
-#define MONO_ARCH_IMT_REG AMD64_R11
+#include "debugger-agent.h"
  
  static gint lmf_tls_offset = -1;
  static gint lmf_addr_tls_offset = -1;
  static gint appdomain_tls_offset = -1;
-static gint thread_tls_offset = -1;
  
  #ifdef MONO_XEN_OPT
  static gboolean optimize_for_xen = TRUE;
@@ -53,7 +49,7 @@ static gboolean optimize_for_xen = TRUE;
  
  #define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
  
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
  /* Under windows, the calling convention is never stdcall */
  #define CALLCONV_IS_STDCALL(call_conv) (FALSE)
  #else
@@ -68,7 +64,25 @@ static CRITICAL_SECTION mini_arch_mutex;
  MonoBreakpointInfo
  mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
  
-#ifdef PLATFORM_WIN32
+/*
+ * The code generated for sequence points reads from this location, which is
+ * made read-only when single stepping is enabled.
+ */
+static gpointer ss_trigger_page;
+
+/* Enabled breakpoints read from this trigger page */
+static gpointer bp_trigger_page;
+
+/* The size of the breakpoint sequence */
+static int breakpoint_size;
+
+/* The size of the breakpoint instruction causing the actual fault */
+static int breakpoint_fault_size;
+
+/* The size of the single step instruction causing the actual fault */
+static int single_step_fault_size;
+
+#ifdef HOST_WIN32
  /* On Win64 always reserve first 32 bytes for first four arguments */
  #define ARGS_OFFSET 48
  #else
@@ -117,16 +131,30 @@ mono_arch_regname (int reg)
         return "unknown";
  }
  
-static const char * xmmregs [] = {
-       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8",
-       "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+static const char * packed_xmmregs [] = {
+       "p:xmm0", "p:xmm1", "p:xmm2", "p:xmm3", "p:xmm4", "p:xmm5", "p:xmm6", "p:xmm7", "p:xmm8",
+       "p:xmm9", "p:xmm10", "p:xmm11", "p:xmm12", "p:xmm13", "p:xmm14", "p:xmm15"
+};
+
+static const char * single_xmmregs [] = {
+       "s:xmm0", "s:xmm1", "s:xmm2", "s:xmm3", "s:xmm4", "s:xmm5", "s:xmm6", "s:xmm7", "s:xmm8",
+       "s:xmm9", "s:xmm10", "s:xmm11", "s:xmm12", "s:xmm13", "s:xmm14", "s:xmm15"
  };
  
  const char*
  mono_arch_fregname (int reg)
  {
         if (reg < AMD64_XMM_NREG)
-               return xmmregs [reg];
+               return single_xmmregs [reg];
+       else
+               return "unknown";
+}
+
+const char *
+mono_arch_xregname (int reg)
+{
+       if (reg < AMD64_XMM_NREG)
+               return packed_xmmregs [reg];
         else
                 return "unknown";
  }
@@ -176,11 +204,278 @@ amd64_is_near_call (guint8 *code)
         return code [0] == 0xe8;
  }
  
+#ifdef __native_client_codegen__
+
+/* Keep track of instruction "depth", that is, the level of sub-instruction */
+/* for any given instruction.  For instance, amd64_call_reg resolves to     */
+/* amd64_call_reg_internal, which uses amd64_alu_* macros, etc.             */
+/* We only want to force bundle alignment for the top level instruction,    */
+/* so NaCl pseudo-instructions can be implemented with sub instructions.    */
+static guint32 nacl_instruction_depth;
+
+static guint32 nacl_rex_tag;
+static guint32 nacl_legacy_prefix_tag;
+
+void
+amd64_nacl_clear_legacy_prefix_tag ()
+{
+       TlsSetValue (nacl_legacy_prefix_tag, NULL);
+}
+
+void
+amd64_nacl_tag_legacy_prefix (guint8* code)
+{
+       if (TlsGetValue (nacl_legacy_prefix_tag) == NULL)
+               TlsSetValue (nacl_legacy_prefix_tag, code);
+}
+
+void
+amd64_nacl_tag_rex (guint8* code)
+{
+       TlsSetValue (nacl_rex_tag, code);
+}
+
+guint8*
+amd64_nacl_get_legacy_prefix_tag ()
+{
+       return (guint8*)TlsGetValue (nacl_legacy_prefix_tag);
+}
+
+guint8*
+amd64_nacl_get_rex_tag ()
+{
+       return (guint8*)TlsGetValue (nacl_rex_tag);
+}
+
+/* Increment the instruction "depth" described above */
+void
+amd64_nacl_instruction_pre ()
+{
+       intptr_t depth = (intptr_t) TlsGetValue (nacl_instruction_depth);
+       depth++;
+       TlsSetValue (nacl_instruction_depth, (gpointer)depth);
+}
+
+/* amd64_nacl_instruction_post: Decrement instruction "depth", force bundle */
+/* alignment if depth == 0 (top level instruction)                          */
+/* IN: start, end    pointers to instruction beginning and end              */
+/* OUT: start, end   pointers to beginning and end after possible alignment */
+/* GLOBALS: nacl_instruction_depth     defined above                        */
+void
+amd64_nacl_instruction_post (guint8 **start, guint8 **end)
+{
+       intptr_t depth = (intptr_t) TlsGetValue(nacl_instruction_depth);
+       depth--;
+       TlsSetValue (nacl_instruction_depth, (void*)depth);
+
+       g_assert ( depth >= 0 );
+       if (depth == 0) {
+               uintptr_t space_in_block;
+               uintptr_t instlen;
+               guint8 *prefix = amd64_nacl_get_legacy_prefix_tag ();
+               /* if legacy prefix is present, and if it was emitted before */
+               /* the start of the instruction sequence, adjust the start   */
+               if (prefix != NULL && prefix < *start) {
+                       g_assert (*start - prefix <= 3);/* only 3 are allowed */
+                       *start = prefix;
+               }
+               space_in_block = kNaClAlignment - ((uintptr_t)(*start) & kNaClAlignmentMask);
+               instlen = (uintptr_t)(*end - *start);
+               /* Only check for instructions which are less than        */
+               /* kNaClAlignment. The only instructions that should ever */
+               /* be that long are call sequences, which are already     */
+               /* padded out to align the return to the next bundle.     */
+               if (instlen > space_in_block && instlen < kNaClAlignment) {
+                       const size_t MAX_NACL_INST_LENGTH = kNaClAlignment;
+                       guint8 copy_of_instruction[MAX_NACL_INST_LENGTH];
+                       const size_t length = (size_t)((*end)-(*start));
+                       g_assert (length < MAX_NACL_INST_LENGTH);
+                       
+                       memcpy (copy_of_instruction, *start, length);
+                       *start = mono_arch_nacl_pad (*start, space_in_block);
+                       memcpy (*start, copy_of_instruction, length);
+                       *end = *start + length;
+               }
+               amd64_nacl_clear_legacy_prefix_tag ();
+               amd64_nacl_tag_rex (NULL);
+       }
+}
+
+/* amd64_nacl_membase_handler: ensure all access to memory of the form      */
+/*   OFFSET(%rXX) is sandboxed.  For allowable base registers %rip, %rbp,   */
+/*   %rsp, and %r15, emit the membase as usual.  For all other registers,   */
+/*   make sure the upper 32-bits are cleared, and use that register in the  */
+/*   index field of a new address of this form: OFFSET(%r15,%eXX,1)         */
+/* IN:      code                                                            */
+/*             pointer to current instruction stream (in the                */
+/*             middle of an instruction, after opcode is emitted)           */
+/*          basereg/offset/dreg                                             */
+/*             operands of normal membase address                           */
+/* OUT:     code                                                            */
+/*             pointer to the end of the membase/memindex emit              */
+/* GLOBALS: nacl_rex_tag                                                    */
+/*             position in instruction stream that rex prefix was emitted   */
+/*          nacl_legacy_prefix_tag                                          */
+/*             (possibly NULL) position in instruction of legacy x86 prefix */
+void
+amd64_nacl_membase_handler (guint8** code, gint8 basereg, gint32 offset, gint8 dreg)
+{
+       gint8 true_basereg = basereg;
+
+       /* Cache these values, they might change  */
+       /* as new instructions are emitted below. */
+       guint8* rex_tag = amd64_nacl_get_rex_tag ();
+       guint8* legacy_prefix_tag = amd64_nacl_get_legacy_prefix_tag ();
+
+       /* 'basereg' is given masked to 0x7 at this point, so check */
+       /* the rex prefix to see if this is an extended register.   */
+       if ((rex_tag != NULL) && IS_REX(*rex_tag) && (*rex_tag & AMD64_REX_B)) {
+               true_basereg |= 0x8;
+       }
+
+#define X86_LEA_OPCODE (0x8D)
+
+       if (!amd64_is_valid_nacl_base (true_basereg) && (*(*code-1) != X86_LEA_OPCODE)) {
+               guint8* old_instruction_start;
+               
+               /* This will hold the 'mov %eXX, %eXX' that clears the upper */
+               /* 32-bits of the old base register (new index register)     */
+               guint8 buf[32];
+               guint8* buf_ptr = buf;
+               size_t insert_len;
+
+               g_assert (rex_tag != NULL);
+
+               if (IS_REX(*rex_tag)) {
+                       /* The old rex.B should be the new rex.X */
+                       if (*rex_tag & AMD64_REX_B) {
+                               *rex_tag |= AMD64_REX_X;
+                       }
+                       /* Since our new base is %r15 set rex.B */
+                       *rex_tag |= AMD64_REX_B;
+               } else {
+                       /* Shift the instruction by one byte  */
+                       /* so we can insert a rex prefix      */
+                       memmove (rex_tag + 1, rex_tag, (size_t)(*code - rex_tag));
+                       *code += 1;
+                       /* New rex prefix only needs rex.B for %r15 base */
+                       *rex_tag = AMD64_REX(AMD64_REX_B);
+               }
+
+               if (legacy_prefix_tag) {
+                       old_instruction_start = legacy_prefix_tag;
+               } else {
+                       old_instruction_start = rex_tag;
+               }
+               
+               /* Clears the upper 32-bits of the previous base register */
+               amd64_mov_reg_reg_size (buf_ptr, true_basereg, true_basereg, 4);
+               insert_len = buf_ptr - buf;
+               
+               /* Move the old instruction forward to make */
+               /* room for 'mov' stored in 'buf_ptr'       */
+               memmove (old_instruction_start + insert_len, old_instruction_start, (size_t)(*code - old_instruction_start));
+               *code += insert_len;
+               memcpy (old_instruction_start, buf, insert_len);
+
+               /* Sandboxed replacement for the normal membase_emit */
+               x86_memindex_emit (*code, dreg, AMD64_R15, offset, basereg, 0);
+               
+       } else {
+               /* Normal default behavior, emit membase memory location */
+               x86_membase_emit_body (*code, dreg, basereg, offset);
+       }
+}
+
+
+static inline unsigned char*
+amd64_skip_nops (unsigned char* code)
+{
+       guint8 in_nop;
+       do {
+               in_nop = 0;
+               if (   code[0] == 0x90) {
+                       in_nop = 1;
+                       code += 1;
+               }
+               if (   code[0] == 0x66 && code[1] == 0x90) {
+                       in_nop = 1;
+                       code += 2;
+               }
+               if (code[0] == 0x0f && code[1] == 0x1f
+                && code[2] == 0x00) {
+                       in_nop = 1;
+                       code += 3;
+               }
+               if (code[0] == 0x0f && code[1] == 0x1f
+                && code[2] == 0x40 && code[3] == 0x00) {
+                       in_nop = 1;
+                       code += 4;
+               }
+               if (code[0] == 0x0f && code[1] == 0x1f
+                && code[2] == 0x44 && code[3] == 0x00
+                && code[4] == 0x00) {
+                       in_nop = 1;
+                       code += 5;
+               }
+               if (code[0] == 0x66 && code[1] == 0x0f
+                && code[2] == 0x1f && code[3] == 0x44
+                && code[4] == 0x00 && code[5] == 0x00) {
+                       in_nop = 1;
+                       code += 6;
+               }
+               if (code[0] == 0x0f && code[1] == 0x1f
+                && code[2] == 0x80 && code[3] == 0x00
+                && code[4] == 0x00 && code[5] == 0x00
+                && code[6] == 0x00) {
+                       in_nop = 1;
+                       code += 7;
+               }
+               if (code[0] == 0x0f && code[1] == 0x1f
+                && code[2] == 0x84 && code[3] == 0x00
+                && code[4] == 0x00 && code[5] == 0x00
+                && code[6] == 0x00 && code[7] == 0x00) {
+                       in_nop = 1;
+                       code += 8;
+               }
+       } while ( in_nop );
+       return code;
+}
+
+guint8*
+mono_arch_nacl_skip_nops (guint8* code)
+{
+  return amd64_skip_nops(code);
+}
+
+#endif /*__native_client_codegen__*/
+
  static inline void 
  amd64_patch (unsigned char* code, gpointer target)
  {
         guint8 rex = 0;
  
+#ifdef __native_client_codegen__
+       code = amd64_skip_nops (code);
+#endif
+#if defined(__native_client_codegen__) && defined(__native_client__)
+       if (nacl_is_code_address (code)) {
+               /* For tail calls, code is patched after being installed */
+               /* but not through the normal "patch callsite" method.   */
+               unsigned char buf[kNaClAlignment];
+               unsigned char *aligned_code = (uintptr_t)code & ~kNaClAlignmentMask;
+               int ret;
+               memcpy (buf, aligned_code, kNaClAlignment);
+               /* Patch a temp buffer of bundle size, */
+               /* then install to actual location.    */
+               amd64_patch (buf + ((uintptr_t)code - (uintptr_t)aligned_code), target);
+               ret = nacl_dyncode_modify (aligned_code, buf, kNaClAlignment);
+               g_assert (ret == 0);
+               return;
+       }
+       target = nacl_modify_patch_target (target);
+#endif
+
         /* Skip REX */
         if ((code [0] >= 0x40) && (code [0] <= 0x4f)) {
                 rex = code [0];
@@ -241,6 +536,9 @@ typedef struct {
         guint32 reg_usage;
         guint32 freg_usage;
         gboolean need_stack_align;
+       gboolean vtype_retaddr;
+       /* The index of the vret arg in the argument list */
+       int vret_arg_index;
         ArgInfo ret;
         ArgInfo sig_cookie;
         ArgInfo args [1];
@@ -248,7 +546,7 @@ typedef struct {
  
  #define DEBUG(a) if (cfg->verbose_level > 1) a
  
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
  #define PARAM_REGS 4
  
  static AMD64_Reg_No param_regs [] = { AMD64_RCX, AMD64_RDX, AMD64_R8, AMD64_R9 };
@@ -269,7 +567,9 @@ add_general (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
  
      if (*gr >= PARAM_REGS) {
                 ainfo->storage = ArgOnStack;
-               (*stack_size) += sizeof (gpointer);
+               /* Since the same stack slot size is used for all arg */
+               /*  types, it needs to be big enough to hold them all */
+               (*stack_size) += sizeof(mgreg_t);
      }
      else {
                 ainfo->storage = ArgInIReg;
@@ -278,7 +578,7 @@ add_general (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
      }
  }
  
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
  #define FLOAT_PARAM_REGS 4
  #else
  #define FLOAT_PARAM_REGS 8
@@ -291,7 +591,9 @@ add_float (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo, gboolean is_double)
  
      if (*gr >= FLOAT_PARAM_REGS) {
                 ainfo->storage = ArgOnStack;
-               (*stack_size) += sizeof (gpointer);
+               /* Since the same stack slot size is used for both float */
+               /*  types, it needs to be big enough to hold them both */
+               (*stack_size) += sizeof(mgreg_t);
      }
      else {
                 /* A double register */
@@ -342,7 +644,7 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
                 break;
         case MONO_TYPE_R4:
         case MONO_TYPE_R8:
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                 class2 = ARG_CLASS_INTEGER;
  #else
                 class2 = ARG_CLASS_SSE;
@@ -386,6 +688,32 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
  
         return class1;
  }
+#ifdef __native_client_codegen__
+const guint kNaClAlignment = kNaClAlignmentAMD64;
+const guint kNaClAlignmentMask = kNaClAlignmentMaskAMD64;
+
+/* Default alignment for Native Client is 32-byte. */
+gint8 nacl_align_byte = -32; /* signed version of 0xe0 */
+
+/* mono_arch_nacl_pad: Add pad bytes of alignment instructions at code,  */
+/* Check that alignment doesn't cross an alignment boundary.             */
+guint8*
+mono_arch_nacl_pad(guint8 *code, int pad)
+{
+       const int kMaxPadding = 8; /* see amd64-codegen.h:amd64_padding_size() */
+
+       if (pad == 0) return code;
+       /* assertion: alignment cannot cross a block boundary */
+       g_assert (((uintptr_t)code & (~kNaClAlignmentMask)) ==
+                (((uintptr_t)code + pad - 1) & (~kNaClAlignmentMask)));
+       while (pad >= kMaxPadding) {
+               amd64_padding (code, kMaxPadding);
+               pad -= kMaxPadding;
+       }
+       if (pad != 0) amd64_padding (code, pad);
+       return code;
+}
+#endif
  
  static void
  add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
@@ -393,6 +721,9 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
                            guint32 *gr, guint32 *fr, guint32 *stack_size)
  {
         guint32 size, quad, nquads, i;
+       /* Keep track of the size used in each quad so we can */
+       /* use the right size when copying args/return vars.  */
+       guint32 quadsize [2] = {8, 8};
         ArgumentClass args [2];
         MonoMarshalType *info = NULL;
         MonoClass *klass;
@@ -409,7 +740,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
  
         klass = mono_class_from_mono_type (type);
         size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
-#ifndef PLATFORM_WIN32
+#ifndef HOST_WIN32
         if (!sig->pinvoke && !disable_vtypes_in_regs && ((is_return && (size == 8)) || (!is_return && (size <= 16)))) {
                 /* We pass and return vtypes of size 8 in a register */
         } else if (!sig->pinvoke || (size == 0) || (size > 16)) {
@@ -421,6 +752,24 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
         }
  #endif
  
+       /* If this struct can't be split up naturally into 8-byte */
+       /* chunks (registers), pass it on the stack.              */
+       if (sig->pinvoke && !pass_on_stack) {
+               info = mono_marshal_load_type_info (klass);
+               g_assert(info);
+               guint32 align;
+               guint32 field_size;
+               for (i = 0; i < info->num_fields; ++i) {
+                       field_size = mono_marshal_type_size (info->fields [i].field->type, 
+                                                          info->fields [i].mspec, 
+                                                          &align, TRUE, klass->unicode);
+                       if ((info->fields [i].offset < 8) && (info->fields [i].offset + field_size) > 8) {
+                               pass_on_stack = TRUE;
+                               break;
+                       }
+               }
+       }
+
         if (pass_on_stack) {
                 /* Allways pass in memory */
                 ainfo->offset = *stack_size;
@@ -457,7 +806,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
                 info = mono_marshal_load_type_info (klass);
                 g_assert (info);
  
-#ifndef PLATFORM_WIN32
+#ifndef HOST_WIN32
                 if (info->native_size > 16) {
                         ainfo->offset = *stack_size;
                         *stack_size += ALIGN_TO (info->native_size, 8);
@@ -520,6 +869,10 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
                                 if ((quad == 1) && (info->fields [i].offset < 8))
                                         continue;
  
+                               /* How far into this quad this data extends.*/
+                               /* (8 is size of quad) */
+                               quadsize [quad] = info->fields [i].offset + size - (quad * 8);
+
                                 class1 = merge_argument_class_from_type (info->fields [i].field->type, class1);
                         }
                         g_assert (class1 != ARG_CLASS_NO_CLASS);
@@ -556,7 +909,9 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
                                 if (*fr >= FLOAT_PARAM_REGS)
                                         args [quad] = ARG_CLASS_MEMORY;
                                 else {
-                                       ainfo->pair_storage [quad] = ArgInDoubleSSEReg;
+                                       if (quadsize[quad] <= 4)
+                                               ainfo->pair_storage [quad] = ArgInFloatSSEReg;
+                                       else ainfo->pair_storage [quad] = ArgInDoubleSSEReg;
                                         ainfo->pair_regs [quad] = *fr;
                                         (*fr) ++;
                                 }
@@ -577,7 +932,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
                         if (sig->pinvoke)
                                 *stack_size += ALIGN_TO (info->native_size, 8);
                         else
-                               *stack_size += nquads * sizeof (gpointer);
+                               *stack_size += nquads * sizeof(mgreg_t);
                         ainfo->storage = ArgOnStack;
                 }
         }
@@ -591,19 +946,22 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
   * Draft Version 0.23" document for more information.
   */
  static CallInfo*
-get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
+get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSignature *sig)
  {
-       guint32 i, gr, fr;
+       guint32 i, gr, fr, pstart;
         MonoType *ret_type;
         int n = sig->hasthis + sig->param_count;
         guint32 stack_size = 0;
         CallInfo *cinfo;
+       gboolean is_pinvoke = sig->pinvoke;
  
         if (mp)
                 cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
         else
                 cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
  
+       cinfo->nargs = n;
+
         gr = 0;
         fr = 0;
  
@@ -645,7 +1003,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                         cinfo->ret.reg = AMD64_XMM0;
                         break;
                 case MONO_TYPE_GENERICINST:
-                       if (!mono_type_generic_inst_is_valuetype (sig->ret)) {
+                       if (!mono_type_generic_inst_is_valuetype (ret_type)) {
                                 cinfo->ret.storage = ArgInIReg;
                                 cinfo->ret.reg = AMD64_RAX;
                                 break;
@@ -655,15 +1013,15 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                         guint32 tmp_gr = 0, tmp_fr = 0, tmp_stacksize = 0;
  
                         add_valuetype (gsctx, sig, &cinfo->ret, sig->ret, TRUE, &tmp_gr, &tmp_fr, &tmp_stacksize);
-                       if (cinfo->ret.storage == ArgOnStack)
+                       if (cinfo->ret.storage == ArgOnStack) {
+                               cinfo->vtype_retaddr = TRUE;
                                 /* The caller passes the address where the value is stored */
-                               add_general (&gr, &stack_size, &cinfo->ret);
+                       }
                         break;
                 }
                 case MONO_TYPE_TYPEDBYREF:
                         /* Same as a valuetype with size 24 */
-                       add_general (&gr, &stack_size, &cinfo->ret);
-                       ;
+                       cinfo->vtype_retaddr = TRUE;
                         break;
                 case MONO_TYPE_VOID:
                         break;
@@ -672,9 +1030,31 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                 }
         }
  
-       /* this */
-       if (sig->hasthis)
-               add_general (&gr, &stack_size, cinfo->args + 0);
+       pstart = 0;
+       /*
+        * To simplify get_this_arg_reg () and LLVM integration, emit the vret arg after
+        * the first argument, allowing 'this' to be always passed in the first arg reg.
+        * Also do this if the first argument is a reference type, since virtual calls
+        * are sometimes made using calli without sig->hasthis set, like in the delegate
+        * invoke wrappers.
+        */
+       if (cinfo->vtype_retaddr && !is_pinvoke && (sig->hasthis || (sig->param_count > 0 && MONO_TYPE_IS_REFERENCE (mini_type_get_underlying_type (gsctx, sig->params [0]))))) {
+               if (sig->hasthis) {
+                       add_general (&gr, &stack_size, cinfo->args + 0);
+               } else {
+                       add_general (&gr, &stack_size, &cinfo->args [sig->hasthis + 0]);
+                       pstart = 1;
+               }
+               add_general (&gr, &stack_size, &cinfo->ret);
+               cinfo->vret_arg_index = 1;
+       } else {
+               /* this */
+               if (sig->hasthis)
+                       add_general (&gr, &stack_size, cinfo->args + 0);
+
+               if (cinfo->vtype_retaddr)
+                       add_general (&gr, &stack_size, &cinfo->ret);
+       }
  
         if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == 0)) {
                 gr = PARAM_REGS;
@@ -684,11 +1064,11 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                 add_general (&gr, &stack_size, &cinfo->sig_cookie);
         }
  
-       for (i = 0; i < sig->param_count; ++i) {
+       for (i = pstart; i < sig->param_count; ++i) {
                 ArgInfo *ainfo = &cinfo->args [sig->hasthis + i];
                 MonoType *ptype;
  
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                 /* The float param registers and other param registers must be the same index on Windows x64.*/
                 if (gr > fr)
                         fr = gr;
@@ -709,10 +1089,6 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                         add_general (&gr, &stack_size, &cinfo->sig_cookie);
                 }
  
-               if (sig->params [i]->byref) {
-                       add_general (&gr, &stack_size, ainfo);
-                       continue;
-               }
                 ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
                 switch (ptype->type) {
                 case MONO_TYPE_BOOLEAN:
@@ -750,7 +1126,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                         add_valuetype (gsctx, sig, ainfo, sig->params [i], FALSE, &gr, &fr, &stack_size);
                         break;
                 case MONO_TYPE_TYPEDBYREF:
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                         add_valuetype (gsctx, sig, ainfo, sig->params [i], FALSE, &gr, &fr, &stack_size);
  #else
                         stack_size += sizeof (MonoTypedRef);
@@ -780,7 +1156,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                 add_general (&gr, &stack_size, &cinfo->sig_cookie);
         }
  
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
         // There always is 32 bytes reserved on the stack when calling on Winx64
         stack_size += 0x20;
  #endif
@@ -812,7 +1188,7 @@ int
  mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info)
  {
         int k;
-       CallInfo *cinfo = get_call_info (NULL, NULL, csig, FALSE);
+       CallInfo *cinfo = get_call_info (NULL, NULL, csig);
         guint32 args_size = cinfo->stack_usage;
  
         /* The arguments are saved to a stack area in mono_arch_instrument_prolog */
@@ -831,9 +1207,31 @@ mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJit
         return args_size;
  }
  
+gboolean
+mono_amd64_tail_call_supported (MonoMethodSignature *caller_sig, MonoMethodSignature *callee_sig)
+{
+       CallInfo *c1, *c2;
+       gboolean res;
+
+       c1 = get_call_info (NULL, NULL, caller_sig);
+       c2 = get_call_info (NULL, NULL, callee_sig);
+       res = c1->stack_usage >= c2->stack_usage;
+       if (callee_sig->ret && MONO_TYPE_ISSTRUCT (callee_sig->ret) && c2->ret.storage != ArgValuetypeInReg)
+               /* An address on the callee's stack is passed as the first argument */
+               res = FALSE;
+
+       g_free (c1);
+       g_free (c2);
+
+       return res;
+}
+
  static int 
  cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
  {
+#if defined(MONO_CROSS_COMPILE)
+       return 0;
+#else
  #ifndef _MSC_VER
         __asm__ __volatile__ ("cpuid"
                 : "=a" (*p_eax), "=b" (*p_ebx), "=c" (*p_ecx), "=d" (*p_edx)
@@ -847,6 +1245,7 @@ cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
         *p_edx = info[3];
  #endif
         return 1;
+#endif
  }
  
  /*
@@ -877,7 +1276,38 @@ mono_arch_cpu_init (void)
  void
  mono_arch_init (void)
  {
+       int flags;
+
         InitializeCriticalSection (&mini_arch_mutex);
+#if defined(__native_client_codegen__)
+       nacl_instruction_depth = TlsAlloc ();
+       TlsSetValue (nacl_instruction_depth, (gpointer)0);
+       nacl_rex_tag = TlsAlloc ();
+       nacl_legacy_prefix_tag = TlsAlloc ();
+#endif
+
+#ifdef MONO_ARCH_NOMAP32BIT
+       flags = MONO_MMAP_READ;
+       /* amd64_mov_reg_imm () + amd64_mov_reg_membase () */
+       breakpoint_size = 13;
+       breakpoint_fault_size = 3;
+       /* amd64_alu_membase_imm_size (code, X86_CMP, AMD64_R11, 0, 0, 4); */
+       single_step_fault_size = 5;
+#else
+       flags = MONO_MMAP_READ|MONO_MMAP_32BIT;
+       /* amd64_mov_reg_mem () */
+       breakpoint_size = 8;
+       breakpoint_fault_size = 8;
+       single_step_fault_size = 8;
+#endif
+
+       ss_trigger_page = mono_valloc (NULL, mono_pagesize (), flags);
+       bp_trigger_page = mono_valloc (NULL, mono_pagesize (), flags);
+       mono_mprotect (bp_trigger_page, mono_pagesize (), 0);
+
+       mono_aot_register_jit_icall ("mono_amd64_throw_exception", mono_amd64_throw_exception);
+       mono_aot_register_jit_icall ("mono_amd64_throw_corlib_exception", mono_amd64_throw_corlib_exception);
+       mono_aot_register_jit_icall ("mono_amd64_get_original_ip", mono_amd64_get_original_ip);
  }
  
  /*
@@ -887,6 +1317,11 @@ void
  mono_arch_cleanup (void)
  {
         DeleteCriticalSection (&mini_arch_mutex);
+#if defined(__native_client_codegen__)
+       TlsFree (nacl_instruction_depth);
+       TlsFree (nacl_rex_tag);
+       TlsFree (nacl_legacy_prefix_tag);
+#endif
  }
  
  /*
@@ -898,8 +1333,6 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
         int eax, ebx, ecx, edx;
         guint32 opts = 0;
  
-       /* FIXME: AMD64 */
-
         *exclude_mask = 0;
         /* Feature Flags function, flags returned in EDX. */
         if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
@@ -921,7 +1354,6 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
   *
   * Returns a bitmask corresponding to all supported versions.
   * 
- * TODO detect other versions like SSE4a.
   */
  guint32
  mono_arch_cpu_enumerate_simd_versions (void)
@@ -931,21 +1363,36 @@ mono_arch_cpu_enumerate_simd_versions (void)
  
         if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
                 if (edx & (1 << 25))
-                       sse_opts |= 1 << SIMD_VERSION_SSE1;
+                       sse_opts |= SIMD_VERSION_SSE1;
                 if (edx & (1 << 26))
-                       sse_opts |= 1 << SIMD_VERSION_SSE2;
+                       sse_opts |= SIMD_VERSION_SSE2;
                 if (ecx & (1 << 0))
-                       sse_opts |= 1 << SIMD_VERSION_SSE3;
+                       sse_opts |= SIMD_VERSION_SSE3;
                 if (ecx & (1 << 9))
-                       sse_opts |= 1 << SIMD_VERSION_SSSE3;
+                       sse_opts |= SIMD_VERSION_SSSE3;
                 if (ecx & (1 << 19))
-                       sse_opts |= 1 << SIMD_VERSION_SSE41;
+                       sse_opts |= SIMD_VERSION_SSE41;
                 if (ecx & (1 << 20))
-                       sse_opts |= 1 << SIMD_VERSION_SSE42;
+                       sse_opts |= SIMD_VERSION_SSE42;
+       }
+
+       /* Yes, all this needs to be done to check for sse4a.
+          See: "Amd: CPUID Specification"
+        */
+       if (cpuid (0x80000000, &eax, &ebx, &ecx, &edx)) {
+               /* eax greater or equal than 0x80000001, ebx = 'htuA', ecx = DMAc', edx = 'itne'*/
+               if ((((unsigned int) eax) >= 0x80000001) && (ebx == 0x68747541) && (ecx == 0x444D4163) && (edx == 0x69746E65)) {
+                       cpuid (0x80000001, &eax, &ebx, &ecx, &edx);
+                       if (ecx & (1 << 6))
+                               sse_opts |= SIMD_VERSION_SSE4a;
+               }
         }
+
         return sse_opts;        
  }
  
+#ifndef DISABLE_JIT
+
  GList *
  mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
  {
@@ -992,12 +1439,12 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
         if (cfg->arch.omit_fp_computed)
                 return;
  
-       header = mono_method_get_header (cfg->method);
+       header = cfg->header;
  
         sig = mono_method_signature (cfg->method);
  
         if (!cfg->arch.cinfo)
-               cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
+               cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
         cinfo = cfg->arch.cinfo;
  
         /*
@@ -1006,6 +1453,13 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
         cfg->arch.omit_fp = TRUE;
         cfg->arch.omit_fp_computed = TRUE;
  
+#ifdef __native_client_codegen__
+       /* NaCl modules may not change the value of RBP, so it cannot be */
+       /* used as a normal register, but it can be used as a frame pointer*/
+       cfg->disable_omit_fp = TRUE;
+       cfg->arch.omit_fp = FALSE;
+#endif
+
         if (cfg->disable_omit_fp)
                 cfg->arch.omit_fp = FALSE;
  
@@ -1062,7 +1516,9 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
                 regs = g_list_prepend (regs, (gpointer)AMD64_R12);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R13);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
                 regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#endif
   
                 regs = g_list_prepend (regs, (gpointer)AMD64_R10);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R9);
@@ -1081,8 +1537,10 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
                 regs = g_list_prepend (regs, (gpointer)AMD64_R12);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R13);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
                 regs = g_list_prepend (regs, (gpointer)AMD64_R15);
-#ifdef PLATFORM_WIN32
+#endif
+#ifdef HOST_WIN32
                 regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
                 regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
  #endif
@@ -1117,7 +1575,9 @@ mono_arch_get_iregs_clobbered_by_call (MonoCallInst *call)
                 regs = g_list_prepend (regs, (gpointer)AMD64_R12);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R13);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
                 regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#endif
  
                 regs = g_list_prepend (regs, (gpointer)AMD64_R10);
                 regs = g_list_prepend (regs, (gpointer)AMD64_R9);
@@ -1188,7 +1648,7 @@ mono_arch_fill_argument_info (MonoCompile *cfg)
         int i;
         CallInfo *cinfo;
  
-       header = mono_method_get_header (cfg->method);
+       header = cfg->header;
  
         sig = mono_method_signature (cfg->method);
  
@@ -1268,7 +1728,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
         gint32 *offsets;
         CallInfo *cinfo;
  
-       header = mono_method_get_header (cfg->method);
+       header = cfg->header;
  
         sig = mono_method_signature (cfg->method);
  
@@ -1318,7 +1778,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                 /* Reserve space for caller saved registers */
                 for (i = 0; i < AMD64_NREG; ++i)
                         if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                               offset += sizeof (gpointer);
+                               offset += sizeof(mgreg_t);
                         }
         }
  
@@ -1375,6 +1835,14 @@ mono_arch_allocate_vars (MonoCompile *cfg)
         /* Allocate locals */
         if (!cfg->globalra) {
                 offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE: TRUE, &locals_stack_size, &locals_stack_align);
+               if (locals_stack_size > MONO_ARCH_MAX_FRAME_SIZE) {
+                       char *mname = mono_method_full_name (cfg->method, TRUE);
+                       cfg->exception_type = MONO_EXCEPTION_INVALID_PROGRAM;
+                       cfg->exception_message = g_strdup_printf ("Method %s stack is too big.", mname);
+                       g_free (mname);
+                       return;
+               }
+               
                 if (locals_stack_align) {
                         offset += (locals_stack_align - 1);
                         offset &= ~(locals_stack_align - 1);
@@ -1439,12 +1907,12 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                                         ins->opcode = OP_REGOFFSET;
                                         ins->inst_basereg = cfg->frame_reg;
                                         /* These arguments are saved to the stack in the prolog */
-                                       offset = ALIGN_TO (offset, sizeof (gpointer));
+                                       offset = ALIGN_TO (offset, sizeof(mgreg_t));
                                         if (cfg->arch.omit_fp) {
                                                 ins->inst_offset = offset;
-                                               offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                               offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof(mgreg_t) : sizeof(mgreg_t);
                                         } else {
-                                               offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                               offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof(mgreg_t) : sizeof(mgreg_t);
                                                 ins->inst_offset = - offset;
                                         }
                                         break;
@@ -1516,13 +1984,16 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                                 ins->opcode = OP_REGOFFSET;
                                 ins->inst_basereg = cfg->frame_reg;
                                 /* These arguments are saved to the stack in the prolog */
-                               offset = ALIGN_TO (offset, sizeof (gpointer));
+                               offset = ALIGN_TO (offset, sizeof(mgreg_t));
                                 if (cfg->arch.omit_fp) {
                                         ins->inst_offset = offset;
-                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof(mgreg_t) : sizeof(mgreg_t);
+                                       // Arguments are yet supported by the stack map creation code
+                                       //cfg->locals_max_stack_offset = MAX (cfg->locals_max_stack_offset, offset);
                                 } else {
-                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+                                       offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof(mgreg_t) : sizeof(mgreg_t);
                                         ins->inst_offset = - offset;
+                                       //cfg->locals_min_stack_offset = MIN (cfg->locals_min_stack_offset, offset);
                                 }
                         }
                 }
@@ -1540,7 +2011,7 @@ mono_arch_create_vars (MonoCompile *cfg)
         sig = mono_method_signature (cfg->method);
  
         if (!cfg->arch.cinfo)
-               cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
+               cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
         cinfo = cfg->arch.cinfo;
  
         if (cinfo->ret.storage == ArgValuetypeInReg)
@@ -1554,6 +2025,14 @@ mono_arch_create_vars (MonoCompile *cfg)
                 }
         }
  
+       if (cfg->gen_seq_points) {
+               MonoInst *ins;
+
+           ins = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
+               ins->flags |= MONO_INST_VOLATILE;
+               cfg->arch.ss_trigger_page_var = ins;
+       }
+
  #ifdef MONO_AMD64_NO_PUSHES
         /*
          * When this is set, we pass arguments on the stack by moves, and by allocating 
@@ -1608,7 +2087,11 @@ arg_storage_to_load_membase (ArgStorage storage)
  {
         switch (storage) {
         case ArgInIReg:
+#if defined(__mono_ilp32__)
+               return OP_LOADI8_MEMBASE;
+#else
                 return OP_LOAD_MEMBASE;
+#endif
         case ArgInDoubleSSEReg:
                 return OP_LOADR8_MEMBASE;
         case ArgInFloatSSEReg:
@@ -1641,7 +2124,7 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
          * passed on the stack after the signature. So compensate by 
          * passing a different signature.
          */
-       tmp_sig = mono_metadata_signature_dup (call->signature);
+       tmp_sig = mono_metadata_signature_dup_full (cfg->method->klass->image, call->signature);
         tmp_sig->param_count -= call->signature->sentinelpos;
         tmp_sig->sentinelpos = 0;
         memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
@@ -1683,10 +2166,11 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
         ArgInfo *ainfo;
         int j;
         LLVMCallInfo *linfo;
+       MonoType *t;
  
         n = sig->param_count + sig->hasthis;
  
-       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
  
         linfo = mono_mempool_alloc0 (cfg->mempool, sizeof (LLVMCallInfo) + (sizeof (LLVMArgInfo) * n));
  
@@ -1711,11 +2195,17 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
         if (MONO_TYPE_ISSTRUCT (sig->ret) && cinfo->ret.storage == ArgInIReg) {
                 /* Vtype returned using a hidden argument */
                 linfo->ret.storage = LLVMArgVtypeRetAddr;
+               linfo->vret_arg_index = cinfo->vret_arg_index;
         }
  
         for (i = 0; i < n; ++i) {
                 ainfo = cinfo->args + i;
  
+               if (i >= sig->hasthis)
+                       t = sig->params [i - sig->hasthis];
+               else
+                       t = &mono_defaults.int_class->byval_arg;
+
                 linfo->args [i].storage = LLVMArgNone;
  
                 switch (ainfo->storage) {
@@ -1727,16 +2217,15 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
                         linfo->args [i].storage = LLVMArgInFPReg;
                         break;
                 case ArgOnStack:
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+                       if (MONO_TYPE_ISSTRUCT (t)) {
                                 linfo->args [i].storage = LLVMArgVtypeByVal;
                         } else {
                                 linfo->args [i].storage = LLVMArgInIReg;
-                               if (!sig->params [i - sig->hasthis]->byref) {
-                                       if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4) {
+                               if (!t->byref) {
+                                       if (t->type == MONO_TYPE_R4)
                                                 linfo->args [i].storage = LLVMArgInFPReg;
-                                       } else if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8) {
+                                       else if (t->type == MONO_TYPE_R8)
                                                 linfo->args [i].storage = LLVMArgInFPReg;
-                                       }
                                 }
                         }
                         break;
@@ -1776,7 +2265,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
         sig = call->signature;
         n = sig->param_count + sig->hasthis;
  
-       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
  
         if (COMPILE_LLVM (cfg)) {
                 /* We shouldn't be called in the llvm case */
@@ -1876,6 +2365,13 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                                 }
                                 g_assert (in->klass);
  
+                               if (ainfo->storage == ArgOnStack && size >= 10000) {
+                                       /* Avoid asserts in emit_memcpy () */
+                                       cfg->exception_type = MONO_EXCEPTION_INVALID_PROGRAM;
+                                       cfg->exception_message = g_strdup_printf ("Passing an argument of size '%d'.", size);
+                                       /* Continue normally */
+                               }
+
                                 if (size > 0) {
                                         MONO_INST_NEW (cfg, arg, OP_OUTARG_VT);
                                         arg->sreg1 = in->dreg;
@@ -1967,7 +2463,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                 }
         }
  
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
         if (call->inst.opcode != OP_JMP && OP_TAILCALL != call->inst.opcode) {
                 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 0x20);
         }
@@ -1999,7 +2495,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
  
                         MONO_INST_NEW (cfg, load, arg_storage_to_load_membase (ainfo->pair_storage [part]));
                         load->inst_basereg = src->dreg;
-                       load->inst_offset = part * sizeof (gpointer);
+                       load->inst_offset = part * sizeof(mgreg_t);
  
                         switch (ainfo->pair_storage [part]) {
                         case ArgInIReg:
@@ -2084,22 +2580,22 @@ mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
  {
         MonoType *ret = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
  
-       if (!ret->byref) {
-               if (ret->type == MONO_TYPE_R4) {
-                       if (COMPILE_LLVM (cfg))
-                               MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
-                       else
-                               MONO_EMIT_NEW_UNALU (cfg, OP_AMD64_SET_XMMREG_R4, cfg->ret->dreg, val->dreg);
-                       return;
-               } else if (ret->type == MONO_TYPE_R8) {
+       if (ret->type == MONO_TYPE_R4) {
+               if (COMPILE_LLVM (cfg))
                         MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
-                       return;
-               }
+               else
+                       MONO_EMIT_NEW_UNALU (cfg, OP_AMD64_SET_XMMREG_R4, cfg->ret->dreg, val->dreg);
+               return;
+       } else if (ret->type == MONO_TYPE_R8) {
+               MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
+               return;
         }
                         
         MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, cfg->ret->dreg, val->dreg);
  }
  
+#endif /* DISABLE_JIT */
+
  #define EMIT_COND_BRANCH(ins,cond,sign) \
          if (ins->inst_true_bb->native_offset) { \
                 x86_branch (code, cond, cfg->native_code + ins->inst_true_bb->native_offset, sign); \
@@ -2112,68 +2608,382 @@ mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
                         x86_branch32 (code, cond, 0, sign); \
  }
  
-/* emit an exception if condition is fail */
-#define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
-        do {                                                        \
-               MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
-               if (tins == NULL) {                                                                             \
-                       mono_add_patch_info (cfg, code - cfg->native_code,   \
-                                       MONO_PATCH_INFO_EXC, exc_name);  \
-                       x86_branch32 (code, cond, 0, signed);               \
-               } else {        \
-                       EMIT_COND_BRANCH (tins, cond, signed);  \
-               }                       \
-       } while (0); 
-
-#define EMIT_FPCOMPARE(code) do { \
-       amd64_fcompp (code); \
-       amd64_fnstsw (code); \
-} while (0); 
+typedef struct {
+       MonoMethodSignature *sig;
+       CallInfo *cinfo;
+} ArchDynCallInfo;
  
-#define EMIT_SSE2_FPFUNC(code, op, dreg, sreg1) do { \
-    amd64_movsd_membase_reg (code, AMD64_RSP, -8, (sreg1)); \
-       amd64_fld_membase (code, AMD64_RSP, -8, TRUE); \
-       amd64_ ##op (code); \
-       amd64_fst_membase (code, AMD64_RSP, -8, TRUE, TRUE); \
-       amd64_movsd_reg_membase (code, (dreg), AMD64_RSP, -8); \
-} while (0);
+typedef struct {
+       mgreg_t regs [PARAM_REGS];
+       mgreg_t res;
+       guint8 *ret;
+} DynCallArgs;
  
-static guint8*
-emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data)
+static gboolean
+dyn_call_supported (MonoMethodSignature *sig, CallInfo *cinfo)
  {
-       gboolean no_patch = FALSE;
+       int i;
  
-       /* 
-        * FIXME: Add support for thunks
-        */
-       {
-               gboolean near_call = FALSE;
+#ifdef HOST_WIN32
+       return FALSE;
+#endif
  
-               /*
-                * Indirect calls are expensive so try to make a near call if possible.
-                * The caller memory is allocated by the code manager so it is 
-                * guaranteed to be at a 32 bit offset.
-                */
+       switch (cinfo->ret.storage) {
+       case ArgNone:
+       case ArgInIReg:
+               break;
+       case ArgValuetypeInReg: {
+               ArgInfo *ainfo = &cinfo->ret;
  
-               if (patch_type != MONO_PATCH_INFO_ABS) {
-                       /* The target is in memory allocated using the code manager */
-                       near_call = TRUE;
+               if (ainfo->pair_storage [0] != ArgNone && ainfo->pair_storage [0] != ArgInIReg)
+                       return FALSE;
+               if (ainfo->pair_storage [1] != ArgNone && ainfo->pair_storage [1] != ArgInIReg)
+                       return FALSE;
+               break;
+       }
+       default:
+               return FALSE;
+       }
  
-                       if ((patch_type == MONO_PATCH_INFO_METHOD) || (patch_type == MONO_PATCH_INFO_METHOD_JUMP)) {
-                               if (((MonoMethod*)data)->klass->image->aot_module)
-                                       /* The callee might be an AOT method */
-                                       near_call = FALSE;
-                               if (((MonoMethod*)data)->dynamic)
-                                       /* The target is in malloc-ed memory */
-                                       near_call = FALSE;
-                       }
+       for (i = 0; i < cinfo->nargs; ++i) {
+               ArgInfo *ainfo = &cinfo->args [i];
+               switch (ainfo->storage) {
+               case ArgInIReg:
+                       break;
+               case ArgValuetypeInReg:
+                       if (ainfo->pair_storage [0] != ArgNone && ainfo->pair_storage [0] != ArgInIReg)
+                               return FALSE;
+                       if (ainfo->pair_storage [1] != ArgNone && ainfo->pair_storage [1] != ArgInIReg)
+                               return FALSE;
+                       break;
+               default:
+                       return FALSE;
+               }
+       }
  
-                       if (patch_type == MONO_PATCH_INFO_INTERNAL_METHOD) {
-                               /* 
-                                * The call might go directly to a native function without
-                                * the wrapper.
-                                */
-                               MonoJitICallInfo *mi = mono_find_jit_icall_by_name (data);
+       return TRUE;
+}
+
+/*
+ * mono_arch_dyn_call_prepare:
+ *
+ *   Return a pointer to an arch-specific structure which contains information 
+ * needed by mono_arch_get_dyn_call_args (). Return NULL if OP_DYN_CALL is not
+ * supported for SIG.
+ * This function is equivalent to ffi_prep_cif in libffi.
+ */
+MonoDynCallInfo*
+mono_arch_dyn_call_prepare (MonoMethodSignature *sig)
+{
+       ArchDynCallInfo *info;
+       CallInfo *cinfo;
+
+       cinfo = get_call_info (NULL, NULL, sig);
+
+       if (!dyn_call_supported (sig, cinfo)) {
+               g_free (cinfo);
+               return NULL;
+       }
+
+       info = g_new0 (ArchDynCallInfo, 1);
+       // FIXME: Preprocess the info to speed up get_dyn_call_args ().
+       info->sig = sig;
+       info->cinfo = cinfo;
+       
+       return (MonoDynCallInfo*)info;
+}
+
+/*
+ * mono_arch_dyn_call_free:
+ *
+ *   Free a MonoDynCallInfo structure.
+ */
+void
+mono_arch_dyn_call_free (MonoDynCallInfo *info)
+{
+       ArchDynCallInfo *ainfo = (ArchDynCallInfo*)info;
+
+       g_free (ainfo->cinfo);
+       g_free (ainfo);
+}
+
+#if !defined(__native_client__)
+#define PTR_TO_GREG(ptr) (mgreg_t)(ptr)
+#define GREG_TO_PTR(greg) (gpointer)(greg)
+#else
+/* Correctly handle casts to/from 32-bit pointers without compiler warnings */
+#define PTR_TO_GREG(ptr) (mgreg_t)(uintptr_t)(ptr)
+#define GREG_TO_PTR(greg) (gpointer)(guint32)(greg)
+#endif
+
+/*
+ * mono_arch_get_start_dyn_call:
+ *
+ *   Convert the arguments ARGS to a format which can be passed to OP_DYN_CALL, and
+ * store the result into BUF.
+ * ARGS should be an array of pointers pointing to the arguments.
+ * RET should point to a memory buffer large enought to hold the result of the
+ * call.
+ * This function should be as fast as possible, any work which does not depend
+ * on the actual values of the arguments should be done in 
+ * mono_arch_dyn_call_prepare ().
+ * start_dyn_call + OP_DYN_CALL + finish_dyn_call is equivalent to ffi_call in
+ * libffi.
+ */
+void
+mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, guint8 *buf, int buf_len)
+{
+       ArchDynCallInfo *dinfo = (ArchDynCallInfo*)info;
+       DynCallArgs *p = (DynCallArgs*)buf;
+       int arg_index, greg, i, pindex;
+       MonoMethodSignature *sig = dinfo->sig;
+
+       g_assert (buf_len >= sizeof (DynCallArgs));
+
+       p->res = 0;
+       p->ret = ret;
+
+       arg_index = 0;
+       greg = 0;
+       pindex = 0;
+
+       if (sig->hasthis || dinfo->cinfo->vret_arg_index == 1) {
+               p->regs [greg ++] = PTR_TO_GREG(*(args [arg_index ++]));
+               if (!sig->hasthis)
+                       pindex = 1;
+       }
+
+       if (dinfo->cinfo->vtype_retaddr)
+               p->regs [greg ++] = PTR_TO_GREG(ret);
+
+       for (i = pindex; i < sig->param_count; i++) {
+               MonoType *t = mono_type_get_underlying_type (sig->params [i]);
+               gpointer *arg = args [arg_index ++];
+
+               if (t->byref) {
+                       p->regs [greg ++] = PTR_TO_GREG(*(arg));
+                       continue;
+               }
+
+               switch (t->type) {
+               case MONO_TYPE_STRING:
+               case MONO_TYPE_CLASS:  
+               case MONO_TYPE_ARRAY:
+               case MONO_TYPE_SZARRAY:
+               case MONO_TYPE_OBJECT:
+               case MONO_TYPE_PTR:
+               case MONO_TYPE_I:
+               case MONO_TYPE_U:
+#if !defined(__mono_ilp32__)
+               case MONO_TYPE_I8:
+               case MONO_TYPE_U8:
+#endif
+                       g_assert (dinfo->cinfo->args [i + sig->hasthis].reg == param_regs [greg]);
+                       p->regs [greg ++] = PTR_TO_GREG(*(arg));
+                       break;
+#if defined(__mono_ilp32__)
+               case MONO_TYPE_I8:
+               case MONO_TYPE_U8:
+                       g_assert (dinfo->cinfo->args [i + sig->hasthis].reg == param_regs [greg]);
+                       p->regs [greg ++] = *(guint64*)(arg);
+                       break;
+#endif
+               case MONO_TYPE_BOOLEAN:
+               case MONO_TYPE_U1:
+                       p->regs [greg ++] = *(guint8*)(arg);
+                       break;
+               case MONO_TYPE_I1:
+                       p->regs [greg ++] = *(gint8*)(arg);
+                       break;
+               case MONO_TYPE_I2:
+                       p->regs [greg ++] = *(gint16*)(arg);
+                       break;
+               case MONO_TYPE_U2:
+               case MONO_TYPE_CHAR:
+                       p->regs [greg ++] = *(guint16*)(arg);
+                       break;
+               case MONO_TYPE_I4:
+                       p->regs [greg ++] = *(gint32*)(arg);
+                       break;
+               case MONO_TYPE_U4:
+                       p->regs [greg ++] = *(guint32*)(arg);
+                       break;
+               case MONO_TYPE_GENERICINST:
+                   if (MONO_TYPE_IS_REFERENCE (t)) {
+                               p->regs [greg ++] = PTR_TO_GREG(*(arg));
+                               break;
+                       } else {
+                               /* Fall through */
+                       }
+               case MONO_TYPE_VALUETYPE: {
+                       ArgInfo *ainfo = &dinfo->cinfo->args [i + sig->hasthis];
+
+                       g_assert (ainfo->storage == ArgValuetypeInReg);
+                       if (ainfo->pair_storage [0] != ArgNone) {
+                               g_assert (ainfo->pair_storage [0] == ArgInIReg);
+                               p->regs [greg ++] = ((mgreg_t*)(arg))[0];
+                       }
+                       if (ainfo->pair_storage [1] != ArgNone) {
+                               g_assert (ainfo->pair_storage [1] == ArgInIReg);
+                               p->regs [greg ++] = ((mgreg_t*)(arg))[1];
+                       }
+                       break;
+               }
+               default:
+                       g_assert_not_reached ();
+               }
+       }
+
+       g_assert (greg <= PARAM_REGS);
+}
+
+/*
+ * mono_arch_finish_dyn_call:
+ *
+ *   Store the result of a dyn call into the return value buffer passed to
+ * start_dyn_call ().
+ * This function should be as fast as possible, any work which does not depend
+ * on the actual values of the arguments should be done in 
+ * mono_arch_dyn_call_prepare ().
+ */
+void
+mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
+{
+       ArchDynCallInfo *dinfo = (ArchDynCallInfo*)info;
+       MonoMethodSignature *sig = dinfo->sig;
+       guint8 *ret = ((DynCallArgs*)buf)->ret;
+       mgreg_t res = ((DynCallArgs*)buf)->res;
+
+       switch (mono_type_get_underlying_type (sig->ret)->type) {
+       case MONO_TYPE_VOID:
+               *(gpointer*)ret = NULL;
+               break;
+       case MONO_TYPE_STRING:
+       case MONO_TYPE_CLASS:  
+       case MONO_TYPE_ARRAY:
+       case MONO_TYPE_SZARRAY:
+       case MONO_TYPE_OBJECT:
+       case MONO_TYPE_I:
+       case MONO_TYPE_U:
+       case MONO_TYPE_PTR:
+               *(gpointer*)ret = GREG_TO_PTR(res);
+               break;
+       case MONO_TYPE_I1:
+               *(gint8*)ret = res;
+               break;
+       case MONO_TYPE_U1:
+       case MONO_TYPE_BOOLEAN:
+               *(guint8*)ret = res;
+               break;
+       case MONO_TYPE_I2:
+               *(gint16*)ret = res;
+               break;
+       case MONO_TYPE_U2:
+       case MONO_TYPE_CHAR:
+               *(guint16*)ret = res;
+               break;
+       case MONO_TYPE_I4:
+               *(gint32*)ret = res;
+               break;
+       case MONO_TYPE_U4:
+               *(guint32*)ret = res;
+               break;
+       case MONO_TYPE_I8:
+               *(gint64*)ret = res;
+               break;
+       case MONO_TYPE_U8:
+               *(guint64*)ret = res;
+               break;
+       case MONO_TYPE_GENERICINST:
+               if (MONO_TYPE_IS_REFERENCE (sig->ret)) {
+                       *(gpointer*)ret = GREG_TO_PTR(res);
+                       break;
+               } else {
+                       /* Fall through */
+               }
+       case MONO_TYPE_VALUETYPE:
+               if (dinfo->cinfo->vtype_retaddr) {
+                       /* Nothing to do */
+               } else {
+                       ArgInfo *ainfo = &dinfo->cinfo->ret;
+
+                       g_assert (ainfo->storage == ArgValuetypeInReg);
+
+                       if (ainfo->pair_storage [0] != ArgNone) {
+                               g_assert (ainfo->pair_storage [0] == ArgInIReg);
+                               ((mgreg_t*)ret)[0] = res;
+                       }
+
+                       g_assert (ainfo->pair_storage [1] == ArgNone);
+               }
+               break;
+       default:
+               g_assert_not_reached ();
+       }
+}
+
+/* emit an exception if condition is fail */
+#define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
+        do {                                                        \
+               MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
+               if (tins == NULL) {                                                                             \
+                       mono_add_patch_info (cfg, code - cfg->native_code,   \
+                                       MONO_PATCH_INFO_EXC, exc_name);  \
+                       x86_branch32 (code, cond, 0, signed);               \
+               } else {        \
+                       EMIT_COND_BRANCH (tins, cond, signed);  \
+               }                       \
+       } while (0); 
+
+#define EMIT_FPCOMPARE(code) do { \
+       amd64_fcompp (code); \
+       amd64_fnstsw (code); \
+} while (0); 
+
+#define EMIT_SSE2_FPFUNC(code, op, dreg, sreg1) do { \
+    amd64_movsd_membase_reg (code, AMD64_RSP, -8, (sreg1)); \
+       amd64_fld_membase (code, AMD64_RSP, -8, TRUE); \
+       amd64_ ##op (code); \
+       amd64_fst_membase (code, AMD64_RSP, -8, TRUE, TRUE); \
+       amd64_movsd_reg_membase (code, (dreg), AMD64_RSP, -8); \
+} while (0);
+
+static guint8*
+emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data)
+{
+       gboolean no_patch = FALSE;
+
+       /* 
+        * FIXME: Add support for thunks
+        */
+       {
+               gboolean near_call = FALSE;
+
+               /*
+                * Indirect calls are expensive so try to make a near call if possible.
+                * The caller memory is allocated by the code manager so it is 
+                * guaranteed to be at a 32 bit offset.
+                */
+
+               if (patch_type != MONO_PATCH_INFO_ABS) {
+                       /* The target is in memory allocated using the code manager */
+                       near_call = TRUE;
+
+                       if ((patch_type == MONO_PATCH_INFO_METHOD) || (patch_type == MONO_PATCH_INFO_METHOD_JUMP)) {
+                               if (((MonoMethod*)data)->klass->image->aot_module)
+                                       /* The callee might be an AOT method */
+                                       near_call = FALSE;
+                               if (((MonoMethod*)data)->dynamic)
+                                       /* The target is in malloc-ed memory */
+                                       near_call = FALSE;
+                       }
+
+                       if (patch_type == MONO_PATCH_INFO_INTERNAL_METHOD) {
+                               /* 
+                                * The call might go directly to a native function without
+                                * the wrapper.
+                                */
+                               MonoJitICallInfo *mi = mono_find_jit_icall_by_name (data);
                                 if (mi) {
                                         gconstpointer target = mono_icall_get_wrapper (mi);
                                         if ((((guint64)target) >> 32) != 0)
@@ -2220,23 +3030,29 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                         /* These methods are allocated using malloc */
                         near_call = FALSE;
  
+#ifdef MONO_ARCH_NOMAP32BIT
+               near_call = FALSE;
+#endif
+
+               /* The 64bit XEN kernel does not honour the MAP_32BIT flag. (#522894) */
+               if (optimize_for_xen)
+                       near_call = FALSE;
+
                 if (cfg->compile_aot) {
                         near_call = TRUE;
                         no_patch = TRUE;
                 }
  
-#ifdef MONO_ARCH_NOMAP32BIT
-               near_call = FALSE;
-#endif
-
                 if (near_call) {
                         /* 
                          * Align the call displacement to an address divisible by 4 so it does
                          * not span cache lines. This is required for code patching to work on SMP
                          * systems.
                          */
-                       if (!no_patch && ((guint32)(code + 1 - cfg->native_code) % 4) != 0)
-                               amd64_padding (code, 4 - ((guint32)(code + 1 - cfg->native_code) % 4));
+                       if (!no_patch && ((guint32)(code + 1 - cfg->native_code) % 4) != 0) {
+                               guint32 pad_size = 4 - ((guint32)(code + 1 - cfg->native_code) % 4);
+                               amd64_padding (code, pad_size);
+                       }
                         mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
                         amd64_call_code (code, 0);
                 }
@@ -2253,12 +3069,12 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
  static inline guint8*
  emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data, gboolean win64_adjust_stack)
  {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
         if (win64_adjust_stack)
                 amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 32);
  #endif
         code = emit_call_body (cfg, code, patch_type, data);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
         if (win64_adjust_stack)
                 amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 32);
  #endif 
@@ -2281,6 +3097,8 @@ store_membase_imm_to_store_membase_reg (int opcode)
         return -1;
  }
  
+#ifndef DISABLE_JIT
+
  #define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB) || ((opcode) == OP_ISBB_IMM)))
  
  /*
@@ -2420,7 +3238,7 @@ mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb)
                                         if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
                                                 ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
                                                 ins2->sreg1 = ins->dreg;
-                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_REG) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_REG) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG) || (ins2->opcode == OP_LIVERANGE_START)) {
                                                 /* Continue */
                                         } else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
                                                 NULLIFY_INS (ins2);
@@ -2491,8 +3309,13 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                 ins->sreg2 = temp->dreg;
                         }
                         break;
+#ifndef __mono_ilp32__
                 case OP_LOAD_MEMBASE:
+#endif
                 case OP_LOADI8_MEMBASE:
+#ifndef __native_client_codegen__
+               /*  Don't generate memindex opcodes (to simplify */
+               /*  read sandboxing) */
                         if (!amd64_is_imm32 (ins->inst_offset)) {
                                 NEW_INS (cfg, ins, temp, OP_I8CONST);
                                 temp->inst_c0 = ins->inst_offset;
@@ -2500,8 +3323,11 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                 ins->opcode = OP_AMD64_LOADI8_MEMINDEX;
                                 ins->inst_indexreg = temp->dreg;
                         }
+#endif
                         break;
+#ifndef __mono_ilp32__
                 case OP_STORE_MEMBASE_IMM:
+#endif
                 case OP_STOREI8_MEMBASE_IMM:
                         if (!amd64_is_imm32 (ins->inst_imm)) {
                                 NEW_INS (cfg, ins, temp, OP_I8CONST);
@@ -2511,6 +3337,30 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                 ins->sreg1 = temp->dreg;
                         }
                         break;
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               case OP_EXPAND_I1: {
+                               int temp_reg1 = mono_alloc_ireg (cfg);
+                               int temp_reg2 = mono_alloc_ireg (cfg);
+                               int original_reg = ins->sreg1;
+
+                               NEW_INS (cfg, ins, temp, OP_ICONV_TO_U1);
+                               temp->sreg1 = original_reg;
+                               temp->dreg = temp_reg1;
+
+                               NEW_INS (cfg, ins, temp, OP_SHL_IMM);
+                               temp->sreg1 = temp_reg1;
+                               temp->dreg = temp_reg2;
+                               temp->inst_imm = 8;
+
+                               NEW_INS (cfg, ins, temp, OP_LOR);
+                               temp->sreg1 = temp->dreg = temp_reg2;
+                               temp->sreg2 = temp_reg1;
+
+                               ins->opcode = OP_EXPAND_I2;
+                               ins->sreg1 = temp_reg2;
+                       }
+                       break;
+#endif
                 default:
                         break;
                 }
@@ -2559,7 +3409,7 @@ mono_emit_stack_alloc (MonoCompile *cfg, guchar *code, MonoInst* tree)
         int sreg = tree->sreg1;
         int need_touch = FALSE;
  
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(HOST_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
         if (!tree->flags & MONO_INST_INIT)
                 need_touch = TRUE;
  #endif
@@ -2629,8 +3479,20 @@ mono_emit_stack_alloc (MonoCompile *cfg, guchar *code, MonoInst* tree)
                 if (cfg->param_area && cfg->arch.no_pushes)
                         amd64_alu_reg_imm (code, X86_ADD, AMD64_RDI, cfg->param_area);
                 amd64_cld (code);
+#if defined(__default_codegen__)
                 amd64_prefix (code, X86_REP_PREFIX);
                 amd64_stosl (code);
+#elif defined(__native_client_codegen__)
+               /* NaCl stos pseudo-instruction */
+               amd64_codegen_pre(code);
+               /* First, clear the upper 32 bits of RDI (mov %edi, %edi)  */
+               amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RDI, 4);
+               /* Add %r15 to %rdi using lea, condition flags unaffected. */
+               amd64_lea_memindex_size (code, AMD64_RDI, AMD64_R15, 0, AMD64_RDI, 0, 8);
+               amd64_prefix (code, X86_REP_PREFIX);
+               amd64_stosl (code);
+               amd64_codegen_post(code);
+#endif /* __native_client_codegen__ */
                 
                 if (tree->dreg != AMD64_RDI && sreg != AMD64_RDI)
                         amd64_pop_reg (code, AMD64_RDI);
@@ -2676,18 +3538,18 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
         case OP_VCALL2:
         case OP_VCALL2_REG:
         case OP_VCALL2_MEMBASE:
-               cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
+               cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, ((MonoCallInst*)ins)->signature);
                 if (cinfo->ret.storage == ArgValuetypeInReg) {
                         MonoInst *loc = cfg->arch.vret_addr_loc;
  
                         /* Load the destination address */
                         g_assert (loc->opcode == OP_REGOFFSET);
-                       amd64_mov_reg_membase (code, AMD64_RCX, loc->inst_basereg, loc->inst_offset, 8);
+                       amd64_mov_reg_membase (code, AMD64_RCX, loc->inst_basereg, loc->inst_offset, sizeof(gpointer));
  
                         for (quad = 0; quad < 2; quad ++) {
                                 switch (cinfo->ret.pair_storage [quad]) {
                                 case ArgInIReg:
-                                       amd64_mov_membase_reg (code, AMD64_RCX, (quad * 8), cinfo->ret.pair_regs [quad], 8);
+                                       amd64_mov_membase_reg (code, AMD64_RCX, (quad * sizeof(mgreg_t)), cinfo->ret.pair_regs [quad], sizeof(mgreg_t));
                                         break;
                                 case ArgInFloatSSEReg:
                                         amd64_movss_membase_reg (code, AMD64_RCX, (quad * 8), cinfo->ret.pair_regs [quad]);
@@ -2708,6 +3570,8 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
         return code;
  }
  
+#endif /* DISABLE_JIT */
+
  /*
   * mono_amd64_emit_tls_get:
   * @code: buffer to store code to
@@ -2723,7 +3587,7 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
  guint8*
  mono_amd64_emit_tls_get (guint8* code, int dreg, int tls_offset)
  {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
         g_assert (tls_offset < 64);
         x86_prefix (code, X86_GS_PREFIX);
         amd64_mov_reg_mem (code, dreg, (tls_offset * 8) + 0x1480, 8);
@@ -2761,6 +3625,15 @@ amd64_pop_reg (code, AMD64_RAX);
  
  #ifndef DISABLE_JIT
  
+#if defined(__native_client__) || defined(__native_client_codegen__)
+void mono_nacl_gc()
+{
+#ifdef __native_client_gc__
+       __nacl_suspend_thread_if_needed();
+#endif
+}
+#endif
+
  void
  mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
  {
@@ -2794,6 +3667,21 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 }
         }
  
+#if defined(__native_client_codegen__)
+       /* For Native Client, all indirect call/jump targets must be */
+       /* 32-byte aligned.  Exception handler blocks are jumped to  */
+       /* indirectly as well.                                       */
+       gboolean bb_needs_alignment = (bb->flags & BB_INDIRECT_JUMP_TARGET) ||
+                                     (bb->flags & BB_EXCEPTION_HANDLER);
+
+       if ( bb_needs_alignment && ((cfg->code_len & kNaClAlignmentMask) != 0)) {
+               int pad = kNaClAlignment - (cfg->code_len & kNaClAlignmentMask);
+               if (pad != kNaClAlignment) code = mono_arch_nacl_pad(code, pad);
+               cfg->code_len += pad;
+               bb->native_offset = cfg->code_len;
+       }
+#endif  /*__native_client_codegen__*/
+
         if (cfg->verbose_level > 2)
                 g_print ("Basic block %d starting at offset 0x%x\n", bb->block_num, bb->native_offset);
  
@@ -2819,9 +3707,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
  
                 max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
  
-               if (G_UNLIKELY (offset > (cfg->code_size - max_len - 16))) {
+#define EXTRA_CODE_SPACE (NACL_SIZE (16, 16 + kNaClAlignment))
+
+               if (G_UNLIKELY (offset > (cfg->code_size - max_len - EXTRA_CODE_SPACE))) {
                         cfg->code_size *= 2;
-                       cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+                       cfg->native_code = mono_realloc_native_code(cfg);
                         code = cfg->native_code + offset;
                         mono_jit_stats.code_reallocs++;
                 }
@@ -2854,7 +3744,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 case OP_STOREI2_MEMBASE_REG:
                         amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 2);
                         break;
+               /* In AMD64 NaCl, pointers are 4 bytes, */
+               /*  so STORE_* != STOREI8_*. Likewise below. */
                 case OP_STORE_MEMBASE_REG:
+                       amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, sizeof(gpointer));
+                       break;
                 case OP_STOREI8_MEMBASE_REG:
                         amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 8);
                         break;
@@ -2862,15 +3756,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 4);
                         break;
                 case OP_STORE_MEMBASE_IMM:
+#ifndef __native_client_codegen__
+                       /* In NaCl, this could be a PCONST type, which could */
+                       /* mean a pointer type was copied directly into the  */
+                       /* lower 32-bits of inst_imm, so for InvalidPtr==-1  */
+                       /* the value would be 0x00000000FFFFFFFF which is    */
+                       /* not proper for an imm32 unless you cast it.       */
+                       g_assert (amd64_is_imm32 (ins->inst_imm));
+#endif
+                       amd64_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, (gint32)ins->inst_imm, sizeof(gpointer));
+                       break;
                 case OP_STOREI8_MEMBASE_IMM:
                         g_assert (amd64_is_imm32 (ins->inst_imm));
                         amd64_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, ins->inst_imm, 8);
                         break;
                 case OP_LOAD_MEM:
+#ifdef __mono_ilp32__
+                       /* In ILP32, pointers are 4 bytes, so separate these */
+                       /* cases, use literal 8 below where we really want 8 */
+                       amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
+                       amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, sizeof(gpointer));
+                       break;
+#endif
                 case OP_LOADI8_MEM:
                         // FIXME: Decompose this earlier
                         if (amd64_is_imm32 (ins->inst_imm))
-                               amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, sizeof (gpointer));
+                               amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 8);
                         else {
                                 amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
                                 amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 8);
@@ -2894,13 +3805,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         amd64_widen_membase (code, ins->dreg, ins->dreg, 0, FALSE, FALSE);
                         break;
                 case OP_LOADU2_MEM:
+                       /* For NaCl, pointers are 4 bytes, so separate these */
+                       /* cases, use literal 8 below where we really want 8 */
                         amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
                         amd64_widen_membase (code, ins->dreg, ins->dreg, 0, FALSE, TRUE);
                         break;
                 case OP_LOAD_MEMBASE:
+                       g_assert (amd64_is_imm32 (ins->inst_offset));
+                       amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, sizeof(gpointer));
+                       break;
                 case OP_LOADI8_MEMBASE:
+                       /* Use literal 8 instead of sizeof pointer or */
+                       /* register, we really want 8 for this opcode */
                         g_assert (amd64_is_imm32 (ins->inst_offset));
-                       amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, sizeof (gpointer));
+                       amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, 8);
                         break;
                 case OP_LOADI4_MEMBASE:
                         amd64_movsxd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
@@ -3128,6 +4046,44 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 case OP_NOT_REACHED:
                 case OP_NOT_NULL:
                         break;
+               case OP_SEQ_POINT: {
+                       int i;
+
+                       if (cfg->compile_aot)
+                               NOT_IMPLEMENTED;
+
+                       /* 
+                        * Read from the single stepping trigger page. This will cause a
+                        * SIGSEGV when single stepping is enabled.
+                        * We do this _before_ the breakpoint, so single stepping after
+                        * a breakpoint is hit will step to the next IL offset.
+                        */
+                       if (ins->flags & MONO_INST_SINGLE_STEP_LOC) {
+                               if (((guint64)ss_trigger_page >> 32) == 0)
+                                       amd64_mov_reg_mem (code, AMD64_R11, (guint64)ss_trigger_page, 4);
+                               else {
+                                       MonoInst *var = cfg->arch.ss_trigger_page_var;
+
+                                       amd64_mov_reg_membase (code, AMD64_R11, var->inst_basereg, var->inst_offset, 8);
+                                       amd64_alu_membase_imm_size (code, X86_CMP, AMD64_R11, 0, 0, 4);
+                               }
+                       }
+
+                       /* 
+                        * This is the address which is saved in seq points, 
+                        * get_ip_for_single_step () / get_ip_for_breakpoint () needs to compute this
+                        * from the address of the instruction causing the fault.
+                        */
+                       mono_add_seq_point (cfg, bb, ins, code - cfg->native_code);
+
+                       /* 
+                        * A placeholder for a possible breakpoint inserted by
+                        * mono_arch_set_breakpoint ().
+                        */
+                       for (i = 0; i < breakpoint_size; ++i)
+                               x86_nop (code);
+                       break;
+               }
                 case OP_ADDCC:
                 case OP_LADD:
                         amd64_alu_reg_reg (code, X86_ADD, ins->sreg1, ins->sreg2);
@@ -3550,14 +4506,14 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         break;
                 case OP_AOTCONST:
                         mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
-                       amd64_mov_reg_membase (code, ins->dreg, AMD64_RIP, 0, 8);
+                       amd64_mov_reg_membase (code, ins->dreg, AMD64_RIP, 0, sizeof(gpointer));
                         break;
                 case OP_JUMP_TABLE:
                         mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
                         amd64_mov_reg_imm_size (code, ins->dreg, 0, 8);
                         break;
                 case OP_MOVE:
-                       amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof (gpointer));
+                       amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof(mgreg_t));
                         break;
                 case OP_AMD64_SET_XMMREG_R4: {
                         amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
@@ -3569,10 +4525,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         break;
                 }
                 case OP_TAILCALL: {
-                       /*
-                        * Note: this 'frame destruction' logic is useful for tail calls, too.
-                        * Keep in sync with the code in emit_epilog.
-                        */
+                       MonoCallInst *call = (MonoCallInst*)ins;
                         int pos = 0, i;
  
                         /* FIXME: no tracing support... */
@@ -3590,20 +4543,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                                 save_offset += 8;
                                         }
                                 amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+
+                               // FIXME:
+                               if (call->stack_usage)
+                                       NOT_IMPLEMENTED;
                         }
                         else {
                                 for (i = 0; i < AMD64_NREG; ++i)
                                         if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                                               pos -= sizeof (gpointer);
-                       
-                               if (pos)
-                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+                                               pos -= sizeof(mgreg_t);
  
-                               /* Pop registers in reverse order */
-                               for (i = AMD64_NREG - 1; i > 0; --i)
+                               /* Restore callee-saved registers */
+                               for (i = AMD64_NREG - 1; i > 0; --i) {
                                         if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-                                               amd64_pop_reg (code, i);
+                                               amd64_mov_reg_membase (code, i, AMD64_RBP, pos, sizeof(mgreg_t));
+                                               pos += sizeof(mgreg_t);
                                         }
+                               }
+
+                               /* Copy arguments on the stack to our argument area */
+                               for (i = 0; i < call->stack_usage; i += sizeof(mgreg_t)) {
+                                       amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, i, sizeof(mgreg_t));
+                                       amd64_mov_membase_reg (code, AMD64_RBP, 16 + i, AMD64_RAX, sizeof(mgreg_t));
+                               }
+                       
+                               if (pos)
+                                       amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
  
                                 amd64_leave (code);
                         }
@@ -3623,7 +4588,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         break;
                 case OP_ARGLIST: {
                         amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, cfg->sig_cookie);
-                       amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, 8);
+                       amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, sizeof(gpointer));
                         break;
                 }
                 case OP_CALL:
@@ -3719,27 +4684,37 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 case OP_CALL_MEMBASE:
                         call = (MonoCallInst*)ins;
  
-                       if (AMD64_IS_ARGUMENT_REG (ins->sreg1)) {
-                               /* 
-                                * Can't use R11 because it is clobbered by the trampoline 
-                                * code, and the reg value is needed by get_vcall_slot_addr.
-                                */
-                               amd64_mov_reg_reg (code, AMD64_RAX, ins->sreg1, 8);
-                               ins->sreg1 = AMD64_RAX;
-                       }
-
-                       /* 
-                        * Emit a few nops to simplify get_vcall_slot ().
-                        */
-                       amd64_nop (code);
-                       amd64_nop (code);
-                       amd64_nop (code);
-
                         amd64_call_membase (code, ins->sreg1, ins->inst_offset);
                         if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
                                 amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
                         code = emit_move_return_value (cfg, ins, code);
                         break;
+               case OP_DYN_CALL: {
+                       int i;
+                       MonoInst *var = cfg->dyn_call_var;
+
+                       g_assert (var->opcode == OP_REGOFFSET);
+
+                       /* r11 = args buffer filled by mono_arch_get_dyn_call_args () */
+                       amd64_mov_reg_reg (code, AMD64_R11, ins->sreg1, 8);
+                       /* r10 = ftn */
+                       amd64_mov_reg_reg (code, AMD64_R10, ins->sreg2, 8);
+
+                       /* Save args buffer */
+                       amd64_mov_membase_reg (code, var->inst_basereg, var->inst_offset, AMD64_R11, 8);
+
+                       /* Set argument registers */
+                       for (i = 0; i < PARAM_REGS; ++i)
+                               amd64_mov_reg_membase (code, param_regs [i], AMD64_R11, i * sizeof(mgreg_t), sizeof(mgreg_t));
+                       
+                       /* Make the call */
+                       amd64_call_reg (code, AMD64_R10);
+
+                       /* Save result */
+                       amd64_mov_reg_membase (code, AMD64_R11, var->inst_basereg, var->inst_offset, 8);
+                       amd64_mov_membase_reg (code, AMD64_R11, G_STRUCT_OFFSET (DynCallArgs, res), AMD64_RAX, 8);
+                       break;
+               }
                 case OP_AMD64_SAVE_SP_TO_LMF:
                         amd64_mov_membase_reg (code, cfg->frame_reg, cfg->arch.lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
                         break;
@@ -3843,12 +4818,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 8);
                         mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
                         amd64_call_imm (code, 0);
+                       mono_cfg_add_try_hole (cfg, ins->inst_eh_block, code, bb);
                         /* Restore stack alignment */
                         amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
                         break;
                 case OP_START_HANDLER: {
+                       /* Even though we're saving RSP, use sizeof */
+                       /* gpointer because spvar is of type IntPtr */
+                       /* see: mono_create_spvar_for_region */
                         MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-                       amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, 8);
+                       amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, sizeof(gpointer));
  
                         if ((MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY) ||
                                  MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY)) &&
@@ -3859,13 +4838,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 }
                 case OP_ENDFINALLY: {
                         MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-                       amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, 8);
+                       amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, sizeof(gpointer));
                         amd64_ret (code);
                         break;
                 }
                 case OP_ENDFILTER: {
                         MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-                       amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, 8);
+                       amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, sizeof(gpointer));
                         /* The local allocator will put the result into RAX */
                         amd64_ret (code);
                         break;
@@ -3976,9 +4955,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 case OP_STORER8_MEMBASE_REG:
                         amd64_sse_movsd_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1);
                         break;
-               case OP_LOADR8_SPILL_MEMBASE:
-                       g_assert_not_reached ();
-                       break;
                 case OP_LOADR8_MEMBASE:
                         amd64_sse_movsd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
                         break;
@@ -4356,7 +5332,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         break;
                 }
                 case OP_MEMORY_BARRIER: {
-                       /* Not needed on amd64 */
+                       /* http://blogs.sun.com/dave/resource/NHM-Pipeline-Blog-V2.txt */
+                       x86_prefix (code, X86_LOCK_PREFIX);
+                       amd64_alu_membase_imm (code, X86_ADD, AMD64_RSP, 0, 0);
                         break;
                 }
                 case OP_ATOMIC_ADD_I4:
@@ -4481,7 +5459,64 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                 amd64_mov_reg_reg (code, ins->dreg, AMD64_RAX, size);
                         break;
                 }
+               case OP_CARD_TABLE_WBARRIER: {
+                       int ptr = ins->sreg1;
+                       int value = ins->sreg2;
+                       guchar *br;
+                       int nursery_shift, card_table_shift;
+                       gpointer card_table_mask;
+                       size_t nursery_size;
+
+                       gpointer card_table = mono_gc_get_card_table (&card_table_shift, &card_table_mask);
+                       guint64 nursery_start = (guint64)mono_gc_get_nursery (&nursery_shift, &nursery_size);
+
+                       /*If either point to the stack we can simply avoid the WB. This happens due to
+                        * optimizations revealing a stack store that was not visible when op_cardtable was emited.
+                        */
+                       if (ins->sreg1 == AMD64_RSP || ins->sreg2 == AMD64_RSP)
+                               continue;
+
+                       /*
+                        * We need one register we can clobber, we choose EDX and make sreg1
+                        * fixed EAX to work around limitations in the local register allocator.
+                        * sreg2 might get allocated to EDX, but that is not a problem since
+                        * we use it before clobbering EDX.
+                        */
+                       g_assert (ins->sreg1 == AMD64_RAX);
+
+                       /*
+                        * This is the code we produce:
+                        *
+                        *   edx = value
+                        *   edx >>= nursery_shift
+                        *   cmp edx, (nursery_start >> nursery_shift)
+                        *   jne done
+                        *   edx = ptr
+                        *   edx >>= card_table_shift
+                        *   edx += cardtable
+                        *   [edx] = 1
+                        * done:
+                        */
+
+                       if (value != AMD64_RDX)
+                               amd64_mov_reg_reg (code, AMD64_RDX, value, 8);
+                       amd64_shift_reg_imm (code, X86_SHR, AMD64_RDX, nursery_shift);
+                       amd64_alu_reg_imm (code, X86_CMP, AMD64_RDX, nursery_start >> nursery_shift);
+                       br = code; x86_branch8 (code, X86_CC_NE, -1, FALSE);
+                       amd64_mov_reg_reg (code, AMD64_RDX, ptr, 8);
+                       amd64_shift_reg_imm (code, X86_SHR, AMD64_RDX, card_table_shift);
+                       if (card_table_mask)
+                               amd64_alu_reg_imm (code, X86_AND, AMD64_RDX, (guint32)(guint64)card_table_mask);
+
+                       mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_GC_CARD_TABLE_ADDR, card_table);
+                       amd64_alu_reg_membase (code, X86_ADD, AMD64_RDX, AMD64_RIP, 0);
+
+                       amd64_mov_membase_imm (code, AMD64_RDX, 0, 1, 1);
+                       x86_patch (br, code);
+                       break;
+               }
  #ifdef MONO_ARCH_SIMD_INTRINSICS
+               /* TODO: Some of these IR opcodes are marked as no clobber when they indeed do. */
                 case OP_ADDPS:
                         amd64_sse_addps_reg_reg (code, ins->sreg1, ins->sreg2);
                         break;
@@ -4588,11 +5623,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 case OP_XORPD:
                         amd64_sse_xorpd_reg_reg (code, ins->sreg1, ins->sreg2);
                         break;
-               /* TODO: This op is in the AMD64 manual but has not been implemented.
                 case OP_SQRTPD:
                         amd64_sse_sqrtpd_reg_reg (code, ins->dreg, ins->sreg1);
                         break;
-               */
                 case OP_ADDSUBPD:
                         amd64_sse_addsubpd_reg_reg (code, ins->sreg1, ins->sreg2);
                         break;
@@ -4632,6 +5665,428 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 case OP_PADDQ:
                         amd64_sse_paddq_reg_reg (code, ins->sreg1, ins->sreg2);
                         break;
+
+               case OP_PSUBB:
+                       amd64_sse_psubb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW:
+                       amd64_sse_psubw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBD:
+                       amd64_sse_psubd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBQ:
+                       amd64_sse_psubq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMAXB_UN:
+                       amd64_sse_pmaxub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW_UN:
+                       amd64_sse_pmaxuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD_UN:
+                       amd64_sse_pmaxud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               
+               case OP_PMAXB:
+                       amd64_sse_pmaxsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW:
+                       amd64_sse_pmaxsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD:
+                       amd64_sse_pmaxsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PAVGB_UN:
+                       amd64_sse_pavgb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PAVGW_UN:
+                       amd64_sse_pavgw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB_UN:
+                       amd64_sse_pminub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW_UN:
+                       amd64_sse_pminuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND_UN:
+                       amd64_sse_pminud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB:
+                       amd64_sse_pminsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW:
+                       amd64_sse_pminsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND:
+                       amd64_sse_pminsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPEQB:
+                       amd64_sse_pcmpeqb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQW:
+                       amd64_sse_pcmpeqw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQD:
+                       amd64_sse_pcmpeqd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQQ:
+                       amd64_sse_pcmpeqq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPGTB:
+                       amd64_sse_pcmpgtb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTW:
+                       amd64_sse_pcmpgtw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTD:
+                       amd64_sse_pcmpgtd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTQ:
+                       amd64_sse_pcmpgtq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSUM_ABS_DIFF:
+                       amd64_sse_psadbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_LOWB:
+                       amd64_sse_punpcklbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWW:
+                       amd64_sse_punpcklwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWD:
+                       amd64_sse_punpckldq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWQ:
+                       amd64_sse_punpcklqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPS:
+                       amd64_sse_unpcklps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPD:
+                       amd64_sse_unpcklpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_HIGHB:
+                       amd64_sse_punpckhbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHW:
+                       amd64_sse_punpckhwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHD:
+                       amd64_sse_punpckhdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHQ:
+                       amd64_sse_punpckhqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPS:
+                       amd64_sse_unpckhps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPD:
+                       amd64_sse_unpckhpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PACKW:
+                       amd64_sse_packsswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD:
+                       amd64_sse_packssdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKW_UN:
+                       amd64_sse_packuswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD_UN:
+                       amd64_sse_packusdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT_UN:
+                       amd64_sse_paddusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT_UN:
+                       amd64_sse_psubusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT_UN:
+                       amd64_sse_paddusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT_UN:
+                       amd64_sse_psubusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT:
+                       amd64_sse_paddsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT:
+                       amd64_sse_psubsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT:
+                       amd64_sse_paddsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT:
+                       amd64_sse_psubsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+                       
+               case OP_PMULW:
+                       amd64_sse_pmullw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULD:
+                       amd64_sse_pmulld_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULQ:
+                       amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH_UN:
+                       amd64_sse_pmulhuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH:
+                       amd64_sse_pmulhw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSHRW:
+                       amd64_sse_psrlw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRW_REG:
+                       amd64_sse_psrlw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARW:
+                       amd64_sse_psraw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARW_REG:
+                       amd64_sse_psraw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLW:
+                       amd64_sse_psllw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLW_REG:
+                       amd64_sse_psllw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRD:
+                       amd64_sse_psrld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRD_REG:
+                       amd64_sse_psrld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARD:
+                       amd64_sse_psrad_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARD_REG:
+                       amd64_sse_psrad_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLD:
+                       amd64_sse_pslld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLD_REG:
+                       amd64_sse_pslld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRQ:
+                       amd64_sse_psrlq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRQ_REG:
+                       amd64_sse_psrlq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               
+               /*TODO: This is appart of the sse spec but not added
+               case OP_PSARQ:
+                       amd64_sse_psraq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARQ_REG:
+                       amd64_sse_psraq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+               */
+       
+               case OP_PSHLQ:
+                       amd64_sse_psllq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLQ_REG:
+                       amd64_sse_psllq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+
+               case OP_ICONV_TO_X:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I4:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I8:
+                       if (ins->inst_c0) {
+                               amd64_movhlps_reg_reg (code, AMD64_XMM15, ins->sreg1);
+                               amd64_movd_reg_xreg_size (code, ins->dreg, AMD64_XMM15, 8);
+                       } else {
+                               amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8);
+                       }
+                       break;
+               case OP_EXTRACT_I1:
+               case OP_EXTRACT_U1:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
+                       break;
+               case OP_EXTRACT_I2:
+               case OP_EXTRACT_U2:
+                       /*amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm_size (code, X86_SHR, ins->dreg, 16, 4);*/
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE, 4);
+                       break;
+               case OP_EXTRACT_R8:
+                       if (ins->inst_c0)
+                               amd64_movhlps_reg_reg (code, ins->dreg, ins->sreg1);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_INSERT_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_EXTRACTX_U2:
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_INSERTX_U1_SLOW:
+                       /*sreg1 is the extracted ireg (scratch)
+                       /sreg2 is the to be inserted ireg (scratch)
+                       /dreg is the xreg to receive the value*/
+
+                       /*clear the bits from the extracted word*/
+                       amd64_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
+                       /*shift the value to insert if needed*/
+                       if (ins->inst_c0 & 1)
+                               amd64_shift_reg_imm_size (code, X86_SHL, ins->sreg2, 8, 4);
+                       /*join them together*/
+                       amd64_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
+                       break;
+               case OP_INSERTX_I4_SLOW:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
+                       amd64_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
+                       break;
+               case OP_INSERTX_I8_SLOW:
+                       amd64_movd_xreg_reg_size(code, AMD64_XMM15, ins->sreg2, 8);
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       break;
+
+               case OP_INSERTX_R4_SLOW:
+                       switch (ins->inst_c0) {
+                       case 0:
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               break;
+                       case 1:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               break;
+                       case 2:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               break;
+                       case 3:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               break;
+                       }
+                       break;
+               case OP_INSERTX_R8_SLOW:
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, ins->sreg2);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_STOREX_MEMBASE_REG:
+               case OP_STOREX_MEMBASE:
+                       amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_LOADX_MEMBASE:
+                       amd64_sse_movups_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_LOADX_ALIGNED_MEMBASE:
+                       amd64_sse_movaps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_STOREX_ALIGNED_MEMBASE_REG:
+                       amd64_sse_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_STOREX_NTA_MEMBASE_REG:
+                       amd64_sse_movntps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_PREFETCH_MEMBASE:
+                       amd64_sse_prefetch_reg_membase (code, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
+                       break;
+
+               case OP_XMOVE:
+                       /*FIXME the peephole pass should have killed this*/
+                       if (ins->dreg != ins->sreg1)
+                               amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;          
+               case OP_XZERO:
+                       amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+               case OP_ICONV_TO_R8_RAW:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+
+               case OP_FCONV_TO_R8_X:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_XCONV_R8_TO_I4:
+                       amd64_sse_cvttsd2si_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       switch (ins->backend.source_opcode) {
+                       case OP_FCONV_TO_I1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
+                               break;
+                       case OP_FCONV_TO_U1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
+                               break;
+                       case OP_FCONV_TO_I2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
+                               break;
+                       case OP_FCONV_TO_U2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
+                               break;
+                       }                       
+                       break;
+
+               case OP_EXPAND_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 0);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I4:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I8:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 8);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
+               case OP_EXPAND_R4:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->dreg);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R8:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
  #endif
                 case OP_LIVERANGE_START: {
                         if (cfg->verbose_level > 1)
@@ -4645,15 +6100,23 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
                         break;
                 }
+               case OP_NACL_GC_SAFE_POINT: {
+#if defined(__native_client_codegen__)
+                       code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)mono_nacl_gc, TRUE);
+#endif
+                       break;
+               }
                 default:
                         g_warning ("unknown opcode %s in %s()\n", mono_inst_name (ins->opcode), __FUNCTION__);
                         g_assert_not_reached ();
                 }
  
                 if ((code - cfg->native_code - offset) > max_len) {
+#if !defined(__native_client_codegen__)
                         g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %ld)",
                                    mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
                         g_assert_not_reached ();
+#endif
                 }
                
                 last_ins = ins;
@@ -4736,6 +6199,8 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
         }
  }
  
+#ifndef DISABLE_JIT
+
  static int
  get_max_epilog_size (MonoCompile *cfg)
  {
@@ -4781,10 +6246,27 @@ mono_arch_emit_prolog (MonoCompile *cfg)
         gint32 lmf_offset = cfg->arch.lmf_offset;
         gboolean args_clobbered = FALSE;
         gboolean trace = FALSE;
+#ifdef __native_client_codegen__
+       guint alignment_check;
+#endif
  
-       cfg->code_size =  MAX (((MonoMethodNormal *)method)->header->code_size * 4, 10240);
+       cfg->code_size =  MAX (cfg->header->code_size * 4, 10240);
  
+#if defined(__default_codegen__)
         code = cfg->native_code = g_malloc (cfg->code_size);
+#elif defined(__native_client_codegen__)
+       /* native_code_alloc is not 32-byte aligned, native_code is. */
+       cfg->native_code_alloc = g_malloc (cfg->code_size + kNaClAlignment);
+
+       /* Align native_code to next nearest kNaclAlignment byte. */
+       cfg->native_code = (uintptr_t)cfg->native_code_alloc + kNaClAlignment;
+       cfg->native_code = (uintptr_t)cfg->native_code & ~kNaClAlignmentMask;
+
+       code = cfg->native_code;
+
+       alignment_check = (guint)cfg->native_code & kNaClAlignmentMask;
+       g_assert (alignment_check == 0);
+#endif
  
         if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
                 trace = TRUE;
@@ -4823,14 +6305,14 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                 mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
                 mono_emit_unwind_op_offset (cfg, code, AMD64_RBP, - cfa_offset);
                 async_exc_point (code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                 mono_arch_unwindinfo_add_push_nonvol (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
  #endif
                 
-               amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+               amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof(mgreg_t));
                 mono_emit_unwind_op_def_cfa_reg (cfg, code, AMD64_RBP);
                 async_exc_point (code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                 mono_arch_unwindinfo_add_set_fpreg (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
  #endif
         }
@@ -4842,7 +6324,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                 for (i = 0; i < AMD64_NREG; ++i)
                         if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                 amd64_push_reg (code, i);
-                               pos += sizeof (gpointer);
+                               pos += 8; /* AMD64 push inst is always 8 bytes, no way to change it */
                                 offset += 8;
                                 mono_emit_unwind_op_offset (cfg, code, i, - offset);
                                 async_exc_point (code);
@@ -4855,7 +6337,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                 if (cfg->arch.omit_fp)
                         // FIXME:
                         g_assert_not_reached ();
-               cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof (gpointer));
+               cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof(mgreg_t));
         }
  
         if (cfg->arch.omit_fp) {
@@ -4878,8 +6360,19 @@ mono_arch_emit_prolog (MonoCompile *cfg)
         /* Allocate stack frame */
         if (alloc_size) {
                 /* See mono_emit_stack_alloc */
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(HOST_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
                 guint32 remaining_size = alloc_size;
+               /*FIXME handle unbounded code expansion, we should use a loop in case of more than X interactions*/
+               guint32 required_code_size = ((remaining_size / 0x1000) + 1) * 10; /*10 is the max size of amd64_alu_reg_imm + amd64_test_membase_reg*/
+               guint32 offset = code - cfg->native_code;
+               if (G_UNLIKELY (required_code_size >= (cfg->code_size - offset))) {
+                       while (required_code_size >= (cfg->code_size - offset))
+                               cfg->code_size *= 2;
+                       cfg->native_code = mono_realloc_native_code (cfg);
+                       code = cfg->native_code + offset;
+                       mono_jit_stats.code_reallocs++;
+               }
+
                 while (remaining_size >= 0x1000) {
                         amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 0x1000);
                         if (cfg->arch.omit_fp) {
@@ -4887,7 +6380,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                 mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
                         }
                         async_exc_point (code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                         if (cfg->arch.omit_fp) 
                                 mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, 0x1000);
  #endif
@@ -4902,7 +6395,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                 mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
                                 async_exc_point (code);
                         }
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                         if (cfg->arch.omit_fp) 
                                 mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, remaining_size);
  #endif
@@ -4928,12 +6421,49 @@ mono_arch_emit_prolog (MonoCompile *cfg)
         }
  #endif
  
+#ifndef TARGET_WIN32
+       if (mini_get_debug_options ()->init_stacks) {
+               /* Fill the stack frame with a dummy value to force deterministic behavior */
+       
+               /* Save registers to the red zone */
+               amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RDI, 8);
+               amd64_mov_membase_reg (code, AMD64_RSP, -16, AMD64_RCX, 8);
+
+               amd64_mov_reg_imm (code, AMD64_RAX, 0x2a2a2a2a2a2a2a2a);
+               amd64_mov_reg_imm (code, AMD64_RCX, alloc_size / 8);
+               amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RSP, 8);
+
+               amd64_cld (code);
+#if defined(__default_codegen__)
+               amd64_prefix (code, X86_REP_PREFIX);
+               amd64_stosl (code);
+#elif defined(__native_client_codegen__)
+               /* NaCl stos pseudo-instruction */
+               amd64_codegen_pre (code);
+               /* First, clear the upper 32 bits of RDI (mov %edi, %edi)  */
+               amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RDI, 4);
+               /* Add %r15 to %rdi using lea, condition flags unaffected. */
+               amd64_lea_memindex_size (code, AMD64_RDI, AMD64_R15, 0, AMD64_RDI, 0, 8);
+               amd64_prefix (code, X86_REP_PREFIX);
+               amd64_stosl (code);
+               amd64_codegen_post (code);
+#endif /* __native_client_codegen__ */
+
+               amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RSP, -8, 8);
+               amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RSP, -16, 8);
+       }
+#endif 
+
         /* Save LMF */
         if (method->save_lmf) {
                 /* 
                  * The ip field is not set, the exception handling code will obtain it from the stack location pointed to by the sp field.
                  */
-               /* sp is saved right before calls */
+               /* 
+                * sp is saved right before calls but we need to save it here too so
+                * async stack walks would work.
+                */
+               amd64_mov_membase_reg (code, cfg->frame_reg, cfg->arch.lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
                 /* Skip method (only needed for trampoline LMF frames) */
                 /* Save callee saved regs */
                 for (i = 0; i < MONO_MAX_IREGS; ++i) {
@@ -4945,8 +6475,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                         case AMD64_R12: offset = G_STRUCT_OFFSET (MonoLMF, r12); break;
                         case AMD64_R13: offset = G_STRUCT_OFFSET (MonoLMF, r13); break;
                         case AMD64_R14: offset = G_STRUCT_OFFSET (MonoLMF, r14); break;
+#ifndef __native_client_codegen__
                         case AMD64_R15: offset = G_STRUCT_OFFSET (MonoLMF, r15); break;
-#ifdef PLATFORM_WIN32
+#endif
+#ifdef HOST_WIN32
                         case AMD64_RDI: offset = G_STRUCT_OFFSET (MonoLMF, rdi); break;
                         case AMD64_RSI: offset = G_STRUCT_OFFSET (MonoLMF, rsi); break;
  #endif
@@ -4984,7 +6516,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                 g_assert (cfg->rgctx_var->opcode == OP_REGOFFSET &&
                                 (cfg->rgctx_var->inst_basereg == AMD64_RBP || cfg->rgctx_var->inst_basereg == AMD64_RSP));
  
-               amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, 8);
+               amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, sizeof(gpointer));
         }
  
         /* compute max_length in order to use short forward jumps */
@@ -4999,8 +6531,22 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                         /* max alignment for loops */
                         if ((cfg->opt & MONO_OPT_LOOP) && bb_is_loop_start (bb))
                                 max_length += LOOP_ALIGNMENT;
+#ifdef __native_client_codegen__
+                       /* max alignment for native client */
+                       max_length += kNaClAlignment;
+#endif
  
                         MONO_BB_FOR_EACH_INS (bb, ins) {
+#ifdef __native_client_codegen__
+                               {
+                                       int space_in_block = kNaClAlignment -
+                                               ((max_length + cfg->code_len) & kNaClAlignmentMask);
+                                       int max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
+                                       if (space_in_block < max_len && max_len < kNaClAlignment) {
+                                               max_length += space_in_block;
+                                       }
+                               }
+#endif  /*__native_client_codegen__*/
                                 max_length += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
                         }
  
@@ -5052,13 +6598,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                 for (quad = 0; quad < 2; quad ++) {
                                         switch (ainfo->pair_storage [quad]) {
                                         case ArgInIReg:
-                                               amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad], sizeof (gpointer));
+                                               amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad], sizeof(mgreg_t));
                                                 break;
                                         case ArgInFloatSSEReg:
-                                               amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+                                               amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
                                                 break;
                                         case ArgInDoubleSSEReg:
-                                               amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+                                               amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
                                                 break;
                                         case ArgNone:
                                                 break;
@@ -5104,13 +6650,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                 for (quad = 0; quad < 2; quad ++) {
                                         switch (ainfo->pair_storage [quad]) {
                                         case ArgInIReg:
-                                               amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad], sizeof (gpointer));
+                                               amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad], sizeof(mgreg_t));
                                                 break;
                                         case ArgInFloatSSEReg:
-                                               amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+                                               amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
                                                 break;
                                         case ArgInDoubleSSEReg:
-                                               amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+                                               amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
                                                 break;
                                         case ArgNone:
                                                 break;
@@ -5175,7 +6721,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                         code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
                                           (gpointer)"mono_jit_thread_attach", TRUE);
                         amd64_patch (buf, code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                         /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
                         /* FIXME: Add a separate key for LMF to avoid this */
                         amd64_alu_reg_imm (code, X86_ADD, AMD64_RAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
@@ -5221,7 +6767,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                         if (lmf_addr_tls_offset != -1) {
                                 /* Load lmf quicky using the FS register */
                                 code = mono_amd64_emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                                 /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
                                 /* FIXME: Add a separate key for LMF to avoid this */
                                 amd64_alu_reg_imm (code, X86_ADD, AMD64_RAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
@@ -5238,13 +6784,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                         }
  
                         /* Save lmf_addr */
-                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, sizeof(gpointer));
                         /* Save previous_lmf */
-                       amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+                       amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, sizeof(gpointer));
+                       amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, sizeof(gpointer));
                         /* Set new lmf */
                         amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
-                       amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
+                       amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, sizeof(gpointer));
                 }
         }
  
@@ -5323,6 +6869,17 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                 }
         }
  
+       /* Initialize ss_trigger_page_var */
+       if (cfg->arch.ss_trigger_page_var) {
+               MonoInst *var = cfg->arch.ss_trigger_page_var;
+
+               g_assert (!cfg->compile_aot);
+               g_assert (var->opcode == OP_REGOFFSET);
+
+               amd64_mov_reg_imm (code, AMD64_R11, (guint64)ss_trigger_page);
+               amd64_mov_membase_reg (code, var->inst_basereg, var->inst_offset, AMD64_R11, 8);
+       }
+
         cfg->code_len = code - cfg->native_code;
  
         g_assert (cfg->code_len < cfg->code_size);
@@ -5344,7 +6901,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
  
         while (cfg->code_len + max_epilog_size > (cfg->code_size - 16)) {
                 cfg->code_size *= 2;
-               cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+               cfg->native_code = mono_realloc_native_code (cfg);
                 mono_jit_stats.code_reallocs++;
         }
  
@@ -5380,14 +6937,14 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                          * through the mono_lmf_addr TLS variable.
                          */
                         /* reg = previous_lmf */
-                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof(gpointer));
                         x86_prefix (code, X86_FS_PREFIX);
                         amd64_mov_mem_reg (code, lmf_tls_offset, AMD64_R11, 8);
                 } else {
                         /* Restore previous lmf */
-                       amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
-                       amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+                       amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof(gpointer));
+                       amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), sizeof(gpointer));
+                       amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, sizeof(gpointer));
                 }
  
                 /* Restore caller saved regs */
@@ -5407,9 +6964,13 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                         amd64_mov_reg_membase (code, AMD64_R14, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
                 }
                 if (cfg->used_int_regs & (1 << AMD64_R15)) {
+#if defined(__default_codegen__)
                         amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
+#elif defined(__native_client_codegen__)
+                       g_assert_not_reached();
+#endif
                 }
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                 if (cfg->used_int_regs & (1 << AMD64_RDI)) {
                         amd64_mov_reg_membase (code, AMD64_RDI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), 8);
                 }
@@ -5431,10 +6992,10 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                 else {
                         for (i = 0; i < AMD64_NREG; ++i)
                                 if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-                                       pos -= sizeof (gpointer);
+                                       pos -= sizeof(mgreg_t);
  
                         if (pos) {
-                               if (pos == - sizeof (gpointer)) {
+                               if (pos == - sizeof(mgreg_t)) {
                                         /* Only one register, so avoid lea */
                                         for (i = AMD64_NREG - 1; i > 0; --i)
                                                 if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
@@ -5463,13 +7024,13 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                 for (quad = 0; quad < 2; quad ++) {
                         switch (ainfo->pair_storage [quad]) {
                         case ArgInIReg:
-                               amd64_mov_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)), sizeof (gpointer));
+                               amd64_mov_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)), sizeof(mgreg_t));
                                 break;
                         case ArgInFloatSSEReg:
-                               amd64_movss_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)));
+                               amd64_movss_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)));
                                 break;
                         case ArgInDoubleSSEReg:
-                               amd64_movsd_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)));
+                               amd64_movsd_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)));
                                 break;
                         case ArgNone:
                                 break;
@@ -5511,11 +7072,20 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                         code_size += 8 + 15; /* sizeof (double) + alignment */
                 if (patch_info->type == MONO_PATCH_INFO_R4)
                         code_size += 4 + 15; /* sizeof (float) + alignment */
+               if (patch_info->type == MONO_PATCH_INFO_GC_CARD_TABLE_ADDR)
+                       code_size += 8 + 7; /*sizeof (void*) + alignment */
         }
  
+#ifdef __native_client_codegen__
+       /* Give us extra room on Native Client.  This could be   */
+       /* more carefully calculated, but bundle alignment makes */
+       /* it much trickier, so *2 like other places is good.    */
+       code_size *= 2;
+#endif
+
         while (cfg->code_len + code_size > (cfg->code_size - 16)) {
                 cfg->code_size *= 2;
-               cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+               cfg->native_code = mono_realloc_native_code (cfg);
                 mono_jit_stats.code_reallocs++;
         }
  
@@ -5555,7 +7125,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                                         exc_classes [nthrows] = exc_class;
                                         exc_throw_start [nthrows] = code;
                                 }
-                               amd64_mov_reg_imm (code, AMD64_ARG_REG1, exc_class->type_token);
+                               amd64_mov_reg_imm (code, AMD64_ARG_REG1, exc_class->type_token - MONO_TOKEN_TYPE_DEF);
  
                                 patch_info->type = MONO_PATCH_INFO_NONE;
  
@@ -5576,35 +7146,104 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                         /* do nothing */
                         break;
                 }
+               g_assert(code < cfg->native_code + cfg->code_size);
         }
  
         /* Handle relocations with RIP relative addressing */
         for (patch_info = cfg->patch_info; patch_info; patch_info = patch_info->next) {
                 gboolean remove = FALSE;
+               guint8 *orig_code = code;
  
                 switch (patch_info->type) {
                 case MONO_PATCH_INFO_R8:
                 case MONO_PATCH_INFO_R4: {
-                       guint8 *pos;
+                       guint8 *pos, *patch_pos, *target_pos;
  
                         /* The SSE opcodes require a 16 byte alignment */
+#if defined(__default_codegen__)
                         code = (guint8*)ALIGN_TO (code, 16);
+#elif defined(__native_client_codegen__)
+                       {
+                               /* Pad this out with HLT instructions  */
+                               /* or we can get garbage bytes emitted */
+                               /* which will fail validation          */
+                               guint8 *aligned_code;
+                               /* extra align to make room for  */
+                               /* mov/push below                      */
+                               int extra_align = patch_info->type == MONO_PATCH_INFO_R8 ? 2 : 1;
+                               aligned_code = (guint8*)ALIGN_TO (code + extra_align, 16);
+                               /* The technique of hiding data in an  */
+                               /* instruction has a problem here: we  */
+                               /* need the data aligned to a 16-byte  */
+                               /* boundary but the instruction cannot */
+                               /* cross the bundle boundary. so only  */
+                               /* odd multiples of 16 can be used     */
+                               if ((intptr_t)aligned_code % kNaClAlignment == 0) {
+                                       aligned_code += 16;
+                               }
+                               while (code < aligned_code) {
+                                       *(code++) = 0xf4; /* hlt */
+                               }
+                       }       
+#endif
  
                         pos = cfg->native_code + patch_info->ip.i;
-
-                       if (IS_REX (pos [1]))
-                               *(guint32*)(pos + 5) = (guint8*)code - pos - 9;
-                       else
-                               *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+                       if (IS_REX (pos [1])) {
+                               patch_pos = pos + 5;
+                               target_pos = code - pos - 9;
+                       }
+                       else {
+                               patch_pos = pos + 4;
+                               target_pos = code - pos - 8;
+                       }
  
                         if (patch_info->type == MONO_PATCH_INFO_R8) {
+#ifdef __native_client_codegen__
+                               /* Hide 64-bit data in a         */
+                               /* "mov imm64, r11" instruction. */
+                               /* write it before the start of  */
+                               /* the data*/
+                               *(code-2) = 0x49; /* prefix      */
+                               *(code-1) = 0xbb; /* mov X, %r11 */
+#endif
                                 *(double*)code = *(double*)patch_info->data.target;
                                 code += sizeof (double);
                         } else {
+#ifdef __native_client_codegen__
+                               /* Hide 32-bit data in a        */
+                               /* "push imm32" instruction.    */
+                               *(code-1) = 0x68; /* push */
+#endif
                                 *(float*)code = *(float*)patch_info->data.target;
                                 code += sizeof (float);
                         }
  
+                       *(guint32*)(patch_pos) = target_pos;
+
+                       remove = TRUE;
+                       break;
+               }
+               case MONO_PATCH_INFO_GC_CARD_TABLE_ADDR: {
+                       guint8 *pos;
+
+                       if (cfg->compile_aot)
+                               continue;
+
+                       /*loading is faster against aligned addresses.*/
+                       code = (guint8*)ALIGN_TO (code, 8);
+                       memset (orig_code, 0, code - orig_code);
+
+                       pos = cfg->native_code + patch_info->ip.i;
+
+                       /*alu_op [rex] modr/m imm32 - 7 or 8 bytes */
+                       if (IS_REX (pos [1]))
+                               *(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+                       else
+                               *(guint32*)(pos + 3) = (guint8*)code - pos - 7;
+
+                       *(gpointer*)code = (gpointer)patch_info->data.target;
+                       code += sizeof (gpointer);
+
                         remove = TRUE;
                         break;
                 }
@@ -5623,6 +7262,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                                 tmp->next = patch_info->next;
                         }
                 }
+               g_assert (code < cfg->native_code + cfg->code_size);
         }
  
         cfg->code_len = code - cfg->native_code;
@@ -5631,6 +7271,8 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
  
  }
  
+#endif /* DISABLE_JIT */
+
  void*
  mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
  {
@@ -5646,7 +7288,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                 /* Allocate a new area on the stack and save arguments there */
                 sig = mono_method_signature (cfg->method);
  
-               cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
+               cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
  
                 n = sig->param_count + sig->hasthis;
  
@@ -5691,9 +7333,9 @@ mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolea
         guchar *code = p;
         int save_mode = SAVE_NONE;
         MonoMethod *method = cfg->method;
-       int rtype = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret)->type;
+       MonoType *ret_type = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
         
-       switch (rtype) {
+       switch (ret_type->type) {
         case MONO_TYPE_VOID:
                 /* special case string .ctor icall */
                 if (strcmp (".ctor", method->name) && method->klass == mono_defaults.string_class)
@@ -5710,7 +7352,7 @@ mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolea
                 save_mode = SAVE_XMM;
                 break;
         case MONO_TYPE_GENERICINST:
-               if (!mono_type_generic_inst_is_valuetype (mono_method_signature (method)->ret)) {
+               if (!mono_type_generic_inst_is_valuetype (ret_type)) {
                         save_mode = SAVE_EAX;
                         break;
                 }
@@ -5938,116 +7580,56 @@ mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guin
         return can_write;
  }
  
-gpointer
-mono_arch_get_vcall_slot (guint8 *code, mgreg_t *regs, int *displacement)
-{
-       guint8 buf [10];
-       guint32 reg;
-       gint32 disp;
-       guint8 rex = 0;
-
-       mono_breakpoint_clean_code (NULL, code, 9, buf, sizeof (buf));
-       code = buf + 9;
-
-       *displacement = 0;
-
-       code -= 7;
-
-       /* 
-        * A given byte sequence can match more than case here, so we have to be
-        * really careful about the ordering of the cases. Longer sequences
-        * come first.
-        * There are two types of calls:
-        * - direct calls: 0xff address_byte 8/32 bits displacement
-        * - indirect calls: nop nop nop <call>
-        * The nops make sure we don't confuse the instruction preceeding an indirect
-        * call with a direct call.
-        */
-       if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
-               /* call OFFSET(%rip) */
-               disp = *(guint32*)(code + 3);
-               return (gpointer*)(code + disp + 7);
-       } else if ((code [0] == 0xff) && (amd64_modrm_reg (code [1]) == 0x2) && (amd64_modrm_mod (code [1]) == 0x2) && (amd64_sib_index (code [2]) == 4) && (amd64_sib_scale (code [2]) == 0)) {
-               /* call *[reg+disp32] using indexed addressing */
-               /* The LLVM JIT emits this, and we emit it too for %r12 */
-               if (IS_REX (code [-1])) {
-                       rex = code [-1];
-                       g_assert (amd64_rex_x (rex) == 0);
-               }                       
-               reg = amd64_sib_base (code [2]);
-               disp = *(gint32*)(code + 3);
-       } else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
-               /* call *[reg+disp32] */
-               if (IS_REX (code [0]))
-                       rex = code [0];
-               reg = amd64_modrm_rm (code [2]);
-               disp = *(gint32*)(code + 3);
-               /* R10 is clobbered by the IMT thunk code */
-               g_assert (reg != AMD64_R10);
-       } else if (code [2] == 0xe8) {
-               /* call <ADDR> */
-               return NULL;
-       } else if ((code [3] == 0xff) && (amd64_modrm_reg (code [4]) == 0x2) && (amd64_modrm_mod (code [4]) == 0x1) && (amd64_sib_index (code [5]) == 4) && (amd64_sib_scale (code [5]) == 0)) {
-               /* call *[r12+disp8] using indexed addressing */
-               if (IS_REX (code [2]))
-                       rex = code [2];
-               reg = amd64_sib_base (code [5]);
-               disp = *(gint8*)(code + 6);
-       } else if (IS_REX (code [4]) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x3)) {
-               /* call *%reg */
-               return NULL;
-       } else if ((code [4] == 0xff) && (amd64_modrm_reg (code [5]) == 0x2) && (amd64_modrm_mod (code [5]) == 0x1)) {
-               /* call *[reg+disp8] */
-               if (IS_REX (code [3]))
-                       rex = code [3];
-               reg = amd64_modrm_rm (code [5]);
-               disp = *(gint8*)(code + 6);
-               //printf ("B: [%%r%d+0x%x]\n", reg, disp);
-       }
-       else if ((code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-               /* call *%reg */
-               if (IS_REX (code [4]))
-                       rex = code [4];
-               reg = amd64_modrm_rm (code [6]);
-               disp = 0;
-       }
-       else
-               g_assert_not_reached ();
+#if defined(__native_client_codegen__)
+/* For membase calls, we want the base register. for Native Client,  */
+/* all indirect calls have the following sequence with the given sizes: */
+/* mov %eXX,%eXX                               [2-3]   */
+/* mov disp(%r15,%rXX,scale),%r11d             [4-8]   */
+/* and $0xffffffffffffffe0,%r11d               [4]     */
+/* add %r15,%r11                               [3]     */
+/* callq *%r11                                 [3]     */
  
-       reg += amd64_rex_b (rex);
  
-       /* R11 is clobbered by the trampoline code */
-       g_assert (reg != AMD64_R11);
+/* Determine if code points to a NaCl call-through-register sequence, */
+/* (i.e., the last 3 instructions listed above) */
+int
+is_nacl_call_reg_sequence(guint8* code)
+{
+       const char *sequence = "\x41\x83\xe3\xe0" /* and */
+                              "\x4d\x03\xdf"     /* add */
+                              "\x41\xff\xd3";   /* call */
+       return memcmp(code, sequence, 10) == 0;
+}
  
-       *displacement = disp;
-       return (gpointer)regs [reg];
+/* Determine if code points to the first opcode of the mov membase component */
+/* of an indirect call sequence (i.e. the first 2 instructions listed above) */
+/* (there could be a REX prefix before the opcode but it is ignored) */
+static int
+is_nacl_indirect_call_membase_sequence(guint8* code)
+{
+              /* Check for mov opcode, reg-reg addressing mode (mod = 3), */
+       return code[0] == 0x8b && amd64_modrm_mod(code[1]) == 3 &&
+              /* and that src reg = dest reg */
+              amd64_modrm_reg(code[1]) == amd64_modrm_rm(code[1]) &&
+              /* Check that next inst is mov, uses SIB byte (rm = 4), */
+              IS_REX(code[2]) &&
+              code[3] == 0x8b && amd64_modrm_rm(code[4]) == 4 &&
+              /* and has dst of r11 and base of r15 */
+              (amd64_modrm_reg(code[4]) + amd64_rex_r(code[2])) == AMD64_R11 &&
+              (amd64_sib_base(code[5]) + amd64_rex_b(code[2])) == AMD64_R15;
  }
+#endif /* __native_client_codegen__ */
  
  int
-mono_arch_get_this_arg_reg (MonoMethodSignature *sig, MonoGenericSharingContext *gsctx, guint8 *code)
+mono_arch_get_this_arg_reg (guint8 *code)
  {
-       int this_reg = AMD64_ARG_REG1;
-
-       if (MONO_TYPE_ISSTRUCT (sig->ret)) {
-               CallInfo *cinfo;
-
-               if (!gsctx && code)
-                       gsctx = mono_get_generic_context_from_code (code);
-
-               cinfo = get_call_info (gsctx, NULL, sig, FALSE);
-               
-               if (cinfo->ret.storage != ArgValuetypeInReg)
-                       this_reg = AMD64_ARG_REG2;
-               g_free (cinfo);
-       }
-
-       return this_reg;
+       return AMD64_ARG_REG1;
  }
  
  gpointer
-mono_arch_get_this_arg_from_call (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, mgreg_t *regs, guint8 *code)
+mono_arch_get_this_arg_from_call (mgreg_t *regs, guint8 *code)
  {
-       return (gpointer)regs [mono_arch_get_this_arg_reg (sig, gsctx, code)];
+       return (gpointer)regs [mono_arch_get_this_arg_reg (code)];
  }
  
  #define MAX_ARCH_DELEGATE_PARAMS 10
@@ -6076,7 +7658,7 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
                         /* We have to shift the arguments left */
                         amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
                         for (i = 0; i < param_count; ++i) {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                                 if (i < 3)
                                         amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
                                 else
@@ -6091,18 +7673,32 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
                 g_assert ((code - start) < 64);
         }
  
+       nacl_global_codeman_validate(&start, 64, &code);
+
         mono_debug_add_delegate_trampoline (start, code - start);
  
         if (code_len)
                 *code_len = code - start;
  
+
+       if (mono_jit_map_is_enabled ()) {
+               char *buff;
+               if (has_target)
+                       buff = (char*)"delegate_invoke_has_target";
+               else
+                       buff = g_strdup_printf ("delegate_invoke_no_target_%d", param_count);
+               mono_emit_jit_tramp (start, code - start, buff);
+               if (!has_target)
+                       g_free (buff);
+       }
+
         return start;
  }
  
  /*
   * mono_arch_get_delegate_invoke_impls:
   *
- *   Return a list of MonoAotTrampInfo structures for the delegate invoke impl
+ *   Return a list of MonoTrampInfo structures for the delegate invoke impl
   * trampolines.
   */
  GSList*
@@ -6114,11 +7710,11 @@ mono_arch_get_delegate_invoke_impls (void)
         int i;
  
         code = get_delegate_invoke_impl (TRUE, 0, &code_len);
-       res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup ("delegate_invoke_impl_has_target"), code, code_len));
+       res = g_slist_prepend (res, mono_tramp_info_create (g_strdup ("delegate_invoke_impl_has_target"), code, code_len, NULL, NULL));
  
         for (i = 0; i < MAX_ARCH_DELEGATE_PARAMS; ++i) {
                 code = get_delegate_invoke_impl (FALSE, i, &code_len);
-               res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup_printf ("delegate_invoke_impl_target_%d", i), code, code_len));
+               res = g_slist_prepend (res, mono_tramp_info_create (g_strdup_printf ("delegate_invoke_impl_target_%d", i), code, code_len, NULL, NULL));
         }
  
         return res;
@@ -6144,7 +7740,7 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
                         return cached;
  
                 if (mono_aot_only)
-                       start = mono_aot_get_named_code ("delegate_invoke_impl_has_target");
+                       start = mono_aot_get_trampoline ("delegate_invoke_impl_has_target");
                 else
                         start = get_delegate_invoke_impl (TRUE, 0, NULL);
  
@@ -6165,7 +7761,7 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
  
                 if (mono_aot_only) {
                         char *name = g_strdup_printf ("delegate_invoke_impl_target_%d", sig->param_count);
-                       start = mono_aot_get_named_code (name);
+                       start = mono_aot_get_trampoline (name);
                         g_free (name);
                 } else {
                         start = get_delegate_invoke_impl (FALSE, sig->param_count, NULL);
@@ -6190,14 +7786,13 @@ void
  mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
  {
         if (!tls_offset_inited) {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
                 /* 
                  * We need to init this multiple times, since when we are first called, the key might not
                  * be initialized yet.
                  */
                 appdomain_tls_offset = mono_domain_get_tls_key ();
                 lmf_tls_offset = mono_get_jit_tls_key ();
-               thread_tls_offset = mono_thread_get_tls_key ();
                 lmf_addr_tls_offset = mono_get_jit_tls_key ();
  
                 /* Only 64 tls entries can be accessed using inline code */
@@ -6205,8 +7800,6 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                         appdomain_tls_offset = -1;
                 if (lmf_tls_offset >= 64)
                         lmf_tls_offset = -1;
-               if (thread_tls_offset >= 64)
-                       thread_tls_offset = -1;
  #else
                 tls_offset_inited = TRUE;
  #ifdef MONO_XEN_OPT
@@ -6215,7 +7808,6 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                 appdomain_tls_offset = mono_domain_get_tls_offset ();
                 lmf_tls_offset = mono_get_lmf_tls_offset ();
                 lmf_addr_tls_offset = mono_get_lmf_addr_tls_offset ();
-               thread_tls_offset = mono_thread_get_tls_offset ();
  #endif
         }               
  }
@@ -6227,6 +7819,7 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
  
  #ifdef MONO_ARCH_HAVE_IMT
  
+#if defined(__default_codegen__)
  #define CMP_SIZE (6 + 1)
  #define CMP_REG_REG_SIZE (4 + 1)
  #define BR_SMALL_SIZE 2
@@ -6234,6 +7827,20 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
  #define MOV_REG_IMM_SIZE 10
  #define MOV_REG_IMM_32BIT_SIZE 6
  #define JUMP_REG_SIZE (2 + 1)
+#elif defined(__native_client_codegen__)
+/* NaCl N-byte instructions can be padded up to N-1 bytes */
+#define CMP_SIZE ((6 + 1) * 2 - 1)
+#define CMP_REG_REG_SIZE ((4 + 1) * 2 - 1)
+#define BR_SMALL_SIZE (2 * 2 - 1)
+#define BR_LARGE_SIZE (6 * 2 - 1)
+#define MOV_REG_IMM_SIZE (10 * 2 - 1)
+#define MOV_REG_IMM_32BIT_SIZE (6 * 2 - 1)
+/* Jump reg for NaCl adds a mask (+4) and add (+3) */
+#define JUMP_REG_SIZE ((2 + 1 + 4 + 3) * 2 - 1)
+/* Jump membase's size is large and unpredictable    */
+/* in native client, just pad it out a whole bundle. */
+#define JUMP_MEMBASE_SIZE (kNaClAlignment)
+#endif
  
  static int
  imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
@@ -6273,6 +7880,9 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                                                 item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
                                         else
                                                 item->chunk_size += MOV_REG_IMM_SIZE;
+#ifdef __native_client_codegen__
+                                       item->chunk_size += JUMP_MEMBASE_SIZE;
+#endif
                                 }
                                 item->chunk_size += BR_SMALL_SIZE + JUMP_REG_SIZE;
                         } else {
@@ -6288,6 +7898,9 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                                         /* with assert below:
                                          * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
                                          */
+#ifdef __native_client_codegen__
+                                       item->chunk_size += JUMP_MEMBASE_SIZE;
+#endif
                                 }
                         }
                 } else {
@@ -6300,10 +7913,16 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                 }
                 size += item->chunk_size;
         }
+#if defined(__native_client__) && defined(__native_client_codegen__)
+       /* In Native Client, we don't re-use thunks, allocate from the */
+       /* normal code manager paths. */
+       code = mono_domain_code_reserve (domain, size);
+#else
         if (fail_tramp)
                 code = mono_method_alloc_generic_virtual_thunk (domain, size);
         else
                 code = mono_domain_code_reserve (domain, size);
+#endif
         start = code;
         for (i = 0; i < count; ++i) {
                 MonoIMTCheckItem *item = imt_entries [i];
@@ -6316,25 +7935,24 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                                         if (amd64_is_imm32 (item->key))
                                                 amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                         else {
-                                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
-                                               amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+                                               amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+                                               amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
                                         }
                                 }
                                 item->jmp_code = code;
                                 amd64_branch8 (code, X86_CC_NE, 0, FALSE);
-                               /* See the comment below about R10 */
                                 if (item->has_target_code) {
-                                       amd64_mov_reg_imm (code, AMD64_R10, item->value.target_code);
-                                       amd64_jump_reg (code, AMD64_R10);
+                                       amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->value.target_code);
+                                       amd64_jump_reg (code, MONO_ARCH_IMT_SCRATCH_REG);
                                 } else {
-                                       amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
-                                       amd64_jump_membase (code, AMD64_R10, 0);
+                                       amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+                                       amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
                                 }
  
                                 if (fail_case) {
                                         amd64_patch (item->jmp_code, code);
-                                       amd64_mov_reg_imm (code, AMD64_R10, fail_tramp);
-                                       amd64_jump_reg (code, AMD64_R10);
+                                       amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, fail_tramp);
+                                       amd64_jump_reg (code, MONO_ARCH_IMT_SCRATCH_REG);
                                         item->jmp_code = NULL;
                                 }
                         } else {
@@ -6343,33 +7961,33 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                                 if (amd64_is_imm32 (item->key))
                                         amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                 else {
-                                       amd64_mov_reg_imm (code, AMD64_R10, item->key);
-                                       amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+                                       amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+                                       amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
                                 }
                                 item->jmp_code = code;
                                 amd64_branch8 (code, X86_CC_NE, 0, FALSE);
                                 /* See the comment below about R10 */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
-                               amd64_jump_membase (code, AMD64_R10, 0);
+                               amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+                               amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
                                 amd64_patch (item->jmp_code, code);
                                 amd64_breakpoint (code);
                                 item->jmp_code = NULL;
  #else
-                               /* We're using R10 here because R11
+                               /* We're using R10 (MONO_ARCH_IMT_SCRATCH_REG) here because R11 (MONO_ARCH_IMT_REG)
                                    needs to be preserved.  R10 needs
                                    to be preserved for calls which
                                    require a runtime generic context,
                                    but interface calls don't. */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
-                               amd64_jump_membase (code, AMD64_R10, 0);
+                               amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+                               amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
  #endif
                         }
                 } else {
                         if (amd64_is_imm32 (item->key))
                                 amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                         else {
-                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
-                               amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+                               amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+                               amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
                         }
                         item->jmp_code = code;
                         if (x86_is_imm8 (imt_branch_distance (imt_entries, i, item->check_target_idx)))
@@ -6393,6 +8011,8 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                 mono_stats.imt_thunks_size += code - start;
         g_assert (code - start <= size);
  
+       nacl_domain_code_validate(domain, &start, size, &code);
+
         return start;
  }
  
@@ -6401,12 +8021,6 @@ mono_arch_find_imt_method (mgreg_t *regs, guint8 *code)
  {
         return (MonoMethod*)regs [MONO_ARCH_IMT_REG];
  }
-
-MonoObject*
-mono_arch_find_this_argument (mgreg_t *regs, MonoMethod *method, MonoGenericSharingContext *gsctx)
-{
-       return mono_arch_get_this_arg_from_call (gsctx, mono_method_signature (method), regs, NULL);
-}
  #endif
  
  MonoVTable*
@@ -6415,6 +8029,17 @@ mono_arch_find_static_call_vtable (mgreg_t *regs, guint8 *code)
         return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
  }
  
+GSList*
+mono_arch_get_cie_program (void)
+{
+       GSList *l = NULL;
+
+       mono_add_unwind_op_def_cfa (l, (guint8*)NULL, (guint8*)NULL, AMD64_RSP, 8);
+       mono_add_unwind_op_offset (l, (guint8*)NULL, (guint8*)NULL, AMD64_RIP, -8);
+
+       return l;
+}
+
  MonoInst*
  mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
  {
@@ -6508,18 +8133,6 @@ MonoInst* mono_arch_get_domain_intrinsic (MonoCompile* cfg)
         return ins;
  }
  
-MonoInst* mono_arch_get_thread_intrinsic (MonoCompile* cfg)
-{
-       MonoInst* ins;
-       
-       if (thread_tls_offset == -1)
-               return NULL;
-       
-       MONO_INST_NEW (cfg, ins, OP_TLS_GET);
-       ins->inst_offset = thread_tls_offset;
-       return ins;
-}
-
  #define _CTX_REG(ctx,fld,i) ((gpointer)((&ctx->fld)[i]))
  
  gpointer
@@ -6540,3 +8153,260 @@ mono_arch_context_get_int_reg (MonoContext *ctx, int reg)
                         g_assert_not_reached ();
         }
  }
+
+/*MONO_ARCH_HAVE_HANDLER_BLOCK_GUARD*/
+gpointer
+mono_arch_install_handler_block_guard (MonoJitInfo *ji, MonoJitExceptionInfo *clause, MonoContext *ctx, gpointer new_value)
+{
+       int offset;
+       gpointer *sp, old_value;
+       char *bp;
+       const unsigned char *handler;
+
+       /*Decode the first instruction to figure out where did we store the spvar*/
+       /*Our jit MUST generate the following:
+        mov    %rsp, ?(%rbp)
+
+        Which is encoded as: REX.W 0x89 mod_rm
+        mod_rm (rsp, rbp, imm) which can be: (imm will never be zero)
+               mod (reg + imm8):  01 reg(rsp): 100 rm(rbp): 101 -> 01100101 (0x65)
+               mod (reg + imm32): 10 reg(rsp): 100 rm(rbp): 101 -> 10100101 (0xA5)
+
+       FIXME can we generate frameless methods on this case?
+
+       */
+       handler = clause->handler_start;
+
+       /*REX.W*/
+       if (*handler != 0x48)
+               return NULL;
+       ++handler;
+
+       /*mov r, r/m */
+       if (*handler != 0x89)
+               return NULL;
+       ++handler;
+
+       if (*handler == 0x65)
+               offset = *(signed char*)(handler + 1);
+       else if (*handler == 0xA5)
+               offset = *(int*)(handler + 1);
+       else
+               return NULL;
+
+       /*Load the spvar*/
+       bp = MONO_CONTEXT_GET_BP (ctx);
+       sp = *(gpointer*)(bp + offset);
+
+       old_value = *sp;
+       if (old_value < ji->code_start || (char*)old_value > ((char*)ji->code_start + ji->code_size))
+               return old_value;
+
+       *sp = new_value;
+
+       return old_value;
+}
+
+/*
+ * mono_arch_emit_load_aotconst:
+ *
+ *   Emit code to load the contents of the GOT slot identified by TRAMP_TYPE and
+ * TARGET from the mscorlib GOT in full-aot code.
+ * On AMD64, the result is placed into R11.
+ */
+guint8*
+mono_arch_emit_load_aotconst (guint8 *start, guint8 *code, MonoJumpInfo **ji, int tramp_type, gconstpointer target)
+{
+       *ji = mono_patch_info_list_prepend (*ji, code - start, tramp_type, target);
+       amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, 8);
+
+       return code;
+}
+
+/*
+ * mono_arch_get_trampolines:
+ *
+ *   Return a list of MonoTrampInfo structures describing arch specific trampolines
+ * for AOT.
+ */
+GSList *
+mono_arch_get_trampolines (gboolean aot)
+{
+       return mono_amd64_get_exception_trampolines (aot);
+}
+
+/* Soft Debug support */
+#ifdef MONO_ARCH_SOFT_DEBUG_SUPPORTED
+
+/*
+ * mono_arch_set_breakpoint:
+ *
+ *   Set a breakpoint at the native code corresponding to JI at NATIVE_OFFSET.
+ * The location should contain code emitted by OP_SEQ_POINT.
+ */
+void
+mono_arch_set_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+       guint8 *code = ip;
+       guint8 *orig_code = code;
+
+       /* 
+        * In production, we will use int3 (has to fix the size in the md 
+        * file). But that could confuse gdb, so during development, we emit a SIGSEGV
+        * instead.
+        */
+       g_assert (code [0] == 0x90);
+       if (breakpoint_size == 8) {
+               amd64_mov_reg_mem (code, AMD64_R11, (guint64)bp_trigger_page, 4);
+       } else {
+               amd64_mov_reg_imm_size (code, AMD64_R11, (guint64)bp_trigger_page, 8);
+               amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, 0, 4);
+       }
+
+       g_assert (code - orig_code == breakpoint_size);
+}
+
+/*
+ * mono_arch_clear_breakpoint:
+ *
+ *   Clear the breakpoint at IP.
+ */
+void
+mono_arch_clear_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+       guint8 *code = ip;
+       int i;
+
+       for (i = 0; i < breakpoint_size; ++i)
+               x86_nop (code);
+}
+
+gboolean
+mono_arch_is_breakpoint_event (void *info, void *sigctx)
+{
+#ifdef HOST_WIN32
+       EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;
+       return FALSE;
+#else
+       siginfo_t* sinfo = (siginfo_t*) info;
+       /* Sometimes the address is off by 4 */
+       if (sinfo->si_addr >= bp_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)bp_trigger_page + 128)
+               return TRUE;
+       else
+               return FALSE;
+#endif
+}
+
+/*
+ * mono_arch_get_ip_for_breakpoint:
+ *
+ *   Convert the ip in CTX to the address where a breakpoint was placed.
+ */
+guint8*
+mono_arch_get_ip_for_breakpoint (MonoJitInfo *ji, MonoContext *ctx)
+{
+       guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+       /* ip points to the instruction causing the fault */
+       ip -= (breakpoint_size - breakpoint_fault_size);
+
+       return ip;
+}
+
+/*
+ * mono_arch_skip_breakpoint:
+ *
+ *   Modify CTX so the ip is placed after the breakpoint instruction, so when
+ * we resume, the instruction is not executed again.
+ */
+void
+mono_arch_skip_breakpoint (MonoContext *ctx)
+{
+       MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + breakpoint_fault_size);
+}
+       
+/*
+ * mono_arch_start_single_stepping:
+ *
+ *   Start single stepping.
+ */
+void
+mono_arch_start_single_stepping (void)
+{
+       mono_mprotect (ss_trigger_page, mono_pagesize (), 0);
+}
+       
+/*
+ * mono_arch_stop_single_stepping:
+ *
+ *   Stop single stepping.
+ */
+void
+mono_arch_stop_single_stepping (void)
+{
+       mono_mprotect (ss_trigger_page, mono_pagesize (), MONO_MMAP_READ);
+}
+
+/*
+ * mono_arch_is_single_step_event:
+ *
+ *   Return whenever the machine state in SIGCTX corresponds to a single
+ * step event.
+ */
+gboolean
+mono_arch_is_single_step_event (void *info, void *sigctx)
+{
+#ifdef HOST_WIN32
+       EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;
+       return FALSE;
+#else
+       siginfo_t* sinfo = (siginfo_t*) info;
+       /* Sometimes the address is off by 4 */
+       if (sinfo->si_addr >= ss_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)ss_trigger_page + 128)
+               return TRUE;
+       else
+               return FALSE;
+#endif
+}
+
+/*
+ * mono_arch_get_ip_for_single_step:
+ *
+ *   Convert the ip in CTX to the address stored in seq_points.
+ */
+guint8*
+mono_arch_get_ip_for_single_step (MonoJitInfo *ji, MonoContext *ctx)
+{
+       guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+       ip += single_step_fault_size;
+
+       return ip;
+}
+
+/*
+ * mono_arch_skip_single_step:
+ *
+ *   Modify CTX so the ip is placed after the single step trigger instruction,
+ * we resume, the instruction is not executed again.
+ */
+void
+mono_arch_skip_single_step (MonoContext *ctx)
+{
+       MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + single_step_fault_size);
+}
+
+/*
+ * mono_arch_create_seq_point_info:
+ *
+ *   Return a pointer to a data structure which is used by the sequence
+ * point implementation in AOTed code.
+ */
+gpointer
+mono_arch_get_seq_point_info (MonoDomain *domain, guint8 *code)
+{
+       NOT_IMPLEMENTED;
+       return NULL;
+}
+
+#endif