X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mono%2Fmini%2Fmini-amd64.c;h=49c3a53c715324b74493490b6898225eaf745142;hb=5b558abeeb255a3179d4ca6a85617e051c6abd38;hp=77486b452615c09a69103e980c4217968079c662;hpb=677a2cf5da0492b37a22c15bbae722cc43c46a7e;p=mono.git

diff --git a/mono/mini/mini-amd64.c b/mono/mini/mini-amd64.c
index 77486b45261..49c3a53c715 100644
--- a/mono/mini/mini-amd64.c
+++ b/mono/mini/mini-amd64.c
@@ -23,18 +23,16 @@
 #include <mono/metadata/threads.h>
 #include <mono/metadata/profiler-private.h>
 #include <mono/metadata/mono-debug.h>
+#include <mono/metadata/gc-internal.h>
 #include <mono/utils/mono-math.h>
+#include <mono/utils/mono-mmap.h>
 
 #include "trace.h"
 #include "ir-emit.h"
 #include "mini-amd64.h"
 #include "cpu-amd64.h"
-
-/* 
- * Can't define this in mini-amd64.h cause that would turn on the generic code in
- * method-to-ir.c.
- */
-#define MONO_ARCH_IMT_REG AMD64_R11
+#include "debugger-agent.h"
+#include "mini-gc.h"
 
 static gint lmf_tls_offset = -1;
 static gint lmf_addr_tls_offset = -1;
@@ -52,7 +50,7 @@ static gboolean optimize_for_xen = TRUE;
 
 #define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 /* Under windows, the calling convention is never stdcall */
 #define CALLCONV_IS_STDCALL(call_conv) (FALSE)
 #else
@@ -67,7 +65,25 @@ static CRITICAL_SECTION mini_arch_mutex;
 MonoBreakpointInfo
 mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
 
-#ifdef PLATFORM_WIN32
+/*
+ * The code generated for sequence points reads from this location, which is
+ * made read-only when single stepping is enabled.
+ */
+static gpointer ss_trigger_page;
+
+/* Enabled breakpoints read from this trigger page */
+static gpointer bp_trigger_page;
+
+/* The size of the breakpoint sequence */
+static int breakpoint_size;
+
+/* The size of the breakpoint instruction causing the actual fault */
+static int breakpoint_fault_size;
+
+/* The size of the single step instruction causing the actual fault */
+static int single_step_fault_size;
+
+#ifdef HOST_WIN32
 /* On Win64 always reserve first 32 bytes for first four arguments */
 #define ARGS_OFFSET 48
 #else
@@ -189,11 +205,278 @@ amd64_is_near_call (guint8 *code)
 	return code [0] == 0xe8;
 }
 
+#ifdef __native_client_codegen__
+
+/* Keep track of instruction "depth", that is, the level of sub-instruction */
+/* for any given instruction.  For instance, amd64_call_reg resolves to     */
+/* amd64_call_reg_internal, which uses amd64_alu_* macros, etc.             */
+/* We only want to force bundle alignment for the top level instruction,    */
+/* so NaCl pseudo-instructions can be implemented with sub instructions.    */
+static guint32 nacl_instruction_depth;
+
+static guint32 nacl_rex_tag;
+static guint32 nacl_legacy_prefix_tag;
+
+void
+amd64_nacl_clear_legacy_prefix_tag ()
+{
+	TlsSetValue (nacl_legacy_prefix_tag, NULL);
+}
+
+void
+amd64_nacl_tag_legacy_prefix (guint8* code)
+{
+	if (TlsGetValue (nacl_legacy_prefix_tag) == NULL)
+		TlsSetValue (nacl_legacy_prefix_tag, code);
+}
+
+void
+amd64_nacl_tag_rex (guint8* code)
+{
+	TlsSetValue (nacl_rex_tag, code);
+}
+
+guint8*
+amd64_nacl_get_legacy_prefix_tag ()
+{
+	return (guint8*)TlsGetValue (nacl_legacy_prefix_tag);
+}
+
+guint8*
+amd64_nacl_get_rex_tag ()
+{
+	return (guint8*)TlsGetValue (nacl_rex_tag);
+}
+
+/* Increment the instruction "depth" described above */
+void
+amd64_nacl_instruction_pre ()
+{
+	intptr_t depth = (intptr_t) TlsGetValue (nacl_instruction_depth);
+	depth++;
+	TlsSetValue (nacl_instruction_depth, (gpointer)depth);
+}
+
+/* amd64_nacl_instruction_post: Decrement instruction "depth", force bundle */
+/* alignment if depth == 0 (top level instruction)                          */
+/* IN: start, end    pointers to instruction beginning and end              */
+/* OUT: start, end   pointers to beginning and end after possible alignment */
+/* GLOBALS: nacl_instruction_depth     defined above                        */
+void
+amd64_nacl_instruction_post (guint8 **start, guint8 **end)
+{
+	intptr_t depth = (intptr_t) TlsGetValue(nacl_instruction_depth);
+	depth--;
+	TlsSetValue (nacl_instruction_depth, (void*)depth);
+
+	g_assert ( depth >= 0 );
+	if (depth == 0) {
+  		uintptr_t space_in_block;
+		uintptr_t instlen;
+		guint8 *prefix = amd64_nacl_get_legacy_prefix_tag ();
+		/* if legacy prefix is present, and if it was emitted before */
+		/* the start of the instruction sequence, adjust the start   */
+		if (prefix != NULL && prefix < *start) {
+			g_assert (*start - prefix <= 3);/* only 3 are allowed */
+			*start = prefix;
+		}
+		space_in_block = kNaClAlignment - ((uintptr_t)(*start) & kNaClAlignmentMask);
+		instlen = (uintptr_t)(*end - *start);
+		/* Only check for instructions which are less than        */
+		/* kNaClAlignment. The only instructions that should ever */
+		/* be that long are call sequences, which are already     */
+		/* padded out to align the return to the next bundle.     */
+		if (instlen > space_in_block && instlen < kNaClAlignment) {
+			const size_t MAX_NACL_INST_LENGTH = kNaClAlignment;
+  			guint8 copy_of_instruction[MAX_NACL_INST_LENGTH];
+  			const size_t length = (size_t)((*end)-(*start));
+  			g_assert (length < MAX_NACL_INST_LENGTH);
+			
+  			memcpy (copy_of_instruction, *start, length);
+			*start = mono_arch_nacl_pad (*start, space_in_block);
+			memcpy (*start, copy_of_instruction, length);
+			*end = *start + length;
+		}
+		amd64_nacl_clear_legacy_prefix_tag ();
+		amd64_nacl_tag_rex (NULL);
+	}
+}
+
+/* amd64_nacl_membase_handler: ensure all access to memory of the form      */
+/*   OFFSET(%rXX) is sandboxed.  For allowable base registers %rip, %rbp,   */
+/*   %rsp, and %r15, emit the membase as usual.  For all other registers,   */
+/*   make sure the upper 32-bits are cleared, and use that register in the  */
+/*   index field of a new address of this form: OFFSET(%r15,%eXX,1)         */
+/* IN:      code                                                            */
+/*             pointer to current instruction stream (in the                */
+/*             middle of an instruction, after opcode is emitted)           */
+/*          basereg/offset/dreg                                             */
+/*             operands of normal membase address                           */
+/* OUT:     code                                                            */
+/*             pointer to the end of the membase/memindex emit              */
+/* GLOBALS: nacl_rex_tag                                                    */
+/*             position in instruction stream that rex prefix was emitted   */
+/*          nacl_legacy_prefix_tag                                          */
+/*             (possibly NULL) position in instruction of legacy x86 prefix */
+void
+amd64_nacl_membase_handler (guint8** code, gint8 basereg, gint32 offset, gint8 dreg)
+{
+	gint8 true_basereg = basereg;
+
+	/* Cache these values, they might change  */
+ 	/* as new instructions are emitted below. */
+	guint8* rex_tag = amd64_nacl_get_rex_tag ();
+	guint8* legacy_prefix_tag = amd64_nacl_get_legacy_prefix_tag ();
+
+	/* 'basereg' is given masked to 0x7 at this point, so check */
+	/* the rex prefix to see if this is an extended register.   */
+	if ((rex_tag != NULL) && IS_REX(*rex_tag) && (*rex_tag & AMD64_REX_B)) {
+		true_basereg |= 0x8;
+	}
+
+#define X86_LEA_OPCODE (0x8D)
+
+	if (!amd64_is_valid_nacl_base (true_basereg) && (*(*code-1) != X86_LEA_OPCODE)) {
+		guint8* old_instruction_start;
+		
+		/* This will hold the 'mov %eXX, %eXX' that clears the upper */
+		/* 32-bits of the old base register (new index register)     */
+		guint8 buf[32];
+		guint8* buf_ptr = buf;
+		size_t insert_len;
+
+		g_assert (rex_tag != NULL);
+
+		if (IS_REX(*rex_tag)) {
+			/* The old rex.B should be the new rex.X */
+			if (*rex_tag & AMD64_REX_B) {
+				*rex_tag |= AMD64_REX_X;
+			}
+			/* Since our new base is %r15 set rex.B */
+			*rex_tag |= AMD64_REX_B;
+		} else {
+			/* Shift the instruction by one byte  */
+			/* so we can insert a rex prefix      */
+			memmove (rex_tag + 1, rex_tag, (size_t)(*code - rex_tag));
+			*code += 1;
+			/* New rex prefix only needs rex.B for %r15 base */
+			*rex_tag = AMD64_REX(AMD64_REX_B);
+		}
+
+		if (legacy_prefix_tag) {
+			old_instruction_start = legacy_prefix_tag;
+		} else {
+			old_instruction_start = rex_tag;
+		}
+		
+		/* Clears the upper 32-bits of the previous base register */
+		amd64_mov_reg_reg_size (buf_ptr, true_basereg, true_basereg, 4);
+		insert_len = buf_ptr - buf;
+		
+		/* Move the old instruction forward to make */
+		/* room for 'mov' stored in 'buf_ptr'       */
+		memmove (old_instruction_start + insert_len, old_instruction_start, (size_t)(*code - old_instruction_start));
+		*code += insert_len;
+		memcpy (old_instruction_start, buf, insert_len);
+
+		/* Sandboxed replacement for the normal membase_emit */
+		x86_memindex_emit (*code, dreg, AMD64_R15, offset, basereg, 0);
+		
+	} else {
+		/* Normal default behavior, emit membase memory location */
+		x86_membase_emit_body (*code, dreg, basereg, offset);
+	}
+}
+
+
+static inline unsigned char*
+amd64_skip_nops (unsigned char* code)
+{
+	guint8 in_nop;
+	do {
+		in_nop = 0;
+		if (   code[0] == 0x90) {
+			in_nop = 1;
+			code += 1;
+		}
+		if (   code[0] == 0x66 && code[1] == 0x90) {
+			in_nop = 1;
+			code += 2;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x00) {
+			in_nop = 1;
+			code += 3;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x40 && code[3] == 0x00) {
+			in_nop = 1;
+			code += 4;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x44 && code[3] == 0x00
+		 && code[4] == 0x00) {
+			in_nop = 1;
+			code += 5;
+		}
+		if (code[0] == 0x66 && code[1] == 0x0f
+		 && code[2] == 0x1f && code[3] == 0x44
+		 && code[4] == 0x00 && code[5] == 0x00) {
+			in_nop = 1;
+			code += 6;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x80 && code[3] == 0x00
+		 && code[4] == 0x00 && code[5] == 0x00
+		 && code[6] == 0x00) {
+			in_nop = 1;
+			code += 7;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x84 && code[3] == 0x00
+		 && code[4] == 0x00 && code[5] == 0x00
+		 && code[6] == 0x00 && code[7] == 0x00) {
+			in_nop = 1;
+			code += 8;
+		}
+	} while ( in_nop );
+	return code;
+}
+
+guint8*
+mono_arch_nacl_skip_nops (guint8* code)
+{
+  return amd64_skip_nops(code);
+}
+
+#endif /*__native_client_codegen__*/
+
 static inline void 
 amd64_patch (unsigned char* code, gpointer target)
 {
 	guint8 rex = 0;
 
+#ifdef __native_client_codegen__
+	code = amd64_skip_nops (code);
+#endif
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	if (nacl_is_code_address (code)) {
+		/* For tail calls, code is patched after being installed */
+		/* but not through the normal "patch callsite" method.   */
+		unsigned char buf[kNaClAlignment];
+		unsigned char *aligned_code = (uintptr_t)code & ~kNaClAlignmentMask;
+		int ret;
+		memcpy (buf, aligned_code, kNaClAlignment);
+		/* Patch a temp buffer of bundle size, */
+		/* then install to actual location.    */
+		amd64_patch (buf + ((uintptr_t)code - (uintptr_t)aligned_code), target);
+		ret = nacl_dyncode_modify (aligned_code, buf, kNaClAlignment);
+		g_assert (ret == 0);
+		return;
+	}
+	target = nacl_modify_patch_target (target);
+#endif
+
 	/* Skip REX */
 	if ((code [0] >= 0x40) && (code [0] <= 0x4f)) {
 		rex = code [0];
@@ -246,6 +529,7 @@ typedef struct {
 	/* Only if storage == ArgValuetypeInReg */
 	ArgStorage pair_storage [2];
 	gint8 pair_regs [2];
+	int nregs;
 } ArgInfo;
 
 typedef struct {
@@ -255,6 +539,8 @@ typedef struct {
 	guint32 freg_usage;
 	gboolean need_stack_align;
 	gboolean vtype_retaddr;
+	/* The index of the vret arg in the argument list */
+	int vret_arg_index;
 	ArgInfo ret;
 	ArgInfo sig_cookie;
 	ArgInfo args [1];
@@ -262,7 +548,7 @@ typedef struct {
 
 #define DEBUG(a) if (cfg->verbose_level > 1) a
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 #define PARAM_REGS 4
 
 static AMD64_Reg_No param_regs [] = { AMD64_RCX, AMD64_RDX, AMD64_R8, AMD64_R9 };
@@ -283,7 +569,9 @@ add_general (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
 
     if (*gr >= PARAM_REGS) {
 		ainfo->storage = ArgOnStack;
-		(*stack_size) += sizeof (gpointer);
+		/* Since the same stack slot size is used for all arg */
+		/*  types, it needs to be big enough to hold them all */
+		(*stack_size) += sizeof(mgreg_t);
     }
     else {
 		ainfo->storage = ArgInIReg;
@@ -292,7 +580,7 @@ add_general (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
     }
 }
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 #define FLOAT_PARAM_REGS 4
 #else
 #define FLOAT_PARAM_REGS 8
@@ -305,7 +593,9 @@ add_float (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo, gboolean is_double)
 
     if (*gr >= FLOAT_PARAM_REGS) {
 		ainfo->storage = ArgOnStack;
-		(*stack_size) += sizeof (gpointer);
+		/* Since the same stack slot size is used for both float */
+		/*  types, it needs to be big enough to hold them both */
+		(*stack_size) += sizeof(mgreg_t);
     }
     else {
 		/* A double register */
@@ -356,7 +646,7 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
 		break;
 	case MONO_TYPE_R4:
 	case MONO_TYPE_R8:
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 		class2 = ARG_CLASS_INTEGER;
 #else
 		class2 = ARG_CLASS_SSE;
@@ -400,6 +690,32 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
 
 	return class1;
 }
+#ifdef __native_client_codegen__
+const guint kNaClAlignment = kNaClAlignmentAMD64;
+const guint kNaClAlignmentMask = kNaClAlignmentMaskAMD64;
+
+/* Default alignment for Native Client is 32-byte. */
+gint8 nacl_align_byte = -32; /* signed version of 0xe0 */
+
+/* mono_arch_nacl_pad: Add pad bytes of alignment instructions at code,  */
+/* Check that alignment doesn't cross an alignment boundary.             */
+guint8*
+mono_arch_nacl_pad(guint8 *code, int pad)
+{
+	const int kMaxPadding = 8; /* see amd64-codegen.h:amd64_padding_size() */
+
+	if (pad == 0) return code;
+	/* assertion: alignment cannot cross a block boundary */
+	g_assert (((uintptr_t)code & (~kNaClAlignmentMask)) ==
+	         (((uintptr_t)code + pad - 1) & (~kNaClAlignmentMask)));
+	while (pad >= kMaxPadding) {
+		amd64_padding (code, kMaxPadding);
+		pad -= kMaxPadding;
+	}
+	if (pad != 0) amd64_padding (code, pad);
+	return code;
+}
+#endif
 
 static void
 add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
@@ -407,6 +723,9 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 			   guint32 *gr, guint32 *fr, guint32 *stack_size)
 {
 	guint32 size, quad, nquads, i;
+	/* Keep track of the size used in each quad so we can */
+	/* use the right size when copying args/return vars.  */
+	guint32 quadsize [2] = {8, 8};
 	ArgumentClass args [2];
 	MonoMarshalType *info = NULL;
 	MonoClass *klass;
@@ -423,7 +742,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 
 	klass = mono_class_from_mono_type (type);
 	size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
-#ifndef PLATFORM_WIN32
+#ifndef HOST_WIN32
 	if (!sig->pinvoke && !disable_vtypes_in_regs && ((is_return && (size == 8)) || (!is_return && (size <= 16)))) {
 		/* We pass and return vtypes of size 8 in a register */
 	} else if (!sig->pinvoke || (size == 0) || (size > 16)) {
@@ -435,6 +754,24 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 	}
 #endif
 
+	/* If this struct can't be split up naturally into 8-byte */
+	/* chunks (registers), pass it on the stack.              */
+	if (sig->pinvoke && !pass_on_stack) {
+		info = mono_marshal_load_type_info (klass);
+		g_assert(info);
+		guint32 align;
+		guint32 field_size;
+		for (i = 0; i < info->num_fields; ++i) {
+			field_size = mono_marshal_type_size (info->fields [i].field->type, 
+							   info->fields [i].mspec, 
+							   &align, TRUE, klass->unicode);
+			if ((info->fields [i].offset < 8) && (info->fields [i].offset + field_size) > 8) {
+				pass_on_stack = TRUE;
+				break;
+			}
+		}
+	}
+
 	if (pass_on_stack) {
 		/* Allways pass in memory */
 		ainfo->offset = *stack_size;
@@ -471,7 +808,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 		info = mono_marshal_load_type_info (klass);
 		g_assert (info);
 
-#ifndef PLATFORM_WIN32
+#ifndef HOST_WIN32
 		if (info->native_size > 16) {
 			ainfo->offset = *stack_size;
 			*stack_size += ALIGN_TO (info->native_size, 8);
@@ -534,6 +871,10 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 				if ((quad == 1) && (info->fields [i].offset < 8))
 					continue;
 
+				/* How far into this quad this data extends.*/
+				/* (8 is size of quad) */
+				quadsize [quad] = info->fields [i].offset + size - (quad * 8);
+
 				class1 = merge_argument_class_from_type (info->fields [i].field->type, class1);
 			}
 			g_assert (class1 != ARG_CLASS_NO_CLASS);
@@ -552,6 +893,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 
 		ainfo->storage = ArgValuetypeInReg;
 		ainfo->pair_storage [0] = ainfo->pair_storage [1] = ArgNone;
+		ainfo->nregs = nquads;
 		for (quad = 0; quad < nquads; ++quad) {
 			switch (args [quad]) {
 			case ARG_CLASS_INTEGER:
@@ -570,7 +912,9 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 				if (*fr >= FLOAT_PARAM_REGS)
 					args [quad] = ARG_CLASS_MEMORY;
 				else {
-					ainfo->pair_storage [quad] = ArgInDoubleSSEReg;
+					if (quadsize[quad] <= 4)
+						ainfo->pair_storage [quad] = ArgInFloatSSEReg;
+					else ainfo->pair_storage [quad] = ArgInDoubleSSEReg;
 					ainfo->pair_regs [quad] = *fr;
 					(*fr) ++;
 				}
@@ -591,7 +935,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 			if (sig->pinvoke)
 				*stack_size += ALIGN_TO (info->native_size, 8);
 			else
-				*stack_size += nquads * sizeof (gpointer);
+				*stack_size += nquads * sizeof(mgreg_t);
 			ainfo->storage = ArgOnStack;
 		}
 	}
@@ -605,13 +949,14 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
  * Draft Version 0.23" document for more information.
  */
 static CallInfo*
-get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
+get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSignature *sig)
 {
-	guint32 i, gr, fr;
+	guint32 i, gr, fr, pstart;
 	MonoType *ret_type;
 	int n = sig->hasthis + sig->param_count;
 	guint32 stack_size = 0;
 	CallInfo *cinfo;
+	gboolean is_pinvoke = sig->pinvoke;
 
 	if (mp)
 		cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
@@ -661,7 +1006,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 			cinfo->ret.reg = AMD64_XMM0;
 			break;
 		case MONO_TYPE_GENERICINST:
-			if (!mono_type_generic_inst_is_valuetype (sig->ret)) {
+			if (!mono_type_generic_inst_is_valuetype (ret_type)) {
 				cinfo->ret.storage = ArgInIReg;
 				cinfo->ret.reg = AMD64_RAX;
 				break;
@@ -674,14 +1019,12 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 			if (cinfo->ret.storage == ArgOnStack) {
 				cinfo->vtype_retaddr = TRUE;
 				/* The caller passes the address where the value is stored */
-				add_general (&gr, &stack_size, &cinfo->ret);
 			}
 			break;
 		}
 		case MONO_TYPE_TYPEDBYREF:
 			/* Same as a valuetype with size 24 */
-			add_general (&gr, &stack_size, &cinfo->ret);
-			;
+			cinfo->vtype_retaddr = TRUE;
 			break;
 		case MONO_TYPE_VOID:
 			break;
@@ -690,9 +1033,31 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 		}
 	}
 
-	/* this */
-	if (sig->hasthis)
-		add_general (&gr, &stack_size, cinfo->args + 0);
+	pstart = 0;
+	/*
+	 * To simplify get_this_arg_reg () and LLVM integration, emit the vret arg after
+	 * the first argument, allowing 'this' to be always passed in the first arg reg.
+	 * Also do this if the first argument is a reference type, since virtual calls
+	 * are sometimes made using calli without sig->hasthis set, like in the delegate
+	 * invoke wrappers.
+	 */
+	if (cinfo->vtype_retaddr && !is_pinvoke && (sig->hasthis || (sig->param_count > 0 && MONO_TYPE_IS_REFERENCE (mini_type_get_underlying_type (gsctx, sig->params [0]))))) {
+		if (sig->hasthis) {
+			add_general (&gr, &stack_size, cinfo->args + 0);
+		} else {
+			add_general (&gr, &stack_size, &cinfo->args [sig->hasthis + 0]);
+			pstart = 1;
+		}
+		add_general (&gr, &stack_size, &cinfo->ret);
+		cinfo->vret_arg_index = 1;
+	} else {
+		/* this */
+		if (sig->hasthis)
+			add_general (&gr, &stack_size, cinfo->args + 0);
+
+		if (cinfo->vtype_retaddr)
+			add_general (&gr, &stack_size, &cinfo->ret);
+	}
 
 	if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == 0)) {
 		gr = PARAM_REGS;
@@ -702,11 +1067,11 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 		add_general (&gr, &stack_size, &cinfo->sig_cookie);
 	}
 
-	for (i = 0; i < sig->param_count; ++i) {
+	for (i = pstart; i < sig->param_count; ++i) {
 		ArgInfo *ainfo = &cinfo->args [sig->hasthis + i];
 		MonoType *ptype;
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 		/* The float param registers and other param registers must be the same index on Windows x64.*/
 		if (gr > fr)
 			fr = gr;
@@ -727,10 +1092,6 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 			add_general (&gr, &stack_size, &cinfo->sig_cookie);
 		}
 
-		if (sig->params [i]->byref) {
-			add_general (&gr, &stack_size, ainfo);
-			continue;
-		}
 		ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
 		switch (ptype->type) {
 		case MONO_TYPE_BOOLEAN:
@@ -768,7 +1129,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 			add_valuetype (gsctx, sig, ainfo, sig->params [i], FALSE, &gr, &fr, &stack_size);
 			break;
 		case MONO_TYPE_TYPEDBYREF:
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 			add_valuetype (gsctx, sig, ainfo, sig->params [i], FALSE, &gr, &fr, &stack_size);
 #else
 			stack_size += sizeof (MonoTypedRef);
@@ -798,16 +1159,18 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 		add_general (&gr, &stack_size, &cinfo->sig_cookie);
 	}
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 	// There always is 32 bytes reserved on the stack when calling on Winx64
 	stack_size += 0x20;
 #endif
 
+#ifndef MONO_AMD64_NO_PUSHES
 	if (stack_size & 0x8) {
 		/* The AMD64 ABI requires each stack frame to be 16 byte aligned */
 		cinfo->need_stack_align = TRUE;
 		stack_size += 8;
 	}
+#endif
 
 	cinfo->stack_usage = stack_size;
 	cinfo->reg_usage = gr;
@@ -830,7 +1193,7 @@ int
 mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info)
 {
 	int k;
-	CallInfo *cinfo = get_call_info (NULL, NULL, csig, FALSE);
+	CallInfo *cinfo = get_call_info (NULL, NULL, csig);
 	guint32 args_size = cinfo->stack_usage;
 
 	/* The arguments are saved to a stack area in mono_arch_instrument_prolog */
@@ -849,9 +1212,31 @@ mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJit
 	return args_size;
 }
 
+gboolean
+mono_amd64_tail_call_supported (MonoMethodSignature *caller_sig, MonoMethodSignature *callee_sig)
+{
+	CallInfo *c1, *c2;
+	gboolean res;
+
+	c1 = get_call_info (NULL, NULL, caller_sig);
+	c2 = get_call_info (NULL, NULL, callee_sig);
+	res = c1->stack_usage >= c2->stack_usage;
+	if (callee_sig->ret && MONO_TYPE_ISSTRUCT (callee_sig->ret) && c2->ret.storage != ArgValuetypeInReg)
+		/* An address on the callee's stack is passed as the first argument */
+		res = FALSE;
+
+	g_free (c1);
+	g_free (c2);
+
+	return res;
+}
+
 static int 
 cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
 {
+#if defined(MONO_CROSS_COMPILE)
+	return 0;
+#else
 #ifndef _MSC_VER
 	__asm__ __volatile__ ("cpuid"
 		: "=a" (*p_eax), "=b" (*p_ebx), "=c" (*p_ecx), "=d" (*p_edx)
@@ -865,6 +1250,7 @@ cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
 	*p_edx = info[3];
 #endif
 	return 1;
+#endif
 }
 
 /*
@@ -895,7 +1281,38 @@ mono_arch_cpu_init (void)
 void
 mono_arch_init (void)
 {
+	int flags;
+
 	InitializeCriticalSection (&mini_arch_mutex);
+#if defined(__native_client_codegen__)
+	nacl_instruction_depth = TlsAlloc ();
+	TlsSetValue (nacl_instruction_depth, (gpointer)0);
+	nacl_rex_tag = TlsAlloc ();
+	nacl_legacy_prefix_tag = TlsAlloc ();
+#endif
+
+#ifdef MONO_ARCH_NOMAP32BIT
+	flags = MONO_MMAP_READ;
+	/* amd64_mov_reg_imm () + amd64_mov_reg_membase () */
+	breakpoint_size = 13;
+	breakpoint_fault_size = 3;
+	/* amd64_alu_membase_imm_size (code, X86_CMP, AMD64_R11, 0, 0, 4); */
+	single_step_fault_size = 5;
+#else
+	flags = MONO_MMAP_READ|MONO_MMAP_32BIT;
+	/* amd64_mov_reg_mem () */
+	breakpoint_size = 8;
+	breakpoint_fault_size = 8;
+	single_step_fault_size = 8;
+#endif
+
+	ss_trigger_page = mono_valloc (NULL, mono_pagesize (), flags);
+	bp_trigger_page = mono_valloc (NULL, mono_pagesize (), flags);
+	mono_mprotect (bp_trigger_page, mono_pagesize (), 0);
+
+	mono_aot_register_jit_icall ("mono_amd64_throw_exception", mono_amd64_throw_exception);
+	mono_aot_register_jit_icall ("mono_amd64_throw_corlib_exception", mono_amd64_throw_corlib_exception);
+	mono_aot_register_jit_icall ("mono_amd64_get_original_ip", mono_amd64_get_original_ip);
 }
 
 /*
@@ -905,6 +1322,11 @@ void
 mono_arch_cleanup (void)
 {
 	DeleteCriticalSection (&mini_arch_mutex);
+#if defined(__native_client_codegen__)
+	TlsFree (nacl_instruction_depth);
+	TlsFree (nacl_rex_tag);
+	TlsFree (nacl_legacy_prefix_tag);
+#endif
 }
 
 /*
@@ -916,8 +1338,6 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
 	int eax, ebx, ecx, edx;
 	guint32 opts = 0;
 
-	/* FIXME: AMD64 */
-
 	*exclude_mask = 0;
 	/* Feature Flags function, flags returned in EDX. */
 	if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
@@ -939,7 +1359,6 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
  *
  * Returns a bitmask corresponding to all supported versions.
  * 
- * TODO detect other versions like SSE4a.
  */
 guint32
 mono_arch_cpu_enumerate_simd_versions (void)
@@ -949,21 +1368,36 @@ mono_arch_cpu_enumerate_simd_versions (void)
 
 	if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
 		if (edx & (1 << 25))
-			sse_opts |= 1 << SIMD_VERSION_SSE1;
+			sse_opts |= SIMD_VERSION_SSE1;
 		if (edx & (1 << 26))
-			sse_opts |= 1 << SIMD_VERSION_SSE2;
+			sse_opts |= SIMD_VERSION_SSE2;
 		if (ecx & (1 << 0))
-			sse_opts |= 1 << SIMD_VERSION_SSE3;
+			sse_opts |= SIMD_VERSION_SSE3;
 		if (ecx & (1 << 9))
-			sse_opts |= 1 << SIMD_VERSION_SSSE3;
+			sse_opts |= SIMD_VERSION_SSSE3;
 		if (ecx & (1 << 19))
-			sse_opts |= 1 << SIMD_VERSION_SSE41;
+			sse_opts |= SIMD_VERSION_SSE41;
 		if (ecx & (1 << 20))
-			sse_opts |= 1 << SIMD_VERSION_SSE42;
+			sse_opts |= SIMD_VERSION_SSE42;
+	}
+
+	/* Yes, all this needs to be done to check for sse4a.
+	   See: "Amd: CPUID Specification"
+	 */
+	if (cpuid (0x80000000, &eax, &ebx, &ecx, &edx)) {
+		/* eax greater or equal than 0x80000001, ebx = 'htuA', ecx = DMAc', edx = 'itne'*/
+		if ((((unsigned int) eax) >= 0x80000001) && (ebx == 0x68747541) && (ecx == 0x444D4163) && (edx == 0x69746E65)) {
+			cpuid (0x80000001, &eax, &ebx, &ecx, &edx);
+			if (ecx & (1 << 6))
+				sse_opts |= SIMD_VERSION_SSE4a;
+		}
 	}
+
 	return sse_opts;	
 }
 
+#ifndef DISABLE_JIT
+
 GList *
 mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
 {
@@ -1010,12 +1444,12 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 	if (cfg->arch.omit_fp_computed)
 		return;
 
-	header = mono_method_get_header (cfg->method);
+	header = cfg->header;
 
 	sig = mono_method_signature (cfg->method);
 
 	if (!cfg->arch.cinfo)
-		cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
+		cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
 	cinfo = cfg->arch.cinfo;
 
 	/*
@@ -1024,6 +1458,13 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 	cfg->arch.omit_fp = TRUE;
 	cfg->arch.omit_fp_computed = TRUE;
 
+#ifdef __native_client_codegen__
+	/* NaCl modules may not change the value of RBP, so it cannot be */
+	/* used as a normal register, but it can be used as a frame pointer*/
+	cfg->disable_omit_fp = TRUE;
+	cfg->arch.omit_fp = FALSE;
+#endif
+
 	if (cfg->disable_omit_fp)
 		cfg->arch.omit_fp = FALSE;
 
@@ -1080,7 +1521,9 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
 		regs = g_list_prepend (regs, (gpointer)AMD64_R12);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R13);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
 		regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#endif
  
 		regs = g_list_prepend (regs, (gpointer)AMD64_R10);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R9);
@@ -1099,8 +1542,10 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
 		regs = g_list_prepend (regs, (gpointer)AMD64_R12);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R13);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
 		regs = g_list_prepend (regs, (gpointer)AMD64_R15);
-#ifdef PLATFORM_WIN32
+#endif
+#ifdef HOST_WIN32
 		regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
 		regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
 #endif
@@ -1135,7 +1580,9 @@ mono_arch_get_iregs_clobbered_by_call (MonoCallInst *call)
 		regs = g_list_prepend (regs, (gpointer)AMD64_R12);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R13);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
 		regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#endif
 
 		regs = g_list_prepend (regs, (gpointer)AMD64_R10);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R9);
@@ -1206,7 +1653,7 @@ mono_arch_fill_argument_info (MonoCompile *cfg)
 	int i;
 	CallInfo *cinfo;
 
-	header = mono_method_get_header (cfg->method);
+	header = cfg->header;
 
 	sig = mono_method_signature (cfg->method);
 
@@ -1286,7 +1733,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 	gint32 *offsets;
 	CallInfo *cinfo;
 
-	header = mono_method_get_header (cfg->method);
+	header = cfg->header;
 
 	sig = mono_method_signature (cfg->method);
 
@@ -1336,7 +1783,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 		/* Reserve space for caller saved registers */
 		for (i = 0; i < AMD64_NREG; ++i)
 			if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-				offset += sizeof (gpointer);
+				offset += sizeof(mgreg_t);
 			}
 	}
 
@@ -1393,6 +1840,14 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 	/* Allocate locals */
 	if (!cfg->globalra) {
 		offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE: TRUE, &locals_stack_size, &locals_stack_align);
+		if (locals_stack_size > MONO_ARCH_MAX_FRAME_SIZE) {
+			char *mname = mono_method_full_name (cfg->method, TRUE);
+			cfg->exception_type = MONO_EXCEPTION_INVALID_PROGRAM;
+			cfg->exception_message = g_strdup_printf ("Method %s stack is too big.", mname);
+			g_free (mname);
+			return;
+		}
+		
 		if (locals_stack_align) {
 			offset += (locals_stack_align - 1);
 			offset &= ~(locals_stack_align - 1);
@@ -1457,12 +1912,12 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 					ins->opcode = OP_REGOFFSET;
 					ins->inst_basereg = cfg->frame_reg;
 					/* These arguments are saved to the stack in the prolog */
-					offset = ALIGN_TO (offset, sizeof (gpointer));
+					offset = ALIGN_TO (offset, sizeof(mgreg_t));
 					if (cfg->arch.omit_fp) {
 						ins->inst_offset = offset;
-						offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+						offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
 					} else {
-						offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+						offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
 						ins->inst_offset = - offset;
 					}
 					break;
@@ -1534,13 +1989,16 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 				ins->opcode = OP_REGOFFSET;
 				ins->inst_basereg = cfg->frame_reg;
 				/* These arguments are saved to the stack in the prolog */
-				offset = ALIGN_TO (offset, sizeof (gpointer));
+				offset = ALIGN_TO (offset, sizeof(mgreg_t));
 				if (cfg->arch.omit_fp) {
 					ins->inst_offset = offset;
-					offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+					offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
+					// Arguments are yet supported by the stack map creation code
+					//cfg->locals_max_stack_offset = MAX (cfg->locals_max_stack_offset, offset);
 				} else {
-					offset += (ainfo->storage == ArgValuetypeInReg) ? 2 * sizeof (gpointer) : sizeof (gpointer);
+					offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
 					ins->inst_offset = - offset;
+					//cfg->locals_min_stack_offset = MIN (cfg->locals_min_stack_offset, offset);
 				}
 			}
 		}
@@ -1558,7 +2016,7 @@ mono_arch_create_vars (MonoCompile *cfg)
 	sig = mono_method_signature (cfg->method);
 
 	if (!cfg->arch.cinfo)
-		cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
+		cfg->arch.cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
 	cinfo = cfg->arch.cinfo;
 
 	if (cinfo->ret.storage == ArgValuetypeInReg)
@@ -1572,6 +2030,14 @@ mono_arch_create_vars (MonoCompile *cfg)
 		}
 	}
 
+	if (cfg->gen_seq_points) {
+		MonoInst *ins;
+
+	    ins = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
+		ins->flags |= MONO_INST_VOLATILE;
+		cfg->arch.ss_trigger_page_var = ins;
+	}
+
 #ifdef MONO_AMD64_NO_PUSHES
 	/*
 	 * When this is set, we pass arguments on the stack by moves, and by allocating 
@@ -1594,7 +2060,7 @@ add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int re
 	switch (storage) {
 	case ArgInIReg:
 		MONO_INST_NEW (cfg, ins, OP_MOVE);
-		ins->dreg = mono_alloc_ireg (cfg);
+		ins->dreg = mono_alloc_ireg_copy (cfg, tree->dreg);
 		ins->sreg1 = tree->dreg;
 		MONO_ADD_INS (cfg->cbb, ins);
 		mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, reg, FALSE);
@@ -1626,7 +2092,11 @@ arg_storage_to_load_membase (ArgStorage storage)
 {
 	switch (storage) {
 	case ArgInIReg:
+#if defined(__mono_ilp32__)
+		return OP_LOADI8_MEMBASE;
+#else
 		return OP_LOAD_MEMBASE;
+#endif
 	case ArgInDoubleSSEReg:
 		return OP_LOADR8_MEMBASE;
 	case ArgInFloatSSEReg:
@@ -1659,7 +2129,7 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
 	 * passed on the stack after the signature. So compensate by 
 	 * passing a different signature.
 	 */
-	tmp_sig = mono_metadata_signature_dup (call->signature);
+	tmp_sig = mono_metadata_signature_dup_full (cfg->method->klass->image, call->signature);
 	tmp_sig->param_count -= call->signature->sentinelpos;
 	tmp_sig->sentinelpos = 0;
 	memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
@@ -1701,10 +2171,11 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
 	ArgInfo *ainfo;
 	int j;
 	LLVMCallInfo *linfo;
+	MonoType *t;
 
 	n = sig->param_count + sig->hasthis;
 
-	cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
+	cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
 
 	linfo = mono_mempool_alloc0 (cfg->mempool, sizeof (LLVMCallInfo) + (sizeof (LLVMArgInfo) * n));
 
@@ -1729,11 +2200,17 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
 	if (MONO_TYPE_ISSTRUCT (sig->ret) && cinfo->ret.storage == ArgInIReg) {
 		/* Vtype returned using a hidden argument */
 		linfo->ret.storage = LLVMArgVtypeRetAddr;
+		linfo->vret_arg_index = cinfo->vret_arg_index;
 	}
 
 	for (i = 0; i < n; ++i) {
 		ainfo = cinfo->args + i;
 
+		if (i >= sig->hasthis)
+			t = sig->params [i - sig->hasthis];
+		else
+			t = &mono_defaults.int_class->byval_arg;
+
 		linfo->args [i].storage = LLVMArgNone;
 
 		switch (ainfo->storage) {
@@ -1745,16 +2222,15 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
 			linfo->args [i].storage = LLVMArgInFPReg;
 			break;
 		case ArgOnStack:
-			if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+			if (MONO_TYPE_ISSTRUCT (t)) {
 				linfo->args [i].storage = LLVMArgVtypeByVal;
 			} else {
 				linfo->args [i].storage = LLVMArgInIReg;
-				if (!sig->params [i - sig->hasthis]->byref) {
-					if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4) {
+				if (!t->byref) {
+					if (t->type == MONO_TYPE_R4)
 						linfo->args [i].storage = LLVMArgInFPReg;
-					} else if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8) {
+					else if (t->type == MONO_TYPE_R8)
 						linfo->args [i].storage = LLVMArgInFPReg;
-					}
 				}
 			}
 			break;
@@ -1794,7 +2270,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 	sig = call->signature;
 	n = sig->param_count + sig->hasthis;
 
-	cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
+	cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
 
 	if (COMPILE_LLVM (cfg)) {
 		/* We shouldn't be called in the llvm case */
@@ -1834,6 +2310,11 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 				} else {
 					MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
 				}
+				if (cfg->compute_gc_maps) {
+					MonoInst *def;
+
+					EMIT_NEW_GC_PARAM_SLOT_LIVENESS_DEF (cfg, def, ainfo->offset, t);
+				}
 			}
 		}
 	}
@@ -1894,6 +2375,13 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 				}
 				g_assert (in->klass);
 
+				if (ainfo->storage == ArgOnStack && size >= 10000) {
+					/* Avoid asserts in emit_memcpy () */
+					cfg->exception_type = MONO_EXCEPTION_INVALID_PROGRAM;
+					cfg->exception_message = g_strdup_printf ("Passing an argument of size '%d'.", size);
+					/* Continue normally */
+				}
+
 				if (size > 0) {
 					MONO_INST_NEW (cfg, arg, OP_OUTARG_VT);
 					arg->sreg1 = in->dreg;
@@ -1985,7 +2473,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 		}
 	}
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 	if (call->inst.opcode != OP_JMP && OP_TAILCALL != call->inst.opcode) {
 		MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 0x20);
 	}
@@ -2017,7 +2505,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
 
 			MONO_INST_NEW (cfg, load, arg_storage_to_load_membase (ainfo->pair_storage [part]));
 			load->inst_basereg = src->dreg;
-			load->inst_offset = part * sizeof (gpointer);
+			load->inst_offset = part * sizeof(mgreg_t);
 
 			switch (ainfo->pair_storage [part]) {
 			case ArgInIReg:
@@ -2094,6 +2582,11 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
 				MONO_ADD_INS (cfg->cbb, arg);
 			}
 		}
+
+		if (cfg->compute_gc_maps) {
+			MonoInst *def;
+			EMIT_NEW_GC_PARAM_SLOT_LIVENESS_DEF (cfg, def, ainfo->offset, &ins->klass->byval_arg);
+		}
 	}
 }
 
@@ -2102,22 +2595,22 @@ mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
 {
 	MonoType *ret = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
 
-	if (!ret->byref) {
-		if (ret->type == MONO_TYPE_R4) {
-			if (COMPILE_LLVM (cfg))
-				MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
-			else
-				MONO_EMIT_NEW_UNALU (cfg, OP_AMD64_SET_XMMREG_R4, cfg->ret->dreg, val->dreg);
-			return;
-		} else if (ret->type == MONO_TYPE_R8) {
+	if (ret->type == MONO_TYPE_R4) {
+		if (COMPILE_LLVM (cfg))
 			MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
-			return;
-		}
+		else
+			MONO_EMIT_NEW_UNALU (cfg, OP_AMD64_SET_XMMREG_R4, cfg->ret->dreg, val->dreg);
+		return;
+	} else if (ret->type == MONO_TYPE_R8) {
+		MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
+		return;
 	}
 			
 	MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, cfg->ret->dreg, val->dreg);
 }
 
+#endif /* DISABLE_JIT */
+
 #define EMIT_COND_BRANCH(ins,cond,sign) \
         if (ins->inst_true_bb->native_offset) { \
 	        x86_branch (code, cond, cfg->native_code + ins->inst_true_bb->native_offset, sign); \
@@ -2146,7 +2639,7 @@ dyn_call_supported (MonoMethodSignature *sig, CallInfo *cinfo)
 {
 	int i;
 
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 	return FALSE;
 #endif
 
@@ -2200,7 +2693,7 @@ mono_arch_dyn_call_prepare (MonoMethodSignature *sig)
 	ArchDynCallInfo *info;
 	CallInfo *cinfo;
 
-	cinfo = get_call_info (NULL, NULL, sig, FALSE);
+	cinfo = get_call_info (NULL, NULL, sig);
 
 	if (!dyn_call_supported (sig, cinfo)) {
 		g_free (cinfo);
@@ -2229,6 +2722,15 @@ mono_arch_dyn_call_free (MonoDynCallInfo *info)
 	g_free (ainfo);
 }
 
+#if !defined(__native_client__)
+#define PTR_TO_GREG(ptr) (mgreg_t)(ptr)
+#define GREG_TO_PTR(greg) (gpointer)(greg)
+#else
+/* Correctly handle casts to/from 32-bit pointers without compiler warnings */
+#define PTR_TO_GREG(ptr) (mgreg_t)(uintptr_t)(ptr)
+#define GREG_TO_PTR(greg) (gpointer)(guint32)(greg)
+#endif
+
 /*
  * mono_arch_get_start_dyn_call:
  *
@@ -2248,7 +2750,7 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
 {
 	ArchDynCallInfo *dinfo = (ArchDynCallInfo*)info;
 	DynCallArgs *p = (DynCallArgs*)buf;
-	int arg_index, greg, i;
+	int arg_index, greg, i, pindex;
 	MonoMethodSignature *sig = dinfo->sig;
 
 	g_assert (buf_len >= sizeof (DynCallArgs));
@@ -2258,20 +2760,23 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
 
 	arg_index = 0;
 	greg = 0;
+	pindex = 0;
 
-	if (dinfo->cinfo->vtype_retaddr)
-		p->regs [greg ++] = (mgreg_t)ret;
-
-	if (sig->hasthis) {
-		p->regs [greg ++] = (mgreg_t)*(args [arg_index ++]);
+	if (sig->hasthis || dinfo->cinfo->vret_arg_index == 1) {
+		p->regs [greg ++] = PTR_TO_GREG(*(args [arg_index ++]));
+		if (!sig->hasthis)
+			pindex = 1;
 	}
 
-	for (i = 0; i < sig->param_count; i++) {
+	if (dinfo->cinfo->vtype_retaddr)
+		p->regs [greg ++] = PTR_TO_GREG(ret);
+
+	for (i = pindex; i < sig->param_count; i++) {
 		MonoType *t = mono_type_get_underlying_type (sig->params [i]);
 		gpointer *arg = args [arg_index ++];
 
 		if (t->byref) {
-			p->regs [greg ++] = (mgreg_t)*(arg);
+			p->regs [greg ++] = PTR_TO_GREG(*(arg));
 			continue;
 		}
 
@@ -2284,11 +2789,20 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
 		case MONO_TYPE_PTR:
 		case MONO_TYPE_I:
 		case MONO_TYPE_U:
+#if !defined(__mono_ilp32__)
+		case MONO_TYPE_I8:
+		case MONO_TYPE_U8:
+#endif
+			g_assert (dinfo->cinfo->args [i + sig->hasthis].reg == param_regs [greg]);
+			p->regs [greg ++] = PTR_TO_GREG(*(arg));
+			break;
+#if defined(__mono_ilp32__)
 		case MONO_TYPE_I8:
 		case MONO_TYPE_U8:
 			g_assert (dinfo->cinfo->args [i + sig->hasthis].reg == param_regs [greg]);
-			p->regs [greg ++] = (mgreg_t)*(arg);
+			p->regs [greg ++] = *(guint64*)(arg);
 			break;
+#endif
 		case MONO_TYPE_BOOLEAN:
 		case MONO_TYPE_U1:
 			p->regs [greg ++] = *(guint8*)(arg);
@@ -2311,7 +2825,7 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
 			break;
 		case MONO_TYPE_GENERICINST:
 		    if (MONO_TYPE_IS_REFERENCE (t)) {
-				p->regs [greg ++] = (mgreg_t)*(arg);
+				p->regs [greg ++] = PTR_TO_GREG(*(arg));
 				break;
 			} else {
 				/* Fall through */
@@ -2367,7 +2881,7 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
 	case MONO_TYPE_I:
 	case MONO_TYPE_U:
 	case MONO_TYPE_PTR:
-		*(gpointer*)ret = (gpointer)res;
+		*(gpointer*)ret = GREG_TO_PTR(res);
 		break;
 	case MONO_TYPE_I1:
 		*(gint8*)ret = res;
@@ -2397,7 +2911,7 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
 		break;
 	case MONO_TYPE_GENERICINST:
 		if (MONO_TYPE_IS_REFERENCE (sig->ret)) {
-			*(gpointer*)ret = (gpointer)res;
+			*(gpointer*)ret = GREG_TO_PTR(res);
 			break;
 		} else {
 			/* Fall through */
@@ -2531,23 +3045,29 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
 			/* These methods are allocated using malloc */
 			near_call = FALSE;
 
+#ifdef MONO_ARCH_NOMAP32BIT
+		near_call = FALSE;
+#endif
+
+		/* The 64bit XEN kernel does not honour the MAP_32BIT flag. (#522894) */
+		if (optimize_for_xen)
+			near_call = FALSE;
+
 		if (cfg->compile_aot) {
 			near_call = TRUE;
 			no_patch = TRUE;
 		}
 
-#ifdef MONO_ARCH_NOMAP32BIT
-		near_call = FALSE;
-#endif
-
 		if (near_call) {
 			/* 
 			 * Align the call displacement to an address divisible by 4 so it does
 			 * not span cache lines. This is required for code patching to work on SMP
 			 * systems.
 			 */
-			if (!no_patch && ((guint32)(code + 1 - cfg->native_code) % 4) != 0)
-				amd64_padding (code, 4 - ((guint32)(code + 1 - cfg->native_code) % 4));
+			if (!no_patch && ((guint32)(code + 1 - cfg->native_code) % 4) != 0) {
+				guint32 pad_size = 4 - ((guint32)(code + 1 - cfg->native_code) % 4);
+				amd64_padding (code, pad_size);
+			}
 			mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
 			amd64_call_code (code, 0);
 		}
@@ -2564,12 +3084,12 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
 static inline guint8*
 emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data, gboolean win64_adjust_stack)
 {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 	if (win64_adjust_stack)
 		amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 32);
 #endif
 	code = emit_call_body (cfg, code, patch_type, data);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 	if (win64_adjust_stack)
 		amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 32);
 #endif	
@@ -2592,6 +3112,8 @@ store_membase_imm_to_store_membase_reg (int opcode)
 	return -1;
 }
 
+#ifndef DISABLE_JIT
+
 #define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB) || ((opcode) == OP_ISBB_IMM)))
 
 /*
@@ -2731,7 +3253,7 @@ mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb)
 					if (((ins2->opcode == OP_STORE_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_IMM) || (ins2->opcode == OP_STOREI8_MEMBASE_IMM) || (ins2->opcode == OP_STORE_MEMBASE_IMM)) && (ins2->inst_imm == 0)) {
 						ins2->opcode = store_membase_imm_to_store_membase_reg (ins2->opcode);
 						ins2->sreg1 = ins->dreg;
-					} else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_REG) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG)) {
+					} else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM) || (ins2->opcode == OP_STOREI4_MEMBASE_REG) || (ins2->opcode == OP_STOREI8_MEMBASE_REG) || (ins2->opcode == OP_STORE_MEMBASE_REG) || (ins2->opcode == OP_LIVERANGE_START) || (ins2->opcode == OP_GC_LIVENESS_DEF) || (ins2->opcode == OP_GC_LIVENESS_USE)) {
 						/* Continue */
 					} else if (((ins2->opcode == OP_ICONST) || (ins2->opcode == OP_I8CONST)) && (ins2->dreg == ins->dreg) && (ins2->inst_c0 == 0)) {
 						NULLIFY_INS (ins2);
@@ -2802,8 +3324,13 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 				ins->sreg2 = temp->dreg;
 			}
 			break;
+#ifndef __mono_ilp32__
 		case OP_LOAD_MEMBASE:
+#endif
 		case OP_LOADI8_MEMBASE:
+#ifndef __native_client_codegen__
+		/*  Don't generate memindex opcodes (to simplify */
+		/*  read sandboxing) */
 			if (!amd64_is_imm32 (ins->inst_offset)) {
 				NEW_INS (cfg, ins, temp, OP_I8CONST);
 				temp->inst_c0 = ins->inst_offset;
@@ -2811,8 +3338,11 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 				ins->opcode = OP_AMD64_LOADI8_MEMINDEX;
 				ins->inst_indexreg = temp->dreg;
 			}
+#endif
 			break;
+#ifndef __mono_ilp32__
 		case OP_STORE_MEMBASE_IMM:
+#endif
 		case OP_STOREI8_MEMBASE_IMM:
 			if (!amd64_is_imm32 (ins->inst_imm)) {
 				NEW_INS (cfg, ins, temp, OP_I8CONST);
@@ -2894,7 +3424,7 @@ mono_emit_stack_alloc (MonoCompile *cfg, guchar *code, MonoInst* tree)
 	int sreg = tree->sreg1;
 	int need_touch = FALSE;
 
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(HOST_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
 	if (!tree->flags & MONO_INST_INIT)
 		need_touch = TRUE;
 #endif
@@ -2964,8 +3494,20 @@ mono_emit_stack_alloc (MonoCompile *cfg, guchar *code, MonoInst* tree)
 		if (cfg->param_area && cfg->arch.no_pushes)
 			amd64_alu_reg_imm (code, X86_ADD, AMD64_RDI, cfg->param_area);
 		amd64_cld (code);
+#if defined(__default_codegen__)
 		amd64_prefix (code, X86_REP_PREFIX);
 		amd64_stosl (code);
+#elif defined(__native_client_codegen__)
+		/* NaCl stos pseudo-instruction */
+		amd64_codegen_pre(code);
+		/* First, clear the upper 32 bits of RDI (mov %edi, %edi)  */
+		amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RDI, 4);
+		/* Add %r15 to %rdi using lea, condition flags unaffected. */
+		amd64_lea_memindex_size (code, AMD64_RDI, AMD64_R15, 0, AMD64_RDI, 0, 8);
+		amd64_prefix (code, X86_REP_PREFIX);
+		amd64_stosl (code);
+		amd64_codegen_post(code);
+#endif /* __native_client_codegen__ */
 		
 		if (tree->dreg != AMD64_RDI && sreg != AMD64_RDI)
 			amd64_pop_reg (code, AMD64_RDI);
@@ -3011,18 +3553,18 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 	case OP_VCALL2:
 	case OP_VCALL2_REG:
 	case OP_VCALL2_MEMBASE:
-		cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
+		cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, ((MonoCallInst*)ins)->signature);
 		if (cinfo->ret.storage == ArgValuetypeInReg) {
 			MonoInst *loc = cfg->arch.vret_addr_loc;
 
 			/* Load the destination address */
 			g_assert (loc->opcode == OP_REGOFFSET);
-			amd64_mov_reg_membase (code, AMD64_RCX, loc->inst_basereg, loc->inst_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_RCX, loc->inst_basereg, loc->inst_offset, sizeof(gpointer));
 
 			for (quad = 0; quad < 2; quad ++) {
 				switch (cinfo->ret.pair_storage [quad]) {
 				case ArgInIReg:
-					amd64_mov_membase_reg (code, AMD64_RCX, (quad * 8), cinfo->ret.pair_regs [quad], 8);
+					amd64_mov_membase_reg (code, AMD64_RCX, (quad * sizeof(mgreg_t)), cinfo->ret.pair_regs [quad], sizeof(mgreg_t));
 					break;
 				case ArgInFloatSSEReg:
 					amd64_movss_membase_reg (code, AMD64_RCX, (quad * 8), cinfo->ret.pair_regs [quad]);
@@ -3043,6 +3585,8 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 	return code;
 }
 
+#endif /* DISABLE_JIT */
+
 /*
  * mono_amd64_emit_tls_get:
  * @code: buffer to store code to
@@ -3058,7 +3602,7 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 guint8*
 mono_amd64_emit_tls_get (guint8* code, int dreg, int tls_offset)
 {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 	g_assert (tls_offset < 64);
 	x86_prefix (code, X86_GS_PREFIX);
 	amd64_mov_reg_mem (code, dreg, (tls_offset * 8) + 0x1480, 8);
@@ -3096,6 +3640,15 @@ amd64_pop_reg (code, AMD64_RAX);
 
 #ifndef DISABLE_JIT
 
+#if defined(__native_client__) || defined(__native_client_codegen__)
+void mono_nacl_gc()
+{
+#ifdef __native_client_gc__
+	__nacl_suspend_thread_if_needed();
+#endif
+}
+#endif
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -3129,6 +3682,21 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		}
 	}
 
+#if defined(__native_client_codegen__)
+	/* For Native Client, all indirect call/jump targets must be */
+	/* 32-byte aligned.  Exception handler blocks are jumped to  */
+	/* indirectly as well.                                       */
+	gboolean bb_needs_alignment = (bb->flags & BB_INDIRECT_JUMP_TARGET) ||
+				      (bb->flags & BB_EXCEPTION_HANDLER);
+
+	if ( bb_needs_alignment && ((cfg->code_len & kNaClAlignmentMask) != 0)) {
+		int pad = kNaClAlignment - (cfg->code_len & kNaClAlignmentMask);
+		if (pad != kNaClAlignment) code = mono_arch_nacl_pad(code, pad);
+		cfg->code_len += pad;
+		bb->native_offset = cfg->code_len;
+	}
+#endif  /*__native_client_codegen__*/
+
 	if (cfg->verbose_level > 2)
 		g_print ("Basic block %d starting at offset 0x%x\n", bb->block_num, bb->native_offset);
 
@@ -3154,9 +3722,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
 		max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
 
-		if (G_UNLIKELY (offset > (cfg->code_size - max_len - 16))) {
+#define EXTRA_CODE_SPACE (NACL_SIZE (16, 16 + kNaClAlignment))
+
+		if (G_UNLIKELY (offset > (cfg->code_size - max_len - EXTRA_CODE_SPACE))) {
 			cfg->code_size *= 2;
-			cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+			cfg->native_code = mono_realloc_native_code(cfg);
 			code = cfg->native_code + offset;
 			mono_jit_stats.code_reallocs++;
 		}
@@ -3189,7 +3759,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		case OP_STOREI2_MEMBASE_REG:
 			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 2);
 			break;
+		/* In AMD64 NaCl, pointers are 4 bytes, */
+		/*  so STORE_* != STOREI8_*. Likewise below. */
 		case OP_STORE_MEMBASE_REG:
+			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, sizeof(gpointer));
+			break;
 		case OP_STOREI8_MEMBASE_REG:
 			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 8);
 			break;
@@ -3197,15 +3771,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 4);
 			break;
 		case OP_STORE_MEMBASE_IMM:
+#ifndef __native_client_codegen__
+			/* In NaCl, this could be a PCONST type, which could */
+			/* mean a pointer type was copied directly into the  */
+			/* lower 32-bits of inst_imm, so for InvalidPtr==-1  */
+			/* the value would be 0x00000000FFFFFFFF which is    */
+			/* not proper for an imm32 unless you cast it.       */
+			g_assert (amd64_is_imm32 (ins->inst_imm));
+#endif
+			amd64_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, (gint32)ins->inst_imm, sizeof(gpointer));
+			break;
 		case OP_STOREI8_MEMBASE_IMM:
 			g_assert (amd64_is_imm32 (ins->inst_imm));
 			amd64_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, ins->inst_imm, 8);
 			break;
 		case OP_LOAD_MEM:
+#ifdef __mono_ilp32__
+			/* In ILP32, pointers are 4 bytes, so separate these */
+			/* cases, use literal 8 below where we really want 8 */
+			amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
+			amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, sizeof(gpointer));
+			break;
+#endif
 		case OP_LOADI8_MEM:
 			// FIXME: Decompose this earlier
 			if (amd64_is_imm32 (ins->inst_imm))
-				amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, sizeof (gpointer));
+				amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 8);
 			else {
 				amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
 				amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 8);
@@ -3229,13 +3820,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			amd64_widen_membase (code, ins->dreg, ins->dreg, 0, FALSE, FALSE);
 			break;
 		case OP_LOADU2_MEM:
+			/* For NaCl, pointers are 4 bytes, so separate these */
+			/* cases, use literal 8 below where we really want 8 */
 			amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
 			amd64_widen_membase (code, ins->dreg, ins->dreg, 0, FALSE, TRUE);
 			break;
 		case OP_LOAD_MEMBASE:
+			g_assert (amd64_is_imm32 (ins->inst_offset));
+			amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, sizeof(gpointer));
+			break;
 		case OP_LOADI8_MEMBASE:
+			/* Use literal 8 instead of sizeof pointer or */
+			/* register, we really want 8 for this opcode */
 			g_assert (amd64_is_imm32 (ins->inst_offset));
-			amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, sizeof (gpointer));
+			amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, 8);
 			break;
 		case OP_LOADI4_MEMBASE:
 			amd64_movsxd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
@@ -3463,6 +4061,44 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		case OP_NOT_REACHED:
 		case OP_NOT_NULL:
 			break;
+		case OP_SEQ_POINT: {
+			int i;
+
+			if (cfg->compile_aot)
+				NOT_IMPLEMENTED;
+
+			/* 
+			 * Read from the single stepping trigger page. This will cause a
+			 * SIGSEGV when single stepping is enabled.
+			 * We do this _before_ the breakpoint, so single stepping after
+			 * a breakpoint is hit will step to the next IL offset.
+			 */
+			if (ins->flags & MONO_INST_SINGLE_STEP_LOC) {
+				if (((guint64)ss_trigger_page >> 32) == 0)
+					amd64_mov_reg_mem (code, AMD64_R11, (guint64)ss_trigger_page, 4);
+				else {
+					MonoInst *var = cfg->arch.ss_trigger_page_var;
+
+					amd64_mov_reg_membase (code, AMD64_R11, var->inst_basereg, var->inst_offset, 8);
+					amd64_alu_membase_imm_size (code, X86_CMP, AMD64_R11, 0, 0, 4);
+				}
+			}
+
+			/* 
+			 * This is the address which is saved in seq points, 
+			 * get_ip_for_single_step () / get_ip_for_breakpoint () needs to compute this
+			 * from the address of the instruction causing the fault.
+			 */
+			mono_add_seq_point (cfg, bb, ins, code - cfg->native_code);
+
+			/* 
+			 * A placeholder for a possible breakpoint inserted by
+			 * mono_arch_set_breakpoint ().
+			 */
+			for (i = 0; i < breakpoint_size; ++i)
+				x86_nop (code);
+			break;
+		}
 		case OP_ADDCC:
 		case OP_LADD:
 			amd64_alu_reg_reg (code, X86_ADD, ins->sreg1, ins->sreg2);
@@ -3885,14 +4521,14 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			break;
 		case OP_AOTCONST:
 			mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
-			amd64_mov_reg_membase (code, ins->dreg, AMD64_RIP, 0, 8);
+			amd64_mov_reg_membase (code, ins->dreg, AMD64_RIP, 0, sizeof(gpointer));
 			break;
 		case OP_JUMP_TABLE:
 			mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
 			amd64_mov_reg_imm_size (code, ins->dreg, 0, 8);
 			break;
 		case OP_MOVE:
-			amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof (gpointer));
+			amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof(mgreg_t));
 			break;
 		case OP_AMD64_SET_XMMREG_R4: {
 			amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
@@ -3904,10 +4540,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			break;
 		}
 		case OP_TAILCALL: {
-			/*
-			 * Note: this 'frame destruction' logic is useful for tail calls, too.
-			 * Keep in sync with the code in emit_epilog.
-			 */
+			MonoCallInst *call = (MonoCallInst*)ins;
 			int pos = 0, i;
 
 			/* FIXME: no tracing support... */
@@ -3925,20 +4558,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 						save_offset += 8;
 					}
 				amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, cfg->arch.stack_alloc_size);
+
+				// FIXME:
+				if (call->stack_usage)
+					NOT_IMPLEMENTED;
 			}
 			else {
 				for (i = 0; i < AMD64_NREG; ++i)
 					if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-						pos -= sizeof (gpointer);
-			
-				if (pos)
-					amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
+						pos -= sizeof(mgreg_t);
 
-				/* Pop registers in reverse order */
-				for (i = AMD64_NREG - 1; i > 0; --i)
+				/* Restore callee-saved registers */
+				for (i = AMD64_NREG - 1; i > 0; --i) {
 					if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-						amd64_pop_reg (code, i);
+						amd64_mov_reg_membase (code, i, AMD64_RBP, pos, sizeof(mgreg_t));
+						pos += sizeof(mgreg_t);
 					}
+				}
+
+				/* Copy arguments on the stack to our argument area */
+				for (i = 0; i < call->stack_usage; i += sizeof(mgreg_t)) {
+					amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, i, sizeof(mgreg_t));
+					amd64_mov_membase_reg (code, AMD64_RBP, 16 + i, AMD64_RAX, sizeof(mgreg_t));
+				}
+			
+				if (pos)
+					amd64_lea_membase (code, AMD64_RSP, AMD64_RBP, pos);
 
 				amd64_leave (code);
 			}
@@ -3950,6 +4595,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			else
 				amd64_set_reg_template (code, AMD64_R11);
 			amd64_jump_reg (code, AMD64_R11);
+			ins->flags |= MONO_INST_GC_CALLSITE;
+			ins->backend.pc_offset = code - cfg->native_code;
 			break;
 		}
 		case OP_CHECK_THIS:
@@ -3958,7 +4605,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			break;
 		case OP_ARGLIST: {
 			amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, cfg->sig_cookie);
-			amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, 8);
+			amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, sizeof(gpointer));
 			break;
 		}
 		case OP_CALL:
@@ -3994,6 +4641,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 				code = emit_call (cfg, code, MONO_PATCH_INFO_METHOD, call->method, FALSE);
 			else
 				code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, call->fptr, FALSE);
+			ins->flags |= MONO_INST_GC_CALLSITE;
+			ins->backend.pc_offset = code - cfg->native_code;
 			if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
 				amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
 			code = emit_move_return_value (cfg, ins, code);
@@ -4042,6 +4691,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			}
 
 			amd64_call_reg (code, ins->sreg1);
+			ins->flags |= MONO_INST_GC_CALLSITE;
+			ins->backend.pc_offset = code - cfg->native_code;
 			if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
 				amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
 			code = emit_move_return_value (cfg, ins, code);
@@ -4054,23 +4705,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		case OP_CALL_MEMBASE:
 			call = (MonoCallInst*)ins;
 
-			if (AMD64_IS_ARGUMENT_REG (ins->sreg1)) {
-				/* 
-				 * Can't use R11 because it is clobbered by the trampoline 
-				 * code, and the reg value is needed by get_vcall_slot_addr.
-				 */
-				amd64_mov_reg_reg (code, AMD64_RAX, ins->sreg1, 8);
-				ins->sreg1 = AMD64_RAX;
-			}
-
-			/* 
-			 * Emit a few nops to simplify get_vcall_slot ().
-			 */
-			amd64_nop (code);
-			amd64_nop (code);
-			amd64_nop (code);
-
 			amd64_call_membase (code, ins->sreg1, ins->inst_offset);
+			ins->flags |= MONO_INST_GC_CALLSITE;
+			ins->backend.pc_offset = code - cfg->native_code;
 			if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
 				amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
 			code = emit_move_return_value (cfg, ins, code);
@@ -4091,11 +4728,14 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
 			/* Set argument registers */
 			for (i = 0; i < PARAM_REGS; ++i)
-				amd64_mov_reg_membase (code, param_regs [i], AMD64_R11, i * sizeof (gpointer), 8);
+				amd64_mov_reg_membase (code, param_regs [i], AMD64_R11, i * sizeof(mgreg_t), sizeof(mgreg_t));
 			
 			/* Make the call */
 			amd64_call_reg (code, AMD64_R10);
 
+			ins->flags |= MONO_INST_GC_CALLSITE;
+			ins->backend.pc_offset = code - cfg->native_code;
+
 			/* Save result */
 			amd64_mov_reg_membase (code, AMD64_R11, var->inst_basereg, var->inst_offset, 8);
 			amd64_mov_membase_reg (code, AMD64_R11, G_STRUCT_OFFSET (DynCallArgs, res), AMD64_RAX, 8);
@@ -4191,12 +4831,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			amd64_mov_reg_reg (code, AMD64_ARG_REG1, ins->sreg1, 8);
 			code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
 					     (gpointer)"mono_arch_throw_exception", FALSE);
+			ins->flags |= MONO_INST_GC_CALLSITE;
+			ins->backend.pc_offset = code - cfg->native_code;
 			break;
 		}
 		case OP_RETHROW: {
 			amd64_mov_reg_reg (code, AMD64_ARG_REG1, ins->sreg1, 8);
 			code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
 					     (gpointer)"mono_arch_rethrow_exception", FALSE);
+			ins->flags |= MONO_INST_GC_CALLSITE;
+			ins->backend.pc_offset = code - cfg->native_code;
 			break;
 		}
 		case OP_CALL_HANDLER: 
@@ -4204,12 +4848,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 8);
 			mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
 			amd64_call_imm (code, 0);
+			mono_cfg_add_try_hole (cfg, ins->inst_eh_block, code, bb);
 			/* Restore stack alignment */
 			amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
 			break;
 		case OP_START_HANDLER: {
+			/* Even though we're saving RSP, use sizeof */
+			/* gpointer because spvar is of type IntPtr */
+			/* see: mono_create_spvar_for_region */
 			MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-			amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, 8);
+			amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, sizeof(gpointer));
 
 			if ((MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY) ||
 				 MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY)) &&
@@ -4220,13 +4868,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		}
 		case OP_ENDFINALLY: {
 			MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, sizeof(gpointer));
 			amd64_ret (code);
 			break;
 		}
 		case OP_ENDFILTER: {
 			MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, sizeof(gpointer));
 			/* The local allocator will put the result into RAX */
 			amd64_ret (code);
 			break;
@@ -4714,7 +5362,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			break;
 		}
 		case OP_MEMORY_BARRIER: {
-			/* Not needed on amd64 */
+			/* http://blogs.sun.com/dave/resource/NHM-Pipeline-Blog-V2.txt */
+			x86_prefix (code, X86_LOCK_PREFIX);
+			amd64_alu_membase_imm (code, X86_ADD, AMD64_RSP, 0, 0);
 			break;
 		}
 		case OP_ATOMIC_ADD_I4:
@@ -4839,6 +5489,62 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 				amd64_mov_reg_reg (code, ins->dreg, AMD64_RAX, size);
 			break;
 		}
+		case OP_CARD_TABLE_WBARRIER: {
+			int ptr = ins->sreg1;
+			int value = ins->sreg2;
+			guchar *br;
+			int nursery_shift, card_table_shift;
+			gpointer card_table_mask;
+			size_t nursery_size;
+
+			gpointer card_table = mono_gc_get_card_table (&card_table_shift, &card_table_mask);
+			guint64 nursery_start = (guint64)mono_gc_get_nursery (&nursery_shift, &nursery_size);
+
+			/*If either point to the stack we can simply avoid the WB. This happens due to
+			 * optimizations revealing a stack store that was not visible when op_cardtable was emited.
+			 */
+			if (ins->sreg1 == AMD64_RSP || ins->sreg2 == AMD64_RSP)
+				continue;
+
+			/*
+			 * We need one register we can clobber, we choose EDX and make sreg1
+			 * fixed EAX to work around limitations in the local register allocator.
+			 * sreg2 might get allocated to EDX, but that is not a problem since
+			 * we use it before clobbering EDX.
+			 */
+			g_assert (ins->sreg1 == AMD64_RAX);
+
+			/*
+			 * This is the code we produce:
+			 *
+			 *   edx = value
+			 *   edx >>= nursery_shift
+			 *   cmp edx, (nursery_start >> nursery_shift)
+			 *   jne done
+			 *   edx = ptr
+			 *   edx >>= card_table_shift
+			 *   edx += cardtable
+			 *   [edx] = 1
+			 * done:
+			 */
+
+			if (value != AMD64_RDX)
+				amd64_mov_reg_reg (code, AMD64_RDX, value, 8);
+			amd64_shift_reg_imm (code, X86_SHR, AMD64_RDX, nursery_shift);
+			amd64_alu_reg_imm (code, X86_CMP, AMD64_RDX, nursery_start >> nursery_shift);
+			br = code; x86_branch8 (code, X86_CC_NE, -1, FALSE);
+			amd64_mov_reg_reg (code, AMD64_RDX, ptr, 8);
+			amd64_shift_reg_imm (code, X86_SHR, AMD64_RDX, card_table_shift);
+			if (card_table_mask)
+				amd64_alu_reg_imm (code, X86_AND, AMD64_RDX, (guint32)(guint64)card_table_mask);
+
+			mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_GC_CARD_TABLE_ADDR, card_table);
+			amd64_alu_reg_membase (code, X86_ADD, AMD64_RDX, AMD64_RIP, 0);
+
+			amd64_mov_membase_imm (code, AMD64_RDX, 0, 1, 1);
+			x86_patch (br, code);
+			break;
+		}
 #ifdef MONO_ARCH_SIMD_INTRINSICS
 		/* TODO: Some of these IR opcodes are marked as no clobber when they indeed do. */
 		case OP_ADDPS:
@@ -5424,15 +6130,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
 			break;
 		}
+		case OP_NACL_GC_SAFE_POINT: {
+#if defined(__native_client_codegen__)
+			code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)mono_nacl_gc, TRUE);
+#endif
+			break;
+		}
+		case OP_GC_LIVENESS_DEF:
+		case OP_GC_LIVENESS_USE:
+		case OP_GC_PARAM_SLOT_LIVENESS_DEF:
+			ins->backend.pc_offset = code - cfg->native_code;
+			break;
+		case OP_GC_SPILL_SLOT_LIVENESS_DEF:
+			ins->backend.pc_offset = code - cfg->native_code;
+			bb->spill_slot_defs = g_slist_prepend_mempool (cfg->mempool, bb->spill_slot_defs, ins);
+			break;
 		default:
 			g_warning ("unknown opcode %s in %s()\n", mono_inst_name (ins->opcode), __FUNCTION__);
 			g_assert_not_reached ();
 		}
 
 		if ((code - cfg->native_code - offset) > max_len) {
+#if !defined(__native_client_codegen__)
 			g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %ld)",
 				   mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
 			g_assert_not_reached ();
+#endif
 		}
 	       
 		last_ins = ins;
@@ -5515,6 +6238,8 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
 	}
 }
 
+#ifndef DISABLE_JIT
+
 static int
 get_max_epilog_size (MonoCompile *cfg)
 {
@@ -5560,10 +6285,27 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 	gint32 lmf_offset = cfg->arch.lmf_offset;
 	gboolean args_clobbered = FALSE;
 	gboolean trace = FALSE;
+#ifdef __native_client_codegen__
+	guint alignment_check;
+#endif
 
-	cfg->code_size =  MAX (((MonoMethodNormal *)method)->header->code_size * 4, 10240);
+	cfg->code_size =  MAX (cfg->header->code_size * 4, 10240);
 
+#if defined(__default_codegen__)
 	code = cfg->native_code = g_malloc (cfg->code_size);
+#elif defined(__native_client_codegen__)
+	/* native_code_alloc is not 32-byte aligned, native_code is. */
+	cfg->native_code_alloc = g_malloc (cfg->code_size + kNaClAlignment);
+
+	/* Align native_code to next nearest kNaclAlignment byte. */
+	cfg->native_code = (uintptr_t)cfg->native_code_alloc + kNaClAlignment;
+	cfg->native_code = (uintptr_t)cfg->native_code & ~kNaClAlignmentMask;
+
+	code = cfg->native_code;
+
+	alignment_check = (guint)cfg->native_code & kNaClAlignmentMask;
+	g_assert (alignment_check == 0);
+#endif
 
 	if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
 		trace = TRUE;
@@ -5595,6 +6337,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 	// IP saved at CFA - 8
 	mono_emit_unwind_op_offset (cfg, code, AMD64_RIP, -cfa_offset);
 	async_exc_point (code);
+	mini_gc_set_slot_type_from_cfa (cfg, -cfa_offset, SLOT_NOREF);
 
 	if (!cfg->arch.omit_fp) {
 		amd64_push_reg (code, AMD64_RBP);
@@ -5602,14 +6345,16 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
 		mono_emit_unwind_op_offset (cfg, code, AMD64_RBP, - cfa_offset);
 		async_exc_point (code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 		mono_arch_unwindinfo_add_push_nonvol (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
 #endif
+		/* These are handled automatically by the stack marking code */
+		mini_gc_set_slot_type_from_cfa (cfg, -cfa_offset, SLOT_NOREF);
 		
-		amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+		amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof(mgreg_t));
 		mono_emit_unwind_op_def_cfa_reg (cfg, code, AMD64_RBP);
 		async_exc_point (code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 		mono_arch_unwindinfo_add_set_fpreg (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
 #endif
 	}
@@ -5621,10 +6366,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		for (i = 0; i < AMD64_NREG; ++i)
 			if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
 				amd64_push_reg (code, i);
-				pos += sizeof (gpointer);
+				pos += 8; /* AMD64 push inst is always 8 bytes, no way to change it */
 				offset += 8;
 				mono_emit_unwind_op_offset (cfg, code, i, - offset);
 				async_exc_point (code);
+
+				/* These are handled automatically by the stack marking code */
+				mini_gc_set_slot_type_from_cfa (cfg, - offset, SLOT_NOREF);
 			}
 	}
 
@@ -5634,21 +6382,28 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		if (cfg->arch.omit_fp)
 			// FIXME:
 			g_assert_not_reached ();
-		cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof (gpointer));
+		cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof(mgreg_t));
 	}
 
 	if (cfg->arch.omit_fp) {
 		/* 
-		 * On enter, the stack is misaligned by the the pushing of the return
+		 * On enter, the stack is misaligned by the pushing of the return
 		 * address. It is either made aligned by the pushing of %rbp, or by
 		 * this.
 		 */
 		alloc_size = ALIGN_TO (cfg->stack_offset, 8);
-		if ((alloc_size % 16) == 0)
+		if ((alloc_size % 16) == 0) {
 			alloc_size += 8;
+			/* Mark the padding slot as NOREF */
+			mini_gc_set_slot_type_from_cfa (cfg, -cfa_offset - sizeof (mgreg_t), SLOT_NOREF);
+		}
 	} else {
 		alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
-
+		if (cfg->stack_offset != alloc_size) {
+			/* Mark the padding slot as NOREF */
+			mini_gc_set_slot_type_from_fp (cfg, -alloc_size + cfg->param_area, SLOT_NOREF);
+		}
+		cfg->arch.sp_fp_offset = alloc_size;
 		alloc_size -= pos;
 	}
 
@@ -5657,8 +6412,19 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 	/* Allocate stack frame */
 	if (alloc_size) {
 		/* See mono_emit_stack_alloc */
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(HOST_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
 		guint32 remaining_size = alloc_size;
+		/*FIXME handle unbounded code expansion, we should use a loop in case of more than X interactions*/
+		guint32 required_code_size = ((remaining_size / 0x1000) + 1) * 10; /*10 is the max size of amd64_alu_reg_imm + amd64_test_membase_reg*/
+		guint32 offset = code - cfg->native_code;
+		if (G_UNLIKELY (required_code_size >= (cfg->code_size - offset))) {
+			while (required_code_size >= (cfg->code_size - offset))
+				cfg->code_size *= 2;
+			cfg->native_code = mono_realloc_native_code (cfg);
+			code = cfg->native_code + offset;
+			mono_jit_stats.code_reallocs++;
+		}
+
 		while (remaining_size >= 0x1000) {
 			amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 0x1000);
  			if (cfg->arch.omit_fp) {
@@ -5666,7 +6432,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
  				mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
 			}
 			async_exc_point (code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 			if (cfg->arch.omit_fp) 
 				mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, 0x1000);
 #endif
@@ -5681,7 +6447,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
  				mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
 				async_exc_point (code);
 			}
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 			if (cfg->arch.omit_fp) 
 				mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, remaining_size);
 #endif
@@ -5707,12 +6473,49 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 	}
 #endif
 
+#ifndef TARGET_WIN32
+	if (mini_get_debug_options ()->init_stacks) {
+		/* Fill the stack frame with a dummy value to force deterministic behavior */
+	
+		/* Save registers to the red zone */
+		amd64_mov_membase_reg (code, AMD64_RSP, -8, AMD64_RDI, 8);
+		amd64_mov_membase_reg (code, AMD64_RSP, -16, AMD64_RCX, 8);
+
+		amd64_mov_reg_imm (code, AMD64_RAX, 0x2a2a2a2a2a2a2a2a);
+		amd64_mov_reg_imm (code, AMD64_RCX, alloc_size / 8);
+		amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RSP, 8);
+
+		amd64_cld (code);
+#if defined(__default_codegen__)
+		amd64_prefix (code, X86_REP_PREFIX);
+		amd64_stosl (code);
+#elif defined(__native_client_codegen__)
+		/* NaCl stos pseudo-instruction */
+		amd64_codegen_pre (code);
+		/* First, clear the upper 32 bits of RDI (mov %edi, %edi)  */
+		amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RDI, 4);
+		/* Add %r15 to %rdi using lea, condition flags unaffected. */
+		amd64_lea_memindex_size (code, AMD64_RDI, AMD64_R15, 0, AMD64_RDI, 0, 8);
+		amd64_prefix (code, X86_REP_PREFIX);
+		amd64_stosl (code);
+		amd64_codegen_post (code);
+#endif /* __native_client_codegen__ */
+
+		amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RSP, -8, 8);
+		amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RSP, -16, 8);
+	}
+#endif	
+
 	/* Save LMF */
 	if (method->save_lmf) {
 		/* 
 		 * The ip field is not set, the exception handling code will obtain it from the stack location pointed to by the sp field.
 		 */
-		/* sp is saved right before calls */
+		/* 
+		 * sp is saved right before calls but we need to save it here too so
+		 * async stack walks would work.
+		 */
+		amd64_mov_membase_reg (code, cfg->frame_reg, cfg->arch.lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
 		/* Skip method (only needed for trampoline LMF frames) */
 		/* Save callee saved regs */
 		for (i = 0; i < MONO_MAX_IREGS; ++i) {
@@ -5724,8 +6527,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			case AMD64_R12: offset = G_STRUCT_OFFSET (MonoLMF, r12); break;
 			case AMD64_R13: offset = G_STRUCT_OFFSET (MonoLMF, r13); break;
 			case AMD64_R14: offset = G_STRUCT_OFFSET (MonoLMF, r14); break;
+#ifndef __native_client_codegen__
 			case AMD64_R15: offset = G_STRUCT_OFFSET (MonoLMF, r15); break;
-#ifdef PLATFORM_WIN32
+#endif
+#ifdef HOST_WIN32
 			case AMD64_RDI: offset = G_STRUCT_OFFSET (MonoLMF, rdi); break;
 			case AMD64_RSI: offset = G_STRUCT_OFFSET (MonoLMF, rsi); break;
 #endif
@@ -5740,6 +6545,26 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 					mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - (lmf_offset + offset)));
 			}
 		}
+
+		/* These can't contain refs */
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), SLOT_NOREF);
+
+		/* These are handled automatically by the stack marking code */
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbp), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), SLOT_NOREF);
+#ifdef HOST_WIN32
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), SLOT_NOREF);
+		mini_gc_set_slot_type_from_fp (cfg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsi), SLOT_NOREF);
+#endif
+
 	}
 
 	/* Save callee saved registers */
@@ -5753,6 +6578,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
 				amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
 				mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - save_area_offset));
+
+				/* These are handled automatically by the stack marking code */
+				mini_gc_set_slot_type_from_cfa (cfg, - (cfa_offset - save_area_offset), SLOT_NOREF);
+
 				save_area_offset += 8;
 				async_exc_point (code);
 			}
@@ -5763,7 +6592,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		g_assert (cfg->rgctx_var->opcode == OP_REGOFFSET &&
 				(cfg->rgctx_var->inst_basereg == AMD64_RBP || cfg->rgctx_var->inst_basereg == AMD64_RSP));
 
-		amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, 8);
+		amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, sizeof(gpointer));
 	}
 
 	/* compute max_length in order to use short forward jumps */
@@ -5778,8 +6607,22 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			/* max alignment for loops */
 			if ((cfg->opt & MONO_OPT_LOOP) && bb_is_loop_start (bb))
 				max_length += LOOP_ALIGNMENT;
+#ifdef __native_client_codegen__
+			/* max alignment for native client */
+			max_length += kNaClAlignment;
+#endif
 
 			MONO_BB_FOR_EACH_INS (bb, ins) {
+#ifdef __native_client_codegen__
+				{
+					int space_in_block = kNaClAlignment -
+						((max_length + cfg->code_len) & kNaClAlignmentMask);
+					int max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
+					if (space_in_block < max_len && max_len < kNaClAlignment) {
+						max_length += space_in_block;
+					}
+				}
+#endif  /*__native_client_codegen__*/
 				max_length += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
 			}
 
@@ -5831,13 +6674,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 				for (quad = 0; quad < 2; quad ++) {
 					switch (ainfo->pair_storage [quad]) {
 					case ArgInIReg:
-						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad], sizeof (gpointer));
+						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad], sizeof(mgreg_t));
 						break;
 					case ArgInFloatSSEReg:
-						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgInDoubleSSEReg:
-						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgNone:
 						break;
@@ -5883,13 +6726,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 				for (quad = 0; quad < 2; quad ++) {
 					switch (ainfo->pair_storage [quad]) {
 					case ArgInIReg:
-						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad], sizeof (gpointer));
+						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad], sizeof(mgreg_t));
 						break;
 					case ArgInFloatSSEReg:
-						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgInDoubleSSEReg:
-						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgNone:
 						break;
@@ -5954,7 +6797,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
 					  (gpointer)"mono_jit_thread_attach", TRUE);
 			amd64_patch (buf, code);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 			/* The TLS key actually contains a pointer to the MonoJitTlsData structure */
 			/* FIXME: Add a separate key for LMF to avoid this */
 			amd64_alu_reg_imm (code, X86_ADD, AMD64_RAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
@@ -6000,7 +6843,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			if (lmf_addr_tls_offset != -1) {
 				/* Load lmf quicky using the FS register */
 				code = mono_amd64_emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 				/* The TLS key actually contains a pointer to the MonoJitTlsData structure */
 				/* FIXME: Add a separate key for LMF to avoid this */
 				amd64_alu_reg_imm (code, X86_ADD, AMD64_RAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
@@ -6017,13 +6860,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			}
 
 			/* Save lmf_addr */
-			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, sizeof(gpointer));
 			/* Save previous_lmf */
-			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, sizeof(gpointer));
+			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, sizeof(gpointer));
 			/* Set new lmf */
 			amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
-			amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
+			amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, sizeof(gpointer));
 		}
 	}
 
@@ -6102,6 +6945,17 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		}
 	}
 
+	/* Initialize ss_trigger_page_var */
+	if (cfg->arch.ss_trigger_page_var) {
+		MonoInst *var = cfg->arch.ss_trigger_page_var;
+
+		g_assert (!cfg->compile_aot);
+		g_assert (var->opcode == OP_REGOFFSET);
+
+		amd64_mov_reg_imm (code, AMD64_R11, (guint64)ss_trigger_page);
+		amd64_mov_membase_reg (code, var->inst_basereg, var->inst_offset, AMD64_R11, 8);
+	}
+
 	cfg->code_len = code - cfg->native_code;
 
 	g_assert (cfg->code_len < cfg->code_size);
@@ -6123,7 +6977,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 
 	while (cfg->code_len + max_epilog_size > (cfg->code_size - 16)) {
 		cfg->code_size *= 2;
-		cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+		cfg->native_code = mono_realloc_native_code (cfg);
 		mono_jit_stats.code_reallocs++;
 	}
 
@@ -6159,14 +7013,14 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 			 * through the mono_lmf_addr TLS variable.
 			 */
 			/* reg = previous_lmf */
-			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof(gpointer));
 			x86_prefix (code, X86_FS_PREFIX);
 			amd64_mov_mem_reg (code, lmf_tls_offset, AMD64_R11, 8);
 		} else {
 			/* Restore previous lmf */
-			amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
-			amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+			amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof(gpointer));
+			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), sizeof(gpointer));
+			amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, sizeof(gpointer));
 		}
 
 		/* Restore caller saved regs */
@@ -6186,9 +7040,13 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 			amd64_mov_reg_membase (code, AMD64_R14, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
 		}
 		if (cfg->used_int_regs & (1 << AMD64_R15)) {
+#if defined(__default_codegen__)
 			amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
+#elif defined(__native_client_codegen__)
+			g_assert_not_reached();
+#endif
 		}
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 		if (cfg->used_int_regs & (1 << AMD64_RDI)) {
 			amd64_mov_reg_membase (code, AMD64_RDI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), 8);
 		}
@@ -6210,10 +7068,10 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 		else {
 			for (i = 0; i < AMD64_NREG; ++i)
 				if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-					pos -= sizeof (gpointer);
+					pos -= sizeof(mgreg_t);
 
 			if (pos) {
-				if (pos == - sizeof (gpointer)) {
+				if (pos == - sizeof(mgreg_t)) {
 					/* Only one register, so avoid lea */
 					for (i = AMD64_NREG - 1; i > 0; --i)
 						if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
@@ -6242,13 +7100,13 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 		for (quad = 0; quad < 2; quad ++) {
 			switch (ainfo->pair_storage [quad]) {
 			case ArgInIReg:
-				amd64_mov_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)), sizeof (gpointer));
+				amd64_mov_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)), sizeof(mgreg_t));
 				break;
 			case ArgInFloatSSEReg:
-				amd64_movss_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)));
+				amd64_movss_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)));
 				break;
 			case ArgInDoubleSSEReg:
-				amd64_movsd_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)));
+				amd64_movsd_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)));
 				break;
 			case ArgNone:
 				break;
@@ -6290,11 +7148,20 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 			code_size += 8 + 15; /* sizeof (double) + alignment */
 		if (patch_info->type == MONO_PATCH_INFO_R4)
 			code_size += 4 + 15; /* sizeof (float) + alignment */
+		if (patch_info->type == MONO_PATCH_INFO_GC_CARD_TABLE_ADDR)
+			code_size += 8 + 7; /*sizeof (void*) + alignment */
 	}
 
+#ifdef __native_client_codegen__
+	/* Give us extra room on Native Client.  This could be   */
+	/* more carefully calculated, but bundle alignment makes */
+	/* it much trickier, so *2 like other places is good.    */
+	code_size *= 2;
+#endif
+
 	while (cfg->code_len + code_size > (cfg->code_size - 16)) {
 		cfg->code_size *= 2;
-		cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+		cfg->native_code = mono_realloc_native_code (cfg);
 		mono_jit_stats.code_reallocs++;
 	}
 
@@ -6334,7 +7201,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 					exc_classes [nthrows] = exc_class;
 					exc_throw_start [nthrows] = code;
 				}
-				amd64_mov_reg_imm (code, AMD64_ARG_REG1, exc_class->type_token);
+				amd64_mov_reg_imm (code, AMD64_ARG_REG1, exc_class->type_token - MONO_TOKEN_TYPE_DEF);
 
 				patch_info->type = MONO_PATCH_INFO_NONE;
 
@@ -6355,35 +7222,104 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 			/* do nothing */
 			break;
 		}
+		g_assert(code < cfg->native_code + cfg->code_size);
 	}
 
 	/* Handle relocations with RIP relative addressing */
 	for (patch_info = cfg->patch_info; patch_info; patch_info = patch_info->next) {
 		gboolean remove = FALSE;
+		guint8 *orig_code = code;
 
 		switch (patch_info->type) {
 		case MONO_PATCH_INFO_R8:
 		case MONO_PATCH_INFO_R4: {
-			guint8 *pos;
+			guint8 *pos, *patch_pos, *target_pos;
 
 			/* The SSE opcodes require a 16 byte alignment */
+#if defined(__default_codegen__)
 			code = (guint8*)ALIGN_TO (code, 16);
+#elif defined(__native_client_codegen__)
+			{
+				/* Pad this out with HLT instructions  */
+				/* or we can get garbage bytes emitted */
+				/* which will fail validation          */
+				guint8 *aligned_code;
+				/* extra align to make room for  */
+				/* mov/push below 		       */
+				int extra_align = patch_info->type == MONO_PATCH_INFO_R8 ? 2 : 1;
+				aligned_code = (guint8*)ALIGN_TO (code + extra_align, 16);
+				/* The technique of hiding data in an  */
+				/* instruction has a problem here: we  */
+				/* need the data aligned to a 16-byte  */
+				/* boundary but the instruction cannot */
+				/* cross the bundle boundary. so only  */
+				/* odd multiples of 16 can be used     */
+				if ((intptr_t)aligned_code % kNaClAlignment == 0) {
+					aligned_code += 16;
+				}
+				while (code < aligned_code) {
+					*(code++) = 0xf4; /* hlt */
+				}
+			}	
+#endif
 
 			pos = cfg->native_code + patch_info->ip.i;
-
-			if (IS_REX (pos [1]))
-				*(guint32*)(pos + 5) = (guint8*)code - pos - 9;
-			else
-				*(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+			if (IS_REX (pos [1])) {
+				patch_pos = pos + 5;
+				target_pos = code - pos - 9;
+			}
+			else {
+				patch_pos = pos + 4;
+				target_pos = code - pos - 8;
+			}
 
 			if (patch_info->type == MONO_PATCH_INFO_R8) {
+#ifdef __native_client_codegen__
+				/* Hide 64-bit data in a         */
+				/* "mov imm64, r11" instruction. */
+				/* write it before the start of  */
+				/* the data*/
+				*(code-2) = 0x49; /* prefix      */
+				*(code-1) = 0xbb; /* mov X, %r11 */
+#endif
 				*(double*)code = *(double*)patch_info->data.target;
 				code += sizeof (double);
 			} else {
+#ifdef __native_client_codegen__
+				/* Hide 32-bit data in a        */
+				/* "push imm32" instruction.    */
+				*(code-1) = 0x68; /* push */
+#endif
 				*(float*)code = *(float*)patch_info->data.target;
 				code += sizeof (float);
 			}
 
+			*(guint32*)(patch_pos) = target_pos;
+
+			remove = TRUE;
+			break;
+		}
+		case MONO_PATCH_INFO_GC_CARD_TABLE_ADDR: {
+			guint8 *pos;
+
+			if (cfg->compile_aot)
+				continue;
+
+			/*loading is faster against aligned addresses.*/
+			code = (guint8*)ALIGN_TO (code, 8);
+			memset (orig_code, 0, code - orig_code);
+
+			pos = cfg->native_code + patch_info->ip.i;
+
+			/*alu_op [rex] modr/m imm32 - 7 or 8 bytes */
+			if (IS_REX (pos [1]))
+				*(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+			else
+				*(guint32*)(pos + 3) = (guint8*)code - pos - 7;
+
+			*(gpointer*)code = (gpointer)patch_info->data.target;
+			code += sizeof (gpointer);
+
 			remove = TRUE;
 			break;
 		}
@@ -6402,6 +7338,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 				tmp->next = patch_info->next;
 			}
 		}
+		g_assert (code < cfg->native_code + cfg->code_size);
 	}
 
 	cfg->code_len = code - cfg->native_code;
@@ -6410,6 +7347,8 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 
 }
 
+#endif /* DISABLE_JIT */
+
 void*
 mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
 {
@@ -6425,7 +7364,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
 		/* Allocate a new area on the stack and save arguments there */
 		sig = mono_method_signature (cfg->method);
 
-		cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
+		cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
 
 		n = sig->param_count + sig->hasthis;
 
@@ -6470,9 +7409,9 @@ mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolea
 	guchar *code = p;
 	int save_mode = SAVE_NONE;
 	MonoMethod *method = cfg->method;
-	int rtype = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret)->type;
+	MonoType *ret_type = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
 	
-	switch (rtype) {
+	switch (ret_type->type) {
 	case MONO_TYPE_VOID:
 		/* special case string .ctor icall */
 		if (strcmp (".ctor", method->name) && method->klass == mono_defaults.string_class)
@@ -6489,7 +7428,7 @@ mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolea
 		save_mode = SAVE_XMM;
 		break;
 	case MONO_TYPE_GENERICINST:
-		if (!mono_type_generic_inst_is_valuetype (mono_method_signature (method)->ret)) {
+		if (!mono_type_generic_inst_is_valuetype (ret_type)) {
 			save_mode = SAVE_EAX;
 			break;
 		}
@@ -6717,123 +7656,56 @@ mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guin
 	return can_write;
 }
 
-gpointer
-mono_arch_get_vcall_slot (guint8 *code, mgreg_t *regs, int *displacement)
-{
-	guint8 buf [10];
-	guint32 reg;
-	gint32 disp;
-	guint8 rex = 0;
-	MonoJitInfo *ji = NULL;
-
-#ifdef ENABLE_LLVM
-	/* code - 9 might be before the start of the method */
-	/* FIXME: Avoid this expensive call somehow */
-	ji = mono_jit_info_table_find (mono_domain_get (), (char*)code);
-#endif
-
-	mono_breakpoint_clean_code (ji ? ji->code_start : NULL, code, 9, buf, sizeof (buf));
-	code = buf + 9;
+#if defined(__native_client_codegen__)
+/* For membase calls, we want the base register. for Native Client,  */
+/* all indirect calls have the following sequence with the given sizes: */
+/* mov %eXX,%eXX				[2-3]	*/
+/* mov disp(%r15,%rXX,scale),%r11d		[4-8]	*/
+/* and $0xffffffffffffffe0,%r11d		[4]	*/
+/* add %r15,%r11				[3]	*/
+/* callq *%r11					[3]	*/
 
-	*displacement = 0;
 
-	code -= 7;
-
-	/* 
-	 * A given byte sequence can match more than case here, so we have to be
-	 * really careful about the ordering of the cases. Longer sequences
-	 * come first.
-	 * There are two types of calls:
-	 * - direct calls: 0xff address_byte 8/32 bits displacement
-	 * - indirect calls: nop nop nop <call>
-	 * The nops make sure we don't confuse the instruction preceeding an indirect
-	 * call with a direct call.
-	 */
-	if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
-		/* call OFFSET(%rip) */
-		disp = *(guint32*)(code + 3);
-		return (gpointer*)(code + disp + 7);
-	} else if ((code [0] == 0xff) && (amd64_modrm_reg (code [1]) == 0x2) && (amd64_modrm_mod (code [1]) == 0x2) && (amd64_sib_index (code [2]) == 4) && (amd64_sib_scale (code [2]) == 0)) {
-		/* call *[reg+disp32] using indexed addressing */
-		/* The LLVM JIT emits this, and we emit it too for %r12 */
-		if (IS_REX (code [-1])) {
-			rex = code [-1];
-			g_assert (amd64_rex_x (rex) == 0);
-		}			
-		reg = amd64_sib_base (code [2]);
-		disp = *(gint32*)(code + 3);
-	} else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
-		/* call *[reg+disp32] */
-		if (IS_REX (code [0]))
-			rex = code [0];
-		reg = amd64_modrm_rm (code [2]);
-		disp = *(gint32*)(code + 3);
-		/* R10 is clobbered by the IMT thunk code */
-		g_assert (reg != AMD64_R10);
-	} else if (code [2] == 0xe8) {
-		/* call <ADDR> */
-		return NULL;
-	} else if ((code [3] == 0xff) && (amd64_modrm_reg (code [4]) == 0x2) && (amd64_modrm_mod (code [4]) == 0x1) && (amd64_sib_index (code [5]) == 4) && (amd64_sib_scale (code [5]) == 0)) {
-		/* call *[r12+disp8] using indexed addressing */
-		if (IS_REX (code [2]))
-			rex = code [2];
-		reg = amd64_sib_base (code [5]);
-		disp = *(gint8*)(code + 6);
-	} else if (IS_REX (code [4]) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x3)) {
-		/* call *%reg */
-		return NULL;
-	} else if ((code [4] == 0xff) && (amd64_modrm_reg (code [5]) == 0x2) && (amd64_modrm_mod (code [5]) == 0x1)) {
-		/* call *[reg+disp8] */
-		if (IS_REX (code [3]))
-			rex = code [3];
-		reg = amd64_modrm_rm (code [5]);
-		disp = *(gint8*)(code + 6);
-		//printf ("B: [%%r%d+0x%x]\n", reg, disp);
-	}
-	else if ((code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-		/* call *%reg */
-		if (IS_REX (code [4]))
-			rex = code [4];
-		reg = amd64_modrm_rm (code [6]);
-		disp = 0;
-	}
-	else
-		g_assert_not_reached ();
-
-	reg += amd64_rex_b (rex);
-
-	/* R11 is clobbered by the trampoline code */
-	g_assert (reg != AMD64_R11);
+/* Determine if code points to a NaCl call-through-register sequence, */
+/* (i.e., the last 3 instructions listed above) */
+int
+is_nacl_call_reg_sequence(guint8* code)
+{
+	const char *sequence = "\x41\x83\xe3\xe0" /* and */
+			       "\x4d\x03\xdf"     /* add */
+			       "\x41\xff\xd3";   /* call */
+	return memcmp(code, sequence, 10) == 0;
+}
 
-	*displacement = disp;
-	return (gpointer)regs [reg];
+/* Determine if code points to the first opcode of the mov membase component */
+/* of an indirect call sequence (i.e. the first 2 instructions listed above) */
+/* (there could be a REX prefix before the opcode but it is ignored) */
+static int
+is_nacl_indirect_call_membase_sequence(guint8* code)
+{
+	       /* Check for mov opcode, reg-reg addressing mode (mod = 3), */
+	return code[0] == 0x8b && amd64_modrm_mod(code[1]) == 3 &&
+	       /* and that src reg = dest reg */
+	       amd64_modrm_reg(code[1]) == amd64_modrm_rm(code[1]) &&
+	       /* Check that next inst is mov, uses SIB byte (rm = 4), */
+	       IS_REX(code[2]) &&
+	       code[3] == 0x8b && amd64_modrm_rm(code[4]) == 4 &&
+	       /* and has dst of r11 and base of r15 */
+	       (amd64_modrm_reg(code[4]) + amd64_rex_r(code[2])) == AMD64_R11 &&
+	       (amd64_sib_base(code[5]) + amd64_rex_b(code[2])) == AMD64_R15;
 }
+#endif /* __native_client_codegen__ */
 
 int
-mono_arch_get_this_arg_reg (MonoMethodSignature *sig, MonoGenericSharingContext *gsctx, guint8 *code)
+mono_arch_get_this_arg_reg (guint8 *code)
 {
-	int this_reg = AMD64_ARG_REG1;
-
-	if (MONO_TYPE_ISSTRUCT (sig->ret)) {
-		CallInfo *cinfo;
-
-		if (!gsctx && code)
-			gsctx = mono_get_generic_context_from_code (code);
-
-		cinfo = get_call_info (gsctx, NULL, sig, FALSE);
-		
-		if (cinfo->ret.storage != ArgValuetypeInReg)
-			this_reg = AMD64_ARG_REG2;
-		g_free (cinfo);
-	}
-
-	return this_reg;
+	return AMD64_ARG_REG1;
 }
 
 gpointer
-mono_arch_get_this_arg_from_call (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, mgreg_t *regs, guint8 *code)
+mono_arch_get_this_arg_from_call (mgreg_t *regs, guint8 *code)
 {
-	return (gpointer)regs [mono_arch_get_this_arg_reg (sig, gsctx, code)];
+	return (gpointer)regs [mono_arch_get_this_arg_reg (code)];
 }
 
 #define MAX_ARCH_DELEGATE_PARAMS 10
@@ -6862,7 +7734,7 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
 			/* We have to shift the arguments left */
 			amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
 			for (i = 0; i < param_count; ++i) {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 				if (i < 3)
 					amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
 				else
@@ -6877,18 +7749,32 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
 		g_assert ((code - start) < 64);
 	}
 
+	nacl_global_codeman_validate(&start, 64, &code);
+
 	mono_debug_add_delegate_trampoline (start, code - start);
 
 	if (code_len)
 		*code_len = code - start;
 
+
+	if (mono_jit_map_is_enabled ()) {
+		char *buff;
+		if (has_target)
+			buff = (char*)"delegate_invoke_has_target";
+		else
+			buff = g_strdup_printf ("delegate_invoke_no_target_%d", param_count);
+		mono_emit_jit_tramp (start, code - start, buff);
+		if (!has_target)
+			g_free (buff);
+	}
+
 	return start;
 }
 
 /*
  * mono_arch_get_delegate_invoke_impls:
  *
- *   Return a list of MonoAotTrampInfo structures for the delegate invoke impl
+ *   Return a list of MonoTrampInfo structures for the delegate invoke impl
  * trampolines.
  */
 GSList*
@@ -6900,11 +7786,11 @@ mono_arch_get_delegate_invoke_impls (void)
 	int i;
 
 	code = get_delegate_invoke_impl (TRUE, 0, &code_len);
-	res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup ("delegate_invoke_impl_has_target"), code, code_len));
+	res = g_slist_prepend (res, mono_tramp_info_create (g_strdup ("delegate_invoke_impl_has_target"), code, code_len, NULL, NULL));
 
 	for (i = 0; i < MAX_ARCH_DELEGATE_PARAMS; ++i) {
 		code = get_delegate_invoke_impl (FALSE, i, &code_len);
-		res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup_printf ("delegate_invoke_impl_target_%d", i), code, code_len));
+		res = g_slist_prepend (res, mono_tramp_info_create (g_strdup_printf ("delegate_invoke_impl_target_%d", i), code, code_len, NULL, NULL));
 	}
 
 	return res;
@@ -6930,7 +7816,7 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
 			return cached;
 
 		if (mono_aot_only)
-			start = mono_aot_get_named_code ("delegate_invoke_impl_has_target");
+			start = mono_aot_get_trampoline ("delegate_invoke_impl_has_target");
 		else
 			start = get_delegate_invoke_impl (TRUE, 0, NULL);
 
@@ -6951,7 +7837,7 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
 
 		if (mono_aot_only) {
 			char *name = g_strdup_printf ("delegate_invoke_impl_target_%d", sig->param_count);
-			start = mono_aot_get_named_code (name);
+			start = mono_aot_get_trampoline (name);
 			g_free (name);
 		} else {
 			start = get_delegate_invoke_impl (FALSE, sig->param_count, NULL);
@@ -6976,7 +7862,7 @@ void
 mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
 {
 	if (!tls_offset_inited) {
-#ifdef PLATFORM_WIN32
+#ifdef HOST_WIN32
 		/* 
 		 * We need to init this multiple times, since when we are first called, the key might not
 		 * be initialized yet.
@@ -7009,6 +7895,7 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 
 #ifdef MONO_ARCH_HAVE_IMT
 
+#if defined(__default_codegen__)
 #define CMP_SIZE (6 + 1)
 #define CMP_REG_REG_SIZE (4 + 1)
 #define BR_SMALL_SIZE 2
@@ -7016,6 +7903,20 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 #define MOV_REG_IMM_SIZE 10
 #define MOV_REG_IMM_32BIT_SIZE 6
 #define JUMP_REG_SIZE (2 + 1)
+#elif defined(__native_client_codegen__)
+/* NaCl N-byte instructions can be padded up to N-1 bytes */
+#define CMP_SIZE ((6 + 1) * 2 - 1)
+#define CMP_REG_REG_SIZE ((4 + 1) * 2 - 1)
+#define BR_SMALL_SIZE (2 * 2 - 1)
+#define BR_LARGE_SIZE (6 * 2 - 1)
+#define MOV_REG_IMM_SIZE (10 * 2 - 1)
+#define MOV_REG_IMM_32BIT_SIZE (6 * 2 - 1)
+/* Jump reg for NaCl adds a mask (+4) and add (+3) */
+#define JUMP_REG_SIZE ((2 + 1 + 4 + 3) * 2 - 1)
+/* Jump membase's size is large and unpredictable    */
+/* in native client, just pad it out a whole bundle. */
+#define JUMP_MEMBASE_SIZE (kNaClAlignment)
+#endif
 
 static int
 imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
@@ -7055,6 +7956,9 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 						item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
 					else
 						item->chunk_size += MOV_REG_IMM_SIZE;
+#ifdef __native_client_codegen__
+					item->chunk_size += JUMP_MEMBASE_SIZE;
+#endif
 				}
 				item->chunk_size += BR_SMALL_SIZE + JUMP_REG_SIZE;
 			} else {
@@ -7070,6 +7974,9 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 					/* with assert below:
 					 * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
 					 */
+#ifdef __native_client_codegen__
+					item->chunk_size += JUMP_MEMBASE_SIZE;
+#endif
 				}
 			}
 		} else {
@@ -7082,10 +7989,16 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 		}
 		size += item->chunk_size;
 	}
+#if defined(__native_client__) && defined(__native_client_codegen__)
+	/* In Native Client, we don't re-use thunks, allocate from the */
+	/* normal code manager paths. */
+	code = mono_domain_code_reserve (domain, size);
+#else
 	if (fail_tramp)
 		code = mono_method_alloc_generic_virtual_thunk (domain, size);
 	else
 		code = mono_domain_code_reserve (domain, size);
+#endif
 	start = code;
 	for (i = 0; i < count; ++i) {
 		MonoIMTCheckItem *item = imt_entries [i];
@@ -7098,25 +8011,24 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 					if (amd64_is_imm32 (item->key))
 						amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
 					else {
-						amd64_mov_reg_imm (code, AMD64_R10, item->key);
-						amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+						amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+						amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
 					}
 				}
 				item->jmp_code = code;
 				amd64_branch8 (code, X86_CC_NE, 0, FALSE);
-				/* See the comment below about R10 */
 				if (item->has_target_code) {
-					amd64_mov_reg_imm (code, AMD64_R10, item->value.target_code);
-					amd64_jump_reg (code, AMD64_R10);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->value.target_code);
+					amd64_jump_reg (code, MONO_ARCH_IMT_SCRATCH_REG);
 				} else {
-					amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
-					amd64_jump_membase (code, AMD64_R10, 0);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+					amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
 				}
 
 				if (fail_case) {
 					amd64_patch (item->jmp_code, code);
-					amd64_mov_reg_imm (code, AMD64_R10, fail_tramp);
-					amd64_jump_reg (code, AMD64_R10);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, fail_tramp);
+					amd64_jump_reg (code, MONO_ARCH_IMT_SCRATCH_REG);
 					item->jmp_code = NULL;
 				}
 			} else {
@@ -7125,33 +8037,33 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 				if (amd64_is_imm32 (item->key))
 					amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
 				else {
-					amd64_mov_reg_imm (code, AMD64_R10, item->key);
-					amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+					amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
 				}
 				item->jmp_code = code;
 				amd64_branch8 (code, X86_CC_NE, 0, FALSE);
 				/* See the comment below about R10 */
-				amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
-				amd64_jump_membase (code, AMD64_R10, 0);
+				amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+				amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
 				amd64_patch (item->jmp_code, code);
 				amd64_breakpoint (code);
 				item->jmp_code = NULL;
 #else
-				/* We're using R10 here because R11
+				/* We're using R10 (MONO_ARCH_IMT_SCRATCH_REG) here because R11 (MONO_ARCH_IMT_REG)
 				   needs to be preserved.  R10 needs
 				   to be preserved for calls which
 				   require a runtime generic context,
 				   but interface calls don't. */
-				amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
-				amd64_jump_membase (code, AMD64_R10, 0);
+				amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+				amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
 #endif
 			}
 		} else {
 			if (amd64_is_imm32 (item->key))
 				amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
 			else {
-				amd64_mov_reg_imm (code, AMD64_R10, item->key);
-				amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
+				amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+				amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
 			}
 			item->jmp_code = code;
 			if (x86_is_imm8 (imt_branch_distance (imt_entries, i, item->check_target_idx)))
@@ -7175,6 +8087,8 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 		mono_stats.imt_thunks_size += code - start;
 	g_assert (code - start <= size);
 
+	nacl_domain_code_validate(domain, &start, size, &code);
+
 	return start;
 }
 
@@ -7183,12 +8097,6 @@ mono_arch_find_imt_method (mgreg_t *regs, guint8 *code)
 {
 	return (MonoMethod*)regs [MONO_ARCH_IMT_REG];
 }
-
-MonoObject*
-mono_arch_find_this_argument (mgreg_t *regs, MonoMethod *method, MonoGenericSharingContext *gsctx)
-{
-	return mono_arch_get_this_arg_from_call (gsctx, mono_method_signature (method), regs, NULL);
-}
 #endif
 
 MonoVTable*
@@ -7197,6 +8105,17 @@ mono_arch_find_static_call_vtable (mgreg_t *regs, guint8 *code)
 	return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
 }
 
+GSList*
+mono_arch_get_cie_program (void)
+{
+	GSList *l = NULL;
+
+	mono_add_unwind_op_def_cfa (l, (guint8*)NULL, (guint8*)NULL, AMD64_RSP, 8);
+	mono_add_unwind_op_offset (l, (guint8*)NULL, (guint8*)NULL, AMD64_RIP, -8);
+
+	return l;
+}
+
 MonoInst*
 mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {
@@ -7310,3 +8229,260 @@ mono_arch_context_get_int_reg (MonoContext *ctx, int reg)
 			g_assert_not_reached ();
 	}
 }
+
+/*MONO_ARCH_HAVE_HANDLER_BLOCK_GUARD*/
+gpointer
+mono_arch_install_handler_block_guard (MonoJitInfo *ji, MonoJitExceptionInfo *clause, MonoContext *ctx, gpointer new_value)
+{
+	int offset;
+	gpointer *sp, old_value;
+	char *bp;
+	const unsigned char *handler;
+
+	/*Decode the first instruction to figure out where did we store the spvar*/
+	/*Our jit MUST generate the following:
+	 mov    %rsp, ?(%rbp)
+
+	 Which is encoded as: REX.W 0x89 mod_rm
+	 mod_rm (rsp, rbp, imm) which can be: (imm will never be zero)
+		mod (reg + imm8):  01 reg(rsp): 100 rm(rbp): 101 -> 01100101 (0x65)
+		mod (reg + imm32): 10 reg(rsp): 100 rm(rbp): 101 -> 10100101 (0xA5)
+
+	FIXME can we generate frameless methods on this case?
+
+	*/
+	handler = clause->handler_start;
+
+	/*REX.W*/
+	if (*handler != 0x48)
+		return NULL;
+	++handler;
+
+	/*mov r, r/m */
+	if (*handler != 0x89)
+		return NULL;
+	++handler;
+
+	if (*handler == 0x65)
+		offset = *(signed char*)(handler + 1);
+	else if (*handler == 0xA5)
+		offset = *(int*)(handler + 1);
+	else
+		return NULL;
+
+	/*Load the spvar*/
+	bp = MONO_CONTEXT_GET_BP (ctx);
+	sp = *(gpointer*)(bp + offset);
+
+	old_value = *sp;
+	if (old_value < ji->code_start || (char*)old_value > ((char*)ji->code_start + ji->code_size))
+		return old_value;
+
+	*sp = new_value;
+
+	return old_value;
+}
+
+/*
+ * mono_arch_emit_load_aotconst:
+ *
+ *   Emit code to load the contents of the GOT slot identified by TRAMP_TYPE and
+ * TARGET from the mscorlib GOT in full-aot code.
+ * On AMD64, the result is placed into R11.
+ */
+guint8*
+mono_arch_emit_load_aotconst (guint8 *start, guint8 *code, MonoJumpInfo **ji, int tramp_type, gconstpointer target)
+{
+	*ji = mono_patch_info_list_prepend (*ji, code - start, tramp_type, target);
+	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, 8);
+
+	return code;
+}
+
+/*
+ * mono_arch_get_trampolines:
+ *
+ *   Return a list of MonoTrampInfo structures describing arch specific trampolines
+ * for AOT.
+ */
+GSList *
+mono_arch_get_trampolines (gboolean aot)
+{
+	return mono_amd64_get_exception_trampolines (aot);
+}
+
+/* Soft Debug support */
+#ifdef MONO_ARCH_SOFT_DEBUG_SUPPORTED
+
+/*
+ * mono_arch_set_breakpoint:
+ *
+ *   Set a breakpoint at the native code corresponding to JI at NATIVE_OFFSET.
+ * The location should contain code emitted by OP_SEQ_POINT.
+ */
+void
+mono_arch_set_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+	guint8 *code = ip;
+	guint8 *orig_code = code;
+
+	/* 
+	 * In production, we will use int3 (has to fix the size in the md 
+	 * file). But that could confuse gdb, so during development, we emit a SIGSEGV
+	 * instead.
+	 */
+	g_assert (code [0] == 0x90);
+	if (breakpoint_size == 8) {
+		amd64_mov_reg_mem (code, AMD64_R11, (guint64)bp_trigger_page, 4);
+	} else {
+		amd64_mov_reg_imm_size (code, AMD64_R11, (guint64)bp_trigger_page, 8);
+		amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, 0, 4);
+	}
+
+	g_assert (code - orig_code == breakpoint_size);
+}
+
+/*
+ * mono_arch_clear_breakpoint:
+ *
+ *   Clear the breakpoint at IP.
+ */
+void
+mono_arch_clear_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+	guint8 *code = ip;
+	int i;
+
+	for (i = 0; i < breakpoint_size; ++i)
+		x86_nop (code);
+}
+
+gboolean
+mono_arch_is_breakpoint_event (void *info, void *sigctx)
+{
+#ifdef HOST_WIN32
+	EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;
+	return FALSE;
+#else
+	siginfo_t* sinfo = (siginfo_t*) info;
+	/* Sometimes the address is off by 4 */
+	if (sinfo->si_addr >= bp_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)bp_trigger_page + 128)
+		return TRUE;
+	else
+		return FALSE;
+#endif
+}
+
+/*
+ * mono_arch_get_ip_for_breakpoint:
+ *
+ *   Convert the ip in CTX to the address where a breakpoint was placed.
+ */
+guint8*
+mono_arch_get_ip_for_breakpoint (MonoJitInfo *ji, MonoContext *ctx)
+{
+	guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+	/* ip points to the instruction causing the fault */
+	ip -= (breakpoint_size - breakpoint_fault_size);
+
+	return ip;
+}
+
+/*
+ * mono_arch_skip_breakpoint:
+ *
+ *   Modify CTX so the ip is placed after the breakpoint instruction, so when
+ * we resume, the instruction is not executed again.
+ */
+void
+mono_arch_skip_breakpoint (MonoContext *ctx)
+{
+	MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + breakpoint_fault_size);
+}
+	
+/*
+ * mono_arch_start_single_stepping:
+ *
+ *   Start single stepping.
+ */
+void
+mono_arch_start_single_stepping (void)
+{
+	mono_mprotect (ss_trigger_page, mono_pagesize (), 0);
+}
+	
+/*
+ * mono_arch_stop_single_stepping:
+ *
+ *   Stop single stepping.
+ */
+void
+mono_arch_stop_single_stepping (void)
+{
+	mono_mprotect (ss_trigger_page, mono_pagesize (), MONO_MMAP_READ);
+}
+
+/*
+ * mono_arch_is_single_step_event:
+ *
+ *   Return whenever the machine state in SIGCTX corresponds to a single
+ * step event.
+ */
+gboolean
+mono_arch_is_single_step_event (void *info, void *sigctx)
+{
+#ifdef HOST_WIN32
+	EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;
+	return FALSE;
+#else
+	siginfo_t* sinfo = (siginfo_t*) info;
+	/* Sometimes the address is off by 4 */
+	if (sinfo->si_addr >= ss_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)ss_trigger_page + 128)
+		return TRUE;
+	else
+		return FALSE;
+#endif
+}
+
+/*
+ * mono_arch_get_ip_for_single_step:
+ *
+ *   Convert the ip in CTX to the address stored in seq_points.
+ */
+guint8*
+mono_arch_get_ip_for_single_step (MonoJitInfo *ji, MonoContext *ctx)
+{
+	guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+	ip += single_step_fault_size;
+
+	return ip;
+}
+
+/*
+ * mono_arch_skip_single_step:
+ *
+ *   Modify CTX so the ip is placed after the single step trigger instruction,
+ * we resume, the instruction is not executed again.
+ */
+void
+mono_arch_skip_single_step (MonoContext *ctx)
+{
+	MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + single_step_fault_size);
+}
+
+/*
+ * mono_arch_create_seq_point_info:
+ *
+ *   Return a pointer to a data structure which is used by the sequence
+ * point implementation in AOTed code.
+ */
+gpointer
+mono_arch_get_seq_point_info (MonoDomain *domain, guint8 *code)
+{
+	NOT_IMPLEMENTED;
+	return NULL;
+}
+
+#endif