From: Elijah Taylor <elijahtaylor@google.com>
Date: Thu, 6 Jan 2011 00:02:57 +0000 (-0800)
Subject: Merge remote branch 'upstream/master'
X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=commitdiff_plain;h=5b558abeeb255a3179d4ca6a85617e051c6abd38;hp=45890e8b46ac438d2b8ccc1bd0d74eea31870de1;p=mono.git

Merge remote branch 'upstream/master'
---

diff --git a/configure.in b/configure.in
index 540f0309b69..ac9717d2a31 100644
--- a/configure.in
+++ b/configure.in
@@ -196,6 +196,7 @@ case "$host" in
 		use_sigposix=yes
 		ikvm_native=no
 		AC_DEFINE(DISABLE_SOCKETS,1,[Disable sockets support])
+		AC_DEFINE(DISABLE_ATTACH, 1, [Disable agent attach support])
 		;;
 	*-*-hpux*)
 	        host_win32=no
@@ -611,6 +612,10 @@ if test "x$enable_shared" = "xno"; then
    with_shared_mono=no
 fi
 
+case $host in
+*nacl* ) with_shared_mono=yes;;
+esac
+
 if test "x$host_win32" = "xyes"; then
    # Boehm GC requires the runtime to be in its own dll
    with_static_mono=no
@@ -1980,6 +1985,7 @@ dnl ***  NaCl  ***
 dnl **************
 
 AC_ARG_ENABLE(nacl_codegen, [  --enable-nacl-codegen      Enable Native Client code generation], enable_nacl_codegen=$enableval, enable_nacl_codegen=no)
+AC_ARG_ENABLE(nacl_gc, [  --enable-nacl-gc           Enable Native Client garbage collection], enable_nacl_gc=$enableval, enable_nacl_gc=no)
 
 AM_CONDITIONAL(NACL_CODEGEN, test x$enable_nacl_codegen != xno)
 if test "x$enable_nacl_codegen" = "xyes"; then
@@ -1988,6 +1994,10 @@ if test "x$enable_nacl_codegen" = "xyes"; then
    AC_DEFINE(TARGET_NACL, 1, [...])
 else
    MONO_NACL_ALIGN_MASK_OFF=0
+   CPPFLAGS="$CPPFLAGS -D__default_codegen__"
+fi
+if test "x$enable_nacl_gc" = "xyes"; then
+   CPPFLAGS="$CPPFLAGS -finstrument-for-thread-suspension -D__native_client_gc__"
 fi
 AC_SUBST(MONO_NACL_ALIGN_MASK_OFF)
 
@@ -2140,6 +2150,12 @@ case "$host" in
 			sgen_supported=true
 			;;
 		esac
+		case "$host" in
+			x86_64-*-nacl*)
+				AC_DEFINE(__mono_ilp32__, 1, [64 bit mode with 4 byte longs and pointers])
+				sizeof_register=8
+				;;
+		esac
 		;;
 	ia64-*-*)
 		TARGET=IA64
@@ -2291,6 +2307,14 @@ if test "x$host" != "x$target"; then
 		sizeof_register=8
 		target_byte_order=G_BIG_ENDIAN
 		;;
+   x86_64-*-nacl)
+		TARGET=AMD64
+		arch_target=amd64
+		AC_DEFINE(TARGET_AMD64, 1, [...])
+		AC_DEFINE(MONO_CROSS_COMPILE,1,[The runtime is compiled for cross-compiling mode])
+		AC_DEFINE(__mono_ilp32__, 1, [64 bit mode with 4 byte longs and pointers])
+		sizeof_register=8
+		;;
 	*)
 		AC_MSG_WARN([Cross compiling is only supported for targets matching 'powerpc64-{ps3,xbox360}-linux-gnu'])
 	esac
@@ -2548,6 +2572,10 @@ case "x$gc" in
 		if test x$TARGET = xSPARC -o x$TARGET = xSPARC64; then
 			LIBGC_CPPFLAGS=`echo $LIBGC_CPPFLAGS | sed -e 's/-D_FILE_OFFSET_BITS=64//g'`
 		fi
+		# Don't pass -finstrument-for-thread-suspension in, 
+		# if these are instrumented it will be very bad news 
+		# (infinite recursion, undefined parking behavior, etc)
+		LIBGC_CPPFLAGS=`echo $LIBGC_CPPFLAGS | sed -e 's/-finstrument-for-thread-suspension//g'`
 		ac_configure_args="$ac_configure_args --disable-embed-check --with-libgc-threads=$libgc_threads $libgc_configure_args \"CPPFLAGS_FOR_LIBGC=$LIBGC_CPPFLAGS\" \"CFLAGS_FOR_LIBGC=$CFLAGS_FOR_LIBGC\""
 		AC_CONFIG_SUBDIRS(libgc)
 		;;
diff --git a/libgc/configure.in b/libgc/configure.in
index c7ce110c6c2..e68d7382883 100644
--- a/libgc/configure.in
+++ b/libgc/configure.in
@@ -94,7 +94,7 @@ case "$THREADS" in
 	fi
 	AC_DEFINE(THREAD_LOCAL_ALLOC)
 	;;
-     *-*-linux*)
+     *-*-linux* | *-*-nacl*)
 	AC_DEFINE(GC_LINUX_THREADS)
 	AC_DEFINE(_REENTRANT)
 	;;
@@ -340,6 +340,9 @@ case "$host" in
      machdep="mach_dep.lo ia64_save_regs_in_stack.lo"
 	target_ia64=true
     ;;
+ *-*-nacl*)
+    AC_DEFINE(NO_EXECUTE_PERMISSION)
+    ;;
 esac
 if test x"$machdep" = x; then
 AC_MSG_RESULT($machdep)
diff --git a/libgc/gc_dlopen.c b/libgc/gc_dlopen.c
index 4c690edcfe4..6ca9e996fb2 100644
--- a/libgc/gc_dlopen.c
+++ b/libgc/gc_dlopen.c
@@ -25,7 +25,7 @@
 
 #include "private/gc_priv.h"
 
-# if (defined(GC_PTHREADS) && !defined(GC_DARWIN_THREADS)) \
+# if defined(DYNAMIC_LOADING) && (defined(GC_PTHREADS) && !defined(GC_DARWIN_THREADS)) \
       || defined(GC_SOLARIS_THREADS)
 
 # if defined(dlopen) && !defined(GC_USE_LD_WRAP)
diff --git a/libgc/include/gc_pthread_redirects.h b/libgc/include/gc_pthread_redirects.h
index bedcc26ac7e..520a36bf66c 100644
--- a/libgc/include/gc_pthread_redirects.h
+++ b/libgc/include/gc_pthread_redirects.h
@@ -59,6 +59,11 @@
 #endif
   int GC_pthread_join(pthread_t thread, void **retval);
   int GC_pthread_detach(pthread_t thread);
+#if defined(__native_client__) || defined(NACL)
+  void GC_pthread_exit(void *status);
+# undef pthread_exit
+# define pthread_exit GC_pthread_exit
+#endif
 
 #if defined(GC_OSF1_THREADS) \
     && defined(_PTHREAD_USE_MANGLED_NAMES_) && !defined(_PTHREAD_USE_PTDNAM_)
diff --git a/libgc/include/private/gc_priv.h b/libgc/include/private/gc_priv.h
index 2efb4732f79..5c4c5146ba2 100644
--- a/libgc/include/private/gc_priv.h
+++ b/libgc/include/private/gc_priv.h
@@ -1962,6 +1962,8 @@ void GC_err_puts GC_PROTO((GC_CONST char *s));
        /* SPARC/Linux doesn't properly define SIGPWR in <signal.h>.
         * It is aliased to SIGLOST in asm/signal.h, though.		*/
 #      define SIG_SUSPEND SIGLOST
+#    elif defined(NACL)
+#	define SIG_SUSPEND 0
 #    else
        /* Linuxthreads itself uses SIGUSR1 and SIGUSR2.			*/
 #      define SIG_SUSPEND SIGPWR
diff --git a/libgc/include/private/gcconfig.h b/libgc/include/private/gcconfig.h
index d1c46944bfe..8276079d91e 100644
--- a/libgc/include/private/gcconfig.h
+++ b/libgc/include/private/gcconfig.h
@@ -65,6 +65,11 @@
 # endif
 
 /* Determine the machine type: */
+# if defined(__native_client__)
+#    define NACL
+#    define I386
+#    define mach_type_known
+# endif
 # if defined(__arm__) || defined(__thumb__)
 #    define ARM32
 #    if !defined(LINUX) && !defined(NETBSD) && !defined(DARWIN)
@@ -1086,13 +1091,19 @@
 # endif
 
 # ifdef I386
-#   define MACH_TYPE "I386"
-#   if defined(__LP64__) || defined(_WIN64)
-#     define CPP_WORDSZ 64
-#     define ALIGNMENT 8
-#   else
+#   if defined( NACL )
+#     define MACH_TYPE "NACL"
 #     define CPP_WORDSZ 32
 #     define ALIGNMENT 4
+#   else
+#     define MACH_TYPE "I386"
+#     if defined(__LP64__) || defined(_WIN64)
+#       define CPP_WORDSZ 64
+#       define ALIGNMENT 8
+#     else
+#       define CPP_WORDSZ 32
+#       define ALIGNMENT 4
+#     endif
 			/* Appears to hold for all "32 bit" compilers	*/
 			/* except Borland.  The -a4 option fixes 	*/
 			/* Borland.					*/
@@ -1188,7 +1199,32 @@
 #	  define HEAP_START DATAEND
 #	endif /* USE_MMAP */
 #   endif /* DGUX */
-
+#   ifdef NACL
+#	define OS_TYPE "NACL"
+	extern int etext[];
+#	define DATASTART ((ptr_t)((((word) (etext)) + 0xfff) & ~0xfff))
+	extern int _end[];
+#	define DATAEND (_end)
+#	ifdef STACK_GRAN
+#	  undef STACK_GRAN
+#	endif /* STACK_GRAN */
+#	define STACK_GRAN 0x10000
+#	define HEURISTIC1
+#	ifdef USE_MMAP
+#	  undef USE_MMAP
+#	endif
+#	ifdef USE_MUNMAP
+#	  undef USE_MUNMAP
+#	endif
+#	ifdef USE_MMAP_ANON
+#	  undef USE_MMAP_ANON
+#	endif
+#	ifdef USE_MMAP_FIXED
+#	  undef USE_MMAP_FIXED
+#	endif
+#	define GETPAGESIZE() 65536
+#	define MAX_NACL_GC_THREADS 1024
+#   endif
 #   ifdef LINUX
 #	ifndef __GNUC__
 	  /* The Intel compiler doesn't like inline assembly */
@@ -2271,7 +2307,7 @@
 # if defined(GC_IRIX_THREADS) && !defined(IRIX5)
 	--> inconsistent configuration
 # endif
-# if defined(GC_LINUX_THREADS) && !defined(LINUX)
+# if defined(GC_LINUX_THREADS) && !(defined(LINUX) || defined(NACL))
 	--> inconsistent configuration
 # endif
 # if defined(GC_SOLARIS_THREADS) && !defined(SUNOS5)
diff --git a/libgc/include/private/pthread_stop_world.h b/libgc/include/private/pthread_stop_world.h
index 054c7a0eacd..bd72739f580 100644
--- a/libgc/include/private/pthread_stop_world.h
+++ b/libgc/include/private/pthread_stop_world.h
@@ -7,6 +7,15 @@ struct thread_stop_info {
     				/* last successfully handled a suspend	*/
     				/* signal.				*/
     ptr_t stack_ptr;  		/* Valid only when stopped.      	*/
+#ifdef NACL
+/* Grab NACL_GC_REG_STORAGE_SIZE pointers off the stack when going into */
+/* a syscall.  20 is more than we need, but it's an overestimate in case*/
+/* the instrumented function uses any callee saved registers, they may  */
+/* be pushed to the stack much earlier.  Also, on amd64 'push' puts 8   */
+/* bytes on the stack even though our pointers are 4 bytes.             */
+#define NACL_GC_REG_STORAGE_SIZE 20
+    ptr_t reg_storage[NACL_GC_REG_STORAGE_SIZE];
+#endif
 };
     
 #endif
diff --git a/libgc/include/private/pthread_support.h b/libgc/include/private/pthread_support.h
index 852d3815bc6..dbd6dbcde89 100644
--- a/libgc/include/private/pthread_support.h
+++ b/libgc/include/private/pthread_support.h
@@ -92,6 +92,9 @@ typedef struct GC_Thread_Rep {
 
 # define THREAD_TABLE_SZ 128	/* Must be power of 2	*/
 extern volatile GC_thread GC_threads[THREAD_TABLE_SZ];
+#ifdef NACL
+extern __thread GC_thread gc_thread_self;
+#endif
 
 extern GC_bool GC_thr_initialized;
 
diff --git a/libgc/os_dep.c b/libgc/os_dep.c
index ee2e409ad42..ecaa27c7d82 100644
--- a/libgc/os_dep.c
+++ b/libgc/os_dep.c
@@ -133,7 +133,7 @@
 # include <errno.h>
 #endif
 
-#ifdef UNIX_LIKE
+#if defined( UNIX_LIKE ) || defined(NACL)
 # include <fcntl.h>
 #endif
 
@@ -618,6 +618,12 @@ void GC_enable_signals(void)
     	  /* longjmp implementations.  Most systems appear not to have	*/
     	  /* a signal 32.						*/
 #	define SIGSETMASK(old, new) (old) = sigsetmask(new)
+#   elif defined(NACL)
+	/* We don't use signals in NaCl. */
+#	define SIGSET_T int
+#	define SIG_DEL(set, signal)
+#	define SIG_FILL(set)
+#	define SIGSETMASK(old, new)
 #   else
 	/* Use POSIX/SYSV interface	*/
 #	define SIGSET_T sigset_t
@@ -2067,8 +2073,21 @@ void GC_remap(ptr_t start, word bytes)
       int result; 
 
       if (0 == start_addr) return;
+#ifdef NACL
+      {
+	/* NaCl doesn't expose mprotect, but mmap should work fine */
+	void * mmap_result;
+        mmap_result = mmap(start_addr, len, PROT_READ | PROT_WRITE | OPT_PROT_EXEC,
+		      MAP_PRIVATE | MAP_FIXED | OPT_MAP_ANON,
+		      zero_fd, 0/* offset */);
+        if (mmap_result != (void *)start_addr) ABORT("mmap as mprotect failed");
+        /* Fake the return value as if mprotect succeeded. */
+        result = 0;
+      }
+#else /* NACL */
       result = mprotect(start_addr, len,
 		        PROT_READ | PROT_WRITE | OPT_PROT_EXEC);
+#endif /* NACL */
       if (result != 0) {
 	  GC_err_printf3(
 		"Mprotect failed at 0x%lx (length %ld) with errno %ld\n",
diff --git a/libgc/pthread_stop_world.c b/libgc/pthread_stop_world.c
index bf2faafe3d6..b1f98099d1e 100644
--- a/libgc/pthread_stop_world.c
+++ b/libgc/pthread_stop_world.c
@@ -9,6 +9,7 @@
 #include <semaphore.h>
 #include <errno.h>
 #include <unistd.h>
+#include <sys/time.h>
 
 /* work around a dlopen issue (bug #75390), undefs to avoid warnings with redefinitions */
 #undef PACKAGE_BUGREPORT
@@ -22,6 +23,19 @@
 #include "include/libgc-mono-debugger.h"
 #endif
 
+#ifdef NACL
+int nacl_park_threads_now = 0;
+pthread_t nacl_thread_parker = -1;
+
+int nacl_thread_parked[MAX_NACL_GC_THREADS];
+int nacl_thread_used[MAX_NACL_GC_THREADS];
+int nacl_thread_parking_inited = 0;
+int nacl_num_gc_threads = 0;
+pthread_mutex_t nacl_thread_alloc_lock = PTHREAD_MUTEX_INITIALIZER;
+__thread int nacl_thread_idx = -1;
+__thread GC_thread nacl_gc_thread_self = NULL;
+#endif
+
 #if DEBUG_THREADS
 
 #ifndef NSIG
@@ -36,6 +50,7 @@
 # endif
 #endif
 
+#ifndef NACL
 void GC_print_sig_mask()
 {
     sigset_t blocked;
@@ -49,7 +64,7 @@ void GC_print_sig_mask()
     }
     GC_printf0("\n");
 }
-
+#endif /* NACL */
 #endif
 
 /* Remove the signals that we want to allow in thread stopping 	*/
@@ -116,6 +131,7 @@ sem_t GC_suspend_ack_sem;
 
 static void _GC_suspend_handler(int sig)
 {
+#ifndef NACL
     int dummy;
     pthread_t my_thread = pthread_self();
     GC_thread me;
@@ -185,6 +201,8 @@ static void _GC_suspend_handler(int sig)
 #if DEBUG_THREADS
     GC_printf1("Continuing 0x%lx\n", my_thread);
 #endif
+
+#endif /* NACL */
 }
 
 void GC_suspend_handler(int sig)
@@ -278,6 +296,10 @@ static void pthread_push_all_stacks()
 #       else
           GC_push_all_stack(lo, hi);
 #	endif
+#       ifdef NACL
+	  /* Push reg_storage as roots, this will cover the reg context */
+          GC_push_all_stack(p -> stop_info.reg_storage, p -> stop_info.reg_storage + NACL_GC_REG_STORAGE_SIZE);
+#       endif
 #	ifdef IA64
 #         if DEBUG_THREADS
             GC_printf3("Reg stack for thread 0x%lx = [%lx,%lx)\n",
@@ -337,6 +359,7 @@ int android_thread_kill(pid_t tid, int sig)
 /* were sent. */
 int GC_suspend_all()
 {
+#ifndef NACL
     int n_live_threads = 0;
     int i;
     GC_thread p;
@@ -375,11 +398,15 @@ int GC_suspend_all()
       }
     }
     return n_live_threads;
+#else /* NACL */
+    return 0;
+#endif
 }
 
 /* Caller holds allocation lock.	*/
 static void pthread_stop_world()
 {
+#ifndef NACL
     int i;
     int n_live_threads;
     int code;
@@ -431,8 +458,128 @@ static void pthread_stop_world()
       GC_printf1("World stopped from 0x%lx\n", pthread_self());
     #endif
     GC_stopping_thread = 0;  /* debugging only */
+#else /* NACL */
+    GC_thread p;
+    int i;
+
+    #if DEBUG_THREADS
+    GC_printf1("pthread_stop_world: num_threads %d\n", nacl_num_gc_threads - 1);
+    #endif
+    nacl_thread_parker = pthread_self();
+    nacl_park_threads_now = 1;
+    
+    while (1) {
+	#define NACL_PARK_WAIT_NANOSECONDS 100000
+        int num_threads_parked = 0;
+        struct timespec ts;
+        int num_used = 0;
+	/* Check the 'parked' flag for each thread the GC knows about */
+        for (i = 0; i < MAX_NACL_GC_THREADS && num_used < nacl_num_gc_threads; i++) {
+            if (nacl_thread_used[i] == 1) {
+                num_used++;
+                if (nacl_thread_parked[i] == 1) {
+                    num_threads_parked++;
+                }
+            }
+        }
+	/* -1 for the current thread */
+        if (num_threads_parked >= nacl_num_gc_threads - 1)
+            break;
+        ts.tv_sec = 0;
+        ts.tv_nsec = NACL_PARK_WAIT_NANOSECONDS;
+        #if DEBUG_THREADS
+        GC_printf1("sleeping waiting for %d threads to park...\n", nacl_num_gc_threads - num_threads_parked - 1);
+        #endif
+        nanosleep(&ts, 0);
+    }
+
+#endif /* NACL */
 }
 
+
+#ifdef NACL
+
+#if __x86_64__
+
+#define NACL_STORE_REGS()  \
+    do {                  \
+	asm("push %rbx");\
+	asm("push %rbp");\
+	asm("push %r12");\
+	asm("push %r13");\
+	asm("push %r14");\
+	asm("push %r15");\
+	asm("mov %%esp, %0" : "=m" (nacl_gc_thread_self->stop_info.stack_ptr));\
+        memcpy(nacl_gc_thread_self->stop_info.reg_storage, nacl_gc_thread_self->stop_info.stack_ptr, NACL_GC_REG_STORAGE_SIZE * sizeof(ptr_t));\
+	asm("add $48, %esp");\
+        asm("add %r15, %rsp");\
+    } while (0)
+
+#elif __i386__
+
+#define NACL_STORE_REGS()  \
+    do {                  \
+	asm("push %ebx");\
+	asm("push %ebp");\
+	asm("push %esi");\
+	asm("push %edi");\
+	asm("mov %%esp, %0" : "=m" (nacl_gc_thread_self->stop_info.stack_ptr));\
+        memcpy(nacl_gc_thread_self->stop_info.reg_storage, nacl_gc_thread_self->stop_info.stack_ptr, NACL_GC_REG_STORAGE_SIZE * sizeof(ptr_t));\
+	asm("add $16, %esp");\
+    } while (0)
+
+#endif
+
+void nacl_pre_syscall_hook()
+{
+    int local_dummy = 0;
+    if (nacl_thread_idx != -1) {
+	NACL_STORE_REGS();
+        nacl_gc_thread_self->stop_info.stack_ptr = (ptr_t)(&local_dummy);
+        nacl_thread_parked[nacl_thread_idx] = 1;
+    }
+}
+
+void nacl_post_syscall_hook()
+{
+    /* Calling __nacl_suspend_thread_if_needed() right away should guarantee we don't mutate the GC set. */
+    __nacl_suspend_thread_if_needed();
+    if (nacl_thread_idx != -1) {
+        nacl_thread_parked[nacl_thread_idx] = 0;
+    }
+}
+
+void __nacl_suspend_thread_if_needed() {
+    if (nacl_park_threads_now) {
+        pthread_t self = pthread_self();
+        int local_dummy = 0;
+        /* Don't try to park the thread parker. */
+        if (nacl_thread_parker == self)
+            return;
+
+        /* This can happen when a thread is created   */
+        /* outside of the GC system (wthread mostly). */
+        if (nacl_thread_idx < 0)
+            return;
+
+        /* If it was already 'parked', we're returning from a syscall, */
+        /* so don't bother storing registers again, the GC has a set.  */
+        if (!nacl_thread_parked[nacl_thread_idx]) {
+            NACL_STORE_REGS();
+            nacl_gc_thread_self->stop_info.stack_ptr = (ptr_t)(&local_dummy);
+        }
+        nacl_thread_parked[nacl_thread_idx] = 1;
+        while (nacl_park_threads_now)
+            ; /* spin */
+        nacl_thread_parked[nacl_thread_idx] = 0;
+
+        /* Clear out the reg storage for next suspend. */
+        memset(nacl_gc_thread_self->stop_info.reg_storage, 0, NACL_GC_REG_STORAGE_SIZE * sizeof(ptr_t));
+    }
+}
+
+#endif /* NACL */
+
 /* Caller holds allocation lock.	*/
 void GC_stop_world()
 {
@@ -465,6 +612,7 @@ void GC_stop_world()
 /* the world stopped.							*/
 static void pthread_start_world()
 {
+#ifndef NACL
     pthread_t my_thread = pthread_self();
     register int i;
     register GC_thread p;
@@ -525,6 +673,12 @@ static void pthread_start_world()
     #if DEBUG_THREADS
       GC_printf0("World started\n");
     #endif
+#else /* NACL */
+#   if DEBUG_THREADS
+    GC_printf0("World starting\n");
+#   endif
+    nacl_park_threads_now = 0;
+#endif /* NACL */
 }
 
 void GC_start_world()
@@ -538,6 +692,7 @@ void GC_start_world()
 }
 
 static void pthread_stop_init() {
+#ifndef NACL
     struct sigaction act;
     
     if (sem_init(&GC_suspend_ack_sem, 0, 0) != 0)
@@ -578,6 +733,7 @@ static void pthread_stop_init() {
               GC_printf0("Will retry suspend signal if necessary.\n");
 	  }
 #     endif
+#endif /* NACL */
 }
 
 /* We hold the allocation lock.	*/
diff --git a/libgc/pthread_support.c b/libgc/pthread_support.c
index c307ac0eec5..3e588ace211 100644
--- a/libgc/pthread_support.c
+++ b/libgc/pthread_support.c
@@ -164,6 +164,9 @@
 #   endif
 #   undef pthread_join
 #   undef pthread_detach
+#   if defined(NACL)
+#     undef pthread_exit
+#   endif
 #   if defined(GC_OSF1_THREADS) && defined(_PTHREAD_USE_MANGLED_NAMES_) \
        && !defined(_PTHREAD_USE_PTDNAM_)
 /* Restore the original mangled names on Tru64 UNIX.  */
@@ -676,6 +679,52 @@ void GC_mark_thread_local_free_lists(void)
 
 static struct GC_Thread_Rep first_thread;
 
+#ifdef NACL
+extern int nacl_thread_parked[MAX_NACL_GC_THREADS];
+extern int nacl_thread_used[MAX_NACL_GC_THREADS];
+extern int nacl_thread_parking_inited;
+extern int nacl_num_gc_threads;
+extern pthread_mutex_t nacl_thread_alloc_lock;
+extern __thread int nacl_thread_idx;
+extern __thread GC_thread nacl_gc_thread_self;
+
+void nacl_initialize_gc_thread()
+{
+    int i;
+    pthread_mutex_lock(&nacl_thread_alloc_lock);
+    if (!nacl_thread_parking_inited)
+    {
+        for (i = 0; i < MAX_NACL_GC_THREADS; i++) {
+            nacl_thread_used[i] = 0;
+            nacl_thread_parked[i] = 0;
+        }
+        nacl_thread_parking_inited = 1;
+    }
+    GC_ASSERT(nacl_num_gc_threads <= MAX_NACL_GC_THREADS);
+    for (i = 0; i < MAX_NACL_GC_THREADS; i++) {
+        if (nacl_thread_used[i] == 0) {
+            nacl_thread_used[i] = 1;
+            nacl_thread_idx = i;
+            nacl_num_gc_threads++;
+            break;
+        }
+    }
+    pthread_mutex_unlock(&nacl_thread_alloc_lock);
+}
+
+void nacl_shutdown_gc_thread()
+{
+    pthread_mutex_lock(&nacl_thread_alloc_lock);
+    GC_ASSERT(nacl_thread_idx >= 0 && nacl_thread_idx < MAX_NACL_GC_THREADS);
+    GC_ASSERT(nacl_thread_used[nacl_thread_idx] != 0);
+    nacl_thread_used[nacl_thread_idx] = 0;
+    nacl_thread_idx = -1;
+    nacl_num_gc_threads--;
+    pthread_mutex_unlock(&nacl_thread_alloc_lock);
+}
+
+#endif /* NACL */
+
 /* Add a thread to GC_threads.  We assume it wasn't already there.	*/
 /* Caller holds allocation lock.					*/
 GC_thread GC_new_thread(pthread_t id)
@@ -698,6 +747,10 @@ GC_thread GC_new_thread(pthread_t id)
 #endif
     result -> next = GC_threads[hv];
     GC_threads[hv] = result;
+#ifdef NACL
+    nacl_gc_thread_self = result;
+    nacl_initialize_gc_thread();
+#endif
     GC_ASSERT(result -> flags == 0 && result -> thread_blocked == 0);
     return(result);
 }
@@ -711,6 +764,11 @@ void GC_delete_thread(pthread_t id)
     register GC_thread p = GC_threads[hv];
     register GC_thread prev = 0;
     
+#ifdef NACL
+    nacl_shutdown_gc_thread();
+    nacl_gc_thread_self = NULL;
+#endif
+
     while (!pthread_equal(p -> id, id)) {
         prev = p;
         p = p -> next;
@@ -1118,6 +1176,7 @@ void GC_init_parallel()
 
 
 #if !defined(GC_DARWIN_THREADS) && !defined(GC_OPENBSD_THREADS)
+#ifndef NACL
 int WRAP_FUNC(pthread_sigmask)(int how, const sigset_t *set, sigset_t *oset)
 {
     sigset_t fudged_set;
@@ -1129,6 +1188,7 @@ int WRAP_FUNC(pthread_sigmask)(int how, const sigset_t *set, sigset_t *oset)
     }
     return(REAL_FUNC(pthread_sigmask)(how, set, oset));
 }
+#endif
 #endif /* !GC_DARWIN_THREADS */
 
 /* Wrappers for functions that are likely to block for an appreciable	*/
@@ -1259,6 +1319,17 @@ int WRAP_FUNC(pthread_join)(pthread_t thread, void **retval)
     return result;
 }
 
+#ifdef NACL
+/* Native Client doesn't support pthread cleanup functions, */
+/* so wrap pthread_exit and manually cleanup the thread.    */
+void
+WRAP_FUNC(pthread_exit)(void *status)
+{
+    GC_thread_exit_proc(0); 
+    REAL_FUNC(pthread_exit)(status);
+}
+#endif
+
 int
 WRAP_FUNC(pthread_detach)(pthread_t thread)
 {
diff --git a/mono/arch/amd64/amd64-codegen.h b/mono/arch/amd64/amd64-codegen.h
index 7ca557d6d21..8684a5c8656 100644
--- a/mono/arch/amd64/amd64-codegen.h
+++ b/mono/arch/amd64/amd64-codegen.h
@@ -67,6 +67,32 @@ typedef enum
   AMD64_REX_W = 8  /* Opeartion is 64-bits instead of 32 (default) or 16 (with 0x66 prefix) */
 } AMD64_REX_Bits;
 
+#if defined(__default_codegen__)
+
+#define amd64_codegen_pre(inst)
+#define amd64_codegen_post(inst)
+
+#elif defined(__native_client_codegen__)
+
+#define amd64_codegen_pre(inst) guint8* _codegen_start = (inst); amd64_nacl_instruction_pre();
+#define amd64_codegen_post(inst) (amd64_nacl_instruction_post(&_codegen_start, &(inst)), _codegen_start);
+
+/* Because of rex prefixes, etc, call sequences are not constant size.  */
+/* These pre- and post-sequence hooks remedy this by aligning the call  */
+/* sequence after we emit it, since we will know the exact size then.   */
+#define amd64_call_sequence_pre(inst) guint8* _code_start = (inst);
+#define amd64_call_sequence_post(inst) \
+  (mono_nacl_align_call(&_code_start, &(inst)), _code_start);
+
+/* Native client can load/store using one of the following registers     */
+/* as a base: rip, r15, rbp, rsp.  Any other base register needs to have */
+/* its upper 32 bits cleared and reference memory using r15 as the base. */
+#define amd64_is_valid_nacl_base(reg) \
+  ((reg) == AMD64_RIP || (reg) == AMD64_R15 || \
+   (reg) == AMD64_RBP || (reg) == AMD64_RSP)
+
+#endif /*__native_client_codegen__*/
+
 #ifdef TARGET_WIN32
 #define AMD64_ARG_REG1 AMD64_RCX
 #define AMD64_ARG_REG2 AMD64_RDX
@@ -88,6 +114,16 @@ typedef enum
 
 #define AMD64_CALLEE_SAVED_REGS ((1<<AMD64_RDI) | (1<<AMD64_RSI) | (1<<AMD64_RBX) | (1<<AMD64_R12) | (1<<AMD64_R13) | (1<<AMD64_R14) | (1<<AMD64_R15) | (1<<AMD64_RBP))
 #define AMD64_IS_CALLEE_SAVED_REG(reg) (AMD64_CALLEE_SAVED_REGS & (1 << (reg)))
+#elif defined(__native_client_codegen__)
+/* AMD64 Native Client code may not write R15 */
+#define AMD64_CALLEE_REGS ((1<<AMD64_RAX) | (1<<AMD64_RCX) | (1<<AMD64_RDX) | (1<<AMD64_RSI) | (1<<AMD64_RDI) | (1<<AMD64_R8) | (1<<AMD64_R9) | (1<<AMD64_R10))
+#define AMD64_IS_CALLEE_REG(reg)  (AMD64_CALLEE_REGS & (1 << (reg)))
+
+#define AMD64_ARGUMENT_REGS ((1<<AMD64_RDI) | (1<<AMD64_RSI) | (1<<AMD64_RDX) | (1<<AMD64_RCX) | (1<<AMD64_R8) | (1<<AMD64_R9))
+#define AMD64_IS_ARGUMENT_REG(reg) (AMD64_ARGUMENT_REGS & (1 << (reg)))
+
+#define AMD64_CALLEE_SAVED_REGS ((1<<AMD64_RBX) | (1<<AMD64_R12) | (1<<AMD64_R13) | (1<<AMD64_R14) | (1<<AMD64_RBP))
+#define AMD64_IS_CALLEE_SAVED_REG(reg) (AMD64_CALLEE_SAVED_REGS & (1 << (reg)))
 #else
 #define AMD64_CALLEE_REGS ((1<<AMD64_RAX) | (1<<AMD64_RCX) | (1<<AMD64_RDX) | (1<<AMD64_RSI) | (1<<AMD64_RDI) | (1<<AMD64_R8) | (1<<AMD64_R9) | (1<<AMD64_R10))
 #define AMD64_IS_CALLEE_REG(reg)  (AMD64_CALLEE_REGS & (1 << (reg)))
@@ -100,6 +136,7 @@ typedef enum
 #endif
 
 #define AMD64_REX(bits) ((unsigned char)(0x40 | (bits)))
+#if defined(__default_codegen__)
 #define amd64_emit_rex(inst, width, reg_modrm, reg_index, reg_rm_base_opcode) do \
 	{ \
 		unsigned char _amd64_rex_bits = \
@@ -109,9 +146,21 @@ typedef enum
 			(((reg_rm_base_opcode) > 7) ? AMD64_REX_B : 0); \
 		if ((_amd64_rex_bits != 0) || (((width) == 1))) *(inst)++ = AMD64_REX(_amd64_rex_bits); \
 	} while (0)
+#elif defined(__native_client_codegen__)
+#define amd64_emit_rex(inst, width, reg_modrm, reg_index, reg_rm_base_opcode) do \
+	{ \
+		unsigned char _amd64_rex_bits = \
+			(((width) > 4) ? AMD64_REX_W : 0) | \
+			(((reg_modrm) > 7) ? AMD64_REX_R : 0) | \
+			(((reg_index) > 7) ? AMD64_REX_X : 0) | \
+			(((reg_rm_base_opcode) > 7) ? AMD64_REX_B : 0); \
+		amd64_nacl_tag_rex((inst)); \
+		if ((_amd64_rex_bits != 0) || (((width) == 1))) *(inst)++ = AMD64_REX(_amd64_rex_bits); \
+	} while (0)
+#endif
 
 typedef union {
-	gsize val;
+	guint64 val;
 	unsigned char b [8];
 } amd64_imm_buf;
 
@@ -138,7 +187,7 @@ typedef union {
 #define x86_imm_emit64(inst,imm)     \
 	do {	\
 			amd64_imm_buf imb; 	\
-			imb.val = (gsize) (imm);	\
+			imb.val = (guint64) (imm);	\
 			*(inst)++ = imb.b [0];	\
 			*(inst)++ = imb.b [1];	\
 			*(inst)++ = imb.b [2];	\
@@ -158,7 +207,7 @@ typedef union {
 		x86_membase_emit ((inst),(reg)&0x7, (basereg)&0x7, (disp)); \
 } while (0)
 
-#define amd64_alu_reg_imm_size(inst,opc,reg,imm,size) 	\
+#define amd64_alu_reg_imm_size_body(inst,opc,reg,imm,size) \
 	do {	\
 		if (x86_is_imm8((imm))) {	\
 			amd64_emit_rex(inst, size, 0, 0, (reg)); \
@@ -177,29 +226,67 @@ typedef union {
 		}	\
 	} while (0)
 
-#define amd64_alu_reg_imm(inst,opc,reg,imm) amd64_alu_reg_imm_size((inst),(opc),(reg),(imm),8)
-
-#define amd64_alu_reg_reg_size(inst,opc,dreg,reg,size)	\
+#define amd64_alu_reg_reg_size_body(inst,opc,dreg,reg,size)	\
 	do {	\
 		amd64_emit_rex(inst, size, (dreg), 0, (reg)); \
 		*(inst)++ = (((unsigned char)(opc)) << 3) + 3;	\
 		x86_reg_emit ((inst), (dreg), (reg));	\
 	} while (0)
 
-#define amd64_alu_reg_reg(inst,opc,dreg,reg) amd64_alu_reg_reg_size ((inst),(opc),(dreg),(reg),8)
+#if defined(__default_codegen__)
+
+#define amd64_alu_reg_imm_size(inst,opc,reg,imm,size) \
+	amd64_alu_reg_imm_size_body((inst), (opc), (reg), (imm), (size))
 
-#define amd64_alu_reg_membase_size(inst,opc,reg,basereg,disp,size) \
+#define amd64_alu_reg_reg_size(inst,opc,dreg,reg,size) \
+		amd64_alu_reg_reg_size_body((inst), (opc), (dreg), (reg), (size))
+
+#elif defined(__native_client_codegen__)
+/* NaCl modules may not directly update RSP or RBP other than direct copies */
+/* between them. Instead the lower 4 bytes are updated and then added to R15 */
+#define amd64_is_nacl_stack_reg(reg) (((reg) == AMD64_RSP) || ((reg) == AMD64_RBP))
+
+#define amd64_alu_reg_imm_size(inst,opc,reg,imm,size) 	\
+	do{ \
+		amd64_codegen_pre(inst);		\
+		if (amd64_is_nacl_stack_reg(reg)) { \
+			if (((opc) != X86_ADD) && ((opc) != X86_SUB)) \
+				g_assert_not_reached(); \
+			amd64_alu_reg_imm_size_body((inst), (opc), (reg), (imm), 4); \
+			/* Use LEA instead of ADD to preserve flags */ \
+			amd64_lea_memindex_size((inst), (reg), (reg), 0, AMD64_R15, 0, 8); \
+		} else { \
+			amd64_alu_reg_imm_size_body((inst), (opc), (reg), (imm), (size)); \
+		} \
+		amd64_codegen_post(inst);		\
+	} while(0)
+
+#define amd64_alu_reg_reg_size(inst,opc,dreg,reg,size) \
 	do { \
-		amd64_emit_rex ((inst),(size),(reg),0,(basereg)); \
-		*(inst)++ = (((unsigned char)(opc)) << 3) + 3;	\
-		amd64_membase_emit (inst, reg, basereg, disp); \
-} while (0)
+		amd64_codegen_pre(inst);		\
+		if (amd64_is_nacl_stack_reg((dreg)) && ((reg) != AMD64_R15)) { \
+			if (((opc) != X86_ADD && (opc) != X86_SUB)) \
+				g_assert_not_reached(); \
+			amd64_alu_reg_reg_size_body((inst), (opc), (dreg), (reg), 4); \
+			/* Use LEA instead of ADD to preserve flags */ \
+			amd64_lea_memindex_size((inst), (dreg), (dreg), 0, AMD64_R15, 0, 8); \
+		} else { \
+			amd64_alu_reg_reg_size_body((inst), (opc), (dreg), (reg), (size)); \
+		} \
+		amd64_codegen_post(inst);		\
+	} while (0)
 
+#endif /*__native_client_codegen__*/
+
+#define amd64_alu_reg_imm(inst,opc,reg,imm) amd64_alu_reg_imm_size((inst),(opc),(reg),(imm),8)
+
+#define amd64_alu_reg_reg(inst,opc,dreg,reg) amd64_alu_reg_reg_size ((inst),(opc),(dreg),(reg),8)
 
 #define amd64_mov_regp_reg(inst,regp,reg,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		if ((size) == 2) \
-			*(inst)++ = (unsigned char)0x66; \
+			x86_prefix((inst), X86_OPERAND_PREFIX); \
 		amd64_emit_rex(inst, (size), (reg), 0, (regp)); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x88; break;	\
@@ -207,12 +294,14 @@ typedef union {
 		default: assert (0);	\
 		}	\
 		x86_regp_emit ((inst), (reg), (regp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_mov_membase_reg(inst,basereg,disp,reg,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		if ((size) == 2) \
-			*(inst)++ = (unsigned char)0x66; \
+			x86_prefix((inst), X86_OPERAND_PREFIX); \
 		amd64_emit_rex(inst, (size), (reg), 0, (basereg)); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x88; break;	\
@@ -220,27 +309,31 @@ typedef union {
 		default: assert (0);	\
 		}	\
 		x86_membase_emit ((inst), ((reg)&0x7), ((basereg)&0x7), (disp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_mov_mem_reg(inst,mem,reg,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		if ((size) == 2) \
-			*(inst)++ = (unsigned char)0x66; \
+			x86_prefix((inst), X86_OPERAND_PREFIX); \
 		amd64_emit_rex(inst, (size), (reg), 0, 0); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x88; break;	\
 		case 2: case 4: case 8: *(inst)++ = (unsigned char)0x89; break;	\
 		default: assert (0);	\
 		}	\
-        x86_address_byte ((inst), 0, (reg), 4); \
-        x86_address_byte ((inst), 0, 4, 5); \
-        x86_imm_emit32 ((inst), (mem)); \
+		x86_address_byte ((inst), 0, (reg), 4); \
+		x86_address_byte ((inst), 0, 4, 5); \
+		x86_imm_emit32 ((inst), (mem)); \
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_mov_reg_reg(inst,dreg,reg,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		if ((size) == 2) \
-			*(inst)++ = (unsigned char)0x66; \
+			x86_prefix((inst), X86_OPERAND_PREFIX); \
 		amd64_emit_rex(inst, (size), (dreg), 0, (reg)); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
@@ -248,27 +341,43 @@ typedef union {
 		default: assert (0);	\
 		}	\
 		x86_reg_emit ((inst), (dreg), (reg));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
-#define amd64_mov_reg_mem(inst,reg,mem,size)	\
+#define amd64_mov_reg_mem_body(inst,reg,mem,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		if ((size) == 2) \
-			*(inst)++ = (unsigned char)0x66; \
+			x86_prefix((inst), X86_OPERAND_PREFIX); \
 		amd64_emit_rex(inst, (size), (reg), 0, 0); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
 		case 2: case 4: case 8: *(inst)++ = (unsigned char)0x8b; break;	\
 		default: assert (0);	\
 		}	\
-        x86_address_byte ((inst), 0, (reg), 4); \
-        x86_address_byte ((inst), 0, 4, 5); \
-        x86_imm_emit32 ((inst), (mem)); \
+		x86_address_byte ((inst), 0, (reg), 4); \
+		x86_address_byte ((inst), 0, 4, 5); \
+		x86_imm_emit32 ((inst), (mem)); \
+		amd64_codegen_post(inst); \
 	} while (0)
 
-#define amd64_mov_reg_membase(inst,reg,basereg,disp,size)	\
+#if defined(__default_codegen__)
+#define amd64_mov_reg_mem(inst,reg,mem,size)	\
+	do {    \
+		amd64_mov_reg_mem_body((inst),(reg),(mem),(size)); \
+	} while (0)
+#elif defined(__native_client_codegen__)
+/* We have to re-base memory reads because memory isn't zero based. */
+#define amd64_mov_reg_mem(inst,reg,mem,size)	\
+	do {    \
+		amd64_mov_reg_membase((inst),(reg),AMD64_R15,(mem),(size)); \
+	} while (0)
+#endif /* __native_client_codegen__ */
+
+#define amd64_mov_reg_membase_body(inst,reg,basereg,disp,size)	\
 	do {	\
 		if ((size) == 2) \
-			*(inst)++ = (unsigned char)0x66; \
+			x86_prefix((inst), X86_OPERAND_PREFIX); \
 		amd64_emit_rex(inst, (size), (reg), 0, (basereg)); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
@@ -278,8 +387,56 @@ typedef union {
 		amd64_membase_emit ((inst), (reg), (basereg), (disp));	\
 	} while (0)
 
+#define amd64_mov_reg_memindex_size_body(inst,reg,basereg,disp,indexreg,shift,size) \
+	do { \
+		amd64_emit_rex ((inst),(size),(reg),(indexreg),(basereg)); \
+		x86_mov_reg_memindex((inst),((reg)&0x7),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),(size) == 8 ? 4 : (size)); \
+	} while (0)
+
+#if defined(__default_codegen__)
+
+#define amd64_mov_reg_memindex_size(inst,reg,basereg,disp,indexreg,shift,size) \
+	amd64_mov_reg_memindex_size_body((inst),(reg),(basereg),(disp),(indexreg),(shift),(size))
+#define amd64_mov_reg_membase(inst,reg,basereg,disp,size)	\
+	do {	\
+		amd64_mov_reg_membase_body((inst), (reg), (basereg), (disp), (size)); \
+	} while (0)
+
+#elif defined(__native_client_codegen__)
+
+#define amd64_mov_reg_memindex_size(inst,reg,basereg,disp,indexreg,shift,size) \
+	do { \
+		amd64_codegen_pre(inst); \
+		if (amd64_is_nacl_stack_reg((reg))) { \
+			/* Clear upper 32 bits with mov of size 4 */ \
+			amd64_mov_reg_memindex_size_body((inst), (reg), (basereg), (disp), (indexreg), (shift), 4); \
+			/* Add %r15 using LEA to preserve flags */ \
+			amd64_lea_memindex_size((inst), (reg), (reg), 0, AMD64_R15, 0, 8); \
+		} else { \
+			amd64_mov_reg_memindex_size_body((inst), (reg), (basereg), (disp), (indexreg), (shift), (size)); \
+		} \
+		amd64_codegen_post(inst); \
+	} while(0)
+
+#define amd64_mov_reg_membase(inst,reg,basereg,disp,size)	\
+	do {	\
+		amd64_codegen_pre(inst); \
+		if (amd64_is_nacl_stack_reg((reg))) { \
+			/* Clear upper 32 bits with mov of size 4 */ \
+			amd64_mov_reg_membase_body((inst), (reg), (basereg), (disp), 4); \
+			/* Add %r15 */ \
+			amd64_lea_memindex_size((inst), (reg), (reg), 0, AMD64_R15, 0, 8); \
+		} else { \
+			amd64_mov_reg_membase_body((inst), (reg), (basereg), (disp), (size)); \
+		} \
+		amd64_codegen_post(inst); \
+	} while (0)
+
+#endif /*__native_client_codegen__*/
+
 #define amd64_movzx_reg_membase(inst,reg,basereg,disp,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		amd64_emit_rex(inst, (size), (reg), 0, (basereg)); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x0f; *(inst)++ = (unsigned char)0xb6; break;	\
@@ -288,27 +445,34 @@ typedef union {
 		default: assert (0);	\
 		}	\
 		x86_membase_emit ((inst), ((reg)&0x7), ((basereg)&0x7), (disp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movsxd_reg_mem(inst,reg,mem) \
     do {     \
-       amd64_emit_rex(inst,8,(reg),0,0); \
-       *(inst)++ = (unsigned char)0x63; \
-       x86_mem_emit ((inst), ((reg)&0x7), (mem)); \
+	amd64_codegen_pre(inst); \
+	amd64_emit_rex(inst,8,(reg),0,0); \
+	*(inst)++ = (unsigned char)0x63; \
+	x86_mem_emit ((inst), ((reg)&0x7), (mem)); \
+	amd64_codegen_post(inst); \
     } while (0)
 
 #define amd64_movsxd_reg_membase(inst,reg,basereg,disp) \
     do {     \
-       amd64_emit_rex(inst,8,(reg),0,(basereg)); \
-       *(inst)++ = (unsigned char)0x63; \
-       x86_membase_emit ((inst), ((reg)&0x7), ((basereg)&0x7), (disp)); \
+	amd64_codegen_pre(inst); \
+	amd64_emit_rex(inst,8,(reg),0,(basereg)); \
+	*(inst)++ = (unsigned char)0x63; \
+	x86_membase_emit ((inst), ((reg)&0x7), ((basereg)&0x7), (disp)); \
+	amd64_codegen_post(inst); \
     } while (0)
 
 #define amd64_movsxd_reg_reg(inst,dreg,reg) \
     do {     \
-       amd64_emit_rex(inst,8,(dreg),0,(reg)); \
-       *(inst)++ = (unsigned char)0x63; \
-	   x86_reg_emit ((inst), (dreg), (reg));	\
+	amd64_codegen_pre(inst); \
+	amd64_emit_rex(inst,8,(dreg),0,(reg)); \
+	*(inst)++ = (unsigned char)0x63; \
+	x86_reg_emit ((inst), (dreg), (reg));	\
+	amd64_codegen_post(inst); \
     } while (0)
 
 /* Pretty much the only instruction that supports a 64-bit immediate. Optimize for common case of
@@ -316,18 +480,22 @@ typedef union {
  */
 #define amd64_mov_reg_imm_size(inst,reg,imm,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		amd64_emit_rex(inst, (size), 0, 0, (reg)); \
 		*(inst)++ = (unsigned char)0xb8 + ((reg) & 0x7);	\
 		if ((size) == 8) \
-			x86_imm_emit64 ((inst), (gsize)(imm));	\
+			x86_imm_emit64 ((inst), (guint64)(imm));	\
 		else \
-			x86_imm_emit32 ((inst), (int)(gsize)(imm));	\
+			x86_imm_emit32 ((inst), (int)(guint64)(imm));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_mov_reg_imm(inst,reg,imm)	\
 	do {	\
-		int _amd64_width_temp = ((gsize)(imm) == (gsize)(int)(gsize)(imm)); \
-        amd64_mov_reg_imm_size ((inst), (reg), (imm), (_amd64_width_temp ? 4 : 8)); \
+		int _amd64_width_temp = ((guint64)(imm) == (guint64)(int)(guint64)(imm)); \
+		amd64_codegen_pre(inst); \
+		amd64_mov_reg_imm_size ((inst), (reg), (imm), (_amd64_width_temp ? 4 : 8)); \
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_set_reg_template(inst,reg) amd64_mov_reg_imm_size ((inst),(reg), 0, 8)
@@ -336,8 +504,9 @@ typedef union {
 
 #define amd64_mov_membase_imm(inst,basereg,disp,imm,size)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		if ((size) == 2) \
-			*(inst)++ = (unsigned char)0x66; \
+			x86_prefix((inst), X86_OPERAND_PREFIX); \
 		amd64_emit_rex(inst, (size) == 1 ? 0 : (size), 0, 0, (basereg)); \
 		if ((size) == 1) {	\
 			*(inst)++ = (unsigned char)0xc6;	\
@@ -352,36 +521,69 @@ typedef union {
 			x86_membase_emit ((inst), 0, (basereg) & 0x7, (disp));	\
 			x86_imm_emit32 ((inst), (imm));	\
 		}	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
-#define amd64_lea_membase(inst,reg,basereg,disp)	\
+
+#define amd64_lea_membase_body(inst,reg,basereg,disp)	\
 	do {	\
 		amd64_emit_rex(inst, 8, (reg), 0, (basereg)); \
 		*(inst)++ = (unsigned char)0x8d;	\
 		amd64_membase_emit ((inst), (reg), (basereg), (disp));	\
 	} while (0)
 
+#if defined(__default_codegen__)
+#define amd64_lea_membase(inst,reg,basereg,disp) \
+	amd64_lea_membase_body((inst), (reg), (basereg), (disp))
+#elif defined(__native_client_codegen__)
+/* NaCl modules may not write directly into RSP/RBP. Instead, use a */
+/*  32-bit LEA and add R15 to the effective address */
+#define amd64_lea_membase(inst,reg,basereg,disp) \
+	do { \
+		amd64_codegen_pre(inst); \
+		if (amd64_is_nacl_stack_reg(reg)) { \
+			/* 32-bit LEA */ \
+			amd64_emit_rex((inst), 4, (reg), 0, (basereg)); \
+			*(inst)++ = (unsigned char)0x8d; \
+			amd64_membase_emit((inst), (reg), (basereg), (disp)); \
+			/* Use a 64-bit LEA instead of an ADD to preserve flags */ \
+			amd64_lea_memindex_size((inst), (reg), (reg), 0, AMD64_R15, 0, 8); \
+		} else { \
+			amd64_lea_membase_body((inst), (reg), (basereg), (disp)); \
+		} \
+		amd64_codegen_post(inst); \
+	} while (0)
+#endif /*__native_client_codegen__*/
+
 /* Instruction are implicitly 64-bits so don't generate REX for just the size. */
 #define amd64_push_reg(inst,reg)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		amd64_emit_rex(inst, 0, 0, 0, (reg)); \
 		*(inst)++ = (unsigned char)0x50 + ((reg) & 0x7);	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 /* Instruction is implicitly 64-bits so don't generate REX for just the size. */
 #define amd64_push_membase(inst,basereg,disp)	\
 	do {	\
+		amd64_codegen_pre(inst); \
 		amd64_emit_rex(inst, 0, 0, 0, (basereg)); \
 		*(inst)++ = (unsigned char)0xff;	\
 		x86_membase_emit ((inst), 6, (basereg) & 0x7, (disp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
-#define amd64_pop_reg(inst,reg)	\
+#define amd64_pop_reg_body(inst,reg)	\
 	do {	\
+		amd64_codegen_pre(inst);  \
 		amd64_emit_rex(inst, 0, 0, 0, (reg)); \
 		*(inst)++ = (unsigned char)0x58 + ((reg) & 0x7);	\
+		amd64_codegen_post(inst);  \
 	} while (0)
 
+#if defined(__default_codegen__)
+
 #define amd64_call_reg(inst,reg)	\
 	do {	\
 		amd64_emit_rex(inst, 0, 0, 0, (reg)); \
@@ -389,94 +591,203 @@ typedef union {
 		x86_reg_emit ((inst), 2, ((reg) & 0x7));	\
 	} while (0)
 
+
 #define amd64_ret(inst) do { *(inst)++ = (unsigned char)0xc3; } while (0)
 #define amd64_leave(inst) do { *(inst)++ = (unsigned char)0xc9; } while (0)
+
+#define amd64_pop_reg(inst,reg) amd64_pop_reg_body((inst), (reg))
+
+#elif defined(__native_client_codegen__)
+
+/* Size is ignored for Native Client jumps, we restrict jumping to 32-bits */
+#define amd64_jump_reg_size(inst,reg,size)                                \
+  do {                                                                    \
+    amd64_codegen_pre((inst));                                            \
+    amd64_alu_reg_imm_size((inst), X86_AND, (reg), (nacl_align_byte), 4); \
+    amd64_alu_reg_reg_size((inst), X86_ADD, (reg), AMD64_R15, 8);         \
+    amd64_emit_rex ((inst),0,0,0,(reg));                                  \
+    x86_jump_reg((inst),((reg)&0x7));                                     \
+    amd64_codegen_post((inst));                                           \
+  } while (0)
+
+/* Size is ignored for Native Client jumps, we restrict jumping to 32-bits */
+#define amd64_jump_mem_size(inst,mem,size)                                \
+  do {                                                                    \
+    amd64_codegen_pre((inst));                                            \
+    amd64_mov_reg_mem((inst), (mem), AMD64_R11, 4);                       \
+    amd64_jump_reg_size((inst), AMD64_R11, 4);                            \
+    amd64_codegen_post((inst));                                           \
+  } while (0)
+
+#define amd64_call_reg_internal(inst,reg)                                 \
+  do {                                                                    \
+    amd64_codegen_pre((inst));                                            \
+    amd64_alu_reg_imm_size((inst), X86_AND, (reg), (nacl_align_byte), 4); \
+    amd64_alu_reg_reg_size((inst), X86_ADD, (reg), AMD64_R15, 8);         \
+    amd64_emit_rex((inst), 0, 0, 0, (reg));                               \
+    x86_call_reg((inst), ((reg) & 0x7));                                  \
+    amd64_codegen_post((inst));                                           \
+  } while (0)
+
+#define amd64_call_reg(inst,reg)                                          \
+  do {                                                                    \
+    amd64_codegen_pre((inst));                                            \
+    amd64_call_sequence_pre(inst);                                        \
+    amd64_call_reg_internal((inst), (reg));                               \
+    amd64_call_sequence_post(inst);                                       \
+    amd64_codegen_post((inst));                                           \
+  } while (0)
+
+
+#define amd64_ret(inst)                                                   \
+  do {                                                                    \
+    amd64_codegen_pre(inst);						  \
+    amd64_pop_reg_body((inst), AMD64_R11);                                \
+    amd64_jump_reg_size((inst), AMD64_R11, 8);                            \
+    amd64_codegen_post(inst);						  \
+  } while (0)
+
+#define amd64_leave(inst)                                                 \
+  do {                                                                    \
+    amd64_codegen_pre(inst);						  \
+    amd64_mov_reg_reg((inst), AMD64_RSP, AMD64_RBP, 8);                   \
+    amd64_pop_reg_body((inst), AMD64_R11);                                \
+    amd64_mov_reg_reg_size((inst), AMD64_RBP, AMD64_R11, 4);              \
+    amd64_alu_reg_reg_size((inst), X86_ADD, AMD64_RBP, AMD64_R15, 8);     \
+    amd64_codegen_post(inst);						  \
+  } while (0)
+
+#define amd64_pop_reg(inst,reg) \
+	do { \
+		amd64_codegen_pre(inst); \
+		if (amd64_is_nacl_stack_reg((reg))) { \
+			amd64_pop_reg_body((inst), AMD64_R11); \
+			amd64_mov_reg_reg_size((inst), (reg), AMD64_R11, 4); \
+			amd64_alu_reg_reg_size((inst), X86_ADD, (reg), AMD64_R15, 8); \
+		} else { \
+			amd64_pop_reg_body((inst), (reg)); \
+		} \
+		amd64_codegen_post(inst); \
+	} while (0)
+
+#endif /*__native_client_codegen__*/
+
 #define amd64_movsd_reg_regp(inst,reg,regp)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf2;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf2); \
 		amd64_emit_rex(inst, 0, (reg), 0, (regp)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x10;	\
 		x86_regp_emit ((inst), (reg) & 0x7, (regp) & 0x7);	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movsd_regp_reg(inst,regp,reg)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf2;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf2); \
 		amd64_emit_rex(inst, 0, (reg), 0, (regp)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x11;	\
 		x86_regp_emit ((inst), (reg) & 0x7, (regp) & 0x7);	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movss_reg_regp(inst,reg,regp)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf3;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf3); \
 		amd64_emit_rex(inst, 0, (reg), 0, (regp)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x10;	\
 		x86_regp_emit ((inst), (reg) & 0x7, (regp) & 0x7);	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movss_regp_reg(inst,regp,reg)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf3;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf3); \
 		amd64_emit_rex(inst, 0, (reg), 0, (regp)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x11;	\
 		x86_regp_emit ((inst), (reg) & 0x7, (regp) & 0x7);	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movsd_reg_membase(inst,reg,basereg,disp)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf2;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf2); \
 		amd64_emit_rex(inst, 0, (reg), 0, (basereg)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x10;	\
 		x86_membase_emit ((inst), (reg) & 0x7, (basereg) & 0x7, (disp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movss_reg_membase(inst,reg,basereg,disp)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf3;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf3); \
 		amd64_emit_rex(inst, 0, (reg), 0, (basereg)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x10;	\
 		x86_membase_emit ((inst), (reg) & 0x7, (basereg) & 0x7, (disp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movsd_membase_reg(inst,basereg,disp,reg)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf2;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf2); \
 		amd64_emit_rex(inst, 0, (reg), 0, (basereg)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x11;	\
 		x86_membase_emit ((inst), (reg) & 0x7, (basereg) & 0x7, (disp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 #define amd64_movss_membase_reg(inst,basereg,disp,reg)	\
 	do {	\
-		*(inst)++ = (unsigned char)0xf3;	\
+		amd64_codegen_pre(inst); \
+		x86_prefix((inst), 0xf3); \
 		amd64_emit_rex(inst, 0, (reg), 0, (basereg)); \
 		*(inst)++ = (unsigned char)0x0f;	\
 		*(inst)++ = (unsigned char)0x11;	\
 		x86_membase_emit ((inst), (reg) & 0x7, (basereg) & 0x7, (disp));	\
+		amd64_codegen_post(inst); \
 	} while (0)
 
 /* The original inc_reg opcode is used as the REX prefix */
 #define amd64_inc_reg_size(inst,reg,size) \
-    do { \
-	    amd64_emit_rex ((inst),(size),0,0,(reg)); \
-        *(inst)++ = (unsigned char)0xff; \
-        x86_reg_emit ((inst),0,(reg) & 0x7); \
-    } while (0)
+	do { \
+		amd64_codegen_pre(inst); \
+		amd64_emit_rex ((inst),(size),0,0,(reg)); \
+		*(inst)++ = (unsigned char)0xff; \
+		x86_reg_emit ((inst),0,(reg) & 0x7); \
+		amd64_codegen_post(inst); \
+	} while (0)
 
 #define amd64_dec_reg_size(inst,reg,size) \
-    do { \
-	    amd64_emit_rex ((inst),(size),0,0,(reg)); \
-        *(inst)++ = (unsigned char)0xff; \
-        x86_reg_emit ((inst),1,(reg) & 0x7); \
-    } while (0)
+	do { \
+		amd64_codegen_pre(inst); \
+		amd64_emit_rex ((inst),(size),0,0,(reg)); \
+		*(inst)++ = (unsigned char)0xff; \
+		x86_reg_emit ((inst),1,(reg) & 0x7); \
+		amd64_codegen_post(inst); \
+	} while (0)
+
+#define amd64_fld_membase_size(inst,basereg,disp,is_double,size) do { \
+	amd64_codegen_pre(inst); \
+	amd64_emit_rex ((inst),0,0,0,(basereg)); \
+	*(inst)++ = (is_double) ? (unsigned char)0xdd : (unsigned char)0xd9;	\
+	amd64_membase_emit ((inst), 0, (basereg), (disp));	\
+	amd64_codegen_post(inst); \
+} while (0)
+
+#if defined (__default_codegen__)
 
 /* From the AMD64 Software Optimization Manual */
 #define amd64_padding_size(inst,size) \
@@ -489,12 +800,6 @@ typedef union {
 		}; \
 		} while (0)
 
-#define amd64_fld_membase_size(inst,basereg,disp,is_double,size) do { \
-	amd64_emit_rex ((inst),0,0,0,(basereg)); \
-    *(inst)++ = (is_double) ? (unsigned char)0xdd : (unsigned char)0xd9;	\
-	amd64_membase_emit ((inst), 0, (basereg), (disp));	\
-} while (0)
-
 #define amd64_call_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); *(inst)++ = (unsigned char)0xff; amd64_membase_emit ((inst),2, (basereg),(disp)); } while (0)
 #define amd64_jump_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); *(inst)++ = (unsigned char)0xff; amd64_membase_emit ((inst), 4, (basereg), (disp)); } while (0)
     
@@ -508,6 +813,98 @@ typedef union {
 	} \
 } while (0)
 
+#elif defined(__native_client_codegen__)
+
+/* The 3-7 byte NOP sequences in amd64_padding_size below are all illegal in */
+/* 64-bit Native Client because they load into rSP/rBP or use duplicate */
+/* prefixes. Instead we use the NOPs recommended in Section 3.5.1.8 of the */
+/* Intel64 and IA-32 Architectures Optimization Reference Manual and */
+/* Section 4.13 of AMD Software Optimization Guide for Family 10h Processors. */
+
+#define amd64_padding_size(inst,size) \
+	do { \
+		unsigned char *code_start = (inst); \
+		switch ((size)) { \
+			/* xchg %eax,%eax, recognized by hardware as a NOP */ \
+			case 1: *(inst)++ = 0x90; break; \
+			/* xchg %ax,%ax */ \
+			case 2: *(inst)++ = 0x66; *(inst)++ = 0x90; \
+				break; \
+			/* nop (%rax) */ \
+			case 3: *(inst)++ = 0x0f; *(inst)++ = 0x1f; \
+				*(inst)++ = 0x00; \
+				break; \
+			/* nop 0x0(%rax) */ \
+			case 4: *(inst)++ = 0x0f; *(inst)++ = 0x1f; \
+				x86_address_byte ((inst), 1, 0, AMD64_RAX);	\
+				x86_imm_emit8 ((inst), 0);	\
+				break; \
+			/* nop 0x0(%rax,%rax) */ \
+			case 5: *(inst)++ = 0x0f; *(inst)++ = 0x1f; \
+				x86_address_byte ((inst), 1, 0, 4);	\
+				x86_address_byte ((inst), 0, AMD64_RAX, AMD64_RAX);	\
+				x86_imm_emit8 ((inst), 0);	\
+				break; \
+			/* nopw 0x0(%rax,%rax) */ \
+			case 6: *(inst)++ = 0x66; *(inst)++ = 0x0f; \
+				*(inst)++ = 0x1f; \
+				x86_address_byte ((inst), 1, 0, 4);	\
+				x86_address_byte ((inst), 0, AMD64_RAX, AMD64_RAX);	\
+				x86_imm_emit8 ((inst), 0);	\
+				break; \
+			/* nop 0x0(%rax) (32-bit displacement) */ \
+			case 7: *(inst)++ = 0x0f; *(inst)++ = 0x1f; \
+				x86_address_byte ((inst), 2, 0, AMD64_RAX);	\
+				x86_imm_emit32((inst), 0); \
+				break; \
+			/* nop 0x0(%rax,%rax) (32-bit displacement) */ \
+			case 8: *(inst)++ = 0x0f; *(inst)++ = 0x1f; \
+				x86_address_byte ((inst), 2, 0, 4);	\
+				x86_address_byte ((inst), 0, AMD64_RAX, AMD64_RAX);	\
+				x86_imm_emit32 ((inst), 0);	\
+				break; \
+			default: \
+				g_assert_not_reached(); \
+		} \
+		g_assert(code_start + (size) == (unsigned char *)(inst)); \
+	} while (0)
+
+
+/* Size is ignored for Native Client calls, we restrict jumping to 32-bits */
+#define amd64_call_membase_size(inst,basereg,disp,size)                   \
+  do {                                                                    \
+    amd64_codegen_pre((inst));                                            \
+    amd64_call_sequence_pre(inst);                                        \
+    amd64_mov_reg_membase((inst), AMD64_R11, (basereg), (disp), 4);       \
+    amd64_call_reg_internal((inst), AMD64_R11);                           \
+    amd64_call_sequence_post(inst);                                       \
+    amd64_codegen_post((inst));                                           \
+  } while (0)
+
+/* Size is ignored for Native Client jumps, we restrict jumping to 32-bits */
+#define amd64_jump_membase_size(inst,basereg,disp,size)                   \
+  do {                                                                    \
+    amd64_mov_reg_membase((inst), AMD64_R11, (basereg), (disp), 4);       \
+    amd64_jump_reg_size((inst), AMD64_R11, 4);                            \
+  } while (0)
+    
+/* On Native Client we can't jump more than INT_MAX in either direction */
+#define amd64_jump_code_size(inst,target,size)                            \
+  do {                                                                    \
+    /* x86_jump_code used twice in case of */                             \
+    /* relocation by amd64_codegen_post    */                             \
+    guint8* jump_start;                                                   \
+    amd64_codegen_pre(inst);                                              \
+    assert(amd64_is_imm32 ((gint64)(target) - (gint64)(inst)));           \
+    x86_jump_code((inst),(target));                                       \
+    inst = amd64_codegen_post(inst);                                      \
+    jump_start = (inst);                                                  \
+    x86_jump_code((inst),(target));                                       \
+    mono_amd64_patch(jump_start, (target));                               \
+} while (0)
+
+#endif /*__native_client_codegen__*/
+
 /*
  * SSE
  */
@@ -517,31 +914,39 @@ typedef union {
 /* Two opcode SSE defines */
 
 #define emit_sse_reg_reg_op2_size(inst,dreg,reg,op1,op2,size) do { \
+    amd64_codegen_pre(inst); \
     amd64_emit_rex ((inst), size, (dreg), 0, (reg)); \
     *(inst)++ = (unsigned char)(op1); \
     *(inst)++ = (unsigned char)(op2); \
     x86_reg_emit ((inst), (dreg), (reg)); \
+    amd64_codegen_post(inst); \
 } while (0)
 
 #define emit_sse_reg_reg_op2(inst,dreg,reg,op1,op2) emit_sse_reg_reg_op2_size ((inst), (dreg), (reg), (op1), (op2), 0)
 
 #define emit_sse_reg_reg_op2_imm(inst,dreg,reg,op1,op2,imm) do { \
+   amd64_codegen_pre(inst); \
    emit_sse_reg_reg_op2 ((inst), (dreg), (reg), (op1), (op2)); \
    x86_imm_emit8 ((inst), (imm)); \
+   amd64_codegen_post(inst); \
 } while (0)
 
 #define emit_sse_membase_reg_op2(inst,basereg,disp,reg,op1,op2) do { \
+    amd64_codegen_pre(inst); \
     amd64_emit_rex ((inst), 0, (reg), 0, (basereg)); \
     *(inst)++ = (unsigned char)(op1); \
     *(inst)++ = (unsigned char)(op2); \
     amd64_membase_emit ((inst), (reg), (basereg), (disp)); \
+    amd64_codegen_post(inst); \
 } while (0)
 
 #define emit_sse_reg_membase_op2(inst,dreg,basereg,disp,op1,op2) do { \
+    amd64_codegen_pre(inst); \
     amd64_emit_rex ((inst), 0, (dreg), 0, (basereg) == AMD64_RIP ? 0 : (basereg)); \
     *(inst)++ = (unsigned char)(op1); \
     *(inst)++ = (unsigned char)(op2); \
     amd64_membase_emit ((inst), (dreg), (basereg), (disp)); \
+    amd64_codegen_post(inst); \
 } while (0)
 
 /* Three opcode SSE defines */
@@ -553,45 +958,55 @@ typedef union {
 } while (0)
 
 #define emit_sse_reg_reg_size(inst,dreg,reg,op1,op2,op3,size) do { \
+    amd64_codegen_pre(inst); \
     *(inst)++ = (unsigned char)(op1); \
 	amd64_emit_rex ((inst), size, (dreg), 0, (reg)); \
     *(inst)++ = (unsigned char)(op2); \
     *(inst)++ = (unsigned char)(op3); \
     x86_reg_emit ((inst), (dreg), (reg)); \
+    amd64_codegen_post(inst); \
 } while (0)
 
 #define emit_sse_reg_reg(inst,dreg,reg,op1,op2,op3) emit_sse_reg_reg_size ((inst), (dreg), (reg), (op1), (op2), (op3), 0)
 
 #define emit_sse_reg_reg_imm(inst,dreg,reg,op1,op2,op3,imm) do { \
+   amd64_codegen_pre(inst); \
    emit_sse_reg_reg ((inst), (dreg), (reg), (op1), (op2), (op3)); \
    x86_imm_emit8 ((inst), (imm)); \
+   amd64_codegen_post(inst); \
 } while (0)
 
 #define emit_sse_membase_reg(inst,basereg,disp,reg,op1,op2,op3) do { \
-    *(inst)++ = (unsigned char)(op1); \
+    amd64_codegen_pre(inst); \
+    x86_prefix((inst), (unsigned char)(op1)); \
     amd64_emit_rex ((inst), 0, (reg), 0, (basereg)); \
     *(inst)++ = (unsigned char)(op2); \
     *(inst)++ = (unsigned char)(op3); \
     amd64_membase_emit ((inst), (reg), (basereg), (disp)); \
+    amd64_codegen_post(inst); \
 } while (0)
 
 #define emit_sse_reg_membase(inst,dreg,basereg,disp,op1,op2,op3) do { \
-    *(inst)++ = (unsigned char)(op1); \
+    amd64_codegen_pre(inst); \
+    x86_prefix((inst), (unsigned char)(op1)); \
     amd64_emit_rex ((inst), 0, (dreg), 0, (basereg) == AMD64_RIP ? 0 : (basereg)); \
     *(inst)++ = (unsigned char)(op2); \
     *(inst)++ = (unsigned char)(op3); \
     amd64_membase_emit ((inst), (dreg), (basereg), (disp)); \
+    amd64_codegen_post(inst); \
 } while (0)
 
 /* Four opcode SSE defines */
 
 #define emit_sse_reg_reg_op4_size(inst,dreg,reg,op1,op2,op3,op4,size) do { \
-    *(inst)++ = (unsigned char)(op1); \
-	amd64_emit_rex ((inst), size, (dreg), 0, (reg)); \
+    amd64_codegen_pre(inst); \
+    x86_prefix((inst), (unsigned char)(op1)); \
+    amd64_emit_rex ((inst), size, (dreg), 0, (reg)); \
     *(inst)++ = (unsigned char)(op2); \
     *(inst)++ = (unsigned char)(op3); \
     *(inst)++ = (unsigned char)(op4); \
     x86_reg_emit ((inst), (dreg), (reg)); \
+    amd64_codegen_post(inst); \
 } while (0)
 
 #define emit_sse_reg_reg_op4(inst,dreg,reg,op1,op2,op3,op4) emit_sse_reg_reg_op4_size ((inst), (dreg), (reg), (op1), (op2), (op3), (op4), 0)
@@ -954,189 +1369,244 @@ typedef union {
 /* Generated from x86-codegen.h */
 
 #define amd64_breakpoint_size(inst,size) do { x86_breakpoint(inst); } while (0)
-#define amd64_cld_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_cld(inst); } while (0)
-#define amd64_stosb_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_stosb(inst); } while (0)
-#define amd64_stosl_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_stosl(inst); } while (0)
-#define amd64_stosd_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_stosd(inst); } while (0)
-#define amd64_movsb_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_movsb(inst); } while (0)
-#define amd64_movsl_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_movsl(inst); } while (0)
-#define amd64_movsd_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_movsd(inst); } while (0)
-#define amd64_prefix_size(inst,p,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_prefix((inst), p); } while (0)
-#define amd64_rdtsc_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_rdtsc(inst); } while (0)
-#define amd64_cmpxchg_reg_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_cmpxchg_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); } while (0)
-#define amd64_cmpxchg_mem_reg_size(inst,mem,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_cmpxchg_mem_reg((inst),(mem),((reg)&0x7)); } while (0)
-#define amd64_cmpxchg_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_cmpxchg_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7)); } while (0)
-#define amd64_xchg_reg_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_xchg_reg_reg((inst),((dreg)&0x7),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_xchg_mem_reg_size(inst,mem,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_xchg_mem_reg((inst),(mem),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_xchg_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_xchg_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_inc_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_inc_mem((inst),(mem)); } while (0)
-#define amd64_inc_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_inc_membase((inst),((basereg)&0x7),(disp)); } while (0)
-//#define amd64_inc_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_inc_reg((inst),((reg)&0x7)); } while (0)
-#define amd64_dec_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_dec_mem((inst),(mem)); } while (0)
-#define amd64_dec_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_dec_membase((inst),((basereg)&0x7),(disp)); } while (0)
-//#define amd64_dec_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_dec_reg((inst),((reg)&0x7)); } while (0)
-#define amd64_not_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_not_mem((inst),(mem)); } while (0)
-#define amd64_not_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_not_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_not_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_not_reg((inst),((reg)&0x7)); } while (0)
-#define amd64_neg_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_neg_mem((inst),(mem)); } while (0)
-#define amd64_neg_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_neg_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_neg_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_neg_reg((inst),((reg)&0x7)); } while (0)
-#define amd64_nop_size(inst,size) do { x86_nop(inst); } while (0)
-//#define amd64_alu_reg_imm_size(inst,opc,reg,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_alu_reg_imm((inst),(opc),((reg)&0x7),(imm)); } while (0)
-#define amd64_alu_mem_imm_size(inst,opc,mem,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_alu_mem_imm((inst),(opc),(mem),(imm)); } while (0)
-#define amd64_alu_membase_imm_size(inst,opc,basereg,disp,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_alu_membase_imm((inst),(opc),((basereg)&0x7),(disp),(imm)); } while (0)
-#define amd64_alu_membase8_imm_size(inst,opc,basereg,disp,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_alu_membase8_imm((inst),(opc),((basereg)&0x7),(disp),(imm)); } while (0)	
-#define amd64_alu_mem_reg_size(inst,opc,mem,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_alu_mem_reg((inst),(opc),(mem),((reg)&0x7)); } while (0)
-#define amd64_alu_membase_reg_size(inst,opc,basereg,disp,reg,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_alu_membase_reg((inst),(opc),((basereg)&0x7),(disp),((reg)&0x7)); } while (0)
-//#define amd64_alu_reg_reg_size(inst,opc,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_alu_reg_reg((inst),(opc),((dreg)&0x7),((reg)&0x7)); } while (0)
-#define amd64_alu_reg8_reg8_size(inst,opc,dreg,reg,is_dreg_h,is_reg_h,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_alu_reg8_reg8((inst),(opc),((dreg)&0x7),((reg)&0x7),(is_dreg_h),(is_reg_h)); } while (0)
-#define amd64_alu_reg_mem_size(inst,opc,reg,mem,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_alu_reg_mem((inst),(opc),((reg)&0x7),(mem)); } while (0)
-//#define amd64_alu_reg_membase_size(inst,opc,reg,basereg,disp,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_alu_reg_membase((inst),(opc),((reg)&0x7),((basereg)&0x7),(disp)); } while (0)
-#define amd64_test_reg_imm_size(inst,reg,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_test_reg_imm((inst),((reg)&0x7),(imm)); } while (0)
-#define amd64_test_mem_imm_size(inst,mem,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_test_mem_imm((inst),(mem),(imm)); } while (0)
-#define amd64_test_membase_imm_size(inst,basereg,disp,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_test_membase_imm((inst),((basereg)&0x7),(disp),(imm)); } while (0)
-#define amd64_test_reg_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_test_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); } while (0)
-#define amd64_test_mem_reg_size(inst,mem,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_test_mem_reg((inst),(mem),((reg)&0x7)); } while (0)
-#define amd64_test_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_test_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7)); } while (0)
-#define amd64_shift_reg_imm_size(inst,opc,reg,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_reg_imm((inst),(opc),((reg)&0x7),(imm)); } while (0)
-#define amd64_shift_mem_imm_size(inst,opc,mem,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_mem_imm((inst),(opc),(mem),(imm)); } while (0)
-#define amd64_shift_membase_imm_size(inst,opc,basereg,disp,imm,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_shift_membase_imm((inst),(opc),((basereg)&0x7),(disp),(imm)); } while (0)
-#define amd64_shift_reg_size(inst,opc,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_reg((inst),(opc),((reg)&0x7)); } while (0)
-#define amd64_shift_mem_size(inst,opc,mem,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_mem((inst),(opc),(mem)); } while (0)
-#define amd64_shift_membase_size(inst,opc,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_shift_membase((inst),(opc),((basereg)&0x7),(disp)); } while (0)
-#define amd64_shrd_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shrd_reg((inst),((dreg)&0x7),((reg)&0x7)); } while (0)
-#define amd64_shrd_reg_imm_size(inst,dreg,reg,shamt,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shrd_reg_imm((inst),((dreg)&0x7),((reg)&0x7),(shamt)); } while (0)
-#define amd64_shld_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shld_reg((inst),((dreg)&0x7),((reg)&0x7)); } while (0)
-#define amd64_shld_reg_imm_size(inst,dreg,reg,shamt,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shld_reg_imm((inst),((dreg)&0x7),((reg)&0x7),(shamt)); } while (0)
-#define amd64_mul_reg_size(inst,reg,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mul_reg((inst),((reg)&0x7),(is_signed)); } while (0)
-#define amd64_mul_mem_size(inst,mem,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_mul_mem((inst),(mem),(is_signed)); } while (0)
-#define amd64_mul_membase_size(inst,basereg,disp,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_mul_membase((inst),((basereg)&0x7),(disp),(is_signed)); } while (0)
-#define amd64_imul_reg_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_imul_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); } while (0)
-#define amd64_imul_reg_mem_size(inst,reg,mem,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_imul_reg_mem((inst),((reg)&0x7),(mem)); } while (0)
-#define amd64_imul_reg_membase_size(inst,reg,basereg,disp,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_imul_reg_membase((inst),((reg)&0x7),((basereg)&0x7),(disp)); } while (0)
-#define amd64_imul_reg_reg_imm_size(inst,dreg,reg,imm,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_imul_reg_reg_imm((inst),((dreg)&0x7),((reg)&0x7),(imm)); } while (0)
-#define amd64_imul_reg_mem_imm_size(inst,reg,mem,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_imul_reg_mem_imm((inst),((reg)&0x7),(mem),(imm)); } while (0)
-#define amd64_imul_reg_membase_imm_size(inst,reg,basereg,disp,imm,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_imul_reg_membase_imm((inst),((reg)&0x7),((basereg)&0x7),(disp),(imm)); } while (0)
-#define amd64_div_reg_size(inst,reg,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_div_reg((inst),((reg)&0x7),(is_signed)); } while (0)
-#define amd64_div_mem_size(inst,mem,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_div_mem((inst),(mem),(is_signed)); } while (0)
-#define amd64_div_membase_size(inst,basereg,disp,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_div_membase((inst),((basereg)&0x7),(disp),(is_signed)); } while (0)
-#define amd64_mov_mem_reg_size(inst,mem,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mov_mem_reg((inst),(mem),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-//#define amd64_mov_regp_reg_size(inst,regp,reg,size) do { amd64_emit_rex ((inst),(size),(regp),0,(reg)); x86_mov_regp_reg((inst),(regp),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-//#define amd64_mov_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_mov_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_mov_memindex_reg_size(inst,basereg,disp,indexreg,shift,reg,size) do { amd64_emit_rex ((inst),(size),(reg),(indexreg),(basereg)); x86_mov_memindex_reg((inst),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_mov_reg_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_mov_reg_reg((inst),((dreg)&0x7),((reg)&0x7),(size) == 8 ? 4 : (size)); } while (0)
-//#define amd64_mov_reg_mem_size(inst,reg,mem,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mov_reg_mem((inst),((reg)&0x7),(mem),(size) == 8 ? 4 : (size)); } while (0)
-//#define amd64_mov_reg_membase_size(inst,reg,basereg,disp,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_mov_reg_membase((inst),((reg)&0x7),((basereg)&0x7),(disp),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_mov_reg_memindex_size(inst,reg,basereg,disp,indexreg,shift,size) do { amd64_emit_rex ((inst),(size),(reg),(indexreg),(basereg)); x86_mov_reg_memindex((inst),((reg)&0x7),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_clear_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_clear_reg((inst),((reg)&0x7)); } while (0)
-//#define amd64_mov_reg_imm_size(inst,reg,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mov_reg_imm((inst),((reg)&0x7),(imm)); } while (0)
-#define amd64_mov_mem_imm_size(inst,mem,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_mov_mem_imm((inst),(mem),(imm),(size) == 8 ? 4 : (size)); } while (0)
-//#define amd64_mov_membase_imm_size(inst,basereg,disp,imm,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_mov_membase_imm((inst),((basereg)&0x7),(disp),(imm),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_mov_memindex_imm_size(inst,basereg,disp,indexreg,shift,imm,size) do { amd64_emit_rex ((inst),(size),0,(indexreg),(basereg)); x86_mov_memindex_imm((inst),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),(imm),(size) == 8 ? 4 : (size)); } while (0)
-#define amd64_lea_mem_size(inst,reg,mem,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_lea_mem((inst),((reg)&0x7),(mem)); } while (0)
-//#define amd64_lea_membase_size(inst,reg,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_lea_membase((inst),((reg)&0x7),((basereg)&0x7),(disp)); } while (0)
-#define amd64_lea_memindex_size(inst,reg,basereg,disp,indexreg,shift,size) do { amd64_emit_rex ((inst),(size),(reg),(indexreg),(basereg)); x86_lea_memindex((inst),((reg)&0x7),((basereg)&0x7),(disp),((indexreg)&0x7),(shift)); } while (0)
-#define amd64_widen_reg_size(inst,dreg,reg,is_signed,is_half,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_widen_reg((inst),((dreg)&0x7),((reg)&0x7),(is_signed),(is_half)); } while (0)
-#define amd64_widen_mem_size(inst,dreg,mem,is_signed,is_half,size) do { amd64_emit_rex ((inst),(size),(dreg),0,0); x86_widen_mem((inst),((dreg)&0x7),(mem),(is_signed),(is_half)); } while (0)
-#define amd64_widen_membase_size(inst,dreg,basereg,disp,is_signed,is_half,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(basereg)); x86_widen_membase((inst),((dreg)&0x7),((basereg)&0x7),(disp),(is_signed),(is_half)); } while (0)
-#define amd64_widen_memindex_size(inst,dreg,basereg,disp,indexreg,shift,is_signed,is_half,size) do { amd64_emit_rex ((inst),(size),(dreg),(indexreg),(basereg)); x86_widen_memindex((inst),((dreg)&0x7),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),(is_signed),(is_half)); } while (0)
-#define amd64_cdq_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_cdq(inst); } while (0)
-#define amd64_wait_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_wait(inst); } while (0)
-#define amd64_fp_op_mem_size(inst,opc,mem,is_double,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fp_op_mem((inst),(opc),(mem),(is_double)); } while (0)
-#define amd64_fp_op_membase_size(inst,opc,basereg,disp,is_double,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fp_op_membase((inst),(opc),((basereg)&0x7),(disp),(is_double)); } while (0)
-#define amd64_fp_op_size(inst,opc,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fp_op((inst),(opc),(index)); } while (0)
-#define amd64_fp_op_reg_size(inst,opc,index,pop_stack,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fp_op_reg((inst),(opc),(index),(pop_stack)); } while (0)
-#define amd64_fp_int_op_membase_size(inst,opc,basereg,disp,is_int,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fp_int_op_membase((inst),(opc),((basereg)&0x7),(disp),(is_int)); } while (0)
-#define amd64_fstp_size(inst,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fstp((inst),(index)); } while (0)
-#define amd64_fcompp_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fcompp(inst); } while (0)
-#define amd64_fucompp_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fucompp(inst); } while (0)
-#define amd64_fnstsw_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fnstsw(inst); } while (0)
-#define amd64_fnstcw_size(inst,mem,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fnstcw((inst),(mem)); } while (0)
-#define amd64_fnstcw_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_fnstcw_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_fldcw_size(inst,mem,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fldcw((inst),(mem)); } while (0)
-#define amd64_fldcw_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fldcw_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_fchs_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fchs(inst); } while (0)
-#define amd64_frem_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_frem(inst); } while (0)
-#define amd64_fxch_size(inst,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fxch((inst),(index)); } while (0)
-#define amd64_fcomi_size(inst,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fcomi((inst),(index)); } while (0)
-#define amd64_fcomip_size(inst,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fcomip((inst),(index)); } while (0)
-#define amd64_fucomi_size(inst,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fucomi((inst),(index)); } while (0)
-#define amd64_fucomip_size(inst,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fucomip((inst),(index)); } while (0)
-#define amd64_fld_size(inst,mem,is_double,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fld((inst),(mem),(is_double)); } while (0)
-//#define amd64_fld_membase_size(inst,basereg,disp,is_double,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fld_membase((inst),((basereg)&0x7),(disp),(is_double)); } while (0)
-#define amd64_fld80_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fld80_mem((inst),(mem)); } while (0)
-#define amd64_fld80_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_fld80_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_fild_size(inst,mem,is_long,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fild((inst),(mem),(is_long)); } while (0)
-#define amd64_fild_membase_size(inst,basereg,disp,is_long,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fild_membase((inst),((basereg)&0x7),(disp),(is_long)); } while (0)
-#define amd64_fld_reg_size(inst,index,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fld_reg((inst),(index)); } while (0)
-#define amd64_fldz_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fldz(inst); } while (0)
-#define amd64_fld1_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fld1(inst); } while (0)
-#define amd64_fldpi_size(inst,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fldpi(inst); } while (0)
-#define amd64_fst_size(inst,mem,is_double,pop_stack,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fst((inst),(mem),(is_double),(pop_stack)); } while (0)
-#define amd64_fst_membase_size(inst,basereg,disp,is_double,pop_stack,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fst_membase((inst),((basereg)&0x7),(disp),(is_double),(pop_stack)); } while (0)
-#define amd64_fst80_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fst80_mem((inst),(mem)); } while (0)
-#define amd64_fst80_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fst80_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_fist_pop_size(inst,mem,is_long,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_fist_pop((inst),(mem),(is_long)); } while (0)
-#define amd64_fist_pop_membase_size(inst,basereg,disp,is_long,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fist_pop_membase((inst),((basereg)&0x7),(disp),(is_long)); } while (0)
-#define amd64_fstsw_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_fstsw(inst); } while (0)
-#define amd64_fist_membase_size(inst,basereg,disp,is_int,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fist_membase((inst),((basereg)&0x7),(disp),(is_int)); } while (0)
-//#define amd64_push_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_push_reg((inst),((reg)&0x7)); } while (0)
-#define amd64_push_regp_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_push_regp((inst),((reg)&0x7)); } while (0)
-#define amd64_push_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_push_mem((inst),(mem)); } while (0)
-//#define amd64_push_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_push_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_push_memindex_size(inst,basereg,disp,indexreg,shift,size) do { amd64_emit_rex ((inst),(size),0,(indexreg),(basereg)); x86_push_memindex((inst),((basereg)&0x7),(disp),((indexreg)&0x7),(shift)); } while (0)
-#define amd64_push_imm_size(inst,imm,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_push_imm((inst),(imm)); } while (0)
-//#define amd64_pop_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_pop_reg((inst),((reg)&0x7)); } while (0)
-#define amd64_pop_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_pop_mem((inst),(mem)); } while (0)
-#define amd64_pop_membase_size(inst,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_pop_membase((inst),((basereg)&0x7),(disp)); } while (0)
-#define amd64_pushad_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_pushad(inst); } while (0)
-#define amd64_pushfd_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_pushfd(inst); } while (0)
-#define amd64_popad_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_popad(inst); } while (0)
-#define amd64_popfd_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_popfd(inst); } while (0)
-#define amd64_loop_size(inst,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_loop((inst),(imm)); } while (0)
-#define amd64_loope_size(inst,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_loope((inst),(imm)); } while (0)
-#define amd64_loopne_size(inst,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_loopne((inst),(imm)); } while (0)
-#define amd64_jump32_size(inst,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_jump32((inst),(imm)); } while (0)
-#define amd64_jump8_size(inst,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_jump8((inst),(imm)); } while (0)
+#define amd64_cld_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_cld(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_stosb_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_stosb(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_stosl_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_stosl(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_stosd_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_stosd(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_movsb_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsb(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_movsl_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsl(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_movsd_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsd(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_prefix_size(inst,p,size) do { x86_prefix((inst), p); } while (0)
+#define amd64_rdtsc_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_rdtsc(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_cmpxchg_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_cmpxchg_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_cmpxchg_mem_reg_size(inst,mem,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_cmpxchg_mem_reg((inst),(mem),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_cmpxchg_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_cmpxchg_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_xchg_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_xchg_reg_reg((inst),((dreg)&0x7),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_xchg_mem_reg_size(inst,mem,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_xchg_mem_reg((inst),(mem),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_xchg_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_xchg_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_inc_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_inc_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_inc_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_inc_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+//#define amd64_inc_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_inc_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_dec_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_dec_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_dec_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_dec_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+//#define amd64_dec_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_dec_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_not_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_not_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_not_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_not_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_not_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_not_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_neg_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_neg_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_neg_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_neg_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_neg_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_neg_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_nop_size(inst,size) do { amd64_codegen_pre(inst); x86_nop(inst); amd64_codegen_post(inst); } while (0)
+//#define amd64_alu_reg_imm_size(inst,opc,reg,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_alu_reg_imm((inst),(opc),((reg)&0x7),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_alu_mem_imm_size(inst,opc,mem,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_alu_mem_imm((inst),(opc),(mem),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_alu_membase_imm_size(inst,opc,basereg,disp,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_alu_membase_imm((inst),(opc),((basereg)&0x7),(disp),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_alu_membase8_imm_size(inst,opc,basereg,disp,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_alu_membase8_imm((inst),(opc),((basereg)&0x7),(disp),(imm)); amd64_codegen_post(inst); } while (0)	
+#define amd64_alu_mem_reg_size(inst,opc,mem,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_alu_mem_reg((inst),(opc),(mem),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_alu_membase_reg_size(inst,opc,basereg,disp,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_alu_membase_reg((inst),(opc),((basereg)&0x7),(disp),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+//#define amd64_alu_reg_reg_size(inst,opc,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_alu_reg_reg((inst),(opc),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_alu_reg8_reg8_size(inst,opc,dreg,reg,is_dreg_h,is_reg_h,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_alu_reg8_reg8((inst),(opc),((dreg)&0x7),((reg)&0x7),(is_dreg_h),(is_reg_h)); amd64_codegen_post(inst); } while (0)
+#define amd64_alu_reg_mem_size(inst,opc,reg,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_alu_reg_mem((inst),(opc),((reg)&0x7),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_alu_reg_membase_size(inst,opc,reg,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_alu_reg_membase((inst),(opc),((reg)&0x7),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_test_reg_imm_size(inst,reg,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_test_reg_imm((inst),((reg)&0x7),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_test_mem_imm_size(inst,mem,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_test_mem_imm((inst),(mem),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_test_membase_imm_size(inst,basereg,disp,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_test_membase_imm((inst),((basereg)&0x7),(disp),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_test_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_test_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_test_mem_reg_size(inst,mem,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_test_mem_reg((inst),(mem),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_test_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_test_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_shift_reg_imm_size(inst,opc,reg,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_reg_imm((inst),(opc),((reg)&0x7),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_shift_mem_imm_size(inst,opc,mem,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_mem_imm((inst),(opc),(mem),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_shift_membase_imm_size(inst,opc,basereg,disp,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_shift_membase_imm((inst),(opc),((basereg)&0x7),(disp),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_shift_reg_size(inst,opc,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_reg((inst),(opc),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_shift_mem_size(inst,opc,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_shift_mem((inst),(opc),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_shift_membase_size(inst,opc,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_shift_membase((inst),(opc),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_shrd_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shrd_reg((inst),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_shrd_reg_imm_size(inst,dreg,reg,shamt,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shrd_reg_imm((inst),((dreg)&0x7),((reg)&0x7),(shamt)); amd64_codegen_post(inst); } while (0)
+#define amd64_shld_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shld_reg((inst),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_shld_reg_imm_size(inst,dreg,reg,shamt,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_shld_reg_imm((inst),((dreg)&0x7),((reg)&0x7),(shamt)); amd64_codegen_post(inst); } while (0)
+#define amd64_mul_reg_size(inst,reg,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mul_reg((inst),((reg)&0x7),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_mul_mem_size(inst,mem,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_mul_mem((inst),(mem),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_mul_membase_size(inst,basereg,disp,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_mul_membase((inst),((basereg)&0x7),(disp),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_imul_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_imul_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_imul_reg_mem_size(inst,reg,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_imul_reg_mem((inst),((reg)&0x7),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_imul_reg_membase_size(inst,reg,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_imul_reg_membase((inst),((reg)&0x7),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_imul_reg_reg_imm_size(inst,dreg,reg,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_imul_reg_reg_imm((inst),((dreg)&0x7),((reg)&0x7),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_imul_reg_mem_imm_size(inst,reg,mem,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_imul_reg_mem_imm((inst),((reg)&0x7),(mem),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_imul_reg_membase_imm_size(inst,reg,basereg,disp,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_imul_reg_membase_imm((inst),((reg)&0x7),((basereg)&0x7),(disp),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_div_reg_size(inst,reg,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_div_reg((inst),((reg)&0x7),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_div_mem_size(inst,mem,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_div_mem((inst),(mem),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_div_membase_size(inst,basereg,disp,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_div_membase((inst),((basereg)&0x7),(disp),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_mov_mem_reg_size(inst,mem,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mov_mem_reg((inst),(mem),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+//#define amd64_mov_regp_reg_size(inst,regp,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(regp),0,(reg)); x86_mov_regp_reg((inst),(regp),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+//#define amd64_mov_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_mov_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_mov_memindex_reg_size(inst,basereg,disp,indexreg,shift,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),(indexreg),(basereg)); x86_mov_memindex_reg((inst),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_mov_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_mov_reg_reg((inst),((dreg)&0x7),((reg)&0x7),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+//#define amd64_mov_reg_mem_size(inst,reg,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mov_reg_mem((inst),((reg)&0x7),(mem),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+//#define amd64_mov_reg_membase_size(inst,reg,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_mov_reg_membase((inst),((reg)&0x7),((basereg)&0x7),(disp),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+//#define amd64_mov_reg_memindex_size(inst,reg,basereg,disp,indexreg,shift,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),(indexreg),(basereg)); x86_mov_reg_memindex((inst),((reg)&0x7),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_clear_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_clear_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+//#define amd64_mov_reg_imm_size(inst,reg,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_mov_reg_imm((inst),((reg)&0x7),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_mov_mem_imm_size(inst,mem,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_mov_mem_imm((inst),(mem),(imm),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+//#define amd64_mov_membase_imm_size(inst,basereg,disp,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_mov_membase_imm((inst),((basereg)&0x7),(disp),(imm),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_mov_memindex_imm_size(inst,basereg,disp,indexreg,shift,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,(indexreg),(basereg)); x86_mov_memindex_imm((inst),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),(imm),(size) == 8 ? 4 : (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_lea_mem_size(inst,reg,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_lea_mem((inst),((reg)&0x7),(mem)); amd64_codegen_post(inst); } while (0)
+//#define amd64_lea_membase_size(inst,reg,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_lea_membase((inst),((reg)&0x7),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_lea_memindex_size(inst,reg,basereg,disp,indexreg,shift,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),(indexreg),(basereg)); x86_lea_memindex((inst),((reg)&0x7),((basereg)&0x7),(disp),((indexreg)&0x7),(shift)); amd64_codegen_post(inst); } while (0)
+#define amd64_widen_reg_size(inst,dreg,reg,is_signed,is_half,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_widen_reg((inst),((dreg)&0x7),((reg)&0x7),(is_signed),(is_half)); amd64_codegen_post(inst); } while (0)
+#define amd64_widen_mem_size(inst,dreg,mem,is_signed,is_half,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,0); x86_widen_mem((inst),((dreg)&0x7),(mem),(is_signed),(is_half)); amd64_codegen_post(inst); } while (0)
+#define amd64_widen_membase_size(inst,dreg,basereg,disp,is_signed,is_half,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(basereg)); x86_widen_membase((inst),((dreg)&0x7),((basereg)&0x7),(disp),(is_signed),(is_half)); amd64_codegen_post(inst); } while (0)
+#define amd64_widen_memindex_size(inst,dreg,basereg,disp,indexreg,shift,is_signed,is_half,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),(indexreg),(basereg)); x86_widen_memindex((inst),((dreg)&0x7),((basereg)&0x7),(disp),((indexreg)&0x7),(shift),(is_signed),(is_half)); amd64_codegen_post(inst); } while (0)
+#define amd64_cdq_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_cdq(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_wait_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_wait(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fp_op_mem_size(inst,opc,mem,is_double,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fp_op_mem((inst),(opc),(mem),(is_double)); amd64_codegen_post(inst); } while (0)
+#define amd64_fp_op_membase_size(inst,opc,basereg,disp,is_double,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fp_op_membase((inst),(opc),((basereg)&0x7),(disp),(is_double)); amd64_codegen_post(inst); } while (0)
+#define amd64_fp_op_size(inst,opc,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fp_op((inst),(opc),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fp_op_reg_size(inst,opc,index,pop_stack,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fp_op_reg((inst),(opc),(index),(pop_stack)); amd64_codegen_post(inst); } while (0)
+#define amd64_fp_int_op_membase_size(inst,opc,basereg,disp,is_int,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fp_int_op_membase((inst),(opc),((basereg)&0x7),(disp),(is_int)); amd64_codegen_post(inst); } while (0)
+#define amd64_fstp_size(inst,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fstp((inst),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fcompp_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fcompp(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fucompp_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fucompp(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fnstsw_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fnstsw(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fnstcw_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fnstcw((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_fnstcw_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_fnstcw_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_fldcw_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fldcw((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_fldcw_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fldcw_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_fchs_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fchs(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_frem_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_frem(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fxch_size(inst,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fxch((inst),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fcomi_size(inst,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fcomi((inst),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fcomip_size(inst,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fcomip((inst),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fucomi_size(inst,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fucomi((inst),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fucomip_size(inst,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fucomip((inst),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fld_size(inst,mem,is_double,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fld((inst),(mem),(is_double)); amd64_codegen_post(inst); } while (0)
+//#define amd64_fld_membase_size(inst,basereg,disp,is_double,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fld_membase((inst),((basereg)&0x7),(disp),(is_double)); amd64_codegen_post(inst); } while (0)
+#define amd64_fld80_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fld80_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_fld80_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_fld80_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_fild_size(inst,mem,is_long,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fild((inst),(mem),(is_long)); amd64_codegen_post(inst); } while (0)
+#define amd64_fild_membase_size(inst,basereg,disp,is_long,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fild_membase((inst),((basereg)&0x7),(disp),(is_long)); amd64_codegen_post(inst); } while (0)
+#define amd64_fld_reg_size(inst,index,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fld_reg((inst),(index)); amd64_codegen_post(inst); } while (0)
+#define amd64_fldz_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fldz(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fld1_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fld1(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fldpi_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fldpi(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fst_size(inst,mem,is_double,pop_stack,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fst((inst),(mem),(is_double),(pop_stack)); amd64_codegen_post(inst); } while (0)
+#define amd64_fst_membase_size(inst,basereg,disp,is_double,pop_stack,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fst_membase((inst),((basereg)&0x7),(disp),(is_double),(pop_stack)); amd64_codegen_post(inst); } while (0)
+#define amd64_fst80_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fst80_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_fst80_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fst80_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_fist_pop_size(inst,mem,is_long,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_fist_pop((inst),(mem),(is_long)); amd64_codegen_post(inst); } while (0)
+#define amd64_fist_pop_membase_size(inst,basereg,disp,is_long,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fist_pop_membase((inst),((basereg)&0x7),(disp),(is_long)); amd64_codegen_post(inst); } while (0)
+#define amd64_fstsw_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_fstsw(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fist_membase_size(inst,basereg,disp,is_int,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_fist_membase((inst),((basereg)&0x7),(disp),(is_int)); amd64_codegen_post(inst); } while (0)
+//#define amd64_push_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_push_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_push_regp_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_push_regp((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_push_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_push_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+//#define amd64_push_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_push_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_push_memindex_size(inst,basereg,disp,indexreg,shift,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,(indexreg),(basereg)); x86_push_memindex((inst),((basereg)&0x7),(disp),((indexreg)&0x7),(shift)); amd64_codegen_post(inst); } while (0)
+#define amd64_push_imm_size(inst,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_push_imm((inst),(imm)); amd64_codegen_post(inst); } while (0)
+//#define amd64_pop_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_pop_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_pop_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_pop_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_pop_membase_size(inst,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_pop_membase((inst),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_pushad_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_pushad(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_pushfd_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_pushfd(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_popad_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_popad(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_popfd_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_popfd(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_loop_size(inst,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_loop((inst),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_loope_size(inst,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_loope((inst),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_loopne_size(inst,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_loopne((inst),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_jump32_size(inst,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_jump32((inst),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_jump8_size(inst,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_jump8((inst),(imm)); amd64_codegen_post(inst); } while (0)
+#if !defined( __native_client_codegen__ )
+/* Defined above for Native Client, so they can be used in other macros */
 #define amd64_jump_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),0,0,0,(reg)); x86_jump_reg((inst),((reg)&0x7)); } while (0)
 #define amd64_jump_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_jump_mem((inst),(mem)); } while (0)
-#define amd64_jump_disp_size(inst,disp,size) do { amd64_emit_rex ((inst),0,0,0,0); x86_jump_disp((inst),(disp)); } while (0)
+#endif
+#define amd64_jump_disp_size(inst,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,0); x86_jump_disp((inst),(disp)); amd64_codegen_post(inst); } while (0)
 #define amd64_branch8_size(inst,cond,imm,is_signed,size) do { x86_branch8((inst),(cond),(imm),(is_signed)); } while (0)
 #define amd64_branch32_size(inst,cond,imm,is_signed,size) do { x86_branch32((inst),(cond),(imm),(is_signed)); } while (0)
-#define amd64_branch_size(inst,cond,target,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_branch((inst),(cond),(target),(is_signed)); } while (0)
-#define amd64_branch_disp_size(inst,cond,disp,is_signed,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_branch_disp((inst),(cond),(disp),(is_signed)); } while (0)
-#define amd64_set_reg_size(inst,cond,reg,is_signed,size) do { amd64_emit_rex((inst),1,0,0,(reg)); x86_set_reg((inst),(cond),((reg)&0x7),(is_signed)); } while (0)
-#define amd64_set_mem_size(inst,cond,mem,is_signed,size) do { x86_set_mem((inst),(cond),(mem),(is_signed)); } while (0)
-#define amd64_set_membase_size(inst,cond,basereg,disp,is_signed,size) do { amd64_emit_rex ((inst),0,0,0,(basereg)); x86_set_membase((inst),(cond),((basereg)&0x7),(disp),(is_signed)); } while (0)
+#define amd64_branch_size_body(inst,cond,target,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_branch((inst),(cond),(target),(is_signed)); amd64_codegen_post(inst); } while (0)
+#if defined(__default_codegen__)
+#define amd64_branch_size(inst,cond,target,is_signed,size) do { amd64_branch_size_body((inst),(cond),(target),(is_signed),(size)); } while (0)
+#elif defined(__native_client_codegen__)
+#define amd64_branch_size(inst,cond,target,is_signed,size) \
+	do { \
+		/* amd64_branch_size_body used twice in     */ \
+		/* case of relocation by amd64_codegen_post */ \
+		guint8* branch_start; \
+		amd64_codegen_pre(inst); \
+		amd64_branch_size_body((inst),(cond),(target),(is_signed),(size)); \
+		inst = amd64_codegen_post(inst); \
+		branch_start = inst; \
+		amd64_branch_size_body((inst),(cond),(target),(is_signed),(size)); \
+		mono_amd64_patch(branch_start, (target)); \
+	} while (0)
+#endif
+
+#define amd64_branch_disp_size(inst,cond,disp,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_branch_disp((inst),(cond),(disp),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_set_reg_size(inst,cond,reg,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex((inst),1,0,0,(reg)); x86_set_reg((inst),(cond),((reg)&0x7),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_set_mem_size(inst,cond,mem,is_signed,size) do { amd64_codegen_pre(inst); x86_set_mem((inst),(cond),(mem),(is_signed)); amd64_codegen_post(inst); } while (0)
+#define amd64_set_membase_size(inst,cond,basereg,disp,is_signed,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),0,0,0,(basereg)); x86_set_membase((inst),(cond),((basereg)&0x7),(disp),(is_signed)); amd64_codegen_post(inst); } while (0)
+//#define amd64_call_reg_size(inst,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_call_reg((inst),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_call_mem_size(inst,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_call_mem((inst),(mem)); amd64_codegen_post(inst); } while (0)
+
+#if defined(__default_codegen__)
+
 #define amd64_call_imm_size(inst,disp,size) do { x86_call_imm((inst),(disp)); } while (0)
-//#define amd64_call_reg_size(inst,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_call_reg((inst),((reg)&0x7)); } while (0)
-#define amd64_call_mem_size(inst,mem,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_call_mem((inst),(mem)); } while (0)
 #define amd64_call_code_size(inst,target,size) do { x86_call_code((inst),(target)); } while (0)
-//#define amd64_ret_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_ret(inst); } while (0)
-#define amd64_ret_imm_size(inst,imm,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_ret_imm((inst),(imm)); } while (0)
-#define amd64_cmov_reg_size(inst,cond,is_signed,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_cmov_reg((inst),(cond),(is_signed),((dreg)&0x7),((reg)&0x7)); } while (0)
-#define amd64_cmov_mem_size(inst,cond,is_signed,reg,mem,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_cmov_mem((inst),(cond),(is_signed),((reg)&0x7),(mem)); } while (0)
-#define amd64_cmov_membase_size(inst,cond,is_signed,reg,basereg,disp,size) do { amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_cmov_membase((inst),(cond),(is_signed),((reg)&0x7),((basereg)&0x7),(disp)); } while (0)
-#define amd64_enter_size(inst,framesize) do { amd64_emit_rex ((inst),(size),0,0,0); x86_enter((inst),(framesize)); } while (0)
-//#define amd64_leave_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_leave(inst); } while (0)
-#define amd64_sahf_size(inst,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_sahf(inst); } while (0)
-#define amd64_fsin_size(inst,size) do { x86_fsin(inst); } while (0)
-#define amd64_fcos_size(inst,size) do { x86_fcos(inst); } while (0)
-#define amd64_fabs_size(inst,size) do { x86_fabs(inst); } while (0)
-#define amd64_ftst_size(inst,size) do { x86_ftst(inst); } while (0)
-#define amd64_fxam_size(inst,size) do { x86_fxam(inst); } while (0)
-#define amd64_fpatan_size(inst,size) do { x86_fpatan(inst); } while (0)
-#define amd64_fprem_size(inst,size) do { x86_fprem(inst); } while (0)
-#define amd64_fprem1_size(inst,size) do { x86_fprem1(inst); } while (0)
-#define amd64_frndint_size(inst,size) do { x86_frndint(inst); } while (0)
-#define amd64_fsqrt_size(inst,size) do { x86_fsqrt(inst); } while (0)
-#define amd64_fptan_size(inst,size) do { x86_fptan(inst); } while (0)
-//#define amd64_padding_size(inst,size) do { x86_padding((inst),(size)); } while (0)
-#define amd64_prolog_size(inst,frame_size,reg_mask,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_prolog((inst),(frame_size),(reg_mask)); } while (0)
-#define amd64_epilog_size(inst,reg_mask,size) do { amd64_emit_rex ((inst),(size),0,0,0); x86_epilog((inst),(reg_mask)); } while (0)
-#define amd64_xadd_reg_reg_size(inst,dreg,reg,size) do { amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_xadd_reg_reg ((inst), (dreg), (reg), (size)); } while (0)
-#define amd64_xadd_mem_reg_size(inst,mem,reg,size) do { amd64_emit_rex ((inst),(size),0,0,(reg)); x86_xadd_mem_reg((inst),(mem),((reg)&0x7), (size)); } while (0)
-#define amd64_xadd_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_xadd_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7),(size)); } while (0)
+
+#elif defined(__native_client_codegen__)
+/* Size is ignored for Native Client calls, we restrict jumping to 32-bits */
+#define amd64_call_imm_size(inst,disp,size)             \
+  do {                                                  \
+    amd64_codegen_pre((inst));                          \
+    amd64_call_sequence_pre((inst));                    \
+    x86_call_imm((inst),(disp));                        \
+    amd64_call_sequence_post((inst));                   \
+    amd64_codegen_post((inst));                         \
+  } while (0)
+
+/* x86_call_code is called twice below, first so we can get the size of the */
+/* call sequence, and again so the exact offset from "inst" is used, since  */
+/* the sequence could have moved from amd64_call_sequence_post.             */
+/* Size is ignored for Native Client jumps, we restrict jumping to 32-bits  */
+#define amd64_call_code_size(inst,target,size)          \
+  do {                                                  \
+    amd64_codegen_pre((inst));                          \
+    guint8* adjusted_start;                             \
+    guint8* call_start;                                 \
+    amd64_call_sequence_pre((inst));                    \
+    x86_call_code((inst),(target));                     \
+    adjusted_start = amd64_call_sequence_post((inst));  \
+    call_start = adjusted_start;                        \
+    x86_call_code(adjusted_start, (target));            \
+    amd64_codegen_post((inst));                         \
+    mono_amd64_patch(call_start, (target));             \
+  } while (0)
+
+#endif /*__native_client_codegen__*/
+
+//#define amd64_ret_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_ret(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_ret_imm_size(inst,imm,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_ret_imm((inst),(imm)); amd64_codegen_post(inst); } while (0)
+#define amd64_cmov_reg_size(inst,cond,is_signed,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_cmov_reg((inst),(cond),(is_signed),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
+#define amd64_cmov_mem_size(inst,cond,is_signed,reg,mem,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_cmov_mem((inst),(cond),(is_signed),((reg)&0x7),(mem)); amd64_codegen_post(inst); } while (0)
+#define amd64_cmov_membase_size(inst,cond,is_signed,reg,basereg,disp,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(basereg)); x86_cmov_membase((inst),(cond),(is_signed),((reg)&0x7),((basereg)&0x7),(disp)); amd64_codegen_post(inst); } while (0)
+#define amd64_enter_size(inst,framesize) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_enter((inst),(framesize)); amd64_codegen_post(inst); } while (0)
+//#define amd64_leave_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_leave(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_sahf_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_sahf(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fsin_size(inst,size) do { amd64_codegen_pre(inst); x86_fsin(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fcos_size(inst,size) do { amd64_codegen_pre(inst); x86_fcos(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fabs_size(inst,size) do { amd64_codegen_pre(inst); x86_fabs(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_ftst_size(inst,size) do { amd64_codegen_pre(inst); x86_ftst(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fxam_size(inst,size) do { amd64_codegen_pre(inst); x86_fxam(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fpatan_size(inst,size) do { amd64_codegen_pre(inst); x86_fpatan(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fprem_size(inst,size) do { amd64_codegen_pre(inst); x86_fprem(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fprem1_size(inst,size) do { amd64_codegen_pre(inst); x86_fprem1(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_frndint_size(inst,size) do { amd64_codegen_pre(inst); x86_frndint(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fsqrt_size(inst,size) do { amd64_codegen_pre(inst); x86_fsqrt(inst); amd64_codegen_post(inst); } while (0)
+#define amd64_fptan_size(inst,size) do { amd64_codegen_pre(inst); x86_fptan(inst); amd64_codegen_post(inst); } while (0)
+//#define amd64_padding_size(inst,size) do { amd64_codegen_pre(inst); x86_padding((inst),(size)); amd64_codegen_post(inst); } while (0)
+#define amd64_prolog_size(inst,frame_size,reg_mask,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_prolog((inst),(frame_size),(reg_mask)); amd64_codegen_post(inst); } while (0)
+#define amd64_epilog_size(inst,reg_mask,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_epilog((inst),(reg_mask)); amd64_codegen_post(inst); } while (0)
+#define amd64_xadd_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_xadd_reg_reg ((inst), (dreg), (reg), (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_xadd_mem_reg_size(inst,mem,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,(reg)); x86_xadd_mem_reg((inst),(mem),((reg)&0x7), (size)); amd64_codegen_post(inst); } while (0)
+#define amd64_xadd_membase_reg_size(inst,basereg,disp,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(reg),0,(basereg)); x86_xadd_membase_reg((inst),((basereg)&0x7),(disp),((reg)&0x7),(size)); amd64_codegen_post(inst); } while (0)
 
 
 
diff --git a/mono/arch/amd64/tramp.c b/mono/arch/amd64/tramp.c
index 5a4f9a9ed2b..6dbec93e859 100644
--- a/mono/arch/amd64/tramp.c
+++ b/mono/arch/amd64/tramp.c
@@ -543,7 +543,7 @@ enum_marshal2:
 	amd64_call_reg (p, AMD64_R11);
 
 	if (sig->ret->byref || string_ctor || !(retval_implicit || sig->ret->type == MONO_TYPE_VOID)) {
-		amd64_mov_reg_membase(p, AMD64_RSI, AMD64_RBP, -8, 8);
+		amd64_mov_reg_membase(p, AMD64_RSI, AMD64_RBP, -8, SIZEOF_VOID_P);
 	}
 	/*
 	 * Handle retval.
@@ -883,19 +883,19 @@ enum_calc_size:
 	 * Initialize MonoInvocation fields, first the ones known now.
 	 */
 	amd64_alu_reg_reg (p, X86_XOR, AMD64_RAX, AMD64_RAX);
-	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, ex)), AMD64_RAX, 8);
-	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, ex_handler)), AMD64_RAX, 8);
-	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, parent)), AMD64_RAX, 8);
+	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, ex)), AMD64_RAX, SIZEOF_VOID_P);
+	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, ex_handler)), AMD64_RAX, SIZEOF_VOID_P);
+	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, parent)), AMD64_RAX, SIZEOF_VOID_P);
 	/*
 	 * Set the method pointer.
 	 */
-	amd64_mov_membase_imm (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, method)), (long)method, 8);
+	amd64_mov_membase_imm (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, method)), (long)method, SIZEOF_VOID_P);
 
 	/*
 	 * Handle this.
 	 */
 	if (sig->hasthis)
-		amd64_mov_membase_reg(p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, obj)), this_reg, 8);
+		amd64_mov_membase_reg(p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, obj)), this_reg, SIZEOF_VOID_P);
 
 	/*
 	 * Handle the arguments. stackval_pos is the offset from RBP of the stackval in the MonoInvocation args array .
@@ -903,7 +903,7 @@ enum_calc_size:
 	 * We just call stackval_from_data to handle all the (nasty) issues....
 	 */
 	amd64_lea_membase (p, AMD64_RAX, AMD64_RBP, stackval_pos);
-	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, stack_args)), AMD64_RAX, 8);
+	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, stack_args)), AMD64_RAX, SIZEOF_VOID_P);
 	for (i = 0; i < sig->param_count; ++i) {
 /* Need to call stackval_from_data (MonoType *type, stackval *result, char *data, gboolean pinvoke); */
 		amd64_mov_reg_imm (p, AMD64_R11, stackval_from_data);
@@ -926,12 +926,12 @@ enum_calc_size:
 	 * Handle the return value storage area.
 	 */
 	amd64_lea_membase (p, AMD64_RAX, AMD64_RBP, stackval_pos);
-	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, retval)), AMD64_RAX, 8);
+	amd64_mov_membase_reg (p, AMD64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, retval)), AMD64_RAX, SIZEOF_VOID_P);
 	if (sig->ret->type == MONO_TYPE_VALUETYPE && !sig->ret->byref) {
 		MonoClass *klass  = sig->ret->data.klass;
 		if (!klass->enumtype) {
-			amd64_mov_reg_membase (p, AMD64_RCX, AMD64_RBP, retval_ptr_rbp_offset, 8);
-			amd64_mov_membase_reg (p, AMD64_RBP, stackval_pos, AMD64_RCX, 8);
+			amd64_mov_reg_membase (p, AMD64_RCX, AMD64_RBP, retval_ptr_rbp_offset, SIZEOF_VOID_P);
+			amd64_mov_membase_reg (p, AMD64_RBP, stackval_pos, AMD64_RCX, SIZEOF_VOID_P);
 		}
 	}
 
@@ -947,7 +947,7 @@ enum_calc_size:
 	 */
 	amd64_lea_membase (p, AMD64_RAX, AMD64_RBP, stackval_pos);
 	if (sig->ret->byref) {
-		amd64_mov_reg_membase (p, AMD64_RAX, AMD64_RAX, 0, 8);
+		amd64_mov_reg_membase (p, AMD64_RAX, AMD64_RAX, 0, SIZEOF_VOID_P);
 	} else {
 		int simpletype = sig->ret->type;	
 	enum_retvalue:
diff --git a/mono/arch/x86/x86-codegen.h b/mono/arch/x86/x86-codegen.h
index af3e3c6f558..6ca3695c7e1 100644
--- a/mono/arch/x86/x86-codegen.h
+++ b/mono/arch/x86/x86-codegen.h
@@ -17,9 +17,7 @@
 #include <assert.h>
 
 #ifdef __native_client_codegen__
-#define kNaClAlignment 32
-#define kNaClAlignmentMask (kNaClAlignment - 1)
-extern guint8 nacl_align_byte;
+extern gint8 nacl_align_byte;
 #endif /* __native_client_codegen__ */
 
 
@@ -28,15 +26,10 @@ extern guint8 nacl_align_byte;
 #define x86_call_sequence_pre(inst) guint8* _code_start = (inst);
 #define x86_call_sequence_post(inst) \
   (mono_nacl_align_call(&_code_start, &(inst)), _code_start);
-#define x86_call_sequence_pre_val(inst) guint8* _code_start = (inst);
-#define x86_call_sequence_post_val(inst) \
-  (mono_nacl_align_call(&_code_start, &(inst)), _code_start);
 #else
 #define x86_codegen_pre(inst_ptr_ptr, inst_len) do {} while (0)
-#define x86_call_sequence_pre(inst)
-#define x86_call_sequence_post(inst)
-#define x86_call_sequence_pre_val(inst) guint8* _code_start = (inst);
-#define x86_call_sequence_post_val(inst) _code_start
+#define x86_call_sequence_pre(inst) guint8* _code_start = (inst);
+#define x86_call_sequence_post(inst) _code_start
 #endif  /* __native_client_codegen__ */
 
 
@@ -305,7 +298,7 @@ typedef union {
 
 #define kMaxMembaseEmitPadding 6
 
-#define x86_membase_emit(inst,r,basereg,disp)	do {\
+#define x86_membase_emit_body(inst,r,basereg,disp)	do {\
 	if ((basereg) == X86_ESP) {	\
 		if ((disp) == 0) {	\
 			x86_address_byte ((inst), 0, (r), X86_ESP);	\
@@ -334,6 +327,18 @@ typedef union {
 	}	\
 	} while (0)
 
+#if defined(__native_client_codegen__) && defined(TARGET_AMD64)
+#define x86_membase_emit(inst,r,basereg,disp) \
+	do { \
+		amd64_nacl_membase_handler(&(inst), (basereg), (disp), (r)) ; \
+	} while (0)
+#else /* __default_codegen__ || 32-bit NaCl codegen */
+#define x86_membase_emit(inst,r,basereg,disp) \
+	do { \
+		x86_membase_emit_body((inst),(r),(basereg),(disp)); \
+	} while (0)
+#endif
+
 #define kMaxMemindexEmitPadding 6
 
 #define x86_memindex_emit(inst,r,basereg,disp,indexreg,shift)	\
@@ -351,7 +356,7 @@ typedef union {
 			x86_imm_emit8 ((inst), (disp));	\
 		} else {	\
 			x86_address_byte ((inst), 2, (r), 4);	\
-			x86_address_byte ((inst), (shift), (indexreg), 5);	\
+			x86_address_byte ((inst), (shift), (indexreg), (basereg));	\
 			x86_imm_emit32 ((inst), (disp));	\
 		}	\
 	} while (0)
@@ -438,12 +443,23 @@ typedef union {
     } while ( in_nop );  \
   } while (0)
 
+#if defined(__native_client__)
 #define x86_patch(ins,target) \
   do { \
     unsigned char* inst = (ins); \
+    guint8* new_target = nacl_modify_patch_target((target)); \
     x86_skip_nops((inst)); \
-    x86_do_patch((inst), (target)); \
+    x86_do_patch((inst), new_target); \
   } while (0)
+#else /* __native_client__ */
+#define x86_patch(ins,target) \
+  do { \
+    unsigned char* inst = (ins); \
+    guint8* new_target = (target); \
+    x86_skip_nops((inst)); \
+    x86_do_patch((inst), new_target); \
+  } while (0)
+#endif /* __native_client__ */
 
 #else
 #define x86_patch(ins,target) do { x86_do_patch((ins), (target)); } while (0)
@@ -472,6 +488,13 @@ typedef union {
 #define x86_movsl(inst) do { *(inst)++ =(unsigned char)0xa5; } while (0)
 #define x86_movsd(inst) x86_movsl((inst))
 
+#if defined(__default_codegen__)
+#define x86_prefix(inst,p) \
+	do { \
+		*(inst)++ =(unsigned char) (p); \
+	} while (0)
+#elif defined(__native_client_codegen__)
+#if defined(TARGET_X86)
 /* kNaClAlignment - 1 is the max value we can pass into x86_codegen_pre. */
 /* This keeps us from having to call x86_codegen_pre with specific       */
 /* knowledge of the size of the instruction that follows it, and         */
@@ -481,6 +504,18 @@ typedef union {
 		x86_codegen_pre(&(inst), kNaClAlignment - 1); \
 		*(inst)++ =(unsigned char) (p); \
 	} while (0)
+#elif defined(TARGET_AMD64)
+/* We need to tag any prefixes so we can perform proper membase sandboxing */
+/* See: mini-amd64.c:amd64_nacl_membase_handler for verbose details        */
+#define x86_prefix(inst,p) \
+	do { \
+		amd64_nacl_tag_legacy_prefix((inst)); \
+		*(inst)++ =(unsigned char) (p); \
+	} while (0)
+
+#endif /* TARGET_AMD64 */
+
+#endif /* __native_client_codegen__ */
 
 #define x86_rdtsc(inst) \
 	do {	\
@@ -1041,7 +1076,7 @@ typedef union {
 		x86_codegen_pre(&(inst), 7); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x88; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x89; break;	\
 		default: assert (0);	\
 		}	\
@@ -1053,7 +1088,7 @@ typedef union {
 		x86_codegen_pre(&(inst), 3); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x88; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x89; break;	\
 		default: assert (0);	\
 		}	\
@@ -1065,7 +1100,7 @@ typedef union {
 		x86_codegen_pre(&(inst), 2 + kMaxMembaseEmitPadding); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x88; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x89; break;	\
 		default: assert (0);	\
 		}	\
@@ -1077,7 +1112,7 @@ typedef union {
 		x86_codegen_pre(&(inst), 2 + kMaxMemindexEmitPadding); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x88; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x89; break;	\
 		default: assert (0);	\
 		}	\
@@ -1089,7 +1124,7 @@ typedef union {
 		x86_codegen_pre(&(inst), 3); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x8b; break;	\
 		default: assert (0);	\
 		}	\
@@ -1101,7 +1136,7 @@ typedef union {
 		x86_codegen_pre(&(inst), 7); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x8b; break;	\
 		default: assert (0);	\
 		}	\
@@ -1115,7 +1150,7 @@ typedef union {
 		x86_codegen_pre(&(inst), kMovRegMembasePadding); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x8b; break;	\
 		default: assert (0);	\
 		}	\
@@ -1127,7 +1162,7 @@ typedef union {
 		x86_codegen_pre(&(inst), 2 + kMaxMemindexEmitPadding); \
 		switch ((size)) {	\
 		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
-		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 2: x86_prefix((inst), X86_OPERAND_PREFIX); /* fall through */	\
 		case 4: *(inst)++ = (unsigned char)0x8b; break;	\
 		default: assert (0);	\
 		}	\
@@ -1155,7 +1190,7 @@ typedef union {
 			x86_imm_emit8 ((inst), (imm));	\
 		} else if ((size) == 2) {	\
 			x86_codegen_pre(&(inst), 9); \
-			*(inst)++ = (unsigned char)0x66;	\
+			x86_prefix((inst), X86_OPERAND_PREFIX);	\
 			*(inst)++ = (unsigned char)0xc7;	\
 			x86_mem_emit ((inst), 0, (mem));	\
 			x86_imm_emit16 ((inst), (imm));	\
@@ -1176,7 +1211,7 @@ typedef union {
 			x86_imm_emit8 ((inst), (imm));	\
 		} else if ((size) == 2) {	\
 			x86_codegen_pre(&(inst), 4 + kMaxMembaseEmitPadding); \
-			*(inst)++ = (unsigned char)0x66;	\
+			x86_prefix((inst), X86_OPERAND_PREFIX);	\
 			*(inst)++ = (unsigned char)0xc7;	\
 			x86_membase_emit ((inst), 0, (basereg), (disp));	\
 			x86_imm_emit16 ((inst), (imm));	\
@@ -1197,7 +1232,7 @@ typedef union {
 			x86_imm_emit8 ((inst), (imm));	\
 		} else if ((size) == 2) {	\
 			x86_codegen_pre(&(inst), 4 + kMaxMemindexEmitPadding); \
-			*(inst)++ = (unsigned char)0x66;	\
+			x86_prefix((inst), X86_OPERAND_PREFIX);	\
 			*(inst)++ = (unsigned char)0xc7;	\
 			x86_memindex_emit ((inst), 0, (basereg), (disp), (indexreg), (shift));	\
 			x86_imm_emit16 ((inst), (imm));	\
@@ -1681,6 +1716,7 @@ typedef union {
 		x86_imm_emit8 ((inst), (imm));	\
 	} while (0)
 
+#if defined(TARGET_X86)
 #define x86_jump32(inst,imm)	\
 	do {	\
 		x86_codegen_pre(&(inst), 5); \
@@ -1694,9 +1730,27 @@ typedef union {
 		*(inst)++ = (unsigned char)0xeb;	\
 		x86_imm_emit8 ((inst), (imm));	\
 	} while (0)
+#elif defined(TARGET_AMD64)
+/* These macros are used directly from mini-amd64.c and other      */
+/* amd64 specific files, so they need to be instrumented directly. */
+#define x86_jump32(inst,imm)	\
+	do {	\
+		amd64_codegen_pre(inst); \
+		*(inst)++ = (unsigned char)0xe9;	\
+		x86_imm_emit32 ((inst), (imm));	\
+		amd64_codegen_post(inst); \
+	} while (0)
 
+#define x86_jump8(inst,imm)	\
+	do {	\
+		amd64_codegen_pre(inst); \
+		*(inst)++ = (unsigned char)0xeb;	\
+		x86_imm_emit8 ((inst), (imm));	\
+		amd64_codegen_post(inst); \
+	} while (0)
+#endif
 
-#ifdef __native_client_codegen__
+#if defined( __native_client_codegen__ ) && defined( TARGET_X86 )
 #define x86_jump_reg(inst,reg)	do {	\
     x86_codegen_pre(&(inst), 5);			\
     *(inst)++ = (unsigned char)0x83;  /* and */		\
@@ -1747,7 +1801,7 @@ typedef union {
 /*
  * target is a pointer in our buffer.
  */
-#define x86_jump_code(inst,target)	\
+#define x86_jump_code_body(inst,target)	\
 	do {	\
 		int t; \
 		x86_codegen_pre(&(inst), 2); \
@@ -1761,6 +1815,31 @@ typedef union {
 		}	\
 	} while (0)
 
+#if defined(__default_codegen__) 
+#define x86_jump_code(inst,target) \
+	do { \
+		x86_jump_code_body((inst),(target)); \
+	} while (0)
+#elif defined(__native_client_codegen__) && defined(TARGET_X86)
+#define x86_jump_code(inst,target) \
+	do { \
+    		guint8* jump_start = (inst); \
+		x86_jump_code_body((inst),(target)); \
+		x86_patch(jump_start, (target)); \
+	} while (0)
+#elif defined(__native_client_codegen__) && defined(TARGET_AMD64)
+#define x86_jump_code(inst,target) \
+	do { \
+		/* jump_code_body is used twice because there are offsets */ \
+		/* calculated based on the IP, which can change after the */ \
+		/* call to amd64_codegen_post                             */ \
+		amd64_codegen_pre(inst); \
+		x86_jump_code_body((inst),(target)); \
+		inst = amd64_codegen_post(inst); \
+		x86_jump_code_body((inst),(target)); \
+	} while (0)
+#endif /* __native_client_codegen__ */
+
 #define x86_jump_disp(inst,disp)	\
 	do {	\
 		int t = (disp) - 2;	\
@@ -1772,6 +1851,7 @@ typedef union {
 		}	\
 	} while (0)
 
+#if defined(TARGET_X86)
 #define x86_branch8(inst,cond,imm,is_signed)	\
 	do {	\
 		x86_codegen_pre(&(inst), 2); \
@@ -1792,12 +1872,40 @@ typedef union {
 			*(inst)++ = x86_cc_unsigned_map [(cond)] + 0x10;	\
 		x86_imm_emit32 ((inst), (imm));	\
 	} while (0)
+#elif defined(TARGET_AMD64)
+/* These macros are used directly from mini-amd64.c and other      */
+/* amd64 specific files, so they need to be instrumented directly. */
+#define x86_branch8(inst,cond,imm,is_signed)	\
+	do {	\
+		amd64_codegen_pre(inst); \
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)];	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)];	\
+		x86_imm_emit8 ((inst), (imm));	\
+		amd64_codegen_post(inst); \
+	} while (0)
+#define x86_branch32(inst,cond,imm,is_signed)	\
+	do {	\
+		amd64_codegen_pre(inst); \
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] + 0x10;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] + 0x10;	\
+		x86_imm_emit32 ((inst), (imm));	\
+		amd64_codegen_post(inst); \
+	} while (0)
+#endif
 
+#if defined(TARGET_X86)
 #define x86_branch(inst,cond,target,is_signed)	\
 	do {	\
 		int offset;					 \
+		guint8* branch_start; \
 		x86_codegen_pre(&(inst), 2); \
 		offset = (target) - (inst) - 2;	\
+		branch_start = (inst); \
 		if (x86_is_imm8 ((offset)))	\
 			x86_branch8 ((inst), (cond), offset, (is_signed));	\
 		else {	\
@@ -1805,7 +1913,42 @@ typedef union {
 			offset = (target) - (inst) - 6;	\
 			x86_branch32 ((inst), (cond), offset, (is_signed));	\
 		}	\
+		x86_patch(branch_start, (target)); \
 	} while (0)
+#elif defined(TARGET_AMD64)
+/* This macro is used directly from mini-amd64.c and other        */
+/* amd64 specific files, so it needs to be instrumented directly. */
+
+#define x86_branch_body(inst,cond,target,is_signed)	\
+	do {	\
+		int offset = (target) - (inst) - 2;	\
+		if (x86_is_imm8 ((offset)))	\
+			x86_branch8 ((inst), (cond), offset, (is_signed));	\
+		else {	\
+			offset = (target) - (inst) - 6;	\
+			x86_branch32 ((inst), (cond), offset, (is_signed));	\
+		}	\
+	} while (0)
+
+#if defined(__default_codegen__)
+#define x86_branch(inst,cond,target,is_signed)	\
+	do { \
+		x86_branch_body((inst),(cond),(target),(is_signed)); \
+	} while (0)
+#elif defined(__native_client_codegen__)
+#define x86_branch(inst,cond,target,is_signed)	\
+	do {	\
+		/* branch_body is used twice because there are offsets */ \
+		/* calculated based on the IP, which can change after  */ \
+ 		/* the call to amd64_codegen_post                      */ \
+		amd64_codegen_pre(inst); \
+		x86_branch_body((inst),(cond),(target),(is_signed)); \
+		inst = amd64_codegen_post(inst); \
+		x86_branch_body((inst),(cond),(target),(is_signed)); \
+	} while (0)
+#endif /* __native_client_codegen__ */
+
+#endif /* TARGET_AMD64 */
 
 #define x86_branch_disp(inst,cond,disp,is_signed)	\
 	do {	\
@@ -1865,10 +2008,10 @@ typedef union {
 		x86_call_sequence_post((inst)); \
 	} while (0)
 
-#ifdef __native_client_codegen__
+
+#if defined( __native_client_codegen__ ) && defined( TARGET_X86 )
 #define x86_call_reg_internal(inst,reg)	\
   do {							\
-    x86_codegen_pre(&(inst), 5);			\
     *(inst)++ = (unsigned char)0x83;  /* and */		\
     x86_reg_emit ((inst), 4, (reg));  /* reg */		\
     *(inst)++ = (unsigned char)nacl_align_byte;		\
@@ -1914,20 +2057,23 @@ typedef union {
 #endif  /* __native_client_codegen__ */
 
 
-#ifdef __native_client_codegen__
+#if defined( __native_client_codegen__ ) && defined( TARGET_X86 )
 
 #define x86_call_code(inst,target)	\
 	do {	\
 		int _x86_offset; \
+		guint8* call_start; \
 		guint8* _aligned_start; \
-		x86_call_sequence_pre_val ((inst)); \
+		x86_call_sequence_pre((inst)); \
 		_x86_offset = (unsigned char*)(target) - (inst);	\
 		_x86_offset -= 5;	\
 		x86_call_imm_body ((inst), _x86_offset);	\
-		_aligned_start = x86_call_sequence_post_val ((inst)); \
+		_aligned_start = x86_call_sequence_post((inst)); \
+		call_start = _aligned_start; \
 		_x86_offset = (unsigned char*)(target) - (_aligned_start);	\
 		_x86_offset -= 5;	\
 		x86_call_imm_body ((_aligned_start), _x86_offset);	\
+		x86_patch(call_start, (target)); \
 	} while (0)
 
 #define SIZE_OF_RET 6
@@ -2062,9 +2208,9 @@ typedef union {
 
 #ifdef __native_client_codegen__
 
-#define kNaClLengthOfCallReg 5
-#define kNaClLengthOfCallImm 5
-#define kNaClLengthOfCallMembase (kNaClLengthOfCallReg + 6)
+#define kx86NaClLengthOfCallReg 5
+#define kx86NaClLengthOfCallImm 5
+#define kx86NaClLengthOfCallMembase (kx86NaClLengthOfCallReg + 6)
 
 #endif  /* __native_client_codegen__ */
 
diff --git a/mono/io-layer/atomic.h b/mono/io-layer/atomic.h
index 258aa185129..e45cfcf99b0 100644
--- a/mono/io-layer/atomic.h
+++ b/mono/io-layer/atomic.h
@@ -92,7 +92,7 @@ static inline gpointer InterlockedCompareExchangePointer(volatile gpointer *dest
 	gpointer old;
 
 	__asm__ __volatile__ ("lock; "
-#ifdef __x86_64__
+#if defined(__x86_64__)  && !defined(__native_client__)
 			      "cmpxchgq"
 #else
 			      "cmpxchgl"
@@ -154,7 +154,7 @@ static inline gpointer InterlockedExchangePointer(volatile gpointer *val,
 	gpointer ret;
 	
 	__asm__ __volatile__ ("1:; lock; "
-#ifdef __x86_64__
+#if defined(__x86_64__)  && !defined(__native_client__)
 			      "cmpxchgq"
 #else
 			      "cmpxchgl"
diff --git a/mono/io-layer/posix.c b/mono/io-layer/posix.c
index 732529039ce..a7781bdd7bd 100644
--- a/mono/io-layer/posix.c
+++ b/mono/io-layer/posix.c
@@ -60,7 +60,8 @@ gpointer _wapi_stdhandle_create (int fd, const gchar *name)
 	g_message("%s: creating standard handle type %s, fd %d", __func__,
 		  name, fd);
 #endif
-	
+
+#if !defined(__native_client__)	
 	/* Check if fd is valid */
 	do {
 		flags=fcntl(fd, F_GETFL);
@@ -78,11 +79,18 @@ gpointer _wapi_stdhandle_create (int fd, const gchar *name)
 		SetLastError (_wapi_get_win32_file_error (errno));
 		return(INVALID_HANDLE_VALUE);
 	}
+	file_handle.fileaccess=convert_from_flags(flags);
+#else
+	/* 
+	 * fcntl will return -1 in nacl, as there is no real file system API. 
+	 * Yet, standard streams are available.
+	 */
+	file_handle.fileaccess = (fd == STDIN_FILENO) ? GENERIC_READ : GENERIC_WRITE;
+#endif
 
 	file_handle.filename = g_strdup(name);
 	/* some default security attributes might be needed */
 	file_handle.security_attributes=0;
-	file_handle.fileaccess=convert_from_flags(flags);
 
 	/* Apparently input handles can't be written to.  (I don't
 	 * know if output or error handles can't be read from.)
diff --git a/mono/io-layer/sockets.c b/mono/io-layer/sockets.c
index e9eaf19f47a..c068f06281d 100644
--- a/mono/io-layer/sockets.c
+++ b/mono/io-layer/sockets.c
@@ -7,9 +7,10 @@
  * (C) 2002 Ximian, Inc.
  */
 
+#include <config.h>
+
 #ifndef DISABLE_SOCKETS
 
-#include <config.h>
 #include <glib.h>
 #include <pthread.h>
 #include <errno.h>
diff --git a/mono/metadata/assembly.c b/mono/metadata/assembly.c
index d9b25997ba6..482762d0e1a 100644
--- a/mono/metadata/assembly.c
+++ b/mono/metadata/assembly.c
@@ -197,13 +197,23 @@ mono_public_tokens_are_equal (const unsigned char *pubt1, const unsigned char *p
 	return memcmp (pubt1, pubt2, 16) == 0;
 }
 
+/* Native Client can't get this info from an environment variable so */
+/* it's passed in to the runtime, or set manually by embedding code. */
+#ifdef __native_client__
+char* nacl_mono_path = NULL;
+#endif
+
 static void
 check_path_env (void)
 {
 	const char *path;
 	char **splitted, **dest;
 	
+#ifdef __native_client__
+	path = nacl_mono_path;
+#else
 	path = g_getenv ("MONO_PATH");
+#endif
 	if (!path)
 		return;
 
diff --git a/mono/metadata/boehm-gc.c b/mono/metadata/boehm-gc.c
index fa49e6a2aa9..050cb328d60 100644
--- a/mono/metadata/boehm-gc.c
+++ b/mono/metadata/boehm-gc.c
@@ -105,6 +105,8 @@ mono_gc_base_init (void)
 
 		GC_stackbottom = (char*)ss.ss_sp;
 	}
+#elif defined(__native_client__)
+	/* Do nothing, GC_stackbottom is set correctly in libgc */
 #else
 	{
 		int dummy;
diff --git a/mono/metadata/domain-internals.h b/mono/metadata/domain-internals.h
index bca936805be..815f605c445 100644
--- a/mono/metadata/domain-internals.h
+++ b/mono/metadata/domain-internals.h
@@ -421,6 +421,12 @@ mono_domain_code_reserve_align (MonoDomain *domain, int size, int alignment) MON
 void
 mono_domain_code_commit (MonoDomain *domain, void *data, int size, int newsize) MONO_INTERNAL;
 
+void *
+nacl_domain_get_code_dest (MonoDomain *domain, void *data) MONO_INTERNAL;
+
+void 
+nacl_domain_code_validate (MonoDomain *domain, guint8 **buf_base, int buf_size, guint8 **code_end) MONO_INTERNAL;
+
 void
 mono_domain_code_foreach (MonoDomain *domain, MonoCodeManagerFunc func, void *user_data) MONO_INTERNAL;
 
diff --git a/mono/metadata/domain.c b/mono/metadata/domain.c
index 07fe67e659e..9c08882435a 100644
--- a/mono/metadata/domain.c
+++ b/mono/metadata/domain.c
@@ -2196,6 +2196,58 @@ mono_domain_code_commit (MonoDomain *domain, void *data, int size, int newsize)
 	mono_domain_unlock (domain);
 }
 
+#if defined(__native_client_codegen__) && defined(__native_client__)
+/*
+ * Given the temporary buffer (allocated by mono_domain_code_reserve) into which
+ * we are generating code, return a pointer to the destination in the dynamic 
+ * code segment into which the code will be copied when mono_domain_code_commit
+ * is called.
+ * LOCKING: Acquires the domain lock.
+ */
+void *
+nacl_domain_get_code_dest (MonoDomain *domain, void *data)
+{
+	void *dest;
+	mono_domain_lock (domain);
+	dest = nacl_code_manager_get_code_dest (domain->code_mp, data);
+	mono_domain_unlock (domain);
+	return dest;
+}
+
+/* 
+ * Convenience function which calls mono_domain_code_commit to validate and copy
+ * the code. The caller sets *buf_base and *buf_size to the start and size of
+ * the buffer (allocated by mono_domain_code_reserve), and *code_end to the byte
+ * after the last instruction byte. On return, *buf_base will point to the start
+ * of the copied in the code segment, and *code_end will point after the end of 
+ * the copied code.
+ */
+void
+nacl_domain_code_validate (MonoDomain *domain, guint8 **buf_base, int buf_size, guint8 **code_end)
+{
+	guint8 *tmp = nacl_domain_get_code_dest (domain, *buf_base);
+	mono_domain_code_commit (domain, *buf_base, buf_size, *code_end - *buf_base);
+	*code_end = tmp + (*code_end - *buf_base);
+	*buf_base = tmp;
+}
+
+#else
+
+/* no-op versions of Native Client functions */
+
+void *
+nacl_domain_get_code_dest (MonoDomain *domain, void *data)
+{
+	return data;
+}
+
+void
+nacl_domain_code_validate (MonoDomain *domain, guint8 **buf_base, int buf_size, guint8 **code_end)
+{
+}
+
+#endif
+
 /*
  * mono_domain_code_foreach:
  * Iterate over the code thunks of the code manager of @domain.
diff --git a/mono/metadata/object.c b/mono/metadata/object.c
index f65b690c4e7..565285f5b14 100644
--- a/mono/metadata/object.c
+++ b/mono/metadata/object.c
@@ -1709,8 +1709,12 @@ mono_method_add_generic_virtual_invocation (MonoDomain *domain, MonoVTable *vtab
 			g_ptr_array_free (sorted, TRUE);
 		}
 
+#ifndef __native_client__
+		/* We don't re-use any thunks as there is a lot of overhead */
+		/* to deleting and re-using code in Native Client.          */
 		if (old_thunk != vtable_trampoline && old_thunk != imt_trampoline)
 			invalidate_generic_virtual_thunk (domain, old_thunk);
+#endif
 	}
 
 	mono_domain_unlock (domain);
diff --git a/mono/mini/Makefile.am b/mono/mini/Makefile.am
index c44c88397e4..a1ee64f563e 100644
--- a/mono/mini/Makefile.am
+++ b/mono/mini/Makefile.am
@@ -385,7 +385,6 @@ test_sources = 			\
 	basic-simd.cs
 
 regtests=basic.exe basic-float.exe basic-long.exe basic-calls.exe objects.exe arrays.exe basic-math.exe exceptions.exe iltests.exe devirtualization.exe generics.exe basic-simd.exe
-fsatests=basic.exe basic-float.exe basic-long.exe basic-calls.exe objects.exe arrays.exe basic-math.exe exceptions.exe devirtualization.exe basic-simd.exe
 
 if X86
 if MONO_DEBUGGER_SUPPORTED
@@ -538,6 +537,9 @@ libmonoinclude_HEADERS = jit.h
 basic-simd.exe: basic-simd.cs
 	$(MCS) -out:$@ $< -r:TestDriver.dll -r:Mono.Simd.dll
 
+nacl.exe: nacl.cs
+	$(MCS) -out:$@ $< -r:TestDriver.dll -r:Mono.Simd.dll
+
 generics.exe: generics.cs TestDriver.dll generics-variant-types.dll
 	$(MCS) -out:$@ $< -r:TestDriver.dll -r:generics-variant-types.dll
 
@@ -642,20 +644,6 @@ fullaotcheck: mono $(regtests)
 llvmfullaotcheck:
 	$(MAKE) fullaotcheck LLVM=1
 
-fsacheck: mono $(fsatests) fsacheck.c generics.exe
-	rm -rf fsa-tmp
-	mkdir fsa-tmp
-	cp $(CLASS)/mscorlib.dll $(CLASS)/System.Core.dll $(CLASS)/System.dll $(CLASS)/Mono.Posix.dll $(CLASS)/System.Configuration.dll $(CLASS)/System.Security.dll $(CLASS)/System.Xml.dll $(CLASS)/Mono.Security.dll $(CLASS)/Mono.Simd.dll \
-	$(fsatests) generics-variant-types.dll TestDriver.dll fsa-tmp/
-	cp $(fsatests) fsa-tmp/
-	MONO_PATH=fsa-tmp $(top_builddir)/runtime/mono-wrapper --aot=full,static fsa-tmp/*.dll || exit 1
-	MONO_PATH=fsa-tmp $(top_builddir)/runtime/mono-wrapper --aot=full,static fsa-tmp/*.exe || exit 1
-	$(CC) -o $@.out -g -static $(VPATH)/fsacheck.c fsa-tmp/*.o \
-	-lmono-2.0 -lpthread -lm -ldl -lrt \
-	-DTARGET_X86 -L.libs -I${prefix}/include/mono-2.0 \
-	-I${prefix} -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
-	for i in $(fsatests); do echo $$i; MONO_PATH=fsa-tmp ./$@.out $$i || exit 1; done
-
 bench: mono test.exe
 	time env $(RUNTIME) --ncompile $(count) --compile Test:$(mtest) test.exe
 
diff --git a/mono/mini/aot-compiler.c b/mono/mini/aot-compiler.c
index 35d9c911840..6f13e974c20 100644
--- a/mono/mini/aot-compiler.c
+++ b/mono/mini/aot-compiler.c
@@ -484,7 +484,7 @@ encode_sleb128 (gint32 value, guint8 *buf, guint8 **endbuf)
 #else
 #define AOT_FUNC_ALIGNMENT 16
 #endif
-#if defined(TARGET_X86) && defined(__native_client_codegen__)
+#if (defined(TARGET_X86) || defined(TARGET_AMD64)) && defined(__native_client_codegen__)
 #undef AOT_FUNC_ALIGNMENT
 #define AOT_FUNC_ALIGNMENT 32
 #endif
@@ -698,8 +698,14 @@ arch_emit_plt_entry (MonoAotCompile *acfg, int index)
 {
 #if defined(TARGET_X86)
 		guint32 offset = (acfg->plt_got_offset_base + index) * sizeof (gpointer);
-
-#ifdef __native_client_codegen__
+#if defined(__default_codegen__)
+		/* jmp *<offset>(%ebx) */
+		emit_byte (acfg, 0xff);
+		emit_byte (acfg, 0xa3);
+		emit_int32 (acfg, offset);
+		/* Used by mono_aot_get_plt_info_offset */
+		emit_int32 (acfg, acfg->plt_got_info_offsets [index]);
+#elif defined(__native_client_codegen__)
 		const guint8 kSizeOfNaClJmp = 11;
 		guint8 bytes[kSizeOfNaClJmp];
 		guint8 *pbytes = &bytes[0];
@@ -711,15 +717,9 @@ arch_emit_plt_entry (MonoAotCompile *acfg, int index)
 		emit_byte (acfg, 0x68);  /* hide data in a push */
 		emit_int32 (acfg, acfg->plt_got_info_offsets [index]);
 		emit_alignment (acfg, AOT_FUNC_ALIGNMENT);
-#else
-		/* jmp *<offset>(%ebx) */
-		emit_byte (acfg, 0xff);
-		emit_byte (acfg, 0xa3);
-		emit_int32 (acfg, offset);
-		/* Used by mono_aot_get_plt_info_offset */
-		emit_int32 (acfg, acfg->plt_got_info_offsets [index]);
-#endif  /* __native_client_codegen__ */
+#endif /*__native_client_codegen__*/
 #elif defined(TARGET_AMD64)
+#if defined(__default_codegen__)
 		/*
 		 * We can't emit jumps because they are 32 bits only so they can't be patched.
 		 * So we make indirect calls through GOT entries which are patched by the AOT 
@@ -731,6 +731,27 @@ arch_emit_plt_entry (MonoAotCompile *acfg, int index)
 		emit_symbol_diff (acfg, acfg->got_symbol, ".", ((acfg->plt_got_offset_base + index) * sizeof (gpointer)) -4);
 		/* Used by mono_aot_get_plt_info_offset */
 		emit_int32 (acfg, acfg->plt_got_info_offsets [index]);
+#elif defined(__native_client_codegen__)
+		guint8 buf [256];
+		guint8 *buf_aligned = ALIGN_TO(buf, kNaClAlignment);
+		guint8 *code = buf_aligned;
+
+		/* mov <OFFSET>(%rip), %r11d */
+		emit_byte (acfg, '\x45');
+		emit_byte (acfg, '\x8b');
+		emit_byte (acfg, '\x1d');
+		emit_symbol_diff (acfg, acfg->got_symbol, ".", ((acfg->plt_got_offset_base + index) * sizeof (gpointer)) -4);
+
+		amd64_jump_reg (code, AMD64_R11);
+		/* This should be constant for the plt patch */
+		g_assert ((size_t)(code-buf_aligned) == 10);
+		emit_bytes (acfg, buf_aligned, code - buf_aligned);
+
+		/* Hide data in a push imm32 so it passes validation */
+		emit_byte (acfg, 0x68);  /* push */
+		emit_int32 (acfg, acfg->plt_got_info_offsets [index]);
+		emit_alignment (acfg, AOT_FUNC_ALIGNMENT);
+#endif /*__native_client_codegen__*/
 #elif defined(TARGET_ARM)
 		guint8 buf [256];
 		guint8 *code;
@@ -814,6 +835,7 @@ arch_emit_specific_trampoline (MonoAotCompile *acfg, int offset, int *tramp_size
 	 * - all the trampolines should be of the same length.
 	 */
 #if defined(TARGET_AMD64)
+#if defined(__default_codegen__)
 	/* This should be exactly 16 bytes long */
 	*tramp_size = 16;
 	/* call *<offset>(%rip) */
@@ -822,8 +844,61 @@ arch_emit_specific_trampoline (MonoAotCompile *acfg, int offset, int *tramp_size
 	emit_byte (acfg, '\x15');
 	emit_symbol_diff (acfg, acfg->got_symbol, ".", (offset * sizeof (gpointer)) - 4);
 	/* This should be relative to the start of the trampoline */
-	emit_symbol_diff (acfg, acfg->got_symbol, ".", (offset * sizeof (gpointer)) - 4 + 19);
+	emit_symbol_diff (acfg, acfg->got_symbol, ".", ((offset+1) * sizeof (gpointer)) + 7);
 	emit_zero_bytes (acfg, 5);
+#elif defined(__native_client_codegen__)
+	guint8 buf [256];
+	guint8 *buf_aligned = ALIGN_TO(buf, kNaClAlignment);
+	guint8 *code = buf_aligned;
+	guint8 *call_start;
+	size_t call_len;
+	int got_offset;
+
+	/* Emit this call in 'code' so we can find out how long it is. */
+	amd64_call_reg (code, AMD64_R11);
+	call_start = mono_arch_nacl_skip_nops (buf_aligned);
+	call_len = code - call_start;
+
+	/* The tramp_size is twice the NaCl alignment because it starts with */ 
+	/* a call which needs to be aligned to the end of the boundary.      */
+	*tramp_size = kNaClAlignment*2;
+	{
+		/* Emit nops to align call site below which is 7 bytes plus */
+		/* the length of the call sequence emitted above.           */
+		/* Note: this requires the specific trampoline starts on a  */
+		/* kNaclAlignedment aligned address, which it does because  */
+		/* it's its own function that is aligned.                   */
+		guint8 nop_buf[256];
+		guint8 *nopbuf_aligned = ALIGN_TO (nop_buf, kNaClAlignment);
+		guint8 *nopbuf_end = mono_arch_nacl_pad (nopbuf_aligned, kNaClAlignment - 7 - (call_len));
+		emit_bytes (acfg, nopbuf_aligned, nopbuf_end - nopbuf_aligned);
+	}
+	/* The trampoline is stored at the offset'th pointer, the -4 is  */
+	/* present because RIP relative addressing starts at the end of  */
+	/* the current instruction, while the label "." is relative to   */
+	/* the beginning of the current asm location, which in this case */
+	/* is not the mov instruction, but the offset itself, due to the */
+	/* way the bytes and ints are emitted here.                      */
+	got_offset = (offset * sizeof(gpointer)) - 4;
+
+	/* mov <OFFSET>(%rip), %r11d */
+	emit_byte (acfg, '\x45');
+	emit_byte (acfg, '\x8b');
+	emit_byte (acfg, '\x1d');
+	emit_symbol_diff (acfg, acfg->got_symbol, ".", got_offset);
+
+	/* naclcall %r11 */
+	emit_bytes (acfg, call_start, call_len);
+
+	/* The arg is stored at the offset+1 pointer, relative to beginning */
+	/* of trampoline: 7 for mov, plus the call length, and 1 for push.  */
+	got_offset = ((offset + 1) * sizeof(gpointer)) + 7 + call_len + 1;
+
+	/* We can't emit this data directly, hide in a "push imm32" */
+	emit_byte (acfg, '\x68'); /* push */
+	emit_symbol_diff (acfg, acfg->got_symbol, ".", got_offset);
+	emit_alignment (acfg, kNaClAlignment);
+#endif /*__native_client_codegen__*/
 #elif defined(TARGET_ARM)
 	guint8 buf [128];
 	guint8 *code;
@@ -1010,6 +1085,7 @@ static void
 arch_emit_static_rgctx_trampoline (MonoAotCompile *acfg, int offset, int *tramp_size)
 {
 #if defined(TARGET_AMD64)
+#if defined(__default_codegen__)
 	/* This should be exactly 13 bytes long */
 	*tramp_size = 13;
 
@@ -1023,6 +1099,31 @@ arch_emit_static_rgctx_trampoline (MonoAotCompile *acfg, int offset, int *tramp_
 	emit_byte (acfg, '\xff');
 	emit_byte (acfg, '\x25');
 	emit_symbol_diff (acfg, acfg->got_symbol, ".", ((offset + 1) * sizeof (gpointer)) - 4);
+#elif defined(__native_client_codegen__)
+	guint8 buf [128];
+	guint8 *buf_aligned = ALIGN_TO(buf, kNaClAlignment);
+	guint8 *code = buf_aligned;
+
+	/* mov <OFFSET>(%rip), %r10d */
+	emit_byte (acfg, '\x45');
+	emit_byte (acfg, '\x8b');
+	emit_byte (acfg, '\x15');
+	emit_symbol_diff (acfg, acfg->got_symbol, ".", (offset * sizeof (gpointer)) - 4);
+
+	/* mov <OFFSET>(%rip), %r11d */
+	emit_byte (acfg, '\x45');
+	emit_byte (acfg, '\x8b');
+	emit_byte (acfg, '\x1d');
+	emit_symbol_diff (acfg, acfg->got_symbol, ".", ((offset + 1) * sizeof (gpointer)) - 4);
+
+	/* nacljmp *%r11 */
+	amd64_jump_reg (code, AMD64_R11);
+	emit_bytes (acfg, buf_aligned, code - buf_aligned);
+
+	emit_alignment (acfg, kNaClAlignment);
+	*tramp_size = kNaClAlignment;
+#endif /*__native_client_codegen__*/
+
 #elif defined(TARGET_ARM)
 	guint8 buf [128];
 	guint8 *code;
@@ -1132,50 +1233,74 @@ arch_emit_imt_thunk (MonoAotCompile *acfg, int offset, int *tramp_size)
 {
 #if defined(TARGET_AMD64)
 	guint8 *buf, *code;
+#if defined(__native_client_codegen__)
+	guint8 *buf_alloc;
+#endif
 	guint8 *labels [3];
+	guint8 mov_buf[3];
+	guint8 *mov_buf_ptr = mov_buf;
 
+	const int kSizeOfMove = 7;
+#if defined(__default_codegen__)
 	code = buf = g_malloc (256);
+#elif defined(__native_client_codegen__)
+	buf_alloc = g_malloc (256 + kNaClAlignment + kSizeOfMove);
+	buf = ((guint)buf_alloc + kNaClAlignment) & ~kNaClAlignmentMask;
+	/* The RIP relative move below is emitted first */
+	buf += kSizeOfMove;
+	code = buf;
+#endif
 
 	/* FIXME: Optimize this, i.e. use binary search etc. */
 	/* Maybe move the body into a separate function (slower, but much smaller) */
 
-	/* R11 is a free register */
+	/* MONO_ARCH_IMT_SCRATCH_REG is a free register */
 
 	labels [0] = code;
-	amd64_alu_membase_imm (code, X86_CMP, AMD64_R11, 0, 0);
+	amd64_alu_membase_imm (code, X86_CMP, MONO_ARCH_IMT_SCRATCH_REG, 0, 0);
 	labels [1] = code;
-	amd64_branch8 (code, X86_CC_Z, FALSE, 0);
+	amd64_branch8 (code, X86_CC_Z, 0, FALSE);
 
 	/* Check key */
-	amd64_alu_membase_reg (code, X86_CMP, AMD64_R11, 0, MONO_ARCH_IMT_REG);
+	amd64_alu_membase_reg_size (code, X86_CMP, MONO_ARCH_IMT_SCRATCH_REG, 0, MONO_ARCH_IMT_REG, sizeof (gpointer));
 	labels [2] = code;
-	amd64_branch8 (code, X86_CC_Z, FALSE, 0);
+	amd64_branch8 (code, X86_CC_Z, 0, FALSE);
 
 	/* Loop footer */
-	amd64_alu_reg_imm (code, X86_ADD, AMD64_R11, 2 * sizeof (gpointer));
+	amd64_alu_reg_imm (code, X86_ADD, MONO_ARCH_IMT_SCRATCH_REG, 2 * sizeof (gpointer));
 	amd64_jump_code (code, labels [0]);
 
 	/* Match */
 	mono_amd64_patch (labels [2], code);
-	amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, sizeof (gpointer), 8);
-	amd64_jump_membase (code, AMD64_R11, 0);
+	amd64_mov_reg_membase (code, MONO_ARCH_IMT_SCRATCH_REG, MONO_ARCH_IMT_SCRATCH_REG, sizeof (gpointer), sizeof (gpointer));
+	amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
 
 	/* No match */
 	/* FIXME: */
 	mono_amd64_patch (labels [1], code);
 	x86_breakpoint (code);
 
-	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 12345678, 8);
-
-	/* mov <OFFSET>(%rip), %r11 */
-	emit_byte (acfg, '\x4d');
-	emit_byte (acfg, '\x8b');
-	emit_byte (acfg, '\x1d');
+	/* mov <OFFSET>(%rip), MONO_ARCH_IMT_SCRATCH_REG */
+	amd64_emit_rex (mov_buf_ptr, sizeof(gpointer), MONO_ARCH_IMT_SCRATCH_REG, 0, AMD64_RIP);
+	*(mov_buf_ptr)++ = (unsigned char)0x8b; /* mov opcode */
+	x86_address_byte (mov_buf_ptr, 0, MONO_ARCH_IMT_SCRATCH_REG & 0x7, 5);
+	emit_bytes (acfg, mov_buf, mov_buf_ptr - mov_buf);
 	emit_symbol_diff (acfg, acfg->got_symbol, ".", (offset * sizeof (gpointer)) - 4);
 
 	emit_bytes (acfg, buf, code - buf);
 	
-	*tramp_size = code - buf + 7;
+	*tramp_size = code - buf + kSizeOfMove;
+#if defined(__native_client_codegen__)
+	/* The tramp will be padded to the next kNaClAlignment bundle. */
+	*tramp_size = ALIGN_TO ((*tramp_size), kNaClAlignment);
+#endif
+
+#if defined(__default_codegen__)
+	g_free (buf);
+#elif defined(__native_client_codegen__)
+	g_free (buf_alloc); 
+#endif
+
 #elif defined(TARGET_X86)
 	guint8 *buf, *code;
 #ifdef __native_client_codegen__
@@ -1183,11 +1308,11 @@ arch_emit_imt_thunk (MonoAotCompile *acfg, int offset, int *tramp_size)
 #endif
 	guint8 *labels [3];
 
-#ifdef __native_client_codegen__
+#if defined(__default_codegen__)
+	code = buf = g_malloc (256);
+#elif defined(__native_client_codegen__)
 	buf_alloc = g_malloc (256 + kNaClAlignment);
 	code = buf = ((guint)buf_alloc + kNaClAlignment) & ~kNaClAlignmentMask;
-#else
-	code = buf = g_malloc (256);
 #endif
 
 	/* Allocate a temporary stack slot */
@@ -1240,6 +1365,13 @@ arch_emit_imt_thunk (MonoAotCompile *acfg, int offset, int *tramp_size)
 	emit_bytes (acfg, buf, code - buf);
 	
 	*tramp_size = code - buf;
+
+#if defined(__default_codegen__)
+	g_free (buf);
+#elif defined(__native_client_codegen__)
+	g_free (buf_alloc); 
+#endif
+
 #elif defined(TARGET_ARM)
 	guint8 buf [128];
 	guint8 *code, *code2, *labels [16];
@@ -3916,7 +4048,7 @@ emit_plt (MonoAotCompile *acfg)
 	sprintf (symbol, "plt");
 
 	emit_section_change (acfg, ".text", 0);
-	emit_alignment (acfg, 16);
+	emit_alignment (acfg, NACL_SIZE(16, kNaClAlignment));
 	emit_label (acfg, symbol);
 	emit_label (acfg, acfg->plt_symbol);
 
@@ -5038,7 +5170,17 @@ emit_code (MonoAotCompile *acfg)
 	 * Emit some padding so the local symbol for the first method doesn't have the
 	 * same address as 'methods'.
 	 */
+#if defined(__default_codegen__)
 	emit_zero_bytes (acfg, 16);
+#elif defined(__native_client_codegen__)
+	{
+		const int kPaddingSize = 16;
+		guint8 pad_buffer[kPaddingSize];
+		mono_arch_nacl_pad (pad_buffer, kPaddingSize);
+		emit_bytes (acfg, pad_buffer, kPaddingSize);
+	}
+#endif
+	
 
 	for (l = acfg->method_order; l != NULL; l = l->next) {
 		MonoCompile *cfg;
@@ -6262,7 +6404,11 @@ compile_asm (MonoAotCompile *acfg)
 #endif
 
 #ifdef __native_client_codegen__
+#if defined(TARGET_AMD64)
+#define AS_NAME "nacl64-as"
+#else
 #define AS_NAME "nacl-as"
+#endif
 #else
 #define AS_NAME "as"
 #endif
diff --git a/mono/mini/aot-runtime.c b/mono/mini/aot-runtime.c
index 9610c7859b2..115384028f0 100644
--- a/mono/mini/aot-runtime.c
+++ b/mono/mini/aot-runtime.c
@@ -1036,6 +1036,7 @@ load_aot_module (MonoAssembly *assembly, gpointer user_data)
 	MonoAotFileInfo *info = NULL;
 	int i, version;
 	guint8 *blob;
+	gboolean do_load_image = TRUE;
 
 	if (mono_compile_aot)
 		return;
@@ -1262,8 +1263,20 @@ load_aot_module (MonoAssembly *assembly, gpointer user_data)
 	 * non-lazily, since we can't handle out-of-date errors later.
 	 * The cached class info also depends on the exact assemblies.
 	 */
-	for (i = 0; i < amodule->image_table_len; ++i)
-		load_image (amodule, i, FALSE);
+#if defined(__native_client__)
+	/* TODO: Don't 'load_image' on mscorlib due to a */
+	/* recursive loading problem.  This should be    */
+	/* removed if mscorlib is loaded from disk.      */
+	if (strncmp(assembly->aname.name, "mscorlib", 8)) {
+		do_load_image = TRUE;
+	} else {
+		do_load_image = FALSE;
+	}
+#endif
+	if (do_load_image) {
+		for (i = 0; i < amodule->image_table_len; ++i)
+			load_image (amodule, i, FALSE);
+	}
 
 	if (amodule->out_of_date) {
 		mono_trace (G_LOG_LEVEL_INFO, MONO_TRACE_AOT, "AOT Module %s is unusable because a dependency is out-of-date.\n", assembly->image->name);
diff --git a/mono/mini/branch-opts.c b/mono/mini/branch-opts.c
index 13a399830a5..141b90a2be2 100644
--- a/mono/mini/branch-opts.c
+++ b/mono/mini/branch-opts.c
@@ -815,6 +815,15 @@ replace_in_block (MonoBasicBlock *bb, MonoBasicBlock *orig, MonoBasicBlock *repl
 static void
 replace_out_block_in_code (MonoBasicBlock *bb, MonoBasicBlock *orig, MonoBasicBlock *repl) {
 	MonoInst *ins;
+
+#if defined(__native_client_codegen__)
+	/* Need to maintain this flag for the new block because */
+	/* we can't jump indirectly to a non-aligned block.     */
+	if (orig->flags & BB_INDIRECT_JUMP_TARGET)
+	{
+		repl->flags |= BB_INDIRECT_JUMP_TARGET;
+	}
+#endif
 	
 	for (ins = bb->code; ins != NULL; ins = ins->next) {
 		switch (ins->opcode) {
diff --git a/mono/mini/cpu-amd64.md b/mono/mini/cpu-amd64.md
index 31158c42df6..b095aa94280 100644
--- a/mono/mini/cpu-amd64.md
+++ b/mono/mini/cpu-amd64.md
@@ -53,6 +53,13 @@
 #
 # See the code in mini-x86.c for more details on how the specifiers are used.
 #
+#
+# Native Client Note: NaCl call sequences do not really reach > 32 bytes but 
+# the maximum length can be high, so if we get unlucky and wind up trying to 
+# emit a call sequence such that we are one or two bytes too long, we need to
+# pad out almost an entire 32 bytes.
+#
+
 break: len:2
 jmp: len:120
 tailcall: len:120 clob:c
@@ -60,8 +67,8 @@ br: len:6
 label: len:0
 seq_point: len:25
 
-long_add: dest:i src1:i src2:i len:3 clob:1
-long_sub: dest:i src1:i src2:i len:3 clob:1
+long_add: dest:i src1:i src2:i len:3 clob:1 nacl:6
+long_sub: dest:i src1:i src2:i len:3 clob:1 nacl:6
 long_mul: dest:i src1:i src2:i len:4 clob:1
 long_div: dest:a src1:a src2:i len:16 clob:d
 long_div_un: dest:a src1:a src2:i len:16 clob:d
@@ -96,11 +103,11 @@ long_min_un: dest:i src1:i src2:i len:16 clob:1
 long_max: dest:i src1:i src2:i len:16 clob:1
 long_max_un: dest:i src1:i src2:i len:16 clob:1
 
-throw: src1:i len:18
-rethrow: src1:i len:18
+throw: src1:i len:18 nacl:50
+rethrow: src1:i len:18 nacl:50
 start_handler: len:16
-endfinally: len:9
-endfilter: src1:a len:9
+endfinally: len:9 nacl:22
+endfilter: src1:a len:9 nacl:19
 ckfinite: dest:f src1:f len:43
 ceq: dest:c len:8
 cgt: dest:c len:8
@@ -115,11 +122,11 @@ compare_imm: src1:i len:13
 icompare_imm: src1:i len:8
 fcompare: src1:f src2:f clob:a len:13
 oparglist: src1:b len:11
-checkthis: src1:b len:5
-call: dest:a clob:c len:32
-voidcall: clob:c len:32
-voidcall_reg: src1:i clob:c len:32
-voidcall_membase: src1:b clob:c len:32
+checkthis: src1:b len:5 nacl:8
+call: dest:a clob:c len:32 nacl:64
+voidcall: clob:c len:32 nacl:64
+voidcall_reg: src1:i clob:c len:32 nacl:64
+voidcall_membase: src1:b clob:c len:32 nacl:64
 fcall: dest:f len:64 clob:c
 fcall_reg: dest:f src1:i len:64 clob:c
 fcall_membase: dest:f src1:b len:64 clob:c
@@ -129,39 +136,39 @@ lcall_membase: dest:a src1:b len:64 clob:c
 vcall: len:64 clob:c
 vcall_reg: src1:i len:64 clob:c
 vcall_membase: src1:b len:64 clob:c
-call_reg: dest:a src1:i len:32 clob:c
-call_membase: dest:a src1:b len:32 clob:c
+call_reg: dest:a src1:i len:32 clob:c nacl:64
+call_membase: dest:a src1:b len:32 clob:c nacl:64
 iconst: dest:i len:10
 i8const: dest:i len:10
 r4const: dest:f len:14
 r8const: dest:f len:9
 store_membase_imm: dest:b len:15
-store_membase_reg: dest:b src1:i len:9
-storei8_membase_reg: dest:b src1:i len:9
-storei1_membase_imm: dest:b len:11
-storei1_membase_reg: dest:b src1:c len:9
-storei2_membase_imm: dest:b len:13
-storei2_membase_reg: dest:b src1:i len:9
-storei4_membase_imm: dest:b len:13
-storei4_membase_reg: dest:b src1:i len:9
+store_membase_reg: dest:b src1:i len:9 nacl:11
+storei8_membase_reg: dest:b src1:i len:9 nacl:11
+storei1_membase_imm: dest:b len:11 nacl:15
+storei1_membase_reg: dest:b src1:c len:9 nacl:11
+storei2_membase_imm: dest:b len:13 nacl:15
+storei2_membase_reg: dest:b src1:i len:9 nacl:11
+storei4_membase_imm: dest:b len:13 nacl:15
+storei4_membase_reg: dest:b src1:i len:9 nacl:11
 storei8_membase_imm: dest:b len:18
 storer4_membase_reg: dest:b src1:f len:15
 storer8_membase_reg: dest:b src1:f len:10
-load_membase: dest:i src1:b len:8
-loadi1_membase: dest:c src1:b len:9
-loadu1_membase: dest:c src1:b len:9
-loadi2_membase: dest:i src1:b len:9
-loadu2_membase: dest:i src1:b len:9
-loadi4_membase: dest:i src1:b len:9
-loadu4_membase: dest:i src1:b len:9
-loadi8_membase: dest:i src1:b len:18
+load_membase: dest:i src1:b len:8 nacl:12
+loadi1_membase: dest:c src1:b len:9 nacl:12
+loadu1_membase: dest:c src1:b len:9 nacl:12
+loadi2_membase: dest:i src1:b len:9 nacl:12
+loadu2_membase: dest:i src1:b len:9 nacl:12
+loadi4_membase: dest:i src1:b len:9 nacl:12
+loadu4_membase: dest:i src1:b len:9 nacl:12
+loadi8_membase: dest:i src1:b len:18 nacl:14
 loadr4_membase: dest:f src1:b len:16
 loadr8_membase: dest:f src1:b len:16
 loadu4_mem: dest:i len:10
 amd64_loadi8_memindex: dest:i src1:i src2:i len:10
 move: dest:i src1:i len:3
-add_imm: dest:i src1:i len:8 clob:1
-sub_imm: dest:i src1:i len:8 clob:1
+add_imm: dest:i src1:i len:8 clob:1 nacl:11
+sub_imm: dest:i src1:i len:8 clob:1 nacl:11
 mul_imm: dest:i src1:i len:11
 and_imm: dest:i src1:i len:8 clob:1
 or_imm: dest:i src1:i len:8 clob:1
@@ -246,8 +253,9 @@ float_clt_membase: dest:i src1:f src2:b len:35
 float_clt_un_membase: dest:i src1:f src2:b len:42
 float_conv_to_u: dest:i src1:f len:46
 fmove: dest:f src1:f len:8
-call_handler: len:14 clob:c
+call_handler: len:14 clob:c nacl:52
 aot_const: dest:i len:10
+nacl_gc_safe_point: clob:c
 x86_test_null: src1:i len:5
 x86_compare_membase_reg: src1:b src2:i len:9
 x86_compare_membase_imm: src1:b len:13
@@ -263,7 +271,7 @@ x86_push_imm: len:6
 x86_push_membase: src1:b len:8
 x86_push_obj: src1:b len:40
 x86_lea: dest:i src1:i src2:i len:8
-x86_lea_membase: dest:i src1:i len:11
+x86_lea_membase: dest:i src1:i len:11 nacl:14
 x86_xchg: src1:i src2:i clob:x len:2
 x86_fpop: src1:f len:3
 x86_seteq_membase: src1:b len:9
@@ -298,7 +306,7 @@ subcc: dest:i src1:i src2:i len:3 clob:1
 adc_imm: dest:i src1:i len:8 clob:1
 sbb: dest:i src1:i src2:i len:3 clob:1
 sbb_imm: dest:i src1:i len:8 clob:1
-br_reg: src1:i len:3
+br_reg: src1:i len:3 nacl:8
 sin: dest:f src1:f len:32
 cos: dest:f src1:f len:32
 abs: dest:f src1:f clob:1 len:32
@@ -310,8 +318,8 @@ sext_i2: dest:i src1:i len:4
 sext_i4: dest:i src1:i len:8
 
 # 32 bit opcodes
-int_add: dest:i src1:i src2:i clob:1 len:4
-int_sub: dest:i src1:i src2:i clob:1 len:4
+int_add: dest:i src1:i src2:i clob:1 len:4 nacl:7
+int_sub: dest:i src1:i src2:i clob:1 len:4 nacl:7
 int_mul: dest:i src1:i src2:i clob:1 len:4
 int_mul_ovf: dest:i src1:i src2:i clob:1 len:32
 int_mul_ovf_un: dest:i src1:i src2:i clob:1 len:32
@@ -331,8 +339,8 @@ int_sbb: dest:i src1:i src2:i clob:1 len:4
 int_sbb_imm: dest:i src1:i clob:1 len:8
 int_addcc: dest:i src1:i src2:i clob:1 len:16
 int_subcc: dest:i src1:i src2:i clob:1 len:16
-int_add_imm: dest:i src1:i clob:1 len:8
-int_sub_imm: dest:i src1:i clob:1 len:8
+int_add_imm: dest:i src1:i clob:1 len:8 nacl:10
+int_sub_imm: dest:i src1:i clob:1 len:8 nacl:10
 int_mul_imm: dest:i src1:i clob:1 len:32
 int_div_imm: dest:a src1:i clob:d len:32
 int_div_un_imm: dest:a src1:i clob:d len:32
@@ -438,8 +446,8 @@ cmov_lgt_un: dest:i src1:i src2:i len:16 clob:1
 cmov_lle_un: dest:i src1:i src2:i len:16 clob:1
 cmov_llt_un: dest:i src1:i src2:i len:16 clob:1
 
-long_add_imm: dest:i src1:i clob:1 len:12
-long_sub_imm: dest:i src1:i clob:1 len:12
+long_add_imm: dest:i src1:i clob:1 len:12 nacl:15
+long_sub_imm: dest:i src1:i clob:1 len:12 nacl:15
 long_and_imm: dest:i src1:i clob:1 len:12
 long_or_imm: dest:i src1:i clob:1 len:12
 long_xor_imm: dest:i src1:i clob:1 len:12
@@ -486,7 +494,7 @@ vcall2: len:64 clob:c
 vcall2_reg: src1:i len:64 clob:c
 vcall2_membase: src1:b len:64 clob:c
 
-dyn_call: src1:i src2:i len:64 clob:c
+dyn_call: src1:i src2:i len:64 clob:c nacl:128
 
 localloc_imm: dest:i len:84
 
diff --git a/mono/mini/cpu-x86.md b/mono/mini/cpu-x86.md
index 7b4f876d45b..41dcbbedfad 100644
--- a/mono/mini/cpu-x86.md
+++ b/mono/mini/cpu-x86.md
@@ -247,6 +247,7 @@ call_handler: len:11 clob:c
 aot_const: dest:i len:5
 load_gotaddr: dest:i len:64
 got_entry: dest:i src1:b len:7
+nacl_gc_safe_point: clob:c
 x86_test_null: src1:i len:2
 x86_compare_membase_reg: src1:b src2:i len:7
 x86_compare_membase_imm: src1:b len:11
diff --git a/mono/mini/dominators.c b/mono/mini/dominators.c
index 5024e066f19..ad35cdb35aa 100644
--- a/mono/mini/dominators.c
+++ b/mono/mini/dominators.c
@@ -384,6 +384,7 @@ mono_compute_natural_loops (MonoCompile *cfg)
 			/* The loop body start is the first bblock in the order they will be emitted */
 			MonoBasicBlock *h = cfg->bblocks [i];
 			MonoBasicBlock *body_start = h;
+			MonoInst *inst;
 			GList *l;
 
 			for (l = h->loop_blocks; l; l = l->next) {
@@ -394,6 +395,12 @@ mono_compute_natural_loops (MonoCompile *cfg)
 				}
 			}
 
+#if defined(__native_client_codegen__)
+			/* Instrument the loop (GC back branch safe point) */
+			MONO_INST_NEW (cfg, inst, OP_NACL_GC_SAFE_POINT);
+			inst->dreg = mono_alloc_dreg (cfg, STACK_I4);
+			mono_bblock_insert_before_ins (body_start, NULL, inst);
+#endif
 			body_start->loop_body_start = 1;
 		}
 	}
diff --git a/mono/mini/driver.c b/mono/mini/driver.c
index 1de8758c074..e8b6a23b3a6 100644
--- a/mono/mini/driver.c
+++ b/mono/mini/driver.c
@@ -115,7 +115,10 @@ opt_funcs [sizeof (int) * 8] = {
 };
 
 #ifdef __native_client_codegen__
-extern guint8 nacl_align_byte;
+extern gint8 nacl_align_byte;
+#endif
+#ifdef __native_client__
+extern char *nacl_mono_path;
 #endif
 
 #define DEFAULT_OPTIMIZATIONS (	\
@@ -1644,7 +1647,11 @@ mono_main (int argc, char* argv[])
 			mono_use_llvm = FALSE;
 #ifdef __native_client_codegen__
 		} else if (strcmp (argv [i], "--nacl-align-mask-off") == 0){
-			nacl_align_byte = 0xff;	
+			nacl_align_byte = -1; /* 0xff */
+#endif
+#ifdef __native_client__
+		} else if (strcmp (argv [i], "--nacl-mono-path") == 0){
+			nacl_mono_path = g_strdup(argv[++i]);
 #endif
 		} else {
 			fprintf (stderr, "Unknown command line option: '%s'\n", argv [i]);
@@ -1655,7 +1662,7 @@ mono_main (int argc, char* argv[])
 #ifdef __native_client_codegen__
 	if (getenv ("MONO_NACL_ALIGN_MASK_OFF"))
 	{
-		nacl_align_byte = 0xff;
+		nacl_align_byte = -1; /* 0xff */
 	}
 #endif
 
diff --git a/mono/mini/exceptions-amd64.c b/mono/mini/exceptions-amd64.c
index e683bf2a11f..cc883afbfb6 100644
--- a/mono/mini/exceptions-amd64.c
+++ b/mono/mini/exceptions-amd64.c
@@ -179,7 +179,9 @@ mono_arch_get_restore_context (MonoTrampInfo **info, gboolean aot)
 	amd64_mov_reg_membase (code, AMD64_R12, AMD64_R11,  G_STRUCT_OFFSET (MonoContext, r12), 8);
 	amd64_mov_reg_membase (code, AMD64_R13, AMD64_R11,  G_STRUCT_OFFSET (MonoContext, r13), 8);
 	amd64_mov_reg_membase (code, AMD64_R14, AMD64_R11,  G_STRUCT_OFFSET (MonoContext, r14), 8);
+#if !defined(__native_client_codegen__)
 	amd64_mov_reg_membase (code, AMD64_R15, AMD64_R11,  G_STRUCT_OFFSET (MonoContext, r15), 8);
+#endif
 
 	if (mono_running_on_valgrind ()) {
 		/* Prevent 'Address 0x... is just below the stack ptr.' errors */
@@ -195,6 +197,8 @@ mono_arch_get_restore_context (MonoTrampInfo **info, gboolean aot)
 	/* jump to the saved IP */
 	amd64_jump_reg (code, AMD64_R11);
 
+	nacl_global_codeman_validate(&start, 256, &code);
+
 	mono_arch_flush_icache (start, code - start);
 
 	if (info)
@@ -219,8 +223,9 @@ mono_arch_get_call_filter (MonoTrampInfo **info, gboolean aot)
 	guint32 pos;
 	MonoJumpInfo *ji = NULL;
 	GSList *unwind_ops = NULL;
+	const guint kMaxCodeSize = NACL_SIZE (128, 256);
 
-	start = code = mono_global_codeman_reserve (128);
+	start = code = mono_global_codeman_reserve (kMaxCodeSize);
 
 	/* call_filter (MonoContext *ctx, unsigned long eip) */
 	code = start;
@@ -252,7 +257,9 @@ mono_arch_get_call_filter (MonoTrampInfo **info, gboolean aot)
 	amd64_mov_reg_membase (code, AMD64_R12, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoContext, r12), 8);
 	amd64_mov_reg_membase (code, AMD64_R13, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoContext, r13), 8);
 	amd64_mov_reg_membase (code, AMD64_R14, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoContext, r14), 8);
+#if !defined(__native_client_codegen__)
 	amd64_mov_reg_membase (code, AMD64_R15, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoContext, r15), 8);
+#endif
 #ifdef TARGET_WIN32
 	amd64_mov_reg_membase (code, AMD64_RDI, AMD64_ARG_REG1,  G_STRUCT_OFFSET (MonoContext, rdi), 8);
 	amd64_mov_reg_membase (code, AMD64_RSI, AMD64_ARG_REG1,  G_STRUCT_OFFSET (MonoContext, rsi), 8);
@@ -275,7 +282,9 @@ mono_arch_get_call_filter (MonoTrampInfo **info, gboolean aot)
 	amd64_leave (code);
 	amd64_ret (code);
 
-	g_assert ((code - start) < 128);
+	g_assert ((code - start) < kMaxCodeSize);
+
+	nacl_global_codeman_validate(&start, kMaxCodeSize, &code);
 
 	mono_arch_flush_icache (start, code - start);
 
@@ -405,10 +414,10 @@ get_throw_trampoline (MonoTrampInfo **info, gboolean rethrow, gboolean corlib, g
 	guint8 *code;
 	MonoJumpInfo *ji = NULL;
 	GSList *unwind_ops = NULL;
-	int i, buf_size, stack_size, arg_offsets [16], regs_offset;
+	int i, stack_size, arg_offsets [16], regs_offset;
+	const guint kMaxCodeSize = NACL_SIZE (256, 512);
 
-	buf_size = 256;
-	start = code = mono_global_codeman_reserve (buf_size);
+	start = code = mono_global_codeman_reserve (kMaxCodeSize);
 
 	/* The stack is unaligned on entry */
 	stack_size = 192 + 8;
@@ -429,37 +438,37 @@ get_throw_trampoline (MonoTrampInfo **info, gboolean rethrow, gboolean corlib, g
 	 */
 
 	arg_offsets [0] = 0;
-	arg_offsets [1] = sizeof (gpointer);
-	arg_offsets [2] = sizeof (gpointer) * 2;
-	arg_offsets [3] = sizeof (gpointer) * 3;
-	regs_offset = sizeof (gpointer) * 4;
+	arg_offsets [1] = sizeof(mgreg_t);
+	arg_offsets [2] = sizeof(mgreg_t) * 2;
+	arg_offsets [3] = sizeof(mgreg_t) * 3;
+	regs_offset = sizeof(mgreg_t) * 4;
 
 	/* Save registers */
 	for (i = 0; i < AMD64_NREG; ++i)
 		if (i != AMD64_RSP)
-			amd64_mov_membase_reg (code, AMD64_RSP, regs_offset + (i * sizeof (gpointer)), i, 8);
+			amd64_mov_membase_reg (code, AMD64_RSP, regs_offset + (i * sizeof(mgreg_t)), i, sizeof(mgreg_t));
 	/* Save RSP */
-	amd64_lea_membase (code, AMD64_RAX, AMD64_RSP, stack_size + sizeof (gpointer));
-	amd64_mov_membase_reg (code, AMD64_RSP, regs_offset + (AMD64_RSP * sizeof (gpointer)), X86_EAX, 8);
+	amd64_lea_membase (code, AMD64_RAX, AMD64_RSP, stack_size + sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RSP, regs_offset + (AMD64_RSP * sizeof(mgreg_t)), X86_EAX, sizeof(mgreg_t));
 	/* Set arg1 == regs */
 	amd64_lea_membase (code, AMD64_RAX, AMD64_RSP, regs_offset);
-	amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [0], AMD64_RAX, 8);
+	amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [0], AMD64_RAX, sizeof(mgreg_t));
 	/* Set arg2 == eip */
 	if (llvm_abs)
 		amd64_alu_reg_reg (code, X86_XOR, AMD64_RAX, AMD64_RAX);
 	else
-		amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, stack_size, 8);
-	amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [1], AMD64_RAX, 8);
+		amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, stack_size, sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [1], AMD64_RAX, sizeof(mgreg_t));
 	/* Set arg3 == exc/ex_token_index */
 	if (resume_unwind)
-		amd64_mov_membase_imm (code, AMD64_RSP, arg_offsets [2], 0, 8);
+		amd64_mov_membase_imm (code, AMD64_RSP, arg_offsets [2], 0, sizeof(mgreg_t));
 	else
-		amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [2], AMD64_ARG_REG1, 8);
+		amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [2], AMD64_ARG_REG1, sizeof(mgreg_t));
 	/* Set arg4 == rethrow/pc offset */
 	if (resume_unwind) {
-		amd64_mov_membase_imm (code, AMD64_RSP, arg_offsets [3], 0, 8);
+		amd64_mov_membase_imm (code, AMD64_RSP, arg_offsets [3], 0, sizeof(mgreg_t));
 	} else if (corlib) {
-		amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [3], AMD64_ARG_REG2, 8);
+		amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [3], AMD64_ARG_REG2, sizeof(mgreg_t));
 		if (llvm_abs)
 			/* 
 			 * The caller is LLVM code which passes the absolute address not a pc offset,
@@ -468,7 +477,7 @@ get_throw_trampoline (MonoTrampInfo **info, gboolean rethrow, gboolean corlib, g
 			 */
 			amd64_neg_membase (code, AMD64_RSP, arg_offsets [3]);
 	} else {
-		amd64_mov_membase_imm (code, AMD64_RSP, arg_offsets [3], rethrow, 8);
+		amd64_mov_membase_imm (code, AMD64_RSP, arg_offsets [3], rethrow, sizeof(mgreg_t));
 	}
 
 	if (aot) {
@@ -482,7 +491,9 @@ get_throw_trampoline (MonoTrampInfo **info, gboolean rethrow, gboolean corlib, g
 
 	mono_arch_flush_icache (start, code - start);
 
-	g_assert ((code - start) < buf_size);
+	g_assert ((code - start) < kMaxCodeSize);
+
+	nacl_global_codeman_validate(&start, kMaxCodeSize, &code);
 
 	if (info)
 		*info = mono_tramp_info_create (g_strdup (tramp_name), start, code - start, ji, unwind_ops);
@@ -550,7 +561,7 @@ mono_arch_find_jit_info (MonoDomain *domain, MonoJitTlsData *jit_tls,
 	*new_ctx = *ctx;
 
 	if (ji != NULL) {
-		gssize regs [MONO_MAX_IREGS + 1];
+		mgreg_t regs [MONO_MAX_IREGS + 1];
 		guint8 *cfa;
 		guint32 unwind_info_len;
 		guint8 *unwind_info;
@@ -602,7 +613,7 @@ mono_arch_find_jit_info (MonoDomain *domain, MonoJitTlsData *jit_tls,
 		new_ctx->r15 = regs [AMD64_R15];
  
 		/* The CFA becomes the new SP value */
-		new_ctx->rsp = (gssize)cfa;
+		new_ctx->rsp = (mgreg_t)cfa;
 
 		/* Adjust IP */
 		new_ctx->rip --;
@@ -655,7 +666,7 @@ mono_arch_find_jit_info (MonoDomain *domain, MonoJitTlsData *jit_tls,
 			 * The rsp field is set just before the call which transitioned to native 
 			 * code. Obtain the rip from the stack.
 			 */
-			rip = *(guint64*)((*lmf)->rsp - sizeof (gpointer));
+			rip = *(guint64*)((*lmf)->rsp - sizeof(mgreg_t));
 		}
 
 		ji = mini_jit_info_table_find (domain, (gpointer)rip, NULL);
@@ -776,6 +787,10 @@ mono_arch_handle_exception (void *sigctx, gpointer obj, gboolean test_only)
 void
 mono_arch_sigctx_to_monoctx (void *sigctx, MonoContext *mctx)
 {
+#if defined(__native_client_codegen__) || defined(__native_client__)
+	printf("WARNING: mono_arch_sigctx_to_monoctx() called!\n");
+#endif
+
 #if defined(MONO_ARCH_USE_SIGACTION)
 	ucontext_t *ctx = (ucontext_t*)sigctx;
 
@@ -814,6 +829,10 @@ mono_arch_sigctx_to_monoctx (void *sigctx, MonoContext *mctx)
 void
 mono_arch_monoctx_to_sigctx (MonoContext *mctx, void *sigctx)
 {
+#if defined(__native_client__) || defined(__native_client_codegen__)
+  printf("WARNING: mono_arch_monoctx_to_sigctx() called!\n");
+#endif
+
 #if defined(MONO_ARCH_USE_SIGACTION)
 	ucontext_t *ctx = (ucontext_t*)sigctx;
 
@@ -971,8 +990,9 @@ mono_arch_get_throw_pending_exception (MonoTrampInfo **info, gboolean aot)
 	gpointer throw_trampoline;
 	MonoJumpInfo *ji = NULL;
 	GSList *unwind_ops = NULL;
+	const guint kMaxCodeSize = NACL_SIZE (128, 256);
 
-	start = code = mono_global_codeman_reserve (128);
+	start = code = mono_global_codeman_reserve (kMaxCodeSize);
 
 	/* We are in the frame of a managed method after a call */
 	/* 
@@ -1065,7 +1085,9 @@ mono_arch_get_throw_pending_exception (MonoTrampInfo **info, gboolean aot)
 	/* Return to original code */
 	amd64_jump_reg (code, AMD64_R11);
 
-	g_assert ((code - start) < 128);
+	g_assert ((code - start) < kMaxCodeSize);
+
+	nacl_global_codeman_validate(&start, kMaxCodeSize, &code);
 
 	if (info)
 		*info = mono_tramp_info_create (g_strdup_printf ("throw_pending_exception"), start, code - start, ji, unwind_ops);
@@ -1407,10 +1429,12 @@ mono_tasklets_arch_restore (void)
 	static guint8* saved = NULL;
 	guint8 *code, *start;
 	int cont_reg = AMD64_R9; /* register usable on both call conventions */
+	const guint kMaxCodeSize = NACL_SIZE (64, 128);
+	
 
 	if (saved)
 		return (MonoContinuationRestore)saved;
-	code = start = mono_global_codeman_reserve (64);
+	code = start = mono_global_codeman_reserve (kMaxCodeSize);
 	/* the signature is: restore (MonoContinuation *cont, int state, MonoLMF **lmf_addr) */
 	/* cont is in AMD64_ARG_REG1 ($rcx or $rdi)
 	 * state is in AMD64_ARG_REG2 ($rdx or $rsi)
@@ -1436,7 +1460,9 @@ mono_tasklets_arch_restore (void)
 	amd64_mov_reg_membase (code, AMD64_R12, AMD64_RCX, G_STRUCT_OFFSET (MonoLMF, r12), 8);
 	amd64_mov_reg_membase (code, AMD64_R13, AMD64_RCX, G_STRUCT_OFFSET (MonoLMF, r13), 8);
 	amd64_mov_reg_membase (code, AMD64_R14, AMD64_RCX, G_STRUCT_OFFSET (MonoLMF, r14), 8);
+#if !defined(__native_client_codegen__)
 	amd64_mov_reg_membase (code, AMD64_R15, AMD64_RCX, G_STRUCT_OFFSET (MonoLMF, r15), 8);
+#endif
 #ifdef TARGET_WIN32
 	amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RCX, G_STRUCT_OFFSET (MonoLMF, rdi), 8);
 	amd64_mov_reg_membase (code, AMD64_RSI, AMD64_RCX, G_STRUCT_OFFSET (MonoLMF, rsi), 8);
@@ -1449,7 +1475,10 @@ mono_tasklets_arch_restore (void)
 
 	/* state is already in rax */
 	amd64_jump_membase (code, cont_reg, G_STRUCT_OFFSET (MonoContinuation, return_ip));
-	g_assert ((code - start) <= 64);
+	g_assert ((code - start) <= kMaxCodeSize);
+
+	nacl_global_codeman_validate(&start, kMaxCodeSize, &code);
+
 	saved = start;
 	return (MonoContinuationRestore)saved;
 }
diff --git a/mono/mini/exceptions-x86.c b/mono/mini/exceptions-x86.c
index e6af5ee0824..d3f09813c20 100644
--- a/mono/mini/exceptions-x86.c
+++ b/mono/mini/exceptions-x86.c
@@ -308,6 +308,8 @@ mono_arch_get_restore_context (MonoTrampInfo **info, gboolean aot)
 	/* jump to the saved IP */
 	x86_ret (code);
 
+	nacl_global_codeman_validate(&start, 128, &code);
+
 	if (info)
 		*info = mono_tramp_info_create (g_strdup_printf ("restore_context"), start, code - start, ji, unwind_ops);
 	else {
@@ -335,11 +337,7 @@ mono_arch_get_call_filter (MonoTrampInfo **info, gboolean aot)
 	guint8 *code;
 	MonoJumpInfo *ji = NULL;
 	GSList *unwind_ops = NULL;
-#ifdef __native_client_codegen__
-	guint kMaxCodeSize = 128;
-#else
-	guint kMaxCodeSize = 64;
-#endif  /* __native_client_codegen__ */
+	guint kMaxCodeSize = NACL_SIZE (64, 128);
 
 	/* call_filter (MonoContext *ctx, unsigned long eip) */
 	start = code = mono_global_codeman_reserve (kMaxCodeSize);
@@ -387,6 +385,8 @@ mono_arch_get_call_filter (MonoTrampInfo **info, gboolean aot)
 	x86_leave (code);
 	x86_ret (code);
 
+	nacl_global_codeman_validate(&start, kMaxCodeSize, &code);
+
 	if (info)
 		*info = mono_tramp_info_create (g_strdup_printf ("call_filter"), start, code - start, ji, unwind_ops);
 	else {
@@ -515,11 +515,8 @@ get_throw_trampoline (const char *name, gboolean rethrow, gboolean llvm, gboolea
 	int i, stack_size, stack_offset, arg_offsets [5], regs_offset;
 	MonoJumpInfo *ji = NULL;
 	GSList *unwind_ops = NULL;
-#ifdef __native_client_codegen__
-	guint kMaxCodeSize = 256;
-#else
-	guint kMaxCodeSize = 128;
-#endif
+	guint kMaxCodeSize = NACL_SIZE (128, 256);
+
 	start = code = mono_global_codeman_reserve (kMaxCodeSize);
 
 	stack_size = 128;
@@ -629,6 +626,8 @@ get_throw_trampoline (const char *name, gboolean rethrow, gboolean llvm, gboolea
 	}
 	x86_breakpoint (code);
 
+	nacl_global_codeman_validate(&start, kMaxCodeSize, &code);
+
 	g_assert ((code - start) < kMaxCodeSize);
 
 	if (info)
diff --git a/mono/mini/fsacheck.c b/mono/mini/fsacheck.c
index e1d4160ac27..6ee66bb8c58 100644
--- a/mono/mini/fsacheck.c
+++ b/mono/mini/fsacheck.c
@@ -1,11 +1,14 @@
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
 
 #include <mono/metadata/appdomain.h>
 #include <mono/metadata/assembly.h>
 #include <mono/metadata/debug-helpers.h>
 #include <mono/metadata/object.h>
 #include <mono/jit/jit.h>
+#include <mono/utils/mono-logger.h>
 
 extern void* mono_aot_module_mscorlib_info;
 extern void* mono_aot_module_System_Core_info;
@@ -32,72 +35,97 @@ extern void* mono_aot_module_devirtualization_info;
 extern void* mono_aot_module_generics_info;
 extern void* mono_aot_module_generics_variant_types_info;
 extern void* mono_aot_module_basic_simd_info;
+extern void* mono_aot_module_gc_stress_info;
+extern void* mono_aot_module_imt_big_iface_test_info;
+extern void* mono_aot_module_make_imt_test_info;
 /* extern void* mono_aot_module_thread_stress_info; */
-
+extern void* mono_aot_module_iltests_info;
 
 extern void mono_aot_register_module(void *aot_info);
 extern void mono_aot_init(void);
 extern void mono_jit_set_aot_only(mono_bool aot_only);
 extern MonoDomain * mini_init (const char *filename, const char *runtime_version);
 
-
-void try_one(char *mname) {
-  MonoDomain *domain;
-  MonoAssembly *ma;
-  MonoImage *mi;
-  MonoClass *mc;
-  MonoMethodDesc *mmd;
-  MonoMethod *mm;
-  MonoObject *mo;
-  MonoArray *arg_array;
-  void *args [1];
-  char *cstr_arg = "20";
-
-  mono_jit_set_aot_only(1);
-  domain = mono_jit_init(mname);
-  printf("mono domain: %p\n", domain);
-
-  ma = mono_domain_assembly_open(domain, mname);
-  if (0 == ma) {
-    printf("ERROR: could not open mono assembly\n");
-    exit(-1);
+int run_all_test_methods(MonoClass *klass) {
+  void * iter = NULL;
+  MonoMethod *mm = NULL;
+  int count = 0;
+  int passed = 0;
+  printf("Running test methods without reflection\n");
+  while (NULL != (mm = mono_class_get_methods(klass, &iter))) {
+    long expected_result;
+    const char *name = mono_method_get_name(mm);
+    char *end = NULL;
+    if (strncmp(name, "test_", 5)) continue;
+    printf("=== Test %d, method %s\n", count, mono_method_get_name(mm));
+    expected_result = strtol(name + 5, &end, 10);
+    if (name == end) {
+      printf(" warning: could not determine expected return value\n");
+      expected_result = 0;
+    }
+    MonoObject *mo = mono_runtime_invoke(mm, NULL, NULL, NULL);
+    int *ret = mono_object_unbox(mo);
+    if (ret && *ret == expected_result) {
+      printf(" passed!\n");
+      passed++;
+    } else {
+      printf(" FAILED, expected %d, returned %p, %d\n", expected_result, ret,
+             ret != NULL ? *ret : 0);
+    }
+    count++;
   }
-  printf("opened mono assembly: %p\n", ma);
-
-  mi = mono_assembly_get_image(ma);
-  printf("mono image: %p\n", mi);
-
-  mo = mono_string_new(domain, cstr_arg);
-  mc = mono_class_from_name(mono_get_corlib(), "System", "String");
-  printf("string class: %p\n", mc);
-  arg_array = mono_array_new(domain, mc, 1);
-  mono_array_setref(arg_array, 0, mo);
-  args[0] = arg_array;
+  if (count > 0) {
+    printf("============================================\n");
+    printf("Final count: %d tests, %d pass, %.2f%%\n", count, passed,
+           (double)passed / count * 100.0);
+  } else {
+    printf("no test methods found.\n");
+  }
+  return count;
+}
 
-  mmd = mono_method_desc_new("Tests:Main()", 1);
-  mm = mono_method_desc_search_in_image(mmd, mi);
-  if (0 == mm) {
-    mmd = mono_method_desc_new("Tests:Main(string[])", 1);
-    mm = mono_method_desc_search_in_image(mmd, mi);
-    if (0 == mm) {
-      mmd = mono_method_desc_new("SimdTests:Main(string[])", 1);
-      mm = mono_method_desc_search_in_image(mmd, mi);
-      if (0 == mm) {
-        printf("Couldn't find Tests:Main(), Tests:Main(string[]) or SimdTests:Main(string[])\n");
-        exit(-1);
+#if defined(__native_client__)
+extern void* mono_aot_module_nacl_info;
+extern char* nacl_mono_path;
+char *load_corlib_data() {
+  FILE *mscorlib;
+  static char *corlib_data = NULL;
+  if (corlib_data) return corlib_data;
+
+  mscorlib = fopen("mscorlib.dll", "r");
+  if (NULL != mscorlib) {
+    size_t size;
+    struct stat st;
+    if (0 == stat("mscorlib.dll", &st)) {
+      size = st.st_size;
+      printf("reading mscorlib.dll, size %ld\n", size);
+      corlib_data = malloc(size);
+      if (corlib_data != NULL) {
+        while (fread(corlib_data, 1, size, mscorlib) != 0) ;
+        if (!ferror(mscorlib)) {
+          mono_set_corlib_data(corlib_data, size);
+        } else {
+          perror("error reading mscorlib.dll");
+          free(corlib_data);
+          corlib_data = NULL;
+        }
+      } else {
+        perror("Could not allocate memory");
       }
+    } else {
+      perror("stat error");
     }
+    fclose(mscorlib);
   }
-  printf("mono desc method: %p\n", mmd);
-  printf("mono method: %p\n", mm);
-
-  mo = mono_runtime_invoke(mm, NULL, args, NULL);
-  printf("mono object: %p\n", mo);
-
-  mono_jit_cleanup(domain);
+  return corlib_data;
 }
+#endif
 
-int main(int argc, char *argv[]) {
+/* Initialize Mono. Must run only once per process */
+MonoDomain *init_mono(char *mname) {
+  MonoDomain *domain = NULL;
+#ifdef AOT_VERSION
+  mono_jit_set_aot_only(1);
   mono_aot_register_module(mono_aot_module_mscorlib_info);
   mono_aot_register_module(mono_aot_module_TestDriver_info);
   mono_aot_register_module(mono_aot_module_System_Core_info);
@@ -120,163 +148,122 @@ int main(int argc, char *argv[]) {
   mono_aot_register_module(mono_aot_module_basic_math_info);
   mono_aot_register_module(mono_aot_module_exceptions_info);
   mono_aot_register_module(mono_aot_module_devirtualization_info);
-  /*
   mono_aot_register_module(mono_aot_module_generics_info);
   mono_aot_register_module(mono_aot_module_generics_variant_types_info);
-  */
-
-  /*  mono_aot_register_module(mono_aot_module_thread_stress_info); */
-  if (argc < 2) {
-    printf("no test specified; running basic.exe\n");
-    printf("==========================\n");
-    try_one("basic.exe");
-    printf("==========================\n");
-  } else {
-    printf("\nProgram %s %s output:\n", argv[0], argv[1]);
-    printf("==========================\n\n");
-    try_one(argv[1]);
+  mono_aot_register_module(mono_aot_module_gc_stress_info);
+  mono_aot_register_module(mono_aot_module_imt_big_iface_test_info);
+  mono_aot_register_module(mono_aot_module_iltests_info);
+#endif
+  /* mono_aot_register_module(mono_aot_module_make_imt_test_info); */
+  /* mono_aot_register_module(mono_aot_module_thread_stress_info); */
+#if defined(__native_client__)
+#ifdef AOT_VERSION
+  mono_aot_register_module(mono_aot_module_nacl_info);
+#endif
+
+  /* Test file-less shortcut for loading mscorlib metadata */
+  load_corlib_data();
+  nacl_mono_path = strdup(".");
+#endif
+  /* Uncomment the following if something is going wrong */
+  /* mono_trace_set_level_string("info"); */
+  domain = mono_jit_init(mname);
+  if (NULL == domain) {
+    printf("ERROR: mono_jit_init failure\n");
+    exit(-1);
   }
-
-  return 0;
+  return domain;
 }
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <mono/metadata/appdomain.h>
-#include <mono/metadata/assembly.h>
-#include <mono/metadata/debug-helpers.h>
-#include <mono/metadata/object.h>
-#include <mono/jit/jit.h>
-
-extern void* mono_aot_module_mscorlib_info;
-extern void* mono_aot_module_System_Core_info;
-extern void* mono_aot_module_System_info;
-extern void* mono_aot_module_Mono_Posix_info;
-extern void* mono_aot_module_System_Configuration_info;
-extern void* mono_aot_module_System_Security_info;
-extern void* mono_aot_module_System_Xml_info;
-/* extern void* mono_aot_module_System_Threading_info; */
-extern void* mono_aot_module_Mono_Security_info;
-extern void* mono_aot_module_Mono_Simd_info;
-extern void* mono_aot_module_TestDriver_info;
 
-extern void* mono_aot_module_basic_info;
-extern void* mono_aot_module_basic_float_info;
-extern void* mono_aot_module_basic_long_info;
-extern void* mono_aot_module_basic_calls_info;
-extern void* mono_aot_module_basic_simd_info;
-extern void* mono_aot_module_objects_info;
-extern void* mono_aot_module_arrays_info;
-extern void* mono_aot_module_basic_math_info;
-extern void* mono_aot_module_exceptions_info;
-extern void* mono_aot_module_devirtualization_info;
-extern void* mono_aot_module_generics_info;
-extern void* mono_aot_module_generics_variant_types_info;
-extern void* mono_aot_module_basic_simd_info;
-/* extern void* mono_aot_module_thread_stress_info; */
-
-
-extern void mono_aot_register_module(void *aot_info);
-extern void mono_aot_init(void);
-extern void mono_jit_set_aot_only(mono_bool aot_only);
-extern MonoDomain * mini_init (const char *filename, const char *runtime_version);
-
-
-void try_one(char *mname) {
-  MonoDomain *domain;
+/* Run all tests from one assembly file */
+int try_one(char *mname, MonoDomain *domain) {
   MonoAssembly *ma;
   MonoImage *mi;
   MonoClass *mc;
   MonoMethodDesc *mmd;
   MonoMethod *mm;
   MonoObject *mo;
+  MonoString *monostring_arg;
   MonoArray *arg_array;
+  int *failures = NULL;
+  const int kUseTestDriver = 1;
+  int test_count = 0;
   void *args [1];
-  char *cstr_arg = "20";
-
-  mono_jit_set_aot_only(1);
-  domain = mono_jit_init(mname);
-  printf("mono domain: %p\n", domain);
+  char *cstr_arg = "--timing";
 
   ma = mono_domain_assembly_open(domain, mname);
-  if (0 == ma) {
+  if (NULL == ma) {
     printf("ERROR: could not open mono assembly\n");
     exit(-1);
   }
-  printf("opened mono assembly: %p\n", ma);
 
   mi = mono_assembly_get_image(ma);
-  printf("mono image: %p\n", mi);
+  if (NULL == mi) {
+    printf("ERROR: could not get assembly image\n");
+    exit(-1);
+  }
 
-  mo = mono_string_new(domain, cstr_arg);
+  monostring_arg = mono_string_new(domain, cstr_arg);
   mc = mono_class_from_name(mono_get_corlib(), "System", "String");
-  printf("string class: %p\n", mc);
-  arg_array = mono_array_new(domain, mc, 1);
-  mono_array_setref(arg_array, 0, mo);
+  if (0 == mc) {
+    printf("ERROR: could not mono string class\n");
+    exit(-1);
+  }
+
+  // to pass a string argument, change the 0 to a 1 and uncomment
+  // mono_array_setref below
+  arg_array = mono_array_new(domain, mc, 0);
+  //mono_array_setref(arg_array, 0, monostring_arg);
   args[0] = arg_array;
 
-  mmd = mono_method_desc_new("Tests:Main()", 1);
-  mm = mono_method_desc_search_in_image(mmd, mi);
-  if (0 == mm) {
-    mmd = mono_method_desc_new("Tests:Main(string[])", 1);
+  if (!kUseTestDriver) {
+    mc = mono_class_from_name(mi, "", "Tests");
+    if (NULL == mc) {
+      printf("could not open Tests class\n");
+      exit(-1);
+    }
+    test_count = run_all_test_methods(mc);
+  }
+  /* If run_all_test_methods didn't find any tests, try Main */
+  if (kUseTestDriver || test_count == 0) {
+    mmd = mono_method_desc_new("Tests:Main()", 1);
     mm = mono_method_desc_search_in_image(mmd, mi);
     if (0 == mm) {
-      mmd = mono_method_desc_new("SimdTests:Main(string[])", 1);
+      mmd = mono_method_desc_new("Tests:Main(string[])", 1);
       mm = mono_method_desc_search_in_image(mmd, mi);
       if (0 == mm) {
-        printf("Couldn't find Tests:Main(), Tests:Main(string[]) or SimdTests:Main(string[])\n");
+        printf("Couldn't find Tests:Main() or Tests:Main(string[])\n");
         exit(-1);
       }
     }
-  }
-  printf("mono desc method: %p\n", mmd);
-  printf("mono method: %p\n", mm);
-
-  mo = mono_runtime_invoke(mm, NULL, args, NULL);
-  printf("mono object: %p\n", mo);
 
-  mono_jit_cleanup(domain);
+    mo = mono_runtime_invoke(mm, NULL, args, NULL);
+    failures = mo != NULL ? mono_object_unbox(mo) : NULL;
+    if (NULL == failures || *failures != 0) {
+      printf("--------------------> Failed");
+    }
+  }
+  return failures != NULL ? failures : 1;
 }
 
 int main(int argc, char *argv[]) {
-  mono_aot_register_module(mono_aot_module_mscorlib_info);
-  mono_aot_register_module(mono_aot_module_TestDriver_info);
-  mono_aot_register_module(mono_aot_module_System_Core_info);
-  mono_aot_register_module(mono_aot_module_System_info);
-  mono_aot_register_module(mono_aot_module_Mono_Posix_info);
-  mono_aot_register_module(mono_aot_module_System_Configuration_info);
-  mono_aot_register_module(mono_aot_module_System_Security_info);
-  mono_aot_register_module(mono_aot_module_System_Xml_info);
-  mono_aot_register_module(mono_aot_module_Mono_Security_info);
-  /*  mono_aot_register_module(mono_aot_module_System_Threading_info); */
-  mono_aot_register_module(mono_aot_module_Mono_Simd_info);
+   MonoDomain *domain;
+   int failures = 0;
 
-  mono_aot_register_module(mono_aot_module_basic_info);
-  mono_aot_register_module(mono_aot_module_basic_float_info);
-  mono_aot_register_module(mono_aot_module_basic_long_info);
-  mono_aot_register_module(mono_aot_module_basic_calls_info);
-  mono_aot_register_module(mono_aot_module_basic_simd_info);
-  mono_aot_register_module(mono_aot_module_objects_info);
-  mono_aot_register_module(mono_aot_module_arrays_info);
-  mono_aot_register_module(mono_aot_module_basic_math_info);
-  mono_aot_register_module(mono_aot_module_exceptions_info);
-  mono_aot_register_module(mono_aot_module_devirtualization_info);
-  /*
-  mono_aot_register_module(mono_aot_module_generics_info);
-  mono_aot_register_module(mono_aot_module_generics_variant_types_info);
-  */
-
-  /*  mono_aot_register_module(mono_aot_module_thread_stress_info); */
   if (argc < 2) {
     printf("no test specified; running basic.exe\n");
-    printf("==========================\n");
-    try_one("basic.exe");
-    printf("==========================\n");
+    printf("================================\n");
+    domain = init_mono("basic.exe");
+    try_one("basic.exe", domain);
   } else {
-    printf("\nProgram %s %s output:\n", argv[0], argv[1]);
-    printf("==========================\n\n");
-    try_one(argv[1]);
+    domain = init_mono(argv[1]);
+    int i;
+    for (i = 1; i < argc; i++) {
+      printf("\nRunning tests from %s:\n", argv[i]);
+      printf("===============================\n\n");
+      failures += try_one(argv[i], domain);
+    }
   }
-
-  return 0;
+  mono_jit_cleanup(domain);
+  return failures;
 }
diff --git a/mono/mini/genmdesc.c b/mono/mini/genmdesc.c
index 0c942afabaf..20a333b1670 100644
--- a/mono/mini/genmdesc.c
+++ b/mono/mini/genmdesc.c
@@ -11,6 +11,8 @@
 #include <string.h>
 #include <mono/metadata/opcodes.h>
 
+void __nacl_suspend_thread_if_needed() {}
+
 #define MINI_OP(a,b,dest,src1,src2) b,
 #define MINI_OP3(a,b,dest,src1,src2,src3) b,
 /* keep in sync with the enum in mini.h */
diff --git a/mono/mini/genmdesc.pl b/mono/mini/genmdesc.pl
index 7d66e31d761..8c13a6171dd 100644
--- a/mono/mini/genmdesc.pl
+++ b/mono/mini/genmdesc.pl
@@ -79,7 +79,7 @@ sub load_opcodes
 	if ($arch =~ "__i386__") {
 		$arch_define = "TARGET_X86";
 	}
-	if ($arch =~ " __x86_64__") {
+	if ($arch =~ "__x86_64__") {
 		$arch_define = "TARGET_AMD64";
 	}
 	if ($arch =~ "__arm__") {
diff --git a/mono/mini/jit-icalls.c b/mono/mini/jit-icalls.c
index 1e4cbf540d9..e5f08461daf 100644
--- a/mono/mini/jit-icalls.c
+++ b/mono/mini/jit-icalls.c
@@ -926,6 +926,16 @@ mono_lconv_to_r8_un (guint64 a)
 }
 #endif
 
+#if defined(__native_client_codegen__) || defined(__native_client__)
+/* When we cross-compile to Native Client we can't directly embed calls */
+/* to the math library on the host. This will use the fmod on the target*/
+double
+mono_fmod(double a, double b)
+{
+	return fmod(a, b);
+}
+#endif
+
 gpointer
 mono_helper_compile_generic_method (MonoObject *obj, MonoMethod *method, gpointer *this_arg)
 {
diff --git a/mono/mini/jit-icalls.h b/mono/mini/jit-icalls.h
index d0c7214a5ff..16679d36ad0 100644
--- a/mono/mini/jit-icalls.h
+++ b/mono/mini/jit-icalls.h
@@ -85,6 +85,10 @@ double mono_conv_to_r8_un (guint32 a) MONO_INTERNAL;
 
 double mono_lconv_to_r8_un (guint64 a) MONO_INTERNAL;
 
+#if defined(__native_client_codegen__) || defined(__native_client__)
+double mono_fmod(double a, double b) MONO_INTERNAL;
+#endif
+
 gpointer mono_helper_compile_generic_method (MonoObject *obj, MonoMethod *method, gpointer *this_arg) MONO_INTERNAL;
 
 MonoString *mono_helper_ldstr (MonoImage *image, guint32 idx) MONO_INTERNAL;
diff --git a/mono/mini/method-to-ir.c b/mono/mini/method-to-ir.c
index e26aef5a752..a34d674696d 100644
--- a/mono/mini/method-to-ir.c
+++ b/mono/mini/method-to-ir.c
@@ -832,7 +832,7 @@ type_from_op (MonoInst *ins, MonoInst *src1, MonoInst *src2) {
 	case OP_LCOMPARE:
 	case OP_ICOMPARE:
 		ins->type = bin_comp_table [src1->type] [src2->type] ? STACK_I4: STACK_INV;
-		if ((src1->type == STACK_I8) || ((SIZEOF_REGISTER == 8) && ((src1->type == STACK_PTR) || (src1->type == STACK_OBJ) || (src1->type == STACK_MP))))
+		if ((src1->type == STACK_I8) || ((SIZEOF_VOID_P == 8) && ((src1->type == STACK_PTR) || (src1->type == STACK_OBJ) || (src1->type == STACK_MP))))
 			ins->opcode = OP_LCOMPARE;
 		else if (src1->type == STACK_R8)
 			ins->opcode = OP_FCOMPARE;
@@ -841,7 +841,7 @@ type_from_op (MonoInst *ins, MonoInst *src1, MonoInst *src2) {
 		break;
 	case OP_ICOMPARE_IMM:
 		ins->type = bin_comp_table [src1->type] [src1->type] ? STACK_I4 : STACK_INV;
-		if ((src1->type == STACK_I8) || ((SIZEOF_REGISTER == 8) && ((src1->type == STACK_PTR) || (src1->type == STACK_OBJ) || (src1->type == STACK_MP))))
+		if ((src1->type == STACK_I8) || ((SIZEOF_VOID_P == 8) && ((src1->type == STACK_PTR) || (src1->type == STACK_OBJ) || (src1->type == STACK_MP))))
 			ins->opcode = OP_LCOMPARE_IMM;		
 		break;
 	case CEE_BEQ:
@@ -929,7 +929,7 @@ type_from_op (MonoInst *ins, MonoInst *src1, MonoInst *src2) {
 			break;
 		case STACK_PTR:
 		case STACK_MP:
-#if SIZEOF_REGISTER == 8
+#if SIZEOF_VOID_P == 8
 			ins->opcode = OP_LCONV_TO_U;
 #else
 			ins->opcode = OP_MOVE;
@@ -5730,6 +5730,11 @@ mono_method_to_ir (MonoCompile *cfg, MonoMethod *method, MonoBasicBlock *start_b
 		cfg->bb_entry = start_bblock;
 		start_bblock->cil_code = NULL;
 		start_bblock->cil_length = 0;
+#if defined(__native_client_codegen__)
+		MONO_INST_NEW (cfg, ins, OP_NACL_GC_SAFE_POINT);
+		ins->dreg = alloc_dreg (cfg, STACK_I4);
+		MONO_ADD_INS (start_bblock, ins);
+#endif
 
 		/* EXIT BLOCK */
 		NEW_BBLOCK (cfg, end_bblock);
@@ -9902,7 +9907,7 @@ mono_method_to_ir (MonoCompile *cfg, MonoMethod *method, MonoBasicBlock *start_b
 				cmp->sreg2 = sp [1]->dreg;
 				type_from_op (cmp, sp [0], sp [1]);
 				CHECK_TYPE (cmp);
-				if ((sp [0]->type == STACK_I8) || ((SIZEOF_REGISTER == 8) && ((sp [0]->type == STACK_PTR) || (sp [0]->type == STACK_OBJ) || (sp [0]->type == STACK_MP))))
+				if ((sp [0]->type == STACK_I8) || ((SIZEOF_VOID_P == 8) && ((sp [0]->type == STACK_PTR) || (sp [0]->type == STACK_OBJ) || (sp [0]->type == STACK_MP))))
 					cmp->opcode = OP_LCOMPARE;
 				else if (sp [0]->type == STACK_R8)
 					cmp->opcode = OP_FCOMPARE;
@@ -10835,7 +10840,11 @@ op_to_op_src1_membase (int load_opcode, int opcode)
 
 	switch (opcode) {
 	case OP_X86_PUSH:
+#ifdef __mono_ilp32__
+		if (load_opcode == OP_LOADI8_MEMBASE)
+#else
 		if ((load_opcode == OP_LOAD_MEMBASE) || (load_opcode == OP_LOADI8_MEMBASE))
+#endif
 			return OP_X86_PUSH_MEMBASE;
 		break;
 		/* FIXME: This only works for 32 bit immediates
@@ -10850,7 +10859,13 @@ op_to_op_src1_membase (int load_opcode, int opcode)
 		break;
 	case OP_COMPARE:
 	case OP_LCOMPARE:
+#ifdef __mono_ilp32__
+		if (load_opcode == OP_LOAD_MEMBASE)
+			return OP_AMD64_ICOMPARE_MEMBASE_REG;
+		if (load_opcode == OP_LOADI8_MEMBASE)
+#else
 		if ((load_opcode == OP_LOAD_MEMBASE) || (load_opcode == OP_LOADI8_MEMBASE))
+#endif
 			return OP_AMD64_COMPARE_MEMBASE_REG;
 		break;
 	case OP_ICOMPARE:
@@ -10888,7 +10903,11 @@ op_to_op_src2_membase (int load_opcode, int opcode)
 #endif
 
 #ifdef TARGET_AMD64
+#ifdef __mono_ilp32__
+	if ((load_opcode == OP_LOADI4_MEMBASE) || (load_opcode == OP_LOADU4_MEMBASE) || (load_opcode == OP_LOAD_MEMBASE) ) {
+#else
 	if ((load_opcode == OP_LOADI4_MEMBASE) || (load_opcode == OP_LOADU4_MEMBASE)) {
+#endif
 		switch (opcode) {
 		case OP_ICOMPARE:
 			return OP_AMD64_ICOMPARE_REG_MEMBASE;
@@ -10903,7 +10922,11 @@ op_to_op_src2_membase (int load_opcode, int opcode)
 		case OP_IXOR:
 			return OP_X86_XOR_REG_MEMBASE;
 		}
+#ifdef __mono_ilp32__
+	} else if (load_opcode == OP_LOADI8_MEMBASE) {
+#else
 	} else if ((load_opcode == OP_LOADI8_MEMBASE) || (load_opcode == OP_LOAD_MEMBASE)) {
+#endif
 		switch (opcode) {
 		case OP_COMPARE:
 		case OP_LCOMPARE:
diff --git a/mono/mini/mini-amd64.c b/mono/mini/mini-amd64.c
index cb1edb8cdc0..49c3a53c715 100644
--- a/mono/mini/mini-amd64.c
+++ b/mono/mini/mini-amd64.c
@@ -205,11 +205,278 @@ amd64_is_near_call (guint8 *code)
 	return code [0] == 0xe8;
 }
 
+#ifdef __native_client_codegen__
+
+/* Keep track of instruction "depth", that is, the level of sub-instruction */
+/* for any given instruction.  For instance, amd64_call_reg resolves to     */
+/* amd64_call_reg_internal, which uses amd64_alu_* macros, etc.             */
+/* We only want to force bundle alignment for the top level instruction,    */
+/* so NaCl pseudo-instructions can be implemented with sub instructions.    */
+static guint32 nacl_instruction_depth;
+
+static guint32 nacl_rex_tag;
+static guint32 nacl_legacy_prefix_tag;
+
+void
+amd64_nacl_clear_legacy_prefix_tag ()
+{
+	TlsSetValue (nacl_legacy_prefix_tag, NULL);
+}
+
+void
+amd64_nacl_tag_legacy_prefix (guint8* code)
+{
+	if (TlsGetValue (nacl_legacy_prefix_tag) == NULL)
+		TlsSetValue (nacl_legacy_prefix_tag, code);
+}
+
+void
+amd64_nacl_tag_rex (guint8* code)
+{
+	TlsSetValue (nacl_rex_tag, code);
+}
+
+guint8*
+amd64_nacl_get_legacy_prefix_tag ()
+{
+	return (guint8*)TlsGetValue (nacl_legacy_prefix_tag);
+}
+
+guint8*
+amd64_nacl_get_rex_tag ()
+{
+	return (guint8*)TlsGetValue (nacl_rex_tag);
+}
+
+/* Increment the instruction "depth" described above */
+void
+amd64_nacl_instruction_pre ()
+{
+	intptr_t depth = (intptr_t) TlsGetValue (nacl_instruction_depth);
+	depth++;
+	TlsSetValue (nacl_instruction_depth, (gpointer)depth);
+}
+
+/* amd64_nacl_instruction_post: Decrement instruction "depth", force bundle */
+/* alignment if depth == 0 (top level instruction)                          */
+/* IN: start, end    pointers to instruction beginning and end              */
+/* OUT: start, end   pointers to beginning and end after possible alignment */
+/* GLOBALS: nacl_instruction_depth     defined above                        */
+void
+amd64_nacl_instruction_post (guint8 **start, guint8 **end)
+{
+	intptr_t depth = (intptr_t) TlsGetValue(nacl_instruction_depth);
+	depth--;
+	TlsSetValue (nacl_instruction_depth, (void*)depth);
+
+	g_assert ( depth >= 0 );
+	if (depth == 0) {
+  		uintptr_t space_in_block;
+		uintptr_t instlen;
+		guint8 *prefix = amd64_nacl_get_legacy_prefix_tag ();
+		/* if legacy prefix is present, and if it was emitted before */
+		/* the start of the instruction sequence, adjust the start   */
+		if (prefix != NULL && prefix < *start) {
+			g_assert (*start - prefix <= 3);/* only 3 are allowed */
+			*start = prefix;
+		}
+		space_in_block = kNaClAlignment - ((uintptr_t)(*start) & kNaClAlignmentMask);
+		instlen = (uintptr_t)(*end - *start);
+		/* Only check for instructions which are less than        */
+		/* kNaClAlignment. The only instructions that should ever */
+		/* be that long are call sequences, which are already     */
+		/* padded out to align the return to the next bundle.     */
+		if (instlen > space_in_block && instlen < kNaClAlignment) {
+			const size_t MAX_NACL_INST_LENGTH = kNaClAlignment;
+  			guint8 copy_of_instruction[MAX_NACL_INST_LENGTH];
+  			const size_t length = (size_t)((*end)-(*start));
+  			g_assert (length < MAX_NACL_INST_LENGTH);
+			
+  			memcpy (copy_of_instruction, *start, length);
+			*start = mono_arch_nacl_pad (*start, space_in_block);
+			memcpy (*start, copy_of_instruction, length);
+			*end = *start + length;
+		}
+		amd64_nacl_clear_legacy_prefix_tag ();
+		amd64_nacl_tag_rex (NULL);
+	}
+}
+
+/* amd64_nacl_membase_handler: ensure all access to memory of the form      */
+/*   OFFSET(%rXX) is sandboxed.  For allowable base registers %rip, %rbp,   */
+/*   %rsp, and %r15, emit the membase as usual.  For all other registers,   */
+/*   make sure the upper 32-bits are cleared, and use that register in the  */
+/*   index field of a new address of this form: OFFSET(%r15,%eXX,1)         */
+/* IN:      code                                                            */
+/*             pointer to current instruction stream (in the                */
+/*             middle of an instruction, after opcode is emitted)           */
+/*          basereg/offset/dreg                                             */
+/*             operands of normal membase address                           */
+/* OUT:     code                                                            */
+/*             pointer to the end of the membase/memindex emit              */
+/* GLOBALS: nacl_rex_tag                                                    */
+/*             position in instruction stream that rex prefix was emitted   */
+/*          nacl_legacy_prefix_tag                                          */
+/*             (possibly NULL) position in instruction of legacy x86 prefix */
+void
+amd64_nacl_membase_handler (guint8** code, gint8 basereg, gint32 offset, gint8 dreg)
+{
+	gint8 true_basereg = basereg;
+
+	/* Cache these values, they might change  */
+ 	/* as new instructions are emitted below. */
+	guint8* rex_tag = amd64_nacl_get_rex_tag ();
+	guint8* legacy_prefix_tag = amd64_nacl_get_legacy_prefix_tag ();
+
+	/* 'basereg' is given masked to 0x7 at this point, so check */
+	/* the rex prefix to see if this is an extended register.   */
+	if ((rex_tag != NULL) && IS_REX(*rex_tag) && (*rex_tag & AMD64_REX_B)) {
+		true_basereg |= 0x8;
+	}
+
+#define X86_LEA_OPCODE (0x8D)
+
+	if (!amd64_is_valid_nacl_base (true_basereg) && (*(*code-1) != X86_LEA_OPCODE)) {
+		guint8* old_instruction_start;
+		
+		/* This will hold the 'mov %eXX, %eXX' that clears the upper */
+		/* 32-bits of the old base register (new index register)     */
+		guint8 buf[32];
+		guint8* buf_ptr = buf;
+		size_t insert_len;
+
+		g_assert (rex_tag != NULL);
+
+		if (IS_REX(*rex_tag)) {
+			/* The old rex.B should be the new rex.X */
+			if (*rex_tag & AMD64_REX_B) {
+				*rex_tag |= AMD64_REX_X;
+			}
+			/* Since our new base is %r15 set rex.B */
+			*rex_tag |= AMD64_REX_B;
+		} else {
+			/* Shift the instruction by one byte  */
+			/* so we can insert a rex prefix      */
+			memmove (rex_tag + 1, rex_tag, (size_t)(*code - rex_tag));
+			*code += 1;
+			/* New rex prefix only needs rex.B for %r15 base */
+			*rex_tag = AMD64_REX(AMD64_REX_B);
+		}
+
+		if (legacy_prefix_tag) {
+			old_instruction_start = legacy_prefix_tag;
+		} else {
+			old_instruction_start = rex_tag;
+		}
+		
+		/* Clears the upper 32-bits of the previous base register */
+		amd64_mov_reg_reg_size (buf_ptr, true_basereg, true_basereg, 4);
+		insert_len = buf_ptr - buf;
+		
+		/* Move the old instruction forward to make */
+		/* room for 'mov' stored in 'buf_ptr'       */
+		memmove (old_instruction_start + insert_len, old_instruction_start, (size_t)(*code - old_instruction_start));
+		*code += insert_len;
+		memcpy (old_instruction_start, buf, insert_len);
+
+		/* Sandboxed replacement for the normal membase_emit */
+		x86_memindex_emit (*code, dreg, AMD64_R15, offset, basereg, 0);
+		
+	} else {
+		/* Normal default behavior, emit membase memory location */
+		x86_membase_emit_body (*code, dreg, basereg, offset);
+	}
+}
+
+
+static inline unsigned char*
+amd64_skip_nops (unsigned char* code)
+{
+	guint8 in_nop;
+	do {
+		in_nop = 0;
+		if (   code[0] == 0x90) {
+			in_nop = 1;
+			code += 1;
+		}
+		if (   code[0] == 0x66 && code[1] == 0x90) {
+			in_nop = 1;
+			code += 2;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x00) {
+			in_nop = 1;
+			code += 3;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x40 && code[3] == 0x00) {
+			in_nop = 1;
+			code += 4;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x44 && code[3] == 0x00
+		 && code[4] == 0x00) {
+			in_nop = 1;
+			code += 5;
+		}
+		if (code[0] == 0x66 && code[1] == 0x0f
+		 && code[2] == 0x1f && code[3] == 0x44
+		 && code[4] == 0x00 && code[5] == 0x00) {
+			in_nop = 1;
+			code += 6;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x80 && code[3] == 0x00
+		 && code[4] == 0x00 && code[5] == 0x00
+		 && code[6] == 0x00) {
+			in_nop = 1;
+			code += 7;
+		}
+		if (code[0] == 0x0f && code[1] == 0x1f
+		 && code[2] == 0x84 && code[3] == 0x00
+		 && code[4] == 0x00 && code[5] == 0x00
+		 && code[6] == 0x00 && code[7] == 0x00) {
+			in_nop = 1;
+			code += 8;
+		}
+	} while ( in_nop );
+	return code;
+}
+
+guint8*
+mono_arch_nacl_skip_nops (guint8* code)
+{
+  return amd64_skip_nops(code);
+}
+
+#endif /*__native_client_codegen__*/
+
 static inline void 
 amd64_patch (unsigned char* code, gpointer target)
 {
 	guint8 rex = 0;
 
+#ifdef __native_client_codegen__
+	code = amd64_skip_nops (code);
+#endif
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	if (nacl_is_code_address (code)) {
+		/* For tail calls, code is patched after being installed */
+		/* but not through the normal "patch callsite" method.   */
+		unsigned char buf[kNaClAlignment];
+		unsigned char *aligned_code = (uintptr_t)code & ~kNaClAlignmentMask;
+		int ret;
+		memcpy (buf, aligned_code, kNaClAlignment);
+		/* Patch a temp buffer of bundle size, */
+		/* then install to actual location.    */
+		amd64_patch (buf + ((uintptr_t)code - (uintptr_t)aligned_code), target);
+		ret = nacl_dyncode_modify (aligned_code, buf, kNaClAlignment);
+		g_assert (ret == 0);
+		return;
+	}
+	target = nacl_modify_patch_target (target);
+#endif
+
 	/* Skip REX */
 	if ((code [0] >= 0x40) && (code [0] <= 0x4f)) {
 		rex = code [0];
@@ -302,7 +569,9 @@ add_general (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
 
     if (*gr >= PARAM_REGS) {
 		ainfo->storage = ArgOnStack;
-		(*stack_size) += sizeof (gpointer);
+		/* Since the same stack slot size is used for all arg */
+		/*  types, it needs to be big enough to hold them all */
+		(*stack_size) += sizeof(mgreg_t);
     }
     else {
 		ainfo->storage = ArgInIReg;
@@ -324,7 +593,9 @@ add_float (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo, gboolean is_double)
 
     if (*gr >= FLOAT_PARAM_REGS) {
 		ainfo->storage = ArgOnStack;
-		(*stack_size) += sizeof (gpointer);
+		/* Since the same stack slot size is used for both float */
+		/*  types, it needs to be big enough to hold them both */
+		(*stack_size) += sizeof(mgreg_t);
     }
     else {
 		/* A double register */
@@ -419,6 +690,32 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
 
 	return class1;
 }
+#ifdef __native_client_codegen__
+const guint kNaClAlignment = kNaClAlignmentAMD64;
+const guint kNaClAlignmentMask = kNaClAlignmentMaskAMD64;
+
+/* Default alignment for Native Client is 32-byte. */
+gint8 nacl_align_byte = -32; /* signed version of 0xe0 */
+
+/* mono_arch_nacl_pad: Add pad bytes of alignment instructions at code,  */
+/* Check that alignment doesn't cross an alignment boundary.             */
+guint8*
+mono_arch_nacl_pad(guint8 *code, int pad)
+{
+	const int kMaxPadding = 8; /* see amd64-codegen.h:amd64_padding_size() */
+
+	if (pad == 0) return code;
+	/* assertion: alignment cannot cross a block boundary */
+	g_assert (((uintptr_t)code & (~kNaClAlignmentMask)) ==
+	         (((uintptr_t)code + pad - 1) & (~kNaClAlignmentMask)));
+	while (pad >= kMaxPadding) {
+		amd64_padding (code, kMaxPadding);
+		pad -= kMaxPadding;
+	}
+	if (pad != 0) amd64_padding (code, pad);
+	return code;
+}
+#endif
 
 static void
 add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
@@ -426,6 +723,9 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 			   guint32 *gr, guint32 *fr, guint32 *stack_size)
 {
 	guint32 size, quad, nquads, i;
+	/* Keep track of the size used in each quad so we can */
+	/* use the right size when copying args/return vars.  */
+	guint32 quadsize [2] = {8, 8};
 	ArgumentClass args [2];
 	MonoMarshalType *info = NULL;
 	MonoClass *klass;
@@ -454,6 +754,24 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 	}
 #endif
 
+	/* If this struct can't be split up naturally into 8-byte */
+	/* chunks (registers), pass it on the stack.              */
+	if (sig->pinvoke && !pass_on_stack) {
+		info = mono_marshal_load_type_info (klass);
+		g_assert(info);
+		guint32 align;
+		guint32 field_size;
+		for (i = 0; i < info->num_fields; ++i) {
+			field_size = mono_marshal_type_size (info->fields [i].field->type, 
+							   info->fields [i].mspec, 
+							   &align, TRUE, klass->unicode);
+			if ((info->fields [i].offset < 8) && (info->fields [i].offset + field_size) > 8) {
+				pass_on_stack = TRUE;
+				break;
+			}
+		}
+	}
+
 	if (pass_on_stack) {
 		/* Allways pass in memory */
 		ainfo->offset = *stack_size;
@@ -553,6 +871,10 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 				if ((quad == 1) && (info->fields [i].offset < 8))
 					continue;
 
+				/* How far into this quad this data extends.*/
+				/* (8 is size of quad) */
+				quadsize [quad] = info->fields [i].offset + size - (quad * 8);
+
 				class1 = merge_argument_class_from_type (info->fields [i].field->type, class1);
 			}
 			g_assert (class1 != ARG_CLASS_NO_CLASS);
@@ -590,7 +912,9 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 				if (*fr >= FLOAT_PARAM_REGS)
 					args [quad] = ARG_CLASS_MEMORY;
 				else {
-					ainfo->pair_storage [quad] = ArgInDoubleSSEReg;
+					if (quadsize[quad] <= 4)
+						ainfo->pair_storage [quad] = ArgInFloatSSEReg;
+					else ainfo->pair_storage [quad] = ArgInDoubleSSEReg;
 					ainfo->pair_regs [quad] = *fr;
 					(*fr) ++;
 				}
@@ -611,7 +935,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 			if (sig->pinvoke)
 				*stack_size += ALIGN_TO (info->native_size, 8);
 			else
-				*stack_size += nquads * sizeof (gpointer);
+				*stack_size += nquads * sizeof(mgreg_t);
 			ainfo->storage = ArgOnStack;
 		}
 	}
@@ -910,6 +1234,9 @@ mono_amd64_tail_call_supported (MonoMethodSignature *caller_sig, MonoMethodSigna
 static int 
 cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
 {
+#if defined(MONO_CROSS_COMPILE)
+	return 0;
+#else
 #ifndef _MSC_VER
 	__asm__ __volatile__ ("cpuid"
 		: "=a" (*p_eax), "=b" (*p_ebx), "=c" (*p_ecx), "=d" (*p_edx)
@@ -923,6 +1250,7 @@ cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
 	*p_edx = info[3];
 #endif
 	return 1;
+#endif
 }
 
 /*
@@ -956,6 +1284,12 @@ mono_arch_init (void)
 	int flags;
 
 	InitializeCriticalSection (&mini_arch_mutex);
+#if defined(__native_client_codegen__)
+	nacl_instruction_depth = TlsAlloc ();
+	TlsSetValue (nacl_instruction_depth, (gpointer)0);
+	nacl_rex_tag = TlsAlloc ();
+	nacl_legacy_prefix_tag = TlsAlloc ();
+#endif
 
 #ifdef MONO_ARCH_NOMAP32BIT
 	flags = MONO_MMAP_READ;
@@ -988,6 +1322,11 @@ void
 mono_arch_cleanup (void)
 {
 	DeleteCriticalSection (&mini_arch_mutex);
+#if defined(__native_client_codegen__)
+	TlsFree (nacl_instruction_depth);
+	TlsFree (nacl_rex_tag);
+	TlsFree (nacl_legacy_prefix_tag);
+#endif
 }
 
 /*
@@ -1119,6 +1458,13 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 	cfg->arch.omit_fp = TRUE;
 	cfg->arch.omit_fp_computed = TRUE;
 
+#ifdef __native_client_codegen__
+	/* NaCl modules may not change the value of RBP, so it cannot be */
+	/* used as a normal register, but it can be used as a frame pointer*/
+	cfg->disable_omit_fp = TRUE;
+	cfg->arch.omit_fp = FALSE;
+#endif
+
 	if (cfg->disable_omit_fp)
 		cfg->arch.omit_fp = FALSE;
 
@@ -1175,7 +1521,9 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
 		regs = g_list_prepend (regs, (gpointer)AMD64_R12);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R13);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
 		regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#endif
  
 		regs = g_list_prepend (regs, (gpointer)AMD64_R10);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R9);
@@ -1194,7 +1542,9 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
 		regs = g_list_prepend (regs, (gpointer)AMD64_R12);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R13);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
 		regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#endif
 #ifdef HOST_WIN32
 		regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
 		regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
@@ -1230,7 +1580,9 @@ mono_arch_get_iregs_clobbered_by_call (MonoCallInst *call)
 		regs = g_list_prepend (regs, (gpointer)AMD64_R12);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R13);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R14);
+#ifndef __native_client_codegen__
 		regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#endif
 
 		regs = g_list_prepend (regs, (gpointer)AMD64_R10);
 		regs = g_list_prepend (regs, (gpointer)AMD64_R9);
@@ -1431,7 +1783,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 		/* Reserve space for caller saved registers */
 		for (i = 0; i < AMD64_NREG; ++i)
 			if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-				offset += sizeof (gpointer);
+				offset += sizeof(mgreg_t);
 			}
 	}
 
@@ -1560,12 +1912,12 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 					ins->opcode = OP_REGOFFSET;
 					ins->inst_basereg = cfg->frame_reg;
 					/* These arguments are saved to the stack in the prolog */
-					offset = ALIGN_TO (offset, sizeof (gpointer));
+					offset = ALIGN_TO (offset, sizeof(mgreg_t));
 					if (cfg->arch.omit_fp) {
 						ins->inst_offset = offset;
-						offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (gpointer) : sizeof (gpointer);
+						offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
 					} else {
-						offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (gpointer) : sizeof (gpointer);
+						offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
 						ins->inst_offset = - offset;
 					}
 					break;
@@ -1637,14 +1989,14 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 				ins->opcode = OP_REGOFFSET;
 				ins->inst_basereg = cfg->frame_reg;
 				/* These arguments are saved to the stack in the prolog */
-				offset = ALIGN_TO (offset, sizeof (gpointer));
+				offset = ALIGN_TO (offset, sizeof(mgreg_t));
 				if (cfg->arch.omit_fp) {
 					ins->inst_offset = offset;
-					offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (gpointer) : sizeof (gpointer);
+					offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
 					// Arguments are yet supported by the stack map creation code
 					//cfg->locals_max_stack_offset = MAX (cfg->locals_max_stack_offset, offset);
 				} else {
-					offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (gpointer) : sizeof (gpointer);
+					offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (mgreg_t) : sizeof (mgreg_t);
 					ins->inst_offset = - offset;
 					//cfg->locals_min_stack_offset = MIN (cfg->locals_min_stack_offset, offset);
 				}
@@ -1740,7 +2092,11 @@ arg_storage_to_load_membase (ArgStorage storage)
 {
 	switch (storage) {
 	case ArgInIReg:
+#if defined(__mono_ilp32__)
+		return OP_LOADI8_MEMBASE;
+#else
 		return OP_LOAD_MEMBASE;
+#endif
 	case ArgInDoubleSSEReg:
 		return OP_LOADR8_MEMBASE;
 	case ArgInFloatSSEReg:
@@ -2149,7 +2505,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
 
 			MONO_INST_NEW (cfg, load, arg_storage_to_load_membase (ainfo->pair_storage [part]));
 			load->inst_basereg = src->dreg;
-			load->inst_offset = part * sizeof (gpointer);
+			load->inst_offset = part * sizeof(mgreg_t);
 
 			switch (ainfo->pair_storage [part]) {
 			case ArgInIReg:
@@ -2366,6 +2722,15 @@ mono_arch_dyn_call_free (MonoDynCallInfo *info)
 	g_free (ainfo);
 }
 
+#if !defined(__native_client__)
+#define PTR_TO_GREG(ptr) (mgreg_t)(ptr)
+#define GREG_TO_PTR(greg) (gpointer)(greg)
+#else
+/* Correctly handle casts to/from 32-bit pointers without compiler warnings */
+#define PTR_TO_GREG(ptr) (mgreg_t)(uintptr_t)(ptr)
+#define GREG_TO_PTR(greg) (gpointer)(guint32)(greg)
+#endif
+
 /*
  * mono_arch_get_start_dyn_call:
  *
@@ -2398,20 +2763,20 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
 	pindex = 0;
 
 	if (sig->hasthis || dinfo->cinfo->vret_arg_index == 1) {
-		p->regs [greg ++] = (mgreg_t)*(args [arg_index ++]);
+		p->regs [greg ++] = PTR_TO_GREG(*(args [arg_index ++]));
 		if (!sig->hasthis)
 			pindex = 1;
 	}
 
 	if (dinfo->cinfo->vtype_retaddr)
-		p->regs [greg ++] = (mgreg_t)ret;
+		p->regs [greg ++] = PTR_TO_GREG(ret);
 
 	for (i = pindex; i < sig->param_count; i++) {
 		MonoType *t = mono_type_get_underlying_type (sig->params [i]);
 		gpointer *arg = args [arg_index ++];
 
 		if (t->byref) {
-			p->regs [greg ++] = (mgreg_t)*(arg);
+			p->regs [greg ++] = PTR_TO_GREG(*(arg));
 			continue;
 		}
 
@@ -2424,11 +2789,20 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
 		case MONO_TYPE_PTR:
 		case MONO_TYPE_I:
 		case MONO_TYPE_U:
+#if !defined(__mono_ilp32__)
 		case MONO_TYPE_I8:
 		case MONO_TYPE_U8:
+#endif
 			g_assert (dinfo->cinfo->args [i + sig->hasthis].reg == param_regs [greg]);
-			p->regs [greg ++] = (mgreg_t)*(arg);
+			p->regs [greg ++] = PTR_TO_GREG(*(arg));
 			break;
+#if defined(__mono_ilp32__)
+		case MONO_TYPE_I8:
+		case MONO_TYPE_U8:
+			g_assert (dinfo->cinfo->args [i + sig->hasthis].reg == param_regs [greg]);
+			p->regs [greg ++] = *(guint64*)(arg);
+			break;
+#endif
 		case MONO_TYPE_BOOLEAN:
 		case MONO_TYPE_U1:
 			p->regs [greg ++] = *(guint8*)(arg);
@@ -2451,7 +2825,7 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
 			break;
 		case MONO_TYPE_GENERICINST:
 		    if (MONO_TYPE_IS_REFERENCE (t)) {
-				p->regs [greg ++] = (mgreg_t)*(arg);
+				p->regs [greg ++] = PTR_TO_GREG(*(arg));
 				break;
 			} else {
 				/* Fall through */
@@ -2507,7 +2881,7 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
 	case MONO_TYPE_I:
 	case MONO_TYPE_U:
 	case MONO_TYPE_PTR:
-		*(gpointer*)ret = (gpointer)res;
+		*(gpointer*)ret = GREG_TO_PTR(res);
 		break;
 	case MONO_TYPE_I1:
 		*(gint8*)ret = res;
@@ -2537,7 +2911,7 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
 		break;
 	case MONO_TYPE_GENERICINST:
 		if (MONO_TYPE_IS_REFERENCE (sig->ret)) {
-			*(gpointer*)ret = (gpointer)res;
+			*(gpointer*)ret = GREG_TO_PTR(res);
 			break;
 		} else {
 			/* Fall through */
@@ -2690,8 +3064,10 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
 			 * not span cache lines. This is required for code patching to work on SMP
 			 * systems.
 			 */
-			if (!no_patch && ((guint32)(code + 1 - cfg->native_code) % 4) != 0)
-				amd64_padding (code, 4 - ((guint32)(code + 1 - cfg->native_code) % 4));
+			if (!no_patch && ((guint32)(code + 1 - cfg->native_code) % 4) != 0) {
+				guint32 pad_size = 4 - ((guint32)(code + 1 - cfg->native_code) % 4);
+				amd64_padding (code, pad_size);
+			}
 			mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
 			amd64_call_code (code, 0);
 		}
@@ -2948,8 +3324,13 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 				ins->sreg2 = temp->dreg;
 			}
 			break;
+#ifndef __mono_ilp32__
 		case OP_LOAD_MEMBASE:
+#endif
 		case OP_LOADI8_MEMBASE:
+#ifndef __native_client_codegen__
+		/*  Don't generate memindex opcodes (to simplify */
+		/*  read sandboxing) */
 			if (!amd64_is_imm32 (ins->inst_offset)) {
 				NEW_INS (cfg, ins, temp, OP_I8CONST);
 				temp->inst_c0 = ins->inst_offset;
@@ -2957,8 +3338,11 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 				ins->opcode = OP_AMD64_LOADI8_MEMINDEX;
 				ins->inst_indexreg = temp->dreg;
 			}
+#endif
 			break;
+#ifndef __mono_ilp32__
 		case OP_STORE_MEMBASE_IMM:
+#endif
 		case OP_STOREI8_MEMBASE_IMM:
 			if (!amd64_is_imm32 (ins->inst_imm)) {
 				NEW_INS (cfg, ins, temp, OP_I8CONST);
@@ -3110,8 +3494,20 @@ mono_emit_stack_alloc (MonoCompile *cfg, guchar *code, MonoInst* tree)
 		if (cfg->param_area && cfg->arch.no_pushes)
 			amd64_alu_reg_imm (code, X86_ADD, AMD64_RDI, cfg->param_area);
 		amd64_cld (code);
+#if defined(__default_codegen__)
+		amd64_prefix (code, X86_REP_PREFIX);
+		amd64_stosl (code);
+#elif defined(__native_client_codegen__)
+		/* NaCl stos pseudo-instruction */
+		amd64_codegen_pre(code);
+		/* First, clear the upper 32 bits of RDI (mov %edi, %edi)  */
+		amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RDI, 4);
+		/* Add %r15 to %rdi using lea, condition flags unaffected. */
+		amd64_lea_memindex_size (code, AMD64_RDI, AMD64_R15, 0, AMD64_RDI, 0, 8);
 		amd64_prefix (code, X86_REP_PREFIX);
 		amd64_stosl (code);
+		amd64_codegen_post(code);
+#endif /* __native_client_codegen__ */
 		
 		if (tree->dreg != AMD64_RDI && sreg != AMD64_RDI)
 			amd64_pop_reg (code, AMD64_RDI);
@@ -3163,12 +3559,12 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 
 			/* Load the destination address */
 			g_assert (loc->opcode == OP_REGOFFSET);
-			amd64_mov_reg_membase (code, AMD64_RCX, loc->inst_basereg, loc->inst_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_RCX, loc->inst_basereg, loc->inst_offset, sizeof(gpointer));
 
 			for (quad = 0; quad < 2; quad ++) {
 				switch (cinfo->ret.pair_storage [quad]) {
 				case ArgInIReg:
-					amd64_mov_membase_reg (code, AMD64_RCX, (quad * 8), cinfo->ret.pair_regs [quad], 8);
+					amd64_mov_membase_reg (code, AMD64_RCX, (quad * sizeof(mgreg_t)), cinfo->ret.pair_regs [quad], sizeof(mgreg_t));
 					break;
 				case ArgInFloatSSEReg:
 					amd64_movss_membase_reg (code, AMD64_RCX, (quad * 8), cinfo->ret.pair_regs [quad]);
@@ -3244,6 +3640,15 @@ amd64_pop_reg (code, AMD64_RAX);
 
 #ifndef DISABLE_JIT
 
+#if defined(__native_client__) || defined(__native_client_codegen__)
+void mono_nacl_gc()
+{
+#ifdef __native_client_gc__
+	__nacl_suspend_thread_if_needed();
+#endif
+}
+#endif
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -3277,6 +3682,21 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		}
 	}
 
+#if defined(__native_client_codegen__)
+	/* For Native Client, all indirect call/jump targets must be */
+	/* 32-byte aligned.  Exception handler blocks are jumped to  */
+	/* indirectly as well.                                       */
+	gboolean bb_needs_alignment = (bb->flags & BB_INDIRECT_JUMP_TARGET) ||
+				      (bb->flags & BB_EXCEPTION_HANDLER);
+
+	if ( bb_needs_alignment && ((cfg->code_len & kNaClAlignmentMask) != 0)) {
+		int pad = kNaClAlignment - (cfg->code_len & kNaClAlignmentMask);
+		if (pad != kNaClAlignment) code = mono_arch_nacl_pad(code, pad);
+		cfg->code_len += pad;
+		bb->native_offset = cfg->code_len;
+	}
+#endif  /*__native_client_codegen__*/
+
 	if (cfg->verbose_level > 2)
 		g_print ("Basic block %d starting at offset 0x%x\n", bb->block_num, bb->native_offset);
 
@@ -3302,9 +3722,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
 		max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
 
-		if (G_UNLIKELY (offset > (cfg->code_size - max_len - 16))) {
+#define EXTRA_CODE_SPACE (NACL_SIZE (16, 16 + kNaClAlignment))
+
+		if (G_UNLIKELY (offset > (cfg->code_size - max_len - EXTRA_CODE_SPACE))) {
 			cfg->code_size *= 2;
-			cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+			cfg->native_code = mono_realloc_native_code(cfg);
 			code = cfg->native_code + offset;
 			mono_jit_stats.code_reallocs++;
 		}
@@ -3337,7 +3759,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		case OP_STOREI2_MEMBASE_REG:
 			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 2);
 			break;
+		/* In AMD64 NaCl, pointers are 4 bytes, */
+		/*  so STORE_* != STOREI8_*. Likewise below. */
 		case OP_STORE_MEMBASE_REG:
+			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, sizeof(gpointer));
+			break;
 		case OP_STOREI8_MEMBASE_REG:
 			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 8);
 			break;
@@ -3345,15 +3771,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 4);
 			break;
 		case OP_STORE_MEMBASE_IMM:
+#ifndef __native_client_codegen__
+			/* In NaCl, this could be a PCONST type, which could */
+			/* mean a pointer type was copied directly into the  */
+			/* lower 32-bits of inst_imm, so for InvalidPtr==-1  */
+			/* the value would be 0x00000000FFFFFFFF which is    */
+			/* not proper for an imm32 unless you cast it.       */
+			g_assert (amd64_is_imm32 (ins->inst_imm));
+#endif
+			amd64_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, (gint32)ins->inst_imm, sizeof(gpointer));
+			break;
 		case OP_STOREI8_MEMBASE_IMM:
 			g_assert (amd64_is_imm32 (ins->inst_imm));
 			amd64_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, ins->inst_imm, 8);
 			break;
 		case OP_LOAD_MEM:
+#ifdef __mono_ilp32__
+			/* In ILP32, pointers are 4 bytes, so separate these */
+			/* cases, use literal 8 below where we really want 8 */
+			amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
+			amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, sizeof(gpointer));
+			break;
+#endif
 		case OP_LOADI8_MEM:
 			// FIXME: Decompose this earlier
 			if (amd64_is_imm32 (ins->inst_imm))
-				amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, sizeof (gpointer));
+				amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 8);
 			else {
 				amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
 				amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 8);
@@ -3377,13 +3820,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			amd64_widen_membase (code, ins->dreg, ins->dreg, 0, FALSE, FALSE);
 			break;
 		case OP_LOADU2_MEM:
+			/* For NaCl, pointers are 4 bytes, so separate these */
+			/* cases, use literal 8 below where we really want 8 */
 			amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
 			amd64_widen_membase (code, ins->dreg, ins->dreg, 0, FALSE, TRUE);
 			break;
 		case OP_LOAD_MEMBASE:
+			g_assert (amd64_is_imm32 (ins->inst_offset));
+			amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, sizeof(gpointer));
+			break;
 		case OP_LOADI8_MEMBASE:
+			/* Use literal 8 instead of sizeof pointer or */
+			/* register, we really want 8 for this opcode */
 			g_assert (amd64_is_imm32 (ins->inst_offset));
-			amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, sizeof (gpointer));
+			amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, 8);
 			break;
 		case OP_LOADI4_MEMBASE:
 			amd64_movsxd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
@@ -4071,14 +4521,14 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			break;
 		case OP_AOTCONST:
 			mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
-			amd64_mov_reg_membase (code, ins->dreg, AMD64_RIP, 0, 8);
+			amd64_mov_reg_membase (code, ins->dreg, AMD64_RIP, 0, sizeof(gpointer));
 			break;
 		case OP_JUMP_TABLE:
 			mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
 			amd64_mov_reg_imm_size (code, ins->dreg, 0, 8);
 			break;
 		case OP_MOVE:
-			amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof (gpointer));
+			amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof(mgreg_t));
 			break;
 		case OP_AMD64_SET_XMMREG_R4: {
 			amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
@@ -4116,20 +4566,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			else {
 				for (i = 0; i < AMD64_NREG; ++i)
 					if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-						pos -= sizeof (gpointer);
+						pos -= sizeof(mgreg_t);
 
 				/* Restore callee-saved registers */
 				for (i = AMD64_NREG - 1; i > 0; --i) {
 					if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
-						amd64_mov_reg_membase (code, i, AMD64_RBP, pos, 8);
-						pos += 8;
+						amd64_mov_reg_membase (code, i, AMD64_RBP, pos, sizeof(mgreg_t));
+						pos += sizeof(mgreg_t);
 					}
 				}
 
 				/* Copy arguments on the stack to our argument area */
-				for (i = 0; i < call->stack_usage; i += 8) {
-					amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, i, 8);
-					amd64_mov_membase_reg (code, AMD64_RBP, 16 + i, AMD64_RAX, 8);
+				for (i = 0; i < call->stack_usage; i += sizeof(mgreg_t)) {
+					amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, i, sizeof(mgreg_t));
+					amd64_mov_membase_reg (code, AMD64_RBP, 16 + i, AMD64_RAX, sizeof(mgreg_t));
 				}
 			
 				if (pos)
@@ -4155,7 +4605,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			break;
 		case OP_ARGLIST: {
 			amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, cfg->sig_cookie);
-			amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, 8);
+			amd64_mov_membase_reg (code, ins->sreg1, 0, AMD64_R11, sizeof(gpointer));
 			break;
 		}
 		case OP_CALL:
@@ -4278,7 +4728,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
 			/* Set argument registers */
 			for (i = 0; i < PARAM_REGS; ++i)
-				amd64_mov_reg_membase (code, param_regs [i], AMD64_R11, i * sizeof (gpointer), 8);
+				amd64_mov_reg_membase (code, param_regs [i], AMD64_R11, i * sizeof(mgreg_t), sizeof(mgreg_t));
 			
 			/* Make the call */
 			amd64_call_reg (code, AMD64_R10);
@@ -4403,8 +4853,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 8);
 			break;
 		case OP_START_HANDLER: {
+			/* Even though we're saving RSP, use sizeof */
+			/* gpointer because spvar is of type IntPtr */
+			/* see: mono_create_spvar_for_region */
 			MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-			amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, 8);
+			amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, sizeof(gpointer));
 
 			if ((MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY) ||
 				 MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY)) &&
@@ -4415,13 +4868,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		}
 		case OP_ENDFINALLY: {
 			MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, sizeof(gpointer));
 			amd64_ret (code);
 			break;
 		}
 		case OP_ENDFILTER: {
 			MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_RSP, spvar->inst_basereg, spvar->inst_offset, sizeof(gpointer));
 			/* The local allocator will put the result into RAX */
 			amd64_ret (code);
 			break;
@@ -5677,6 +6130,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
 			break;
 		}
+		case OP_NACL_GC_SAFE_POINT: {
+#if defined(__native_client_codegen__)
+			code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)mono_nacl_gc, TRUE);
+#endif
+			break;
+		}
 		case OP_GC_LIVENESS_DEF:
 		case OP_GC_LIVENESS_USE:
 		case OP_GC_PARAM_SLOT_LIVENESS_DEF:
@@ -5692,9 +6151,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		}
 
 		if ((code - cfg->native_code - offset) > max_len) {
+#if !defined(__native_client_codegen__)
 			g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %ld)",
 				   mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
 			g_assert_not_reached ();
+#endif
 		}
 	       
 		last_ins = ins;
@@ -5824,10 +6285,27 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 	gint32 lmf_offset = cfg->arch.lmf_offset;
 	gboolean args_clobbered = FALSE;
 	gboolean trace = FALSE;
+#ifdef __native_client_codegen__
+	guint alignment_check;
+#endif
 
 	cfg->code_size =  MAX (cfg->header->code_size * 4, 10240);
 
+#if defined(__default_codegen__)
 	code = cfg->native_code = g_malloc (cfg->code_size);
+#elif defined(__native_client_codegen__)
+	/* native_code_alloc is not 32-byte aligned, native_code is. */
+	cfg->native_code_alloc = g_malloc (cfg->code_size + kNaClAlignment);
+
+	/* Align native_code to next nearest kNaclAlignment byte. */
+	cfg->native_code = (uintptr_t)cfg->native_code_alloc + kNaClAlignment;
+	cfg->native_code = (uintptr_t)cfg->native_code & ~kNaClAlignmentMask;
+
+	code = cfg->native_code;
+
+	alignment_check = (guint)cfg->native_code & kNaClAlignmentMask;
+	g_assert (alignment_check == 0);
+#endif
 
 	if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
 		trace = TRUE;
@@ -5873,7 +6351,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		/* These are handled automatically by the stack marking code */
 		mini_gc_set_slot_type_from_cfa (cfg, -cfa_offset, SLOT_NOREF);
 		
-		amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+		amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof(mgreg_t));
 		mono_emit_unwind_op_def_cfa_reg (cfg, code, AMD64_RBP);
 		async_exc_point (code);
 #ifdef HOST_WIN32
@@ -5888,7 +6366,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		for (i = 0; i < AMD64_NREG; ++i)
 			if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
 				amd64_push_reg (code, i);
-				pos += sizeof (gpointer);
+				pos += 8; /* AMD64 push inst is always 8 bytes, no way to change it */
 				offset += 8;
 				mono_emit_unwind_op_offset (cfg, code, i, - offset);
 				async_exc_point (code);
@@ -5904,7 +6382,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		if (cfg->arch.omit_fp)
 			// FIXME:
 			g_assert_not_reached ();
-		cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof (gpointer));
+		cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof(mgreg_t));
 	}
 
 	if (cfg->arch.omit_fp) {
@@ -5942,7 +6420,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		if (G_UNLIKELY (required_code_size >= (cfg->code_size - offset))) {
 			while (required_code_size >= (cfg->code_size - offset))
 				cfg->code_size *= 2;
-			cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+			cfg->native_code = mono_realloc_native_code (cfg);
 			code = cfg->native_code + offset;
 			mono_jit_stats.code_reallocs++;
 		}
@@ -6008,8 +6486,20 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RSP, 8);
 
 		amd64_cld (code);
+#if defined(__default_codegen__)
+		amd64_prefix (code, X86_REP_PREFIX);
+		amd64_stosl (code);
+#elif defined(__native_client_codegen__)
+		/* NaCl stos pseudo-instruction */
+		amd64_codegen_pre (code);
+		/* First, clear the upper 32 bits of RDI (mov %edi, %edi)  */
+		amd64_mov_reg_reg (code, AMD64_RDI, AMD64_RDI, 4);
+		/* Add %r15 to %rdi using lea, condition flags unaffected. */
+		amd64_lea_memindex_size (code, AMD64_RDI, AMD64_R15, 0, AMD64_RDI, 0, 8);
 		amd64_prefix (code, X86_REP_PREFIX);
 		amd64_stosl (code);
+		amd64_codegen_post (code);
+#endif /* __native_client_codegen__ */
 
 		amd64_mov_reg_membase (code, AMD64_RDI, AMD64_RSP, -8, 8);
 		amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RSP, -16, 8);
@@ -6037,7 +6527,9 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			case AMD64_R12: offset = G_STRUCT_OFFSET (MonoLMF, r12); break;
 			case AMD64_R13: offset = G_STRUCT_OFFSET (MonoLMF, r13); break;
 			case AMD64_R14: offset = G_STRUCT_OFFSET (MonoLMF, r14); break;
+#ifndef __native_client_codegen__
 			case AMD64_R15: offset = G_STRUCT_OFFSET (MonoLMF, r15); break;
+#endif
 #ifdef HOST_WIN32
 			case AMD64_RDI: offset = G_STRUCT_OFFSET (MonoLMF, rdi); break;
 			case AMD64_RSI: offset = G_STRUCT_OFFSET (MonoLMF, rsi); break;
@@ -6100,7 +6592,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 		g_assert (cfg->rgctx_var->opcode == OP_REGOFFSET &&
 				(cfg->rgctx_var->inst_basereg == AMD64_RBP || cfg->rgctx_var->inst_basereg == AMD64_RSP));
 
-		amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, 8);
+		amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, sizeof(gpointer));
 	}
 
 	/* compute max_length in order to use short forward jumps */
@@ -6115,8 +6607,22 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			/* max alignment for loops */
 			if ((cfg->opt & MONO_OPT_LOOP) && bb_is_loop_start (bb))
 				max_length += LOOP_ALIGNMENT;
+#ifdef __native_client_codegen__
+			/* max alignment for native client */
+			max_length += kNaClAlignment;
+#endif
 
 			MONO_BB_FOR_EACH_INS (bb, ins) {
+#ifdef __native_client_codegen__
+				{
+					int space_in_block = kNaClAlignment -
+						((max_length + cfg->code_len) & kNaClAlignmentMask);
+					int max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
+					if (space_in_block < max_len && max_len < kNaClAlignment) {
+						max_length += space_in_block;
+					}
+				}
+#endif  /*__native_client_codegen__*/
 				max_length += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
 			}
 
@@ -6168,13 +6674,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 				for (quad = 0; quad < 2; quad ++) {
 					switch (ainfo->pair_storage [quad]) {
 					case ArgInIReg:
-						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad], sizeof (gpointer));
+						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad], sizeof(mgreg_t));
 						break;
 					case ArgInFloatSSEReg:
-						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgInDoubleSSEReg:
-						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgNone:
 						break;
@@ -6220,13 +6726,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 				for (quad = 0; quad < 2; quad ++) {
 					switch (ainfo->pair_storage [quad]) {
 					case ArgInIReg:
-						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad], sizeof (gpointer));
+						amd64_mov_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad], sizeof(mgreg_t));
 						break;
 					case ArgInFloatSSEReg:
-						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movss_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgInDoubleSSEReg:
-						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), ainfo->pair_regs [quad]);
+						amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset + (quad * sizeof(mgreg_t)), ainfo->pair_regs [quad]);
 						break;
 					case ArgNone:
 						break;
@@ -6354,13 +6860,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 			}
 
 			/* Save lmf_addr */
-			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, sizeof(gpointer));
 			/* Save previous_lmf */
-			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, sizeof(gpointer));
+			amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, sizeof(gpointer));
 			/* Set new lmf */
 			amd64_lea_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset);
-			amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
+			amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, sizeof(gpointer));
 		}
 	}
 
@@ -6471,7 +6977,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 
 	while (cfg->code_len + max_epilog_size > (cfg->code_size - 16)) {
 		cfg->code_size *= 2;
-		cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+		cfg->native_code = mono_realloc_native_code (cfg);
 		mono_jit_stats.code_reallocs++;
 	}
 
@@ -6507,14 +7013,14 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 			 * through the mono_lmf_addr TLS variable.
 			 */
 			/* reg = previous_lmf */
-			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
+			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof(gpointer));
 			x86_prefix (code, X86_FS_PREFIX);
 			amd64_mov_mem_reg (code, lmf_tls_offset, AMD64_R11, 8);
 		} else {
 			/* Restore previous lmf */
-			amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
-			amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+			amd64_mov_reg_membase (code, AMD64_RCX, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof(gpointer));
+			amd64_mov_reg_membase (code, AMD64_R11, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), sizeof(gpointer));
+			amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, sizeof(gpointer));
 		}
 
 		/* Restore caller saved regs */
@@ -6534,7 +7040,11 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 			amd64_mov_reg_membase (code, AMD64_R14, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), 8);
 		}
 		if (cfg->used_int_regs & (1 << AMD64_R15)) {
+#if defined(__default_codegen__)
 			amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
+#elif defined(__native_client_codegen__)
+			g_assert_not_reached();
+#endif
 		}
 #ifdef HOST_WIN32
 		if (cfg->used_int_regs & (1 << AMD64_RDI)) {
@@ -6558,10 +7068,10 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 		else {
 			for (i = 0; i < AMD64_NREG; ++i)
 				if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i)))
-					pos -= sizeof (gpointer);
+					pos -= sizeof(mgreg_t);
 
 			if (pos) {
-				if (pos == - sizeof (gpointer)) {
+				if (pos == - sizeof(mgreg_t)) {
 					/* Only one register, so avoid lea */
 					for (i = AMD64_NREG - 1; i > 0; --i)
 						if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
@@ -6590,13 +7100,13 @@ mono_arch_emit_epilog (MonoCompile *cfg)
 		for (quad = 0; quad < 2; quad ++) {
 			switch (ainfo->pair_storage [quad]) {
 			case ArgInIReg:
-				amd64_mov_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)), sizeof (gpointer));
+				amd64_mov_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)), sizeof(mgreg_t));
 				break;
 			case ArgInFloatSSEReg:
-				amd64_movss_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)));
+				amd64_movss_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)));
 				break;
 			case ArgInDoubleSSEReg:
-				amd64_movsd_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof (gpointer)));
+				amd64_movsd_reg_membase (code, ainfo->pair_regs [quad], inst->inst_basereg, inst->inst_offset + (quad * sizeof(mgreg_t)));
 				break;
 			case ArgNone:
 				break;
@@ -6642,9 +7152,16 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 			code_size += 8 + 7; /*sizeof (void*) + alignment */
 	}
 
+#ifdef __native_client_codegen__
+	/* Give us extra room on Native Client.  This could be   */
+	/* more carefully calculated, but bundle alignment makes */
+	/* it much trickier, so *2 like other places is good.    */
+	code_size *= 2;
+#endif
+
 	while (cfg->code_len + code_size > (cfg->code_size - 16)) {
 		cfg->code_size *= 2;
-		cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+		cfg->native_code = mono_realloc_native_code (cfg);
 		mono_jit_stats.code_reallocs++;
 	}
 
@@ -6705,6 +7222,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 			/* do nothing */
 			break;
 		}
+		g_assert(code < cfg->native_code + cfg->code_size);
 	}
 
 	/* Handle relocations with RIP relative addressing */
@@ -6715,27 +7233,69 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 		switch (patch_info->type) {
 		case MONO_PATCH_INFO_R8:
 		case MONO_PATCH_INFO_R4: {
-			guint8 *pos;
+			guint8 *pos, *patch_pos, *target_pos;
 
 			/* The SSE opcodes require a 16 byte alignment */
+#if defined(__default_codegen__)
 			code = (guint8*)ALIGN_TO (code, 16);
-			memset (orig_code, 0, code - orig_code);
+#elif defined(__native_client_codegen__)
+			{
+				/* Pad this out with HLT instructions  */
+				/* or we can get garbage bytes emitted */
+				/* which will fail validation          */
+				guint8 *aligned_code;
+				/* extra align to make room for  */
+				/* mov/push below 		       */
+				int extra_align = patch_info->type == MONO_PATCH_INFO_R8 ? 2 : 1;
+				aligned_code = (guint8*)ALIGN_TO (code + extra_align, 16);
+				/* The technique of hiding data in an  */
+				/* instruction has a problem here: we  */
+				/* need the data aligned to a 16-byte  */
+				/* boundary but the instruction cannot */
+				/* cross the bundle boundary. so only  */
+				/* odd multiples of 16 can be used     */
+				if ((intptr_t)aligned_code % kNaClAlignment == 0) {
+					aligned_code += 16;
+				}
+				while (code < aligned_code) {
+					*(code++) = 0xf4; /* hlt */
+				}
+			}	
+#endif
 
 			pos = cfg->native_code + patch_info->ip.i;
-
-			if (IS_REX (pos [1]))
-				*(guint32*)(pos + 5) = (guint8*)code - pos - 9;
-			else
-				*(guint32*)(pos + 4) = (guint8*)code - pos - 8;
+			if (IS_REX (pos [1])) {
+				patch_pos = pos + 5;
+				target_pos = code - pos - 9;
+			}
+			else {
+				patch_pos = pos + 4;
+				target_pos = code - pos - 8;
+			}
 
 			if (patch_info->type == MONO_PATCH_INFO_R8) {
+#ifdef __native_client_codegen__
+				/* Hide 64-bit data in a         */
+				/* "mov imm64, r11" instruction. */
+				/* write it before the start of  */
+				/* the data*/
+				*(code-2) = 0x49; /* prefix      */
+				*(code-1) = 0xbb; /* mov X, %r11 */
+#endif
 				*(double*)code = *(double*)patch_info->data.target;
 				code += sizeof (double);
 			} else {
+#ifdef __native_client_codegen__
+				/* Hide 32-bit data in a        */
+				/* "push imm32" instruction.    */
+				*(code-1) = 0x68; /* push */
+#endif
 				*(float*)code = *(float*)patch_info->data.target;
 				code += sizeof (float);
 			}
 
+			*(guint32*)(patch_pos) = target_pos;
+
 			remove = TRUE;
 			break;
 		}
@@ -6778,6 +7338,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 				tmp->next = patch_info->next;
 			}
 		}
+		g_assert (code < cfg->native_code + cfg->code_size);
 	}
 
 	cfg->code_len = code - cfg->native_code;
@@ -7095,6 +7656,46 @@ mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guin
 	return can_write;
 }
 
+#if defined(__native_client_codegen__)
+/* For membase calls, we want the base register. for Native Client,  */
+/* all indirect calls have the following sequence with the given sizes: */
+/* mov %eXX,%eXX				[2-3]	*/
+/* mov disp(%r15,%rXX,scale),%r11d		[4-8]	*/
+/* and $0xffffffffffffffe0,%r11d		[4]	*/
+/* add %r15,%r11				[3]	*/
+/* callq *%r11					[3]	*/
+
+
+/* Determine if code points to a NaCl call-through-register sequence, */
+/* (i.e., the last 3 instructions listed above) */
+int
+is_nacl_call_reg_sequence(guint8* code)
+{
+	const char *sequence = "\x41\x83\xe3\xe0" /* and */
+			       "\x4d\x03\xdf"     /* add */
+			       "\x41\xff\xd3";   /* call */
+	return memcmp(code, sequence, 10) == 0;
+}
+
+/* Determine if code points to the first opcode of the mov membase component */
+/* of an indirect call sequence (i.e. the first 2 instructions listed above) */
+/* (there could be a REX prefix before the opcode but it is ignored) */
+static int
+is_nacl_indirect_call_membase_sequence(guint8* code)
+{
+	       /* Check for mov opcode, reg-reg addressing mode (mod = 3), */
+	return code[0] == 0x8b && amd64_modrm_mod(code[1]) == 3 &&
+	       /* and that src reg = dest reg */
+	       amd64_modrm_reg(code[1]) == amd64_modrm_rm(code[1]) &&
+	       /* Check that next inst is mov, uses SIB byte (rm = 4), */
+	       IS_REX(code[2]) &&
+	       code[3] == 0x8b && amd64_modrm_rm(code[4]) == 4 &&
+	       /* and has dst of r11 and base of r15 */
+	       (amd64_modrm_reg(code[4]) + amd64_rex_r(code[2])) == AMD64_R11 &&
+	       (amd64_sib_base(code[5]) + amd64_rex_b(code[2])) == AMD64_R15;
+}
+#endif /* __native_client_codegen__ */
+
 int
 mono_arch_get_this_arg_reg (guint8 *code)
 {
@@ -7148,6 +7749,8 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
 		g_assert ((code - start) < 64);
 	}
 
+	nacl_global_codeman_validate(&start, 64, &code);
+
 	mono_debug_add_delegate_trampoline (start, code - start);
 
 	if (code_len)
@@ -7292,6 +7895,7 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 
 #ifdef MONO_ARCH_HAVE_IMT
 
+#if defined(__default_codegen__)
 #define CMP_SIZE (6 + 1)
 #define CMP_REG_REG_SIZE (4 + 1)
 #define BR_SMALL_SIZE 2
@@ -7299,6 +7903,20 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 #define MOV_REG_IMM_SIZE 10
 #define MOV_REG_IMM_32BIT_SIZE 6
 #define JUMP_REG_SIZE (2 + 1)
+#elif defined(__native_client_codegen__)
+/* NaCl N-byte instructions can be padded up to N-1 bytes */
+#define CMP_SIZE ((6 + 1) * 2 - 1)
+#define CMP_REG_REG_SIZE ((4 + 1) * 2 - 1)
+#define BR_SMALL_SIZE (2 * 2 - 1)
+#define BR_LARGE_SIZE (6 * 2 - 1)
+#define MOV_REG_IMM_SIZE (10 * 2 - 1)
+#define MOV_REG_IMM_32BIT_SIZE (6 * 2 - 1)
+/* Jump reg for NaCl adds a mask (+4) and add (+3) */
+#define JUMP_REG_SIZE ((2 + 1 + 4 + 3) * 2 - 1)
+/* Jump membase's size is large and unpredictable    */
+/* in native client, just pad it out a whole bundle. */
+#define JUMP_MEMBASE_SIZE (kNaClAlignment)
+#endif
 
 static int
 imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
@@ -7338,6 +7956,9 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 						item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
 					else
 						item->chunk_size += MOV_REG_IMM_SIZE;
+#ifdef __native_client_codegen__
+					item->chunk_size += JUMP_MEMBASE_SIZE;
+#endif
 				}
 				item->chunk_size += BR_SMALL_SIZE + JUMP_REG_SIZE;
 			} else {
@@ -7353,6 +7974,9 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 					/* with assert below:
 					 * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
 					 */
+#ifdef __native_client_codegen__
+					item->chunk_size += JUMP_MEMBASE_SIZE;
+#endif
 				}
 			}
 		} else {
@@ -7365,10 +7989,16 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 		}
 		size += item->chunk_size;
 	}
+#if defined(__native_client__) && defined(__native_client_codegen__)
+	/* In Native Client, we don't re-use thunks, allocate from the */
+	/* normal code manager paths. */
+	code = mono_domain_code_reserve (domain, size);
+#else
 	if (fail_tramp)
 		code = mono_method_alloc_generic_virtual_thunk (domain, size);
 	else
 		code = mono_domain_code_reserve (domain, size);
+#endif
 	start = code;
 	for (i = 0; i < count; ++i) {
 		MonoIMTCheckItem *item = imt_entries [i];
@@ -7381,24 +8011,24 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 					if (amd64_is_imm32 (item->key))
 						amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
 					else {
-						amd64_mov_reg_imm (code, AMD64_R11, item->key);
-						amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R11);
+						amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+						amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
 					}
 				}
 				item->jmp_code = code;
 				amd64_branch8 (code, X86_CC_NE, 0, FALSE);
 				if (item->has_target_code) {
-					amd64_mov_reg_imm (code, AMD64_R11, item->value.target_code);
-					amd64_jump_reg (code, AMD64_R11);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->value.target_code);
+					amd64_jump_reg (code, MONO_ARCH_IMT_SCRATCH_REG);
 				} else {
-					amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->value.vtable_slot]));
-					amd64_jump_membase (code, AMD64_R11, 0);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+					amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
 				}
 
 				if (fail_case) {
 					amd64_patch (item->jmp_code, code);
-					amd64_mov_reg_imm (code, AMD64_R11, fail_tramp);
-					amd64_jump_reg (code, AMD64_R11);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, fail_tramp);
+					amd64_jump_reg (code, MONO_ARCH_IMT_SCRATCH_REG);
 					item->jmp_code = NULL;
 				}
 			} else {
@@ -7407,27 +8037,33 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 				if (amd64_is_imm32 (item->key))
 					amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
 				else {
-					amd64_mov_reg_imm (code, AMD64_R11, item->key);
-					amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R11);
+					amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+					amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
 				}
 				item->jmp_code = code;
 				amd64_branch8 (code, X86_CC_NE, 0, FALSE);
-				amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->value.vtable_slot]));
-				amd64_jump_membase (code, AMD64_R11, 0);
+				/* See the comment below about R10 */
+				amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+				amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
 				amd64_patch (item->jmp_code, code);
 				amd64_breakpoint (code);
 				item->jmp_code = NULL;
 #else
-				amd64_mov_reg_imm (code, AMD64_R11, & (vtable->vtable [item->value.vtable_slot]));
-				amd64_jump_membase (code, AMD64_R11, 0);
+				/* We're using R10 (MONO_ARCH_IMT_SCRATCH_REG) here because R11 (MONO_ARCH_IMT_REG)
+				   needs to be preserved.  R10 needs
+				   to be preserved for calls which
+				   require a runtime generic context,
+				   but interface calls don't. */
+				amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, & (vtable->vtable [item->value.vtable_slot]));
+				amd64_jump_membase (code, MONO_ARCH_IMT_SCRATCH_REG, 0);
 #endif
 			}
 		} else {
 			if (amd64_is_imm32 (item->key))
 				amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
 			else {
-				amd64_mov_reg_imm (code, AMD64_R11, item->key);
-				amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R11);
+				amd64_mov_reg_imm (code, MONO_ARCH_IMT_SCRATCH_REG, item->key);
+				amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, MONO_ARCH_IMT_SCRATCH_REG);
 			}
 			item->jmp_code = code;
 			if (x86_is_imm8 (imt_branch_distance (imt_entries, i, item->check_target_idx)))
@@ -7451,6 +8087,8 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 		mono_stats.imt_thunks_size += code - start;
 	g_assert (code - start <= size);
 
+	nacl_domain_code_validate(domain, &start, size, &code);
+
 	return start;
 }
 
diff --git a/mono/mini/mini-amd64.h b/mono/mini/mini-amd64.h
index d71a60a108e..effe00bcbb0 100644
--- a/mono/mini/mini-amd64.h
+++ b/mono/mini/mini-amd64.h
@@ -5,6 +5,18 @@
 #include <mono/utils/mono-sigcontext.h>
 #include <glib.h>
 
+#ifdef __native_client_codegen__
+#define kNaClAlignmentAMD64 32
+#define kNaClAlignmentMaskAMD64 (kNaClAlignmentAMD64 - 1)
+
+/* TODO: use kamd64NaClLengthOfCallImm    */
+/* temporarily using kNaClAlignmentAMD64 so padding in */
+/* image-writer.c doesn't happen                       */
+#define kNaClLengthOfCallImm kNaClAlignmentAMD64
+
+int is_nacl_call_reg_sequence(guint8* code);
+#endif
+
 #ifdef HOST_WIN32
 #include <windows.h>
 /* use SIG* defines if possible */
@@ -146,7 +158,13 @@ struct MonoLMF {
 	gpointer    lmf_addr;
 	/* This is only set in trampoline LMF frames */
 	MonoMethod *method;
+#if defined(__default_codegen__) || defined(HOST_WIN32)
 	guint64     rip;
+#elif defined(__native_client_codegen__)
+	/* On 64-bit compilers, default alignment is 8 for this field, */
+	/* this allows the structure to match for 32-bit compilers.    */
+	guint64     rip __attribute__ ((aligned(8)));
+#endif
 	guint64     rbx;
 	guint64     rbp;
 	guint64     rsp;
@@ -238,7 +256,7 @@ typedef struct {
  */
 #define MONO_ARCH_VARARG_ICALLS 1
 
-#ifndef HOST_WIN32
+#if !defined( HOST_WIN32 ) && !defined(__native_client__) && !defined(__native_client_codegen__)
 
 #define MONO_ARCH_USE_SIGACTION 1
 
@@ -248,7 +266,7 @@ typedef struct {
 
 #endif
 
-#endif /* HOST_WIN32 */
+#endif /* !HOST_WIN32 && !__native_client__ */
 
 #if defined (__APPLE__)
 
@@ -335,6 +353,7 @@ typedef struct {
 #define MONO_ARCH_HAVE_IMT 1
 #define MONO_ARCH_HAVE_TLS_GET 1
 #define MONO_ARCH_IMT_REG AMD64_R10
+#define MONO_ARCH_IMT_SCRATCH_REG AMD64_R11
 #define MONO_ARCH_VTABLE_REG MONO_AMD64_ARG_REG1
 /*
  * We use r10 for the imt/rgctx register rather than r11 because r11 is
@@ -357,7 +376,7 @@ typedef struct {
 #define MONO_ARCH_HAVE_GET_TRAMPOLINES 1
 
 #define MONO_ARCH_AOT_SUPPORTED 1
-#ifndef HOST_WIN32
+#if !defined( HOST_WIN32 ) && !defined( __native_client__ )
 #define MONO_ARCH_SOFT_DEBUG_SUPPORTED 1
 #else
 #define DISABLE_DEBUGGER_AGENT 1
diff --git a/mono/mini/mini-ops.h b/mono/mini/mini-ops.h
index b79690642cb..687abadec16 100644
--- a/mono/mini/mini-ops.h
+++ b/mono/mini/mini-ops.h
@@ -891,6 +891,13 @@ MINI_OP(OP_GC_SPILL_SLOT_LIVENESS_DEF, "gc_spill_slot_liveness_def", NONE, NONE,
 MINI_OP(OP_GC_PARAM_SLOT_LIVENESS_DEF, "gc_param_slot_liveness_def", NONE, NONE, NONE)
 
 /* Arch specific opcodes */
+/* #if defined(__native_client_codegen__) || defined(__native_client__) */
+/* We have to define these in terms of the TARGET defines, not NaCl defines */
+/* because genmdesc.pl doesn't have multiple defines per platform.          */
+#if defined(TARGET_AMD64) || defined(TARGET_X86)
+MINI_OP(OP_NACL_GC_SAFE_POINT,     "nacl_gc_safe_point", IREG, NONE, NONE)
+#endif
+
 #if defined(TARGET_X86) || defined(TARGET_AMD64)
 MINI_OP(OP_X86_TEST_NULL,          "x86_test_null", NONE, IREG, NONE)
 MINI_OP(OP_X86_COMPARE_MEMBASE_REG,"x86_compare_membase_reg", NONE, IREG, IREG)
diff --git a/mono/mini/mini-x86.c b/mono/mini/mini-x86.c
index 26078a9dc99..66a39e08eb2 100644
--- a/mono/mini/mini-x86.c
+++ b/mono/mini/mini-x86.c
@@ -68,36 +68,13 @@ static CRITICAL_SECTION mini_arch_mutex;
 MonoBreakpointInfo
 mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
 
-static gpointer
-mono_realloc_native_code (MonoCompile *cfg)
-{
-#ifdef __native_client_codegen__
-	guint old_padding;
-	gpointer native_code;
-	guint alignment_check;
-
-	/* Save the old alignment offset so we can re-align after the realloc. */
-	old_padding = (guint)(cfg->native_code - cfg->native_code_alloc);
-
-	cfg->native_code_alloc = g_realloc (cfg->native_code_alloc, 
-										cfg->code_size + kNaClAlignment);
-
-	/* Align native_code to next nearest kNaClAlignment byte. */
-	native_code = (guint)cfg->native_code_alloc + kNaClAlignment;
-	native_code = (guint)native_code & ~kNaClAlignmentMask;
-
-	/* Shift the data to be 32-byte aligned again. */
-	memmove (native_code, cfg->native_code_alloc + old_padding, cfg->code_size);
-
-	alignment_check = (guint)native_code & kNaClAlignmentMask;
-	g_assert (alignment_check == 0);
-	return native_code;
-#else
-	return g_realloc (cfg->native_code, cfg->code_size);
-#endif
-}
 
 #ifdef __native_client_codegen__
+const guint kNaClAlignment = kNaClAlignmentX86;
+const guint kNaClAlignmentMask = kNaClAlignmentMaskX86;
+
+/* Default alignment for Native Client is 32-byte. */
+gint8 nacl_align_byte = -32; /* signed version of 0xe0 */
 
 /* mono_arch_nacl_pad: Add pad bytes of alignment instructions at code,       */
 /* Check that alignment doesn't cross an alignment boundary.        */
@@ -2333,6 +2310,16 @@ x86_pop_reg (code, X86_EAX);
 
 #ifndef DISABLE_JIT
 
+#if defined(__native_client__) || defined(__native_client_codegen__)
+void
+mono_nacl_gc()
+{
+#ifdef __native_client_gc__
+	__nacl_suspend_thread_if_needed();
+#endif
+}
+#endif
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -4694,6 +4681,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
 			break;
 		}
+		case OP_NACL_GC_SAFE_POINT: {
+#if defined(__native_client_codegen__)
+			code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)mono_nacl_gc);
+#endif
+			break;
+		}
 		case OP_GC_LIVENESS_DEF:
 		case OP_GC_LIVENESS_USE:
 		case OP_GC_PARAM_SLOT_LIVENESS_DEF:
@@ -4773,13 +4766,46 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
 		case MONO_PATCH_INFO_GENERIC_CLASS_INIT:
 		case MONO_PATCH_INFO_MONITOR_ENTER:
 		case MONO_PATCH_INFO_MONITOR_EXIT:
+#if defined(__native_client_codegen__) && defined(__native_client__)
+			if (nacl_is_code_address (code)) {
+				/* For tail calls, code is patched after being installed */
+				/* but not through the normal "patch callsite" method.   */
+				unsigned char buf[kNaClAlignment];
+				unsigned char *aligned_code = (uintptr_t)code & ~kNaClAlignmentMask;
+				unsigned char *_target = target;
+				int ret;
+				/* All patch targets modified in x86_patch */
+				/* are IP relative.                        */
+				_target = _target + (uintptr_t)buf - (uintptr_t)aligned_code;
+				memcpy (buf, aligned_code, kNaClAlignment);
+				/* Patch a temp buffer of bundle size, */
+				/* then install to actual location.    */
+				x86_patch (buf + ((uintptr_t)code - (uintptr_t)aligned_code), _target);
+				ret = nacl_dyncode_modify (aligned_code, buf, kNaClAlignment);
+				g_assert (ret == 0);
+			}
+			else {
+				x86_patch (ip, target);
+			}
+#else
 			x86_patch (ip, target);
+#endif
 			break;
 		case MONO_PATCH_INFO_NONE:
 			break;
+		case MONO_PATCH_INFO_R4:
+		case MONO_PATCH_INFO_R8: {
+			guint32 offset = mono_arch_get_patch_offset (ip);
+			*((gconstpointer *)(ip + offset)) = target;
+			break;
+		}
 		default: {
 			guint32 offset = mono_arch_get_patch_offset (ip);
+#if !defined(__native_client__)
 			*((gconstpointer *)(ip + offset)) = target;
+#else
+			*((gconstpointer *)(ip + offset)) = nacl_modify_patch_target (target);
+#endif
 			break;
 		}
 		}
@@ -4805,7 +4831,9 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 	if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
 		cfg->code_size += 512;
 
-#ifdef __native_client_codegen__
+#if defined(__default_codegen__)
+	code = cfg->native_code = g_malloc (cfg->code_size);
+#elif defined(__native_client_codegen__)
 	/* native_code_alloc is not 32-byte aligned, native_code is. */
 	cfg->native_code_alloc = g_malloc (cfg->code_size + kNaClAlignment);
 
@@ -4817,8 +4845,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 
 	alignment_check = (guint)cfg->native_code & kNaClAlignmentMask;
   	g_assert(alignment_check == 0);
-#else
-	code = cfg->native_code = g_malloc (cfg->code_size);
 #endif
 
 	/* Offset between RSP and the CFA */
@@ -5339,11 +5365,11 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 				guint32 size;
 
 				/* Compute size of code following the push <OFFSET> */
-#ifdef __native_client_codegen__
+#if defined(__default_codegen__)
+				size = 5 + 5;
+#elif defined(__native_client_codegen__)
 				code = mono_nacl_align (code);
 				size = kNaClAlignment;
-#else
-				size = 5 + 5;
 #endif
 				/*This is aligned to 16 bytes by the callee. This way we save a few bytes here.*/
 
@@ -5459,16 +5485,15 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 //[1 + 5] x86_jump_mem(inst,mem)
 
 #define CMP_SIZE 6
-#ifdef __native_client_codegen__
-/* These constants should be coming from cpu-x86.md            */
+#if defined(__default_codegen__)
+#define BR_SMALL_SIZE 2
+#define BR_LARGE_SIZE 5
+#elif defined(__native_client_codegen__)
 /* I suspect the size calculation below is actually incorrect. */
-/* TODO: fix the calculation that uses these sizes.            */
+/* TODO: fix the calculation that uses these sizes.  */
 #define BR_SMALL_SIZE 16
 #define BR_LARGE_SIZE 12
-#else
-#define BR_SMALL_SIZE 2
-#define BR_LARGE_SIZE 5
-#endif  /* __native_client_codegen__ */
+#endif  /*__native_client_codegen__*/
 #define JUMP_IMM_SIZE 6
 #define ENABLE_WRONG_METHOD_CHECK 0
 #define DEBUG_IMT 0
@@ -5493,9 +5518,6 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 	int size = 0;
 	guint8 *code, *start;
 
-#ifdef __native_client_codegen__
-	/* g_print("mono_arch_build_imt_thunk needs to be aligned.\n"); */
-#endif
 	for (i = 0; i < count; ++i) {
 		MonoIMTCheckItem *item = imt_entries [i];
 		if (item->is_equals) {
@@ -5519,10 +5541,16 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 		}
 		size += item->chunk_size;
 	}
+#if defined(__native_client__) && defined(__native_client_codegen__)
+	/* In Native Client, we don't re-use thunks, allocate from the */
+	/* normal code manager paths. */
+	code = mono_domain_code_reserve (domain, size);
+#else
 	if (fail_tramp)
 		code = mono_method_alloc_generic_virtual_thunk (domain, size);
 	else
 		code = mono_domain_code_reserve (domain, size);
+#endif
 	start = code;
 	for (i = 0; i < count; ++i) {
 		MonoIMTCheckItem *item = imt_entries [i];
@@ -5607,6 +5635,8 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
 		g_free (buff);
 	}
 
+	nacl_domain_code_validate (domain, &start, size, &code);
+
 	return start;
 }
 
@@ -5837,6 +5867,7 @@ static gpointer
 get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *code_len)
 {
 	guint8 *code, *start;
+	int code_reserve = 64;
 
 	/*
 	 * The stack contains:
@@ -5845,7 +5876,7 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
 	 */
 
 	if (has_target) {
-		start = code = mono_global_codeman_reserve (64);
+		start = code = mono_global_codeman_reserve (code_reserve);
 
 		/* Replace the this argument with the target */
 		x86_mov_reg_membase (code, X86_EAX, X86_ESP, 4, 4);
@@ -5853,15 +5884,15 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
 		x86_mov_membase_reg (code, X86_ESP, 4, X86_ECX, 4);
 		x86_jump_membase (code, X86_EAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
 
-		g_assert ((code - start) < 64);
+		g_assert ((code - start) < code_reserve);
 	} else {
 		int i = 0;
 		/* 8 for mov_reg and jump, plus 8 for each parameter */
 #ifdef __native_client_codegen__
 		/* TODO: calculate this size correctly */
-		int code_reserve = 13 + (param_count * 8) + 2 * kNaClAlignment;
+		code_reserve = 13 + (param_count * 8) + 2 * kNaClAlignment;
 #else
-		int code_reserve = 8 + (param_count * 8);
+		code_reserve = 8 + (param_count * 8);
 #endif  /* __native_client_codegen__ */
 		/*
 		 * The stack contains:
@@ -5895,6 +5926,7 @@ get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *cod
 		g_assert ((code - start) < code_reserve);
 	}
 
+	nacl_global_codeman_validate(&start, code_reserve, &code);
 	mono_debug_add_delegate_trampoline (start, code - start);
 
 	if (code_len)
diff --git a/mono/mini/mini-x86.h b/mono/mini/mini-x86.h
index 3063fc2ceca..3cc5b440367 100644
--- a/mono/mini/mini-x86.h
+++ b/mono/mini/mini-x86.h
@@ -3,6 +3,14 @@
 
 #include <mono/arch/x86/x86-codegen.h>
 #include <mono/utils/mono-sigcontext.h>
+
+#ifdef __native_client_codegen__
+#define kNaClAlignmentX86 32
+#define kNaClAlignmentMaskX86 (kNaClAlignmentX86 - 1)
+
+#define kNaClLengthOfCallImm kx86NaClLengthOfCallImm
+#endif
+
 #ifdef HOST_WIN32
 #include <windows.h>
 /* use SIG* defines if possible */
@@ -59,12 +67,6 @@ struct sigcontext {
 #undef MONO_ARCH_USE_SIGACTION
 #endif
 
-#if defined(__native_client_codegen__) || defined(__native_client__)
-#define NACL_SIZE(a, b) (b)
-#else
-#define NACL_SIZE(a, b) (a)
-#endif
-
 #ifndef HOST_WIN32
 
 #ifdef HAVE_WORKING_SIGALTSTACK
diff --git a/mono/mini/mini.c b/mono/mini/mini.c
index e8cade800a2..69e19cf74d1 100644
--- a/mono/mini/mini.c
+++ b/mono/mini/mini.c
@@ -89,10 +89,6 @@
 
 static gpointer mono_jit_compile_method_with_opt (MonoMethod *method, guint32 opt, MonoException **ex);
 
-#ifdef __native_client_codegen__
-/* Default alignment for Native Client is 32-byte. */
-guint8 nacl_align_byte = 0xe0;
-#endif
 
 static guint32 default_opt = 0;
 static gboolean default_opt_set = FALSE;
@@ -164,6 +160,38 @@ gboolean disable_vtypes_in_regs = FALSE;
 
 gboolean mono_dont_free_global_codeman;
 
+gpointer
+mono_realloc_native_code (MonoCompile *cfg)
+{
+#if defined(__default_codegen__)
+	return g_realloc (cfg->native_code, cfg->code_size);
+#elif defined(__native_client_codegen__)
+	guint old_padding;
+	gpointer native_code;
+	guint alignment_check;
+
+	/* Save the old alignment offset so we can re-align after the realloc. */
+	old_padding = (guint)(cfg->native_code - cfg->native_code_alloc);
+
+	cfg->native_code_alloc = g_realloc ( cfg->native_code_alloc,
+										 cfg->code_size + kNaClAlignment );
+
+	/* Align native_code to next nearest kNaClAlignment byte. */
+	native_code = (guint)cfg->native_code_alloc + kNaClAlignment;
+	native_code = (guint)native_code & ~kNaClAlignmentMask;
+
+	/* Shift the data to be 32-byte aligned again. */
+	memmove (native_code, cfg->native_code_alloc + old_padding, cfg->code_size);
+
+	alignment_check = (guint)native_code & kNaClAlignmentMask;
+	g_assert (alignment_check == 0);
+	return native_code;
+#else
+	g_assert_not_reached ();
+	return cfg->native_code;
+#endif
+}
+
 #ifdef __native_client_codegen__
 
 /* Prevent instructions from straddling a 32-byte alignment boundary.   */
@@ -430,6 +458,67 @@ void *mono_global_codeman_reserve (int size)
 	}
 }
 
+#if defined(__native_client_codegen__) && defined(__native_client__)
+/* Given the temporary buffer (allocated by mono_global_codeman_reserve) into
+ * which we are generating code, return a pointer to the destination in the
+ * dynamic code segment into which the code will be copied when
+ * mono_global_codeman_commit is called.
+ * LOCKING: Acquires the jit lock.
+ */
+void*
+nacl_global_codeman_get_dest (void *data)
+{
+	void *dest;
+	mono_jit_lock ();
+	dest = nacl_code_manager_get_code_dest (global_codeman, data);
+	mono_jit_unlock ();
+	return dest;
+}
+
+void
+mono_global_codeman_commit (void *data, int size, int newsize)
+{
+	mono_jit_lock ();
+	mono_code_manager_commit (global_codeman, data, size, newsize);
+	mono_jit_unlock ();
+}
+
+/* 
+ * Convenience function which calls mono_global_codeman_commit to validate and
+ * copy the code. The caller sets *buf_base and *buf_size to the start and size
+ * of the buffer (allocated by mono_global_codeman_reserve), and *code_end to
+ * the byte after the last instruction byte. On return, *buf_base will point to
+ * the start of the copied in the code segment, and *code_end will point after
+ * the end of the copied code.
+ */
+void
+nacl_global_codeman_validate (guint8 **buf_base, int buf_size, guint8 **code_end)
+{
+	guint8 *tmp = nacl_global_codeman_get_dest (*buf_base);
+	mono_global_codeman_commit (*buf_base, buf_size, *code_end - *buf_base);
+	*code_end = tmp + (*code_end - *buf_base);
+	*buf_base = tmp;
+}
+#else
+/* no-op versions of Native Client functions */
+void*
+nacl_global_codeman_get_dest (void *data)
+{
+	return data;
+}
+
+void
+mono_global_codeman_commit (void *data, int size, int newsize)
+{
+}
+
+void
+nacl_global_codeman_validate (guint8 **buf_base, int buf_size, guint8 **code_end)
+{
+}
+
+#endif /* __native_client__ */
+
 /**
  * mono_create_unwind_op:
  *
@@ -1684,7 +1773,7 @@ mono_allocate_stack_slots_full2 (MonoCompile *cfg, gboolean backward, guint32 *s
 		case MONO_TYPE_PTR:
 		case MONO_TYPE_I:
 		case MONO_TYPE_U:
-#if SIZEOF_REGISTER == 4
+#if SIZEOF_VOID_P == 4
 		case MONO_TYPE_I4:
 #else
 		case MONO_TYPE_I8:
@@ -1918,7 +2007,7 @@ mono_allocate_stack_slots_full (MonoCompile *cfg, gboolean backward, guint32 *st
 
 	vars = mono_varlist_sort (cfg, vars, 0);
 	offset = 0;
-	*stack_align = sizeof (gpointer);
+	*stack_align = sizeof(mgreg_t);
 	for (l = vars; l; l = l->next) {
 		vmv = l->data;
 		inst = cfg->varinfo [vmv->idx];
@@ -1973,7 +2062,7 @@ mono_allocate_stack_slots_full (MonoCompile *cfg, gboolean backward, guint32 *st
 			case MONO_TYPE_PTR:
 			case MONO_TYPE_I:
 			case MONO_TYPE_U:
-#if SIZEOF_REGISTER == 4
+#if SIZEOF_VOID_P == 4
 			case MONO_TYPE_I4:
 #else
 			case MONO_TYPE_I8:
@@ -2277,6 +2366,8 @@ mono_bblock_insert_before_ins (MonoBasicBlock *bb, MonoInst *ins, MonoInst *ins_
 {
 	if (ins == NULL) {
 		ins = bb->code;
+		if (ins)
+			ins->prev = ins_to_insert;
 		bb->code = ins_to_insert;
 		ins_to_insert->next = ins;
 		if (bb->last_ins == NULL)
@@ -2859,7 +2950,13 @@ mono_resolve_patch_target (MonoMethod *method, MonoDomain *domain, guint8 *code,
 		target = patch_info->data.inst->inst_c0 + code;
 		break;
 	case MONO_PATCH_INFO_IP:
+#if defined(__native_client__) && defined(__native_client_codegen__)
+		/* Need to transform to the destination address, it's */
+		/* emitted as an immediate in the code. */
+		target = nacl_inverse_modify_patch_target(ip);
+#else
 		target = ip;
+#endif
 		break;
 	case MONO_PATCH_INFO_METHOD_REL:
 		target = code + patch_info->data.offset;
@@ -2875,6 +2972,13 @@ mono_resolve_patch_target (MonoMethod *method, MonoDomain *domain, guint8 *code,
 	}
 	case MONO_PATCH_INFO_METHOD_JUMP:
 		target = mono_create_jump_trampoline (domain, patch_info->data.method, FALSE);
+#if defined(__native_client__) && defined(__native_client_codegen__)
+#if defined(TARGET_AMD64)
+		/* This target is an absolute address, not relative to the */
+		/* current code being emitted on AMD64. */
+		target = nacl_inverse_modify_patch_target(target);
+#endif
+#endif
 		break;
 	case MONO_PATCH_INFO_METHOD:
 		if (patch_info->data.method == method) {
@@ -2888,6 +2992,11 @@ mono_resolve_patch_target (MonoMethod *method, MonoDomain *domain, guint8 *code,
 		gpointer *jump_table;
 		int i;
 
+#if defined(__native_client__) && defined(__native_client_codegen__)
+		/* This memory will leak, but we don't care if we're */
+		/* not deleting JIT'd methods anyway                 */
+		jump_table = g_malloc0 (sizeof(gpointer) * patch_info->data.table->table_size);
+#else
 		if (method && method->dynamic) {
 			jump_table = mono_code_manager_reserve (mono_dynamic_code_hash_lookup (domain, method)->code_mp, sizeof (gpointer) * patch_info->data.table->table_size);
 		} else {
@@ -2897,10 +3006,27 @@ mono_resolve_patch_target (MonoMethod *method, MonoDomain *domain, guint8 *code,
 				jump_table = mono_domain_code_reserve (domain, sizeof (gpointer) * patch_info->data.table->table_size);
 			}
 		}
+#endif
 
-		for (i = 0; i < patch_info->data.table->table_size; i++)
+		for (i = 0; i < patch_info->data.table->table_size; i++) {
+#if defined(__native_client__) && defined(__native_client_codegen__)
+			/* 'code' is relative to the current code blob, we */
+			/* need to do this transform on it to make the     */
+			/* pointers in this table absolute                 */
+			jump_table [i] = nacl_inverse_modify_patch_target (code) + GPOINTER_TO_INT (patch_info->data.table->table [i]);
+#else
 			jump_table [i] = code + GPOINTER_TO_INT (patch_info->data.table->table [i]);
+#endif
+		}
+
+#if defined(__native_client__) && defined(__native_client_codegen__)
+		/* jump_table is in the data section, we need to transform */
+		/* it here so when it gets modified in amd64_patch it will */
+		/* then point back to the absolute data address            */
+		target = nacl_inverse_modify_patch_target (jump_table);
+#else
 		target = jump_table;
+#endif
 		break;
 	}
 	case MONO_PATCH_INFO_METHODCONST:
@@ -3246,11 +3372,18 @@ mono_postprocess_patches (MonoCompile *cfg)
 		}
 		case MONO_PATCH_INFO_SWITCH: {
 			gpointer *table;
+#if defined(__native_client__) && defined(__native_client_codegen__)
+			/* This memory will leak.  */
+			/* TODO: can we free this when  */
+			/* making the final jump table? */
+			table = g_malloc0 (sizeof(gpointer) * patch_info->data.table->table_size);
+#else
 			if (cfg->method->dynamic) {
 				table = mono_code_manager_reserve (cfg->dynamic_info->code_mp, sizeof (gpointer) * patch_info->data.table->table_size);
 			} else {
 				table = mono_domain_code_reserve (cfg->domain, sizeof (gpointer) * patch_info->data.table->table_size);
 			}
+#endif
 
 			for (i = 0; i < patch_info->data.table->table_size; i++) {
 				/* Might be NULL if the switch is eliminated */
@@ -3268,6 +3401,12 @@ mono_postprocess_patches (MonoCompile *cfg)
 			GSList *list;
 			MonoDomain *domain = cfg->domain;
 			unsigned char *ip = cfg->native_code + patch_info->ip.i;
+#if defined(__native_client__) && defined(__native_client_codegen__)
+			/* When this jump target gets evaluated, the method */
+			/* will be installed in the dynamic code section,   */
+			/* not at the location of cfg->native_code.         */
+			ip = nacl_inverse_modify_patch_target (cfg->native_code) + patch_info->ip.i;
+#endif
 
 			mono_domain_lock (domain);
 			if (!domain_jit_info (domain)->jump_target_hash)
@@ -3407,6 +3546,15 @@ mono_codegen (MonoCompile *cfg)
 	int max_epilog_size;
 	guint8 *code;
 
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	void *code_dest;
+
+	/* This keeps patch targets from being transformed during
+	 * ordinary method compilation, for local branches and jumps.
+	 */
+	nacl_allow_target_modification (FALSE);
+#endif
+
 	for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
 		cfg->spill_count = 0;
 		/* we reuse dfn here */
@@ -3459,6 +3607,9 @@ mono_codegen (MonoCompile *cfg)
 		}
 	}
 
+#ifdef __native_client_codegen__
+	mono_nacl_fix_patches (cfg->native_code, cfg->patch_info);
+#endif
 	mono_arch_emit_exceptions (cfg);
 
 	max_epilog_size = 0;
@@ -3489,9 +3640,14 @@ mono_codegen (MonoCompile *cfg)
 #endif
 		code = mono_domain_code_reserve (cfg->domain, cfg->code_size + unwindlen);
 	}
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	nacl_allow_target_modification (TRUE);
+#endif
 
 	memcpy (code, cfg->native_code, cfg->code_len);
-#ifdef __native_client_codegen__
+#if defined(__default_codegen__)
+	g_free (cfg->native_code);
+#elif defined(__native_client_codegen__)
 	if (cfg->native_code_alloc) {
 		g_free (cfg->native_code_alloc);
 		cfg->native_code_alloc = 0;
@@ -3499,9 +3655,7 @@ mono_codegen (MonoCompile *cfg)
 	else if (cfg->native_code) {
 		g_free (cfg->native_code);
 	}
-#else
-	g_free (cfg->native_code);
-#endif
+#endif /* __native_client_codegen__ */
 	cfg->native_code = code;
 	code = cfg->native_code + cfg->code_len;
   
@@ -3539,8 +3693,18 @@ if (valgrind_register){
 #ifdef MONO_ARCH_HAVE_SAVE_UNWIND_INFO
 	mono_arch_save_unwind_info (cfg);
 #endif
-	
-#ifdef __native_client_codegen__
+
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	if (!cfg->compile_aot) {
+		if (cfg->method->dynamic) {
+			code_dest = nacl_code_manager_get_code_dest(cfg->dynamic_info->code_mp, cfg->native_code);
+		} else {
+			code_dest = nacl_domain_get_code_dest(cfg->domain, cfg->native_code);
+		}
+	}
+#endif
+
+#if defined(__native_client_codegen__)
 	mono_nacl_fix_patches (cfg->native_code, cfg->patch_info);
 #endif
 
@@ -3551,6 +3715,9 @@ if (valgrind_register){
 	} else {
 		mono_domain_code_commit (cfg->domain, cfg->native_code, cfg->code_size, cfg->code_len);
 	}
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	cfg->native_code = code_dest;
+#endif
 	mono_profiler_code_buffer_new (cfg->native_code, cfg->code_len, MONO_PROFILER_CODE_BUFFER_METHOD, cfg->method);
 	
 	mono_arch_flush_icache (cfg->native_code, cfg->code_len);
@@ -6149,6 +6316,9 @@ mini_init (const char *filename, const char *runtime_version)
 	register_icall (mono_load_remote_field_new, "mono_load_remote_field_new", "object object ptr ptr", FALSE);
 	register_icall (mono_store_remote_field_new, "mono_store_remote_field_new", "void object ptr ptr object", FALSE);
 
+#if defined(__native_client__) || defined(__native_client_codegen__)
+	register_icall (mono_nacl_gc, "mono_nacl_gc", "void", TRUE);
+#endif
 	/* 
 	 * NOTE, NOTE, NOTE, NOTE:
 	 * when adding emulation for some opcodes, remember to also add a dummy
@@ -6219,7 +6389,11 @@ mini_init (const char *filename, const char *runtime_version)
 	mono_register_opcode_emulation (OP_LCONV_TO_R_UN, "__emul_lconv_to_r8_un", "double long", mono_lconv_to_r8_un, FALSE);
 #endif
 #ifdef MONO_ARCH_EMULATE_FREM
+#if defined(__default_codegen__)
 	mono_register_opcode_emulation (OP_FREM, "__emul_frem", "double double double", fmod, FALSE);
+#elif defined(__native_client_codegen__)
+	mono_register_opcode_emulation (OP_FREM, "__emul_frem", "double double double", mono_fmod, FALSE);
+#endif
 #endif
 
 #ifdef MONO_ARCH_SOFT_FLOAT
diff --git a/mono/mini/mini.h b/mono/mini/mini.h
index 5901ab58450..33764f2bf5d 100644
--- a/mono/mini/mini.h
+++ b/mono/mini/mini.h
@@ -1490,7 +1490,7 @@ enum {
 #endif
 
 /* Opcodes to load/store regsize quantities */
-#ifdef __mono_ilp32__
+#if defined (__mono_ilp32__)
 #define OP_LOADR_MEMBASE OP_LOADI8_MEMBASE
 #define OP_STORER_MEMBASE_REG OP_STOREI8_MEMBASE_REG
 #else
@@ -1776,6 +1776,7 @@ void      mono_linterval_split              (MonoCompile *cfg, MonoLiveInterval
 void      mono_liveness_handle_exception_clauses (MonoCompile *cfg) MONO_INTERNAL;
 
 /* Native Client functions */
+gpointer mono_realloc_native_code(MonoCompile *cfg);
 #ifdef __native_client_codegen__
 void mono_nacl_align_inst(guint8 **pcode, int instlen);
 void mono_nacl_align_call(guint8 **start, guint8 **pcode);
@@ -1786,6 +1787,18 @@ void mono_nacl_fix_patches(const guint8 *code, MonoJumpInfo *ji);
 guint8 *mono_arch_nacl_pad(guint8 *code, int pad);
 guint8 *mono_arch_nacl_skip_nops(guint8 *code);
 
+extern const guint kNaClAlignment;
+extern const guint kNaClAlignmentMask;
+#endif
+
+#if defined(__native_client__) || defined(__native_client_codegen__)
+void mono_nacl_gc();
+#endif
+
+#if defined(__native_client_codegen__) || defined(__native_client__)
+#define NACL_SIZE(a, b) (b)
+#else
+#define NACL_SIZE(a, b) (a)
 #endif
 
 /* AOT */
@@ -1894,6 +1907,9 @@ char*             mono_get_rgctx_fetch_trampoline_name (int slot) MONO_INTERNAL;
 
 gboolean          mono_running_on_valgrind (void) MONO_INTERNAL;
 void*             mono_global_codeman_reserve (int size) MONO_INTERNAL;
+void*             nacl_global_codeman_get_dest(void *data) MONO_INTERNAL;
+void              mono_global_codeman_commit(void *data, int size, int newsize) MONO_INTERNAL;
+void              nacl_global_codeman_validate(guint8 **buf_base, int buf_size, guint8 **code_end) MONO_INTERNAL;
 const char       *mono_regname_full (int reg, int bank) MONO_INTERNAL;
 gint32*           mono_allocate_stack_slots_full (MonoCompile *cfg, gboolean backward, guint32 *stack_size, guint32 *stack_align) MONO_INTERNAL;
 gint32*           mono_allocate_stack_slots (MonoCompile *cfg, guint32 *stack_size, guint32 *stack_align) MONO_INTERNAL;
diff --git a/mono/mini/nacl.cs b/mono/mini/nacl.cs
new file mode 100644
index 00000000000..24cd2c5cc82
--- /dev/null
+++ b/mono/mini/nacl.cs
@@ -0,0 +1,67 @@
+using System;
+using Mono.Simd;
+
+class Tests {
+	struct myvt {
+	  public int X;
+	  public int Y;
+	}
+
+	static int test_0_vector4i_cmp_gt () {
+ 	        Vector4i a = new Vector4i (10, 5, 12, -1);
+		Vector4i b = new Vector4i (-1, 5, 10, 10);
+
+		Vector4i c = a.CompareGreaterThan (b);
+	
+		if (c.X != -1)
+			return 1;
+		if (c.Y != 0)
+			return 2;
+		if (c.Z != -1)
+		  return 3;
+		if (c.W != 0)
+		  return 4;
+		return 0;
+	}
+
+	static myvt CompareGT(myvt a, myvt b) {
+	  myvt r;
+	  r.X = a.X > b.X ? -1 : 0;
+	  r.Y = a.Y > b.Y ? -1 : 0;
+	  return r;
+	}
+
+	static int test_0_struct2i_cmp_gt() {
+	  myvt a;
+	  myvt b;
+	  a.X = 10;
+	  a.Y = 5;
+	  b.X = -1;
+	  b.Y = 5;
+	  myvt c = CompareGT(a, b);
+	  if (c.X != -1)
+	    return 1;
+	  if (c.Y != 0)
+	    return 2;
+	  return 0;
+	}
+
+	static int vararg_sum(params int[] args) {
+	  int sum = 0;
+	  foreach(int arg in args) {
+	    sum += arg;
+	  }
+	  return sum;
+	}
+	static int test_21_vararg_test() {
+	  int sum = 0;
+	  sum += vararg_sum();
+	  sum += vararg_sum(1);
+	  sum += vararg_sum(2, 3);
+	  sum += vararg_sum(4, 5, 6);
+	  return sum;
+	}
+	public static int Main(String[] args) {
+	  return TestDriver.RunTests(typeof(Tests));
+	}
+}
diff --git a/mono/mini/tramp-amd64.c b/mono/mini/tramp-amd64.c
index cd42591d876..f576af9496c 100644
--- a/mono/mini/tramp-amd64.c
+++ b/mono/mini/tramp-amd64.c
@@ -25,6 +25,11 @@
 #include "mini.h"
 #include "mini-amd64.h"
 
+#if defined(__native_client_codegen__) && defined(__native_client__)
+#include <malloc.h>
+#include <sys/nacl_syscalls.h>
+#endif
+
 #define IS_REX(inst) (((inst) >= 0x40) && ((inst) <= 0x4f))
 
 static guint8* nullified_class_init_trampoline;
@@ -56,6 +61,8 @@ mono_arch_get_unbox_trampoline (MonoMethod *m, gpointer addr)
 	amd64_jump_reg (code, AMD64_RAX);
 	g_assert ((code - start) < 20);
 
+	nacl_domain_code_validate (domain, &start, 20, &code);
+
 	mono_arch_flush_icache (start, code - start);
 
 	return start;
@@ -90,6 +97,7 @@ mono_arch_get_static_rgctx_trampoline (MonoMethod *m, MonoMethodRuntimeGenericCo
 	amd64_jump_code (code, addr);
 	g_assert ((code - start) < buf_len);
 
+	nacl_domain_code_validate (domain, &start, buf_len, &code);
 	mono_arch_flush_icache (start, code - start);
 
 	return start;
@@ -117,6 +125,8 @@ mono_arch_get_llvm_imt_trampoline (MonoDomain *domain, MonoMethod *m, int vt_off
 
 	g_assert ((code - start) < buf_len);
 
+	nacl_domain_code_validate (domain, &start, buf_len, &code);
+
 	mono_arch_flush_icache (start, code - start);
 
 	return start;
@@ -131,12 +141,14 @@ mono_arch_get_llvm_imt_trampoline (MonoDomain *domain, MonoMethod *m, int vt_off
 void
 mono_arch_patch_callsite (guint8 *method_start, guint8 *orig_code, guint8 *addr)
 {
+#if defined(__default_codegen__)
 	guint8 *code;
 	guint8 buf [16];
 	gboolean can_write = mono_breakpoint_clean_code (method_start, orig_code, 14, buf, sizeof (buf));
 
 	code = buf + 14;
 
+	/* mov 64-bit imm into r11 (followed by call reg?)  or direct call*/
 	if (((code [-13] == 0x49) && (code [-12] == 0xbb)) || (code [-5] == 0xe8)) {
 		if (code [-5] != 0xe8) {
 			if (can_write) {
@@ -184,6 +196,38 @@ mono_arch_patch_callsite (guint8 *method_start, guint8 *orig_code, guint8 *addr)
 			VALGRIND_DISCARD_TRANSLATIONS (orig_code - 5, sizeof (gpointer));
 		}
 	}
+#elif defined(__native_client__)
+	/* These are essentially the same 2 cases as above, modified for NaCl*/
+
+	/* Target must be bundle-aligned */
+	g_assert (((guint32)addr & kNaClAlignmentMask) == 0);
+	/* Return target must be bundle-aligned */
+	g_assert (((guint32)orig_code & kNaClAlignmentMask) == 0);
+
+	if (orig_code[-5] == 0xe8) {
+		/* Direct call */
+		int ret;
+		gint32 offset = (gint32)addr - (gint32)orig_code;
+		guint8 buf[sizeof(gint32)];
+		*((gint32*)(buf)) = offset;
+		ret = nacl_dyncode_modify (orig_code - sizeof(gint32), buf, sizeof(gint32));
+		g_assert (ret == 0);
+	}
+
+	else if (is_nacl_call_reg_sequence (orig_code - 10) && orig_code[-16] == 0x41 && orig_code[-15] == 0xbb) {
+		int ret;
+		guint8 buf[sizeof(gint32)];
+		*((gint32 *)(buf)) = addr;
+		/* orig_code[-14] is the start of the immediate. */
+		ret = nacl_dyncode_modify (orig_code - 14, buf, sizeof(gint32));
+		g_assert (ret == 0);
+	}
+	else {
+		g_assert_not_reached ();
+	}
+
+	return;
+#endif
 }
 
 void
@@ -192,6 +236,7 @@ mono_arch_patch_plt_entry (guint8 *code, gpointer *got, mgreg_t *regs, guint8 *a
 	gint32 disp;
 	gpointer *plt_jump_table_entry;
 
+#if defined(__default_codegen__)
 	/* A PLT entry: jmp *<DISP>(%rip) */
 	g_assert (code [0] == 0xff);
 	g_assert (code [1] == 0x25);
@@ -199,6 +244,24 @@ mono_arch_patch_plt_entry (guint8 *code, gpointer *got, mgreg_t *regs, guint8 *a
 	disp = *(gint32*)(code + 2);
 
 	plt_jump_table_entry = (gpointer*)(code + 6 + disp);
+#elif defined(__native_client_codegen__)
+	/* A PLT entry:            */
+	/* mov <DISP>(%rip), %r11d */
+	/* nacljmp *%r11           */
+
+	/* Verify the 'mov' */
+	g_assert (code [0] == 0x45);
+	g_assert (code [1] == 0x8b);
+	g_assert (code [2] == 0x1d);
+
+	disp = *(gint32*)(code + 3);
+
+	/* 7 = 3 (mov opcode) + 4 (disp) */
+	/* This needs to resolve to the target of the RIP-relative offset */
+	plt_jump_table_entry = (gpointer*)(code + 7 + disp);
+
+#endif /* __native_client_codegen__ */
+
 
 	InterlockedExchangePointer (plt_jump_table_entry, addr);
 }
@@ -321,22 +384,25 @@ guchar*
 mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInfo **info, gboolean aot)
 {
 	guint8 *buf, *code, *tramp, *br [2], *r11_save_code, *after_r11_save_code;
-	int i, lmf_offset, offset, res_offset, arg_offset, rax_offset, tramp_offset;
-	int buf_len, saved_regs_offset;
+	int i, lmf_offset, offset, res_offset, arg_offset, rax_offset, tramp_offset, saved_regs_offset;
 	int saved_fpregs_offset, rbp_offset, framesize, orig_rsp_to_rbp_offset, cfa_offset;
 	gboolean has_caller;
 	GSList *unwind_ops = NULL;
 	MonoJumpInfo *ji = NULL;
+	const guint kMaxCodeSize = NACL_SIZE (548, 548*2);
+
+#if defined(__native_client_codegen__)
+	const guint kNaClTrampOffset = 17;
+#endif
 
 	if (tramp_type == MONO_TRAMPOLINE_JUMP)
 		has_caller = FALSE;
 	else
 		has_caller = TRUE;
 
-	buf_len = 548;
-	code = buf = mono_global_codeman_reserve (buf_len);
+	code = buf = mono_global_codeman_reserve (kMaxCodeSize);
 
-	framesize = 538 + sizeof (MonoLMF);
+	framesize = kMaxCodeSize + sizeof (MonoLMF);
 	framesize = (framesize + (MONO_ARCH_FRAME_ALIGNMENT - 1)) & ~ (MONO_ARCH_FRAME_ALIGNMENT - 1);
 
 	orig_rsp_to_rbp_offset = 0;
@@ -353,72 +419,76 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 
 	/* Pop the return address off the stack */
 	amd64_pop_reg (code, AMD64_R11);
-	orig_rsp_to_rbp_offset += 8;
+	orig_rsp_to_rbp_offset += sizeof(mgreg_t);
 
-	cfa_offset -= 8;
+	cfa_offset -= sizeof(mgreg_t);
 	mono_add_unwind_op_def_cfa_offset (unwind_ops, code, buf, cfa_offset);
 
 	/* 
 	 * Allocate a new stack frame
 	 */
 	amd64_push_reg (code, AMD64_RBP);
-	cfa_offset += 8;
+	cfa_offset += sizeof(mgreg_t);
 	mono_add_unwind_op_def_cfa_offset (unwind_ops, code, buf, cfa_offset);
 	mono_add_unwind_op_offset (unwind_ops, code, buf, AMD64_RBP, - cfa_offset);
 
-	orig_rsp_to_rbp_offset -= 8;
-	amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, 8);
+	orig_rsp_to_rbp_offset -= sizeof(mgreg_t);
+	amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof(mgreg_t));
 	mono_add_unwind_op_def_cfa_reg (unwind_ops, code, buf, AMD64_RBP);
 	amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, framesize);
 
 	offset = 0;
 	rbp_offset = - offset;
 
-	offset += 8;
+	offset += sizeof(mgreg_t);
 	rax_offset = - offset;
 
-	offset += 8;
+	offset += sizeof(mgreg_t);
 	tramp_offset = - offset;
 
-	offset += 8;
+	offset += sizeof(gpointer);
 	arg_offset = - offset;
 
 	/* Compute the trampoline address from the return address */
 	if (aot) {
+#if defined(__default_codegen__)
 		/* 7 = length of call *<offset>(rip) */
 		amd64_alu_reg_imm (code, X86_SUB, AMD64_R11, 7);
+#elif defined(__native_client_codegen__)
+		amd64_alu_reg_imm (code, X86_SUB, AMD64_R11, kNaClTrampOffset);
+#endif
 	} else {
 		/* 5 = length of amd64_call_membase () */
 		amd64_alu_reg_imm (code, X86_SUB, AMD64_R11, 5);
 	}
-	amd64_mov_membase_reg (code, AMD64_RBP, tramp_offset, AMD64_R11, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, tramp_offset, AMD64_R11, sizeof(gpointer));
 
-	offset += 8;
+	offset += sizeof(mgreg_t);
 	res_offset = - offset;
 
 	/* Save all registers */
 
-	offset += AMD64_NREG * 8;
+	offset += AMD64_NREG * sizeof(mgreg_t);
 	saved_regs_offset = - offset;
 	for (i = 0; i < AMD64_NREG; ++i) {
 		if (i == AMD64_RBP) {
 			/* RAX is already saved */
-			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RBP, rbp_offset, 8);
-			amd64_mov_membase_reg (code, AMD64_RBP, saved_regs_offset + (i * 8), AMD64_RAX, 8);
+			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RBP, rbp_offset, sizeof(mgreg_t));
+			amd64_mov_membase_reg (code, AMD64_RBP, saved_regs_offset + (i * sizeof(mgreg_t)), AMD64_RAX, sizeof(mgreg_t));
 		} else if (i != AMD64_R11) {
-			amd64_mov_membase_reg (code, AMD64_RBP, saved_regs_offset + (i * 8), i, 8);
+			amd64_mov_membase_reg (code, AMD64_RBP, saved_regs_offset + (i * sizeof(mgreg_t)), i, sizeof(mgreg_t));
 		} else {
 			/* We have to save R11 right at the start of
 			   the trampoline code because it's used as a
 			   scratch register */
-			amd64_mov_membase_reg (r11_save_code, AMD64_RSP, saved_regs_offset + orig_rsp_to_rbp_offset + (i * 8), i, 8);
+			amd64_mov_membase_reg (r11_save_code, AMD64_RSP, saved_regs_offset + orig_rsp_to_rbp_offset + (i * sizeof(mgreg_t)), i, sizeof(mgreg_t));
 			g_assert (r11_save_code == after_r11_save_code);
 		}
 	}
-	offset += 8 * 8;
+	offset += 8 * sizeof(mgreg_t);
 	saved_fpregs_offset = - offset;
 	for (i = 0; i < 8; ++i)
-		amd64_movsd_membase_reg (code, AMD64_RBP, saved_fpregs_offset + (i * 8), i);
+		amd64_movsd_membase_reg (code, AMD64_RBP, saved_fpregs_offset + (i * sizeof(mgreg_t)), i);
 
 	if (tramp_type != MONO_TRAMPOLINE_GENERIC_CLASS_INIT &&
 			tramp_type != MONO_TRAMPOLINE_MONITOR_ENTER &&
@@ -426,14 +496,21 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 		/* Obtain the trampoline argument which is encoded in the instruction stream */
 		if (aot) {
 			/* Load the GOT offset */
-			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, tramp_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, tramp_offset, sizeof(gpointer));
+#if defined(__default_codegen__)
 			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_R11, 7, 4);
+#elif defined(__native_client_codegen__)
+			/* The arg is hidden in a "push imm32" instruction, */
+			/* add one to skip the opcode.                      */
+			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_R11, kNaClTrampOffset+1, 4);
+#endif
 			/* Compute the address of the GOT slot */
-			amd64_alu_reg_reg_size (code, X86_ADD, AMD64_R11, AMD64_RAX, 8);
+			amd64_alu_reg_reg_size (code, X86_ADD, AMD64_R11, AMD64_RAX, sizeof(gpointer));
 			/* Load the value */
-			amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, 0, 8);
+			amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, 0, sizeof(gpointer));
 		} else {			
-			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, tramp_offset, 8);
+			amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, tramp_offset, sizeof(gpointer));
+#if defined(__default_codegen__)
 			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_R11, 5, 1);
 			amd64_widen_reg (code, AMD64_RAX, AMD64_RAX, TRUE, FALSE);
 			amd64_alu_reg_imm_size (code, X86_CMP, AMD64_RAX, 4, 1);
@@ -447,11 +524,15 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 			mono_amd64_patch (br [0], code);
 			amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, 6, 8);
 			mono_amd64_patch (br [1], code);
+#elif defined(__native_client_codegen__)
+			/* All args are 32-bit pointers in NaCl */
+			amd64_mov_reg_membase (code, AMD64_R11, AMD64_R11, 6, 4);
+#endif
 		}
-		amd64_mov_membase_reg (code, AMD64_RBP, arg_offset, AMD64_R11, 8);
+		amd64_mov_membase_reg (code, AMD64_RBP, arg_offset, AMD64_R11, sizeof(gpointer));
 	} else {
-		amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, saved_regs_offset + (MONO_AMD64_ARG_REG1 * 8), 8);
-		amd64_mov_membase_reg (code, AMD64_RBP, arg_offset, AMD64_R11, 8);
+		amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, saved_regs_offset + (MONO_AMD64_ARG_REG1 * sizeof(mgreg_t)), sizeof(mgreg_t));
+		amd64_mov_membase_reg (code, AMD64_RBP, arg_offset, AMD64_R11, sizeof(gpointer));
 	}
 
 	/* Save LMF begin */
@@ -461,34 +542,34 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 
 	/* Save ip */
 	if (has_caller)
-		amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, 8, 8);
+		amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, 8, sizeof(gpointer));
 	else
 		amd64_mov_reg_imm (code, AMD64_R11, 0);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rip), AMD64_R11, sizeof(mgreg_t));
 	/* Save fp */
-	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RSP, framesize, 8);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbp), AMD64_R11, 8);
+	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RSP, framesize, sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbp), AMD64_R11, sizeof(mgreg_t));
 	/* Save sp */
-	amd64_mov_reg_reg (code, AMD64_R11, AMD64_RSP, 8);
+	amd64_mov_reg_reg (code, AMD64_R11, AMD64_RSP, sizeof(mgreg_t));
 	amd64_alu_reg_imm (code, X86_ADD, AMD64_R11, framesize + 16);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_R11, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_R11, sizeof(mgreg_t));
 	/* Save method */
 	if (tramp_type == MONO_TRAMPOLINE_JIT || tramp_type == MONO_TRAMPOLINE_JUMP) {
-		amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, arg_offset, 8);
-		amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, 8);
+		amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, arg_offset, sizeof(gpointer));
+		amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), AMD64_R11, sizeof(gpointer));
 	} else {
-		amd64_mov_membase_imm (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), 0, 8);
+		amd64_mov_membase_imm (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), 0, sizeof(gpointer));
 	}
 	/* Save callee saved regs */
 #ifdef TARGET_WIN32
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), AMD64_RDI, 8);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsi), AMD64_RSI, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), AMD64_RDI, sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsi), AMD64_RSI, sizeof(mgreg_t));
 #endif
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, sizeof(mgreg_t));
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, sizeof(mgreg_t));
 
 	if (aot) {
 		code = mono_arch_emit_load_aotconst (buf, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, "mono_get_lmf_addr");
@@ -498,15 +579,15 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 	amd64_call_reg (code, AMD64_R11);
 
 	/* Save lmf_addr */
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), AMD64_RAX, sizeof(gpointer));
 	/* Save previous_lmf */
 	/* Set the lowest bit to 1 to signal that this LMF has the ip field set */
-	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, 8);
-	amd64_alu_reg_imm_size (code, X86_ADD, AMD64_R11, 1, 8);
-	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, 8);
+	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RAX, 0, sizeof(gpointer));
+	amd64_alu_reg_imm_size (code, X86_ADD, AMD64_R11, 1, sizeof(gpointer));
+	amd64_mov_membase_reg (code, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), AMD64_R11, sizeof(gpointer));
 	/* Set new lmf */
 	amd64_lea_membase (code, AMD64_R11, AMD64_RBP, lmf_offset);
-	amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, 8);
+	amd64_mov_membase_reg (code, AMD64_RAX, 0, AMD64_R11, sizeof(gpointer));
 
 	/* Save LMF end */
 
@@ -515,15 +596,15 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 
 	/* Arg2 is the address of the calling code */
 	if (has_caller)
-		amd64_mov_reg_membase (code, AMD64_ARG_REG2, AMD64_RBP, 8, 8);
+		amd64_mov_reg_membase (code, AMD64_ARG_REG2, AMD64_RBP, 8, sizeof(gpointer));
 	else
 		amd64_mov_reg_imm (code, AMD64_ARG_REG2, 0);
 
 	/* Arg3 is the method/vtable ptr */
-	amd64_mov_reg_membase (code, AMD64_ARG_REG3, AMD64_RBP, arg_offset, 8);
+	amd64_mov_reg_membase (code, AMD64_ARG_REG3, AMD64_RBP, arg_offset, sizeof(gpointer));
 
 	/* Arg4 is the trampoline address */
-	amd64_mov_reg_membase (code, AMD64_ARG_REG4, AMD64_RBP, tramp_offset, 8);
+	amd64_mov_reg_membase (code, AMD64_ARG_REG4, AMD64_RBP, tramp_offset, sizeof(gpointer));
 
 	if (aot) {
 		char *icall_name = g_strdup_printf ("trampoline_func_%d", tramp_type);
@@ -539,7 +620,7 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 	/* 
 	 * Have to call the _force_ variant, since there could be a protected wrapper on the top of the stack.
 	 */
-	amd64_mov_membase_reg (code, AMD64_RBP, res_offset, AMD64_RAX, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, res_offset, AMD64_RAX, sizeof(mgreg_t));
 	if (aot) {
 		code = mono_arch_emit_load_aotconst (buf, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, "mono_thread_force_interruption_checkpoint");
 	} else {
@@ -547,43 +628,45 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 	}
 	amd64_call_reg (code, AMD64_R11);
 
-	amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RBP, res_offset, 8);	
+	amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RBP, res_offset, sizeof(mgreg_t));	
 
 	/* Restore LMF */
 
-	amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 8);
-	amd64_alu_reg_imm_size (code, X86_SUB, AMD64_RCX, 1, 8);
-	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 8);
-	amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, 8);
+	amd64_mov_reg_membase (code, AMD64_RCX, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof(gpointer));
+	amd64_alu_reg_imm_size (code, X86_SUB, AMD64_RCX, 1, sizeof(gpointer));
+	amd64_mov_reg_membase (code, AMD64_R11, AMD64_RBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), sizeof(gpointer));
+	amd64_mov_membase_reg (code, AMD64_R11, 0, AMD64_RCX, sizeof(gpointer));
 
 	/* 
 	 * Save rax to the stack, after the leave instruction, this will become part of
 	 * the red zone.
 	 */
-	amd64_mov_membase_reg (code, AMD64_RBP, rax_offset, AMD64_RAX, 8);
+	amd64_mov_membase_reg (code, AMD64_RBP, rax_offset, AMD64_RAX, sizeof(mgreg_t));
 
 	/* Restore argument registers, r10 (imt method/rgxtx)
 	   and rax (needed for direct calls to C vararg functions). */
 	for (i = 0; i < AMD64_NREG; ++i)
 		if (AMD64_IS_ARGUMENT_REG (i) || i == AMD64_R10 || i == AMD64_RAX)
-			amd64_mov_reg_membase (code, i, AMD64_RBP, saved_regs_offset + (i * 8), 8);
+			amd64_mov_reg_membase (code, i, AMD64_RBP, saved_regs_offset + (i * sizeof(mgreg_t)), sizeof(mgreg_t));
 
 	for (i = 0; i < 8; ++i)
-		amd64_movsd_reg_membase (code, i, AMD64_RBP, saved_fpregs_offset + (i * 8));
+		amd64_movsd_reg_membase (code, i, AMD64_RBP, saved_fpregs_offset + (i * sizeof(mgreg_t)));
 
 	/* Restore stack */
 	amd64_leave (code);
 
 	if (MONO_TRAMPOLINE_TYPE_MUST_RETURN (tramp_type)) {
 		/* Load result */
-		amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, rax_offset - 0x8, 8);
+		amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, rax_offset - sizeof(mgreg_t), sizeof(mgreg_t));
 		amd64_ret (code);
 	} else {
 		/* call the compiled method using the saved rax */
-		amd64_jump_membase (code, AMD64_RSP, rax_offset - 0x8);
+		amd64_jump_membase (code, AMD64_RSP, rax_offset - sizeof(mgreg_t));
 	}
 
-	g_assert ((code - buf) <= buf_len);
+	g_assert ((code - buf) <= kMaxCodeSize);
+
+	nacl_global_codeman_validate (&buf, kMaxCodeSize, &code);
 
 	mono_arch_flush_icache (buf, code - buf);
 
@@ -606,6 +689,8 @@ mono_arch_get_nullified_class_init_trampoline (MonoTrampInfo **info)
 	code = buf = mono_global_codeman_reserve (16);
 	amd64_ret (code);
 
+	nacl_global_codeman_validate(&buf, 16, &code);
+
 	mono_arch_flush_icache (buf, code - buf);
 
 	if (info)
@@ -625,15 +710,25 @@ mono_arch_create_specific_trampoline (gpointer arg1, MonoTrampolineType tramp_ty
 
 	tramp = mono_get_trampoline_code (tramp_type);
 
+#if defined(__default_codegen__)
 	if ((((guint64)arg1) >> 32) == 0)
 		size = 5 + 1 + 4;
 	else
 		size = 5 + 1 + 8;
 
 	code = buf = mono_domain_code_reserve_align (domain, size, 1);
+#elif defined(__native_client_codegen__)
+	size = 5 + 1 + 4;
+	/* Aligning the call site below could */
+	/* add up to kNaClAlignment-1 bytes   */
+	size += (kNaClAlignment-1);
+	buf = mono_domain_code_reserve_align (domain, size, kNaClAlignment);
+	code = buf;
+#endif
 
 	amd64_call_code (code, tramp);
 	/* The trampoline code will obtain the argument from the instruction stream */
+#if defined(__default_codegen__)
 	if ((((guint64)arg1) >> 32) == 0) {
 		*code = 0x4;
 		*(guint32*)(code + 1) = (gint64)arg1;
@@ -643,12 +738,20 @@ mono_arch_create_specific_trampoline (gpointer arg1, MonoTrampolineType tramp_ty
 		*(guint64*)(code + 1) = (gint64)arg1;
 		code += 9;
 	}
+#elif defined(__native_client_codegen__)
+	/* For NaCl, all tramp args are 32-bit because they're pointers */
+	*code = 0x68; /* push imm32 */
+	*(guint32*)(code + 1) = (gint32)arg1;
+	code += 5;
+#endif
 
 	g_assert ((code - buf) <= size);
 
 	if (code_len)
 		*code_len = size;
 
+	nacl_domain_code_validate(domain, &buf, size, &code);
+
 	mono_arch_flush_icache (buf, size);
 
 	return buf;
@@ -679,7 +782,7 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 		index -= size - 1;
 	}
 
-	tramp_size = 64 + 8 * depth;
+	tramp_size = NACL_SIZE (64 + 8 * depth, 128 + 8 * depth);
 
 	code = buf = mono_global_codeman_reserve (tramp_size);
 
@@ -692,7 +795,7 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 		amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
 	} else {
 		/* load rgctx ptr from vtable */
-		amd64_mov_reg_membase (code, AMD64_RAX, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoVTable, runtime_generic_context), 8);
+		amd64_mov_reg_membase (code, AMD64_RAX, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoVTable, runtime_generic_context), sizeof(gpointer));
 		/* is the rgctx ptr null? */
 		amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
 		/* if yes, jump to actual trampoline */
@@ -703,9 +806,9 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 	for (i = 0; i < depth; ++i) {
 		/* load ptr to next array */
 		if (mrgctx && i == 0)
-			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RAX, MONO_SIZEOF_METHOD_RUNTIME_GENERIC_CONTEXT, 8);
+			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RAX, MONO_SIZEOF_METHOD_RUNTIME_GENERIC_CONTEXT, sizeof(gpointer));
 		else
-			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RAX, 0, 8);
+			amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RAX, 0, sizeof(gpointer));
 		/* is the ptr null? */
 		amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
 		/* if yes, jump to actual trampoline */
@@ -714,7 +817,7 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 	}
 
 	/* fetch slot */
-	amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RAX, sizeof (gpointer) * (index + 1), 8);
+	amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RAX, sizeof (gpointer) * (index + 1), sizeof(gpointer));
 	/* is the slot null? */
 	amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
 	/* if yes, jump to actual trampoline */
@@ -724,12 +827,12 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 	amd64_ret (code);
 
 	for (i = mrgctx ? 1 : 0; i <= depth + 1; ++i)
-		x86_patch (rgctx_null_jumps [i], code);
+		mono_amd64_patch (rgctx_null_jumps [i], code);
 
 	g_free (rgctx_null_jumps);
 
 	/* move the rgctx pointer to the VTABLE register */
-	amd64_mov_reg_reg (code, MONO_ARCH_VTABLE_REG, AMD64_ARG_REG1, 8);
+	amd64_mov_reg_reg (code, MONO_ARCH_VTABLE_REG, AMD64_ARG_REG1, sizeof(gpointer));
 
 	if (aot) {
 		code = mono_arch_emit_load_aotconst (buf, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, g_strdup_printf ("specific_trampoline_lazy_fetch_%u", slot));
@@ -741,6 +844,7 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 		amd64_jump_code (code, tramp);
 	}
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
 	mono_arch_flush_icache (buf, code - buf);
 
 	g_assert (code - buf <= tramp_size);
@@ -788,6 +892,8 @@ mono_arch_create_generic_class_init_trampoline (MonoTrampInfo **info, gboolean a
 		amd64_jump_code (code, tramp);
 	}
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	mono_arch_flush_icache (buf, code - buf);
 
 	g_assert (code - buf <= tramp_size);
@@ -911,6 +1017,8 @@ mono_arch_create_monitor_enter_trampoline (MonoTrampInfo **info, gboolean aot)
 		amd64_jump_code (code, tramp);
 	}
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	mono_arch_flush_icache (code, code - buf);
 	g_assert (code - buf <= tramp_size);
 
@@ -1027,6 +1135,8 @@ mono_arch_create_monitor_exit_trampoline (MonoTrampInfo **info, gboolean aot)
 		amd64_jump_code (code, tramp);
 	}
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	mono_arch_flush_icache (code, code - buf);
 	g_assert (code - buf <= tramp_size);
 
@@ -1118,5 +1228,11 @@ mono_arch_get_call_target (guint8 *code)
 guint32
 mono_arch_get_plt_info_offset (guint8 *plt_entry, mgreg_t *regs, guint8 *code)
 {
+#if defined(__native_client__) || defined(__native_client_codegen__)
+	/* 18 = 3 (mov opcode) + 4 (disp) + 10 (nacljmp) + 1 (push opcode) */
+	/* See aot-compiler.c arch_emit_plt_entry for details.             */
+	return *(guint32*)(plt_entry + 18);
+#else
 	return *(guint32*)(plt_entry + 6);
+#endif
 }
diff --git a/mono/mini/tramp-x86.c b/mono/mini/tramp-x86.c
index 97ec916d6c8..90b76fdbbe9 100644
--- a/mono/mini/tramp-x86.c
+++ b/mono/mini/tramp-x86.c
@@ -49,6 +49,8 @@ mono_arch_get_unbox_trampoline (MonoMethod *m, gpointer addr)
 	x86_jump_code (code, addr);
 	g_assert ((code - start) < 16);
 
+	nacl_domain_code_validate (domain, &start, 16, &code);
+
 	return start;
 }
 
@@ -68,6 +70,7 @@ mono_arch_get_static_rgctx_trampoline (MonoMethod *m, MonoMethodRuntimeGenericCo
 	x86_jump_code (code, addr);
 	g_assert ((code - start) <= buf_len);
 
+	nacl_domain_code_validate (domain, &start, buf_len, &code);
 	mono_arch_flush_icache (start, code - start);
 
 	return start;
@@ -96,6 +99,8 @@ mono_arch_get_llvm_imt_trampoline (MonoDomain *domain, MonoMethod *m, int vt_off
 
 	g_assert ((code - start) < buf_len);
 
+	nacl_domain_code_validate (domain, &start, buf_len, &code);
+
 	mono_arch_flush_icache (start, code - start);
 
 	return start;
@@ -104,6 +109,7 @@ mono_arch_get_llvm_imt_trampoline (MonoDomain *domain, MonoMethod *m, int vt_off
 void
 mono_arch_patch_callsite (guint8 *method_start, guint8 *orig_code, guint8 *addr)
 {
+#if defined(__default_codegen__)
 	guint8 *code;
 	guint8 buf [8];
 	gboolean can_write = mono_breakpoint_clean_code (method_start, orig_code, 8, buf, sizeof (buf));
@@ -135,6 +141,23 @@ mono_arch_patch_callsite (guint8 *method_start, guint8 *orig_code, guint8 *addr)
 				code [4], code [5], code [6]);
 		g_assert_not_reached ();
 	}
+#elif defined(__native_client__)
+	/* Target must be bundle-aligned */
+	g_assert (((guint32)addr & kNaClAlignmentMask) == 0);
+
+	/* 0xe8 = call <DISP>, 0xe9 = jump <DISP> */
+	if ((orig_code [-5] == 0xe8) || orig_code [-6] == 0xe9) {
+		int ret;
+		gint32 offset = (gint32)addr - (gint32)orig_code;
+		guint8 buf[sizeof(gint32)];
+		*((gint32*)(buf)) = offset;
+		ret = nacl_dyncode_modify (orig_code - sizeof(gint32), buf, sizeof(gint32));
+		g_assert (ret == 0);
+	} else {
+		printf ("Invalid trampoline sequence %p: %02x %02x %02x %02x %02x\n", orig_code, orig_code [-5], orig_code [-4], orig_code [-3], orig_code [-2], orig_code[-1]);
+		g_assert_not_reached ();
+	}
+#endif
 }
 
 void
@@ -154,7 +177,7 @@ mono_arch_patch_plt_entry (guint8 *code, gpointer *got, mgreg_t *regs, guint8 *a
 	g_assert (code [1] == 0x8b);
 
 	offset = *(guint32*)(code + 2);
-#else
+#elif defined(__default_codegen__)
 	/* A PLT entry: jmp *<DISP>(%ebx) */
 	g_assert (code [0] == 0xff);
 	g_assert (code [1] == 0xa3);
@@ -222,6 +245,7 @@ mono_arch_nullify_class_init_trampoline (guint8 *code, mgreg_t *regs)
 
 	code -= 5;
 	if (code [0] == 0xe8) {
+#if defined(__default_codegen__)
 		if (!mono_running_on_valgrind ()) {
 			guint32 ops;
 			/*
@@ -248,6 +272,9 @@ mono_arch_nullify_class_init_trampoline (guint8 *code, mgreg_t *regs)
 			/* Tell valgrind to recompile the patched code */
 			//VALGRIND_DISCARD_TRANSLATIONS (code, 8);
 		}
+#elif defined(__native_client_codegen__)
+		mono_arch_patch_callsite (code, code + 5, nullified_class_init_trampoline);
+#endif
 	} else if (code [0] == 0x90 || code [0] == 0xeb) {
 		/* Already changed by another thread */
 		;
@@ -499,6 +526,7 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf
 
 	x86_ret (code);
 
+	nacl_global_codeman_validate (&buf, 256, &code);
 	g_assert ((code - buf) <= 256);
 
 	if (info)
@@ -516,10 +544,13 @@ gpointer
 mono_arch_get_nullified_class_init_trampoline (MonoTrampInfo **info)
 {
 	guint8 *code, *buf;
+	int tramp_size = NACL_SIZE (16, kNaClAlignment);		
 
-	code = buf = mono_global_codeman_reserve (16);
+	code = buf = mono_global_codeman_reserve (tramp_size);
 	x86_ret (code);
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	mono_arch_flush_icache (buf, code - buf);
 
 	if (info)
@@ -546,6 +577,8 @@ mono_arch_create_specific_trampoline (gpointer arg1, MonoTrampolineType tramp_ty
 	x86_jump_code (buf, tramp);
 	g_assert ((buf - code) <= TRAMPOLINE_SIZE);
 
+	nacl_domain_code_validate (domain, &code, kNaClAlignment, &buf);
+
 	mono_arch_flush_icache (code, buf - code);
 
 	if (code_len)
@@ -581,13 +614,12 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 		index -= size - 1;
 	}
 
-#ifdef __native_client_codegen__
-	/* TODO: align for Native Client */
-	tramp_size = (aot ? 64 : 36) + 2 * kNaClAlignment +
-		6 * (depth + kNaClAlignment);
-#else
+#if defined(__default_codegen__)
 	tramp_size = (aot ? 64 : 36) + 6 * depth;
-#endif  /* __native_client_codegen__ */
+#elif defined(__native_client_codegen__)
+	tramp_size = (aot ? 64 : 36) + 2 * kNaClAlignment +
+	  6 * (depth + kNaClAlignment);
+#endif
 
 	code = buf = mono_global_codeman_reserve (tramp_size);
 
@@ -645,6 +677,7 @@ mono_arch_create_rgctx_lazy_fetch_trampoline (guint32 slot, MonoTrampInfo **info
 		x86_jump_code (code, tramp);
 	}
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
 	mono_arch_flush_icache (buf, code - buf);
 
 	g_assert (code - buf <= tramp_size);
@@ -703,6 +736,9 @@ mono_arch_create_generic_class_init_trampoline (MonoTrampInfo **info, gboolean a
 #ifdef __native_client_codegen__
 	g_assert (code - buf <= kNaClAlignment);
 #endif
+
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	if (info)
 		*info = mono_tramp_info_create (g_strdup_printf ("generic_class_init_trampoline"), buf, code - buf, ji, unwind_ops);
 
@@ -850,6 +886,8 @@ mono_arch_create_monitor_enter_trampoline (MonoTrampInfo **info, gboolean aot)
 	mono_arch_flush_icache (buf, code - buf);
 	g_assert (code - buf <= tramp_size);
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	if (info)
 		*info = mono_tramp_info_create (g_strdup_printf ("monitor_enter_trampoline"), buf, code - buf, ji, unwind_ops);
 
@@ -960,6 +998,8 @@ mono_arch_create_monitor_exit_trampoline (MonoTrampInfo **info, gboolean aot)
 		x86_jump_code (code, tramp);
 	}
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	mono_arch_flush_icache (buf, code - buf);
 	g_assert (code - buf <= tramp_size);
 
@@ -1032,6 +1072,8 @@ mono_arch_create_handler_block_trampoline (void)
 		x86_jump_code (code, handler_block_trampoline_helper);
 	}
 
+	nacl_global_codeman_validate (&buf, tramp_size, &code);
+
 	mono_arch_flush_icache (buf, code - buf);
 	g_assert (code - buf <= tramp_size);
 
diff --git a/mono/utils/mono-codeman.c b/mono/utils/mono-codeman.c
index 32d7319dc14..af73766a743 100644
--- a/mono/utils/mono-codeman.c
+++ b/mono/utils/mono-codeman.c
@@ -19,7 +19,12 @@
 #ifdef HAVE_VALGRIND_MEMCHECK_H
 #include <valgrind/memcheck.h>
 #endif
- 
+
+#if defined(__native_client_codegen__) && defined(__native_client__)
+#include <malloc.h>
+#include <sys/nacl_syscalls.h>
+#endif
+
 /*
  * AMD64 processors maintain icache coherency only for pages which are 
  * marked executable. Also, windows DEP requires us to obtain executable memory from
@@ -82,10 +87,119 @@ struct _MonoCodeManager {
 	int read_only;
 	CodeChunk *current;
 	CodeChunk *full;
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	MonoGHashTable *hash;
+#endif
 };
 
 #define ALIGN_INT(val,alignment) (((val) + (alignment - 1)) & ~(alignment - 1))
 
+#if defined(__native_client_codegen__) && defined(__native_client__)
+/* End of text segment, set by linker. 
+ * Dynamic text starts on the next allocated page.
+ */
+extern char etext[];
+char *next_dynamic_code_addr = NULL;
+
+/*
+ * This routine gets the next available bundle aligned
+ * pointer in the dynamic code section.  It does not check
+ * for the section end, this error will be caught in the
+ * service runtime.
+ */
+void*
+allocate_code(intptr_t increment)
+{
+	char *addr;
+	if (increment < 0) return NULL;
+	increment = increment & kNaClBundleMask ? (increment & ~kNaClBundleMask) + kNaClBundleSize : increment;
+	addr = next_dynamic_code_addr;
+	next_dynamic_code_addr += increment;
+	return addr;
+}
+
+int
+nacl_is_code_address (void *target)
+{
+	return (char *)target < next_dynamic_code_addr;
+}
+
+const int kMaxPatchDepth = 32;
+__thread unsigned char **patch_source_base = NULL;
+__thread unsigned char **patch_dest_base = NULL;
+__thread int *patch_alloc_size = NULL;
+__thread int patch_current_depth = -1;
+__thread int allow_target_modification = 1;
+
+void
+nacl_allow_target_modification (int val)
+{
+	allow_target_modification = val;
+}
+
+static void
+nacl_jit_check_init ()
+{
+	if (patch_source_base == NULL) {
+		patch_source_base = g_malloc (kMaxPatchDepth * sizeof(unsigned char *));
+		patch_dest_base = g_malloc (kMaxPatchDepth * sizeof(unsigned char *));
+		patch_alloc_size = g_malloc (kMaxPatchDepth * sizeof(int));
+	}
+}
+
+
+/* Given a patch target, modify the target such that patching will work when
+ * the code is copied to the data section.
+ */
+void*
+nacl_modify_patch_target (unsigned char *target)
+{
+	/* This seems like a bit of an ugly way to do this but the advantage
+	 * is we don't have to worry about all the conditions in
+	 * mono_resolve_patch_target, and it can be used by all the bare uses
+	 * of <arch>_patch.
+	 */
+	unsigned char *sb;
+	unsigned char *db;
+
+	if (!allow_target_modification) return target;
+
+	nacl_jit_check_init ();
+	sb = patch_source_base[patch_current_depth];
+	db = patch_dest_base[patch_current_depth];
+
+	if (target >= sb && (target < sb + patch_alloc_size[patch_current_depth])) {
+		/* Do nothing.  target is in the section being generated.
+		 * no need to modify, the disp will be the same either way.
+		 */
+	} else {
+		int target_offset = target - db;
+		target = sb + target_offset;
+	}
+	return target;
+}
+
+void*
+nacl_inverse_modify_patch_target (unsigned char *target)
+{
+	unsigned char *sb;
+	unsigned char *db;
+	int target_offset;
+
+	if (!allow_target_modification) return target;
+
+	nacl_jit_check_init ();
+	sb = patch_source_base[patch_current_depth];
+	db = patch_dest_base[patch_current_depth];
+
+	target_offset = target - sb;
+	target = db + target_offset;
+	return target;
+}
+
+
+#endif /* __native_client_codegen && __native_client__ */
+
 /**
  * mono_code_manager_new:
  *
@@ -107,6 +221,24 @@ mono_code_manager_new (void)
 	cman->full = NULL;
 	cman->dynamic = 0;
 	cman->read_only = 0;
+#if defined(__native_client_codegen__) && defined(__native_client__)
+	if (next_dynamic_code_addr == NULL) {
+		const guint kPageMask = 0xFFFF; /* 64K pages */
+		next_dynamic_code_addr = (uintptr_t)(etext + kPageMask) & ~kPageMask;
+		/* Workaround bug in service runtime, unable to allocate */
+		/* from the first page in the dynamic code section.    */
+		/* TODO: remove */
+		next_dynamic_code_addr += (uintptr_t)0x10000;
+	}
+	cman->hash =  mono_g_hash_table_new (NULL, NULL);
+	/* Keep the hash table from being collected */
+	mono_gc_register_root (&cman->hash, sizeof (void*), NULL);
+	if (patch_source_base == NULL) {
+		patch_source_base = g_malloc (kMaxPatchDepth * sizeof(unsigned char *));
+		patch_dest_base = g_malloc (kMaxPatchDepth * sizeof(unsigned char *));
+		patch_alloc_size = g_malloc (kMaxPatchDepth * sizeof(int));
+	}
+#endif
 	return cman;
 }
 
@@ -288,7 +420,10 @@ new_codechunk (int dynamic, int size)
 		if (!ptr)
 			return NULL;
 	} else {
-		ptr = mono_valloc (NULL, chunk_size, MONO_PROT_RWX | ARCH_MAP_FLAGS);
+		/* Allocate MIN_ALIGN-1 more than we need so we can still */
+		/* guarantee MIN_ALIGN alignment for individual allocs    */
+		/* from mono_code_manager_reserve_align.                  */
+		ptr = mono_valloc (NULL, chunk_size + MIN_ALIGN - 1, MONO_PROT_RWX | ARCH_MAP_FLAGS);
 		if (!ptr)
 			return NULL;
 	}
@@ -333,8 +468,10 @@ new_codechunk (int dynamic, int size)
 void*
 mono_code_manager_reserve_align (MonoCodeManager *cman, int size, int alignment)
 {
+#if !defined(__native_client__) || !defined(__native_client_codegen__)
 	CodeChunk *chunk, *prev;
 	void *ptr;
+	guint32 align_mask = alignment - 1;
 
 	g_assert (!cman->read_only);
 
@@ -357,8 +494,10 @@ mono_code_manager_reserve_align (MonoCodeManager *cman, int size, int alignment)
 	for (chunk = cman->current; chunk; chunk = chunk->next) {
 		if (ALIGN_INT (chunk->pos, alignment) + size <= chunk->size) {
 			chunk->pos = ALIGN_INT (chunk->pos, alignment);
-			ptr = chunk->data + chunk->pos;
-			chunk->pos += size;
+			/* Align the chunk->data we add to chunk->pos */
+			/* or we can't guarantee proper alignment     */
+			ptr = (void*)((((uintptr_t)chunk->data + align_mask) & ~align_mask) + chunk->pos);
+			chunk->pos = ((char*)ptr - chunk->data) + size;
 			return ptr;
 		}
 	}
@@ -385,9 +524,33 @@ mono_code_manager_reserve_align (MonoCodeManager *cman, int size, int alignment)
 	chunk->next = cman->current;
 	cman->current = chunk;
 	chunk->pos = ALIGN_INT (chunk->pos, alignment);
-	ptr = chunk->data + chunk->pos;
-	chunk->pos += size;
+	/* Align the chunk->data we add to chunk->pos */
+	/* or we can't guarantee proper alignment     */
+	ptr = (void*)((((uintptr_t)chunk->data + align_mask) & ~align_mask) + chunk->pos);
+	chunk->pos = ((char*)ptr - chunk->data) + size;
 	return ptr;
+#else
+	unsigned char *temp_ptr, *code_ptr;
+	/* Round up size to next bundle */
+	alignment = kNaClBundleSize;
+	size = (size + kNaClBundleSize) & (~kNaClBundleMask);
+	/* Allocate a temp buffer */
+	temp_ptr = memalign (alignment, size);
+	g_assert (((uintptr_t)temp_ptr & kNaClBundleMask) == 0);
+	/* Allocate code space from the service runtime */
+	code_ptr = allocate_code (size);
+	/* Insert pointer to code space in hash, keyed by buffer ptr */
+	mono_g_hash_table_insert (cman->hash, temp_ptr, code_ptr);
+
+	nacl_jit_check_init ();
+
+	patch_current_depth++;
+	patch_source_base[patch_current_depth] = temp_ptr;
+	patch_dest_base[patch_current_depth] = code_ptr;
+	patch_alloc_size[patch_current_depth] = size;
+	g_assert (patch_current_depth < kMaxPatchDepth);
+	return temp_ptr;
+#endif
 }
 
 /**
@@ -419,13 +582,45 @@ mono_code_manager_reserve (MonoCodeManager *cman, int size)
 void
 mono_code_manager_commit (MonoCodeManager *cman, void *data, int size, int newsize)
 {
+#if !defined(__native_client__) || !defined(__native_client_codegen__)
 	g_assert (newsize <= size);
 
 	if (cman->current && (size != newsize) && (data == cman->current->data + cman->current->pos - size)) {
 		cman->current->pos -= size - newsize;
 	}
+#else
+	unsigned char *code;
+	int status;
+	g_assert (newsize <= size);
+	code = mono_g_hash_table_lookup (cman->hash, data);
+	g_assert (code != NULL);
+	/* Pad space after code with HLTs */
+	/* TODO: this is x86/amd64 specific */
+	while (newsize & kNaClBundleMask) {
+		*((char *)data + newsize) = 0xf4;
+		newsize++;
+	}
+	status = nacl_dyncode_create (code, data, newsize);
+	if (status != 0) {
+		g_assert_not_reached ();
+	}
+	mono_g_hash_table_remove (cman->hash, data);
+	g_assert (data == patch_source_base[patch_current_depth]);
+	g_assert (code == patch_dest_base[patch_current_depth]);
+	patch_current_depth--;
+	g_assert (patch_current_depth >= -1);
+	free (data);
+#endif
 }
 
+#if defined(__native_client_codegen__) && defined(__native_client__)
+void *
+nacl_code_manager_get_code_dest (MonoCodeManager *cman, void *data)
+{
+	return mono_g_hash_table_lookup (cman->hash, data);
+}
+#endif
+
 /**
  * mono_code_manager_size:
  * @cman: a code manager
diff --git a/mono/utils/mono-codeman.h b/mono/utils/mono-codeman.h
index 39cc1e28034..1507348b0b5 100644
--- a/mono/utils/mono-codeman.h
+++ b/mono/utils/mono-codeman.h
@@ -19,5 +19,21 @@ int              mono_code_manager_size    (MonoCodeManager *cman, int *used_siz
 typedef int    (*MonoCodeManagerFunc)      (void *data, int csize, int size, void *user_data);
 void            mono_code_manager_foreach  (MonoCodeManager *cman, MonoCodeManagerFunc func, void *user_data);
 
+#if defined( __native_client_codegen__ ) && defined( __native_client__ )
+
+#define kNaClBundleSize 32
+#define kNaClBundleMask (kNaClBundleSize-1)
+
+extern __thread unsigned char **patch_source_base;
+extern __thread unsigned char **patch_dest_base;
+extern __thread int patch_current_depth;
+
+int              nacl_is_code_address             (void *target);
+void*            nacl_code_manager_get_code_dest  (MonoCodeManager *cman, void *data);
+void             nacl_allow_target_modification   (int val);
+void*            nacl_modify_patch_target         (unsigned char *target);
+void*            nacl_inverse_modify_patch_target (unsigned char *target);
+#endif /* __native_client__ */
+
 #endif /* __MONO_CODEMAN_H__ */
 
diff --git a/mono/utils/mono-path.c b/mono/utils/mono-path.c
index 32ad8899f52..ca71d98996f 100644
--- a/mono/utils/mono-path.c
+++ b/mono/utils/mono-path.c
@@ -30,6 +30,9 @@
 
 /* Resolves '..' and '.' references in a path. If the path provided is relative,
  * it will be relative to the current directory */
+
+/* For Native Client, the above is not true.  Since there is no getcwd we fill */
+/* in the file being passed in relative to '.' and don't resolve it            */
 gchar *
 mono_path_canonicalize (const char *path)
 {
@@ -39,9 +42,14 @@ mono_path_canonicalize (const char *path)
 	if (g_path_is_absolute (path)) {
 		abspath = g_strdup (path);
 	} else {
+#ifdef __native_client__
+		gchar *tmpdir = ".";
+		abspath = g_build_filename (tmpdir, path, NULL);
+#else
 		gchar *tmpdir = g_get_current_dir ();
 		abspath = g_build_filename (tmpdir, path, NULL);
 		g_free (tmpdir);
+#endif
 	}
 
 #ifdef HOST_WIN32
diff --git a/nacl/README b/nacl/README
new file mode 100644
index 00000000000..191fec693e1
--- /dev/null
+++ b/nacl/README
@@ -0,0 +1,92 @@
+Quick guide
+===========
+
+Prerequistites (see end of file for gclient & svn paths)
+--------------
+1. Naclports from SVN
+   - needed for nacl toolchain (nacl-gcc, etc.)
+   - needed for packages (zlib for nacl, etc.)
+2. Native Client repo from SVN
+   - currently needed for sel_ldr
+3. Mono with NaCl support (you have it if you're reading this file)
+
+4. Directory conventions used in this document
+(your directories may differ...)
+  ~/naclports  Naclports repo from SVN
+  ~/nacl       Native Client repo from SVN
+  ~/mono       Mono for NaCl
+
+5. Setting your environment:
+  export NACL_SDK_PATH=/home/username/naclports
+  export NACL_PATH=/home/username/nacl
+5a. Make sure you have a dbg sel_ldr available
+  /home/username/nacl/
+    native_client/scons-out/dbg-${OS_SUBDIR}-x86-${BITSIZE}/staging
+5b. If it is not available, scons build it (substitute
+   linux with mac or win as needed)
+      cd /home/username/nacl/native_client
+      ./scons MODE=dbg-linux,nacl [platform=x86-64]
+
+6. Build naclports libraries
+     cd /home/username/naclports/src/packages
+     ./nacl-install-all.sh
+
+7. Build NaCl Mono Runtime ('libmono.a' for NaCl, 5 minutes):
+  cd /home/username/mono/trunk/nacl
+  ./nacl-runtime-mono.sh [TARGET_BITSIZE=32/64 for cross-compiling runtime]
+
+8. (optional for AOT) Build NaCl Mono Compiler: ('nacl[64]-mono' AOT cross compiler for NaCl, 5 minutes):
+  cd /home/username/mono/trunk/nacl
+  ./nacl-mono.sh (32-bit cross-compiler)
+  ./nacl64-mono.sh (64-bit cross-compiler)
+
+Native Client Mono Install locations:
+  /home/username/mono/trunk/nacl/runtime
+  /home/username/mono/trunk/nacl/compiler
+
+Normal (not-Native-Client) Mono Install location:
+  /home/username/mono/trunk/nacl/normal-mono
+
+Simple Test (requires sel_ldr to run)
+  cd  /home/username/mono/trunk/nacl/test
+  ./nacl [normal,aot,regression] (defaults to nacl,jit,simple test)
+
+
+SVN Repos
+=========
+
+1. Getting Naclports repo
+  cd ~
+  mkdir naclports
+  cd naclports
+  gclient config https://naclports.googlecode.com/svn/trunk/src
+  gclient sync
+
+2. Getting Native Client repo
+
+  cd ~
+  mkdir nacl
+  cd nacl
+  vim .gclient
+--------add text below-------
+solutions = [
+  { "name"        : "native_client",
+    "url"         : "svn://svn.chromium.org/native_client/trunk/src/native_client",
+  },
+  { "name"        : "supplement.DEPS",
+    "url"         : "svn://svn.chromium.org/native_client/trunk/deps/supplement.DEPS",
+  },
+]
+---------end text------------
+  gclient sync
+  gclient runhooks --force
+
+
+3. Getting Mono repo
+
+  cd ~
+  mkdir mono
+  cd mono
+  // see http://mono-project.com/Compiling_Mono_From_Git
+
+
diff --git a/nacl/common.sh b/nacl/common.sh
new file mode 100644
index 00000000000..bbf8b29d1c4
--- /dev/null
+++ b/nacl/common.sh
@@ -0,0 +1,204 @@
+# Copyright (c) 2009 The Native Client Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that be
+# found in the LICENSE file.
+#
+
+set -o nounset
+set -o errexit
+
+# scripts that source this file must be run from within packages tree
+readonly SAVE_PWD=$(pwd)
+
+# Pick platform directory for compiler.
+readonly OS_NAME=$(uname -s)
+if [ $OS_NAME = "Darwin" ]; then
+  readonly OS_SUBDIR="mac"
+  readonly OS_SUBDIR_SHORT="mac"
+elif [ $OS_NAME = "Linux" ]; then
+  readonly OS_SUBDIR="linux"
+  readonly OS_SUBDIR_SHORT="linux"
+else
+  readonly OS_SUBDIR="windows"
+  readonly OS_SUBDIR_SHORT="win"
+fi
+
+readonly MACHINE=$(uname -m)
+if [ $MACHINE = "x86_64" ]; then
+  readonly TARGET_BITSIZE=${TARGET_BITSIZE:-"64"}
+  readonly HOST_BITSIZE=${HOST_BITSIZE:-"64"}
+else
+  # uname -m reports i686 on Linux and i386 on Mac
+  readonly TARGET_BITSIZE=${TARGET_BITSIZE:-"32"}
+  readonly HOST_BITSIZE=${HOST_BITSIZE:-"32"}
+fi
+
+NACL_SDK_BASE=/usr/local/google/elijahtaylor/nacl_svn/native_client/tools/sdk/nacl-sdk
+#NACL_SDK_BASE=/usr/local/google/elijahtaylor/nacl_svn/native_client/toolchain/linux_x86
+
+if [ $TARGET_BITSIZE == "64" ]; then
+  readonly TARGET_BIT_PREFIX="64"
+else
+  readonly TARGET_BIT_PREFIX=""
+fi
+# we might want to override the detected host platform (e.g. on OSX 10.6)
+if [ $HOST_BITSIZE == "64" ]; then
+  readonly HOST_BIT_PREFIX="64"
+else
+  readonly HOST_BIT_PREFIX=""
+fi
+
+# locate default nacl_sdk toolchain
+# TODO: no arm support
+readonly NACL_SDK=${NACL_SDK_PATH:-/usr/local/google/elijahtaylor/naclports_svn}
+readonly NACL_DEV=${NACL_PATH:-/usr/local/google/elijahtaylor/nacl_svn}
+readonly NACL_NATIVE_CLIENT=${NACL_SDK}/src
+readonly NACL_SDK_BASE=${NACL_SDK_BASE:-\
+${NACL_NATIVE_CLIENT}/toolchain/${OS_SUBDIR_SHORT}_x86}
+
+readonly NACL_BIN_PATH=${NACL_SDK_BASE}/bin
+readonly NACLCC=${NACL_SDK_BASE}/bin/nacl${TARGET_BIT_PREFIX}-gcc
+readonly NACLCXX=${NACL_SDK_BASE}/bin/nacl${TARGET_BIT_PREFIX}-g++
+readonly NACLAR=${NACL_SDK_BASE}/bin/nacl${TARGET_BIT_PREFIX}-ar
+readonly NACLRANLIB=${NACL_SDK_BASE}/bin/nacl${TARGET_BIT_PREFIX}-ranlib
+readonly NACLLD=${NACL_SDK_BASE}/bin/nacl${TARGET_BIT_PREFIX}-ld
+readonly NACLAS=${NACL_SDK_BASE}/bin/nacl${TARGET_BIT_PREFIX}-as
+
+# NACL_SDK_GCC_SPECS_PATH is where nacl-gcc 'specs' file will be installed
+readonly NACL_SDK_GCC_SPECS_PATH=${NACL_SDK_BASE}/lib/gcc/nacl64/4.4.3
+
+# NACL_SDK_USR is where the headers, libraries, etc. will be installed
+readonly NACL_SDK_USR=${NACL_SDK_BASE}/nacl/usr
+readonly NACL_SDK_USR_INCLUDE=${NACL_SDK_USR}/include
+readonly NACL_SDK_USR_LIB=${NACL_SDK_USR}/lib
+
+
+######################################################################
+# Helper functions
+######################################################################
+
+Banner() {
+  echo "######################################################################"
+  echo $*
+  echo "######################################################################"
+}
+
+
+VerifyPath() {
+  # make sure path isn't all slashes (possibly from an unset variable)
+  local PATH=$1
+  local TRIM=${PATH##/}
+  if [ ${#TRIM} -ne 0 ]; then
+    return 0
+  else
+    return 1
+  fi
+}
+
+
+ChangeDir() {
+  local NAME=$1
+  if VerifyPath ${NAME}; then
+    cd ${NAME}
+  else
+    echo "ChangeDir called with bad path."
+    exit -1
+  fi
+}
+
+
+Remove() {
+  local NAME=$1
+  if VerifyPath ${NAME}; then
+    rm -rf ${NAME}
+  else
+    echo "Remove called with bad path."
+    exit -1
+  fi
+}
+
+
+MakeDir() {
+  local NAME=$1
+  if VerifyPath ${NAME}; then
+    mkdir -p ${NAME}
+  else
+    echo "MakeDir called with bad path."
+    exit -1
+  fi
+}
+
+
+PatchSpecFile() {
+  # fix up spaces so gcc sees entire path
+  local SED_SAFE_SPACES_USR_INCLUDE=${NACL_SDK_USR_INCLUDE/ /\ /}
+  local SED_SAFE_SPACES_USR_LIB=${NACL_SDK_USR_LIB/ /\ /}
+  # have nacl-gcc dump specs file & add include & lib search paths
+  ${NACL_SDK_BASE}/bin/nacl-gcc -dumpspecs |\
+    sed "/*cpp:/{
+      N
+      s|$| -I${SED_SAFE_SPACES_USR_INCLUDE}|
+    }" |\
+    sed "/*link_libgcc:/{
+      N
+      s|$| -L${SED_SAFE_SPACES_USR_LIB}|
+    }" >${NACL_SDK_GCC_SPECS_PATH}/specs
+}
+
+
+DefaultConfigureStep() {
+  Banner "Configuring ${PACKAGE_NAME}"
+  # export the nacl tools
+  export CC=${NACLCC}
+  export CXX=${NACLCXX}
+  export AR=${NACLAR}
+  export RANLIB=${NACLRANLIB}
+  export PKG_CONFIG_PATH=${NACL_SDK_USR_LIB}/pkgconfig
+  export PKG_CONFIG_LIBDIR=${NACL_SDK_USR_LIB}
+  export PATH=${NACL_BIN_PATH}:${PATH};
+  ChangeDir ${NACL_PACKAGES_REPOSITORY}/${PACKAGE_NAME}
+  Remove ${PACKAGE_NAME}-build
+  MakeDir ${PACKAGE_NAME}-build
+  cd ${PACKAGE_NAME}-build
+  ../configure \
+    --host=nacl \
+    --disable-shared \
+    --prefix=${NACL_SDK_USR} \
+    --exec-prefix=${NACL_SDK_USR} \
+    --libdir=${NACL_SDK_USR_LIB} \
+    --oldincludedir=${NACL_SDK_USR_INCLUDE} \
+    --with-http=off \
+    --with-html=off \
+    --with-ftp=off \
+    --with-x=no
+}
+
+
+DefaultBuildStep() {
+  # assumes pwd has makefile
+  make clean
+  make -j4
+}
+
+
+DefaultInstallStep() {
+  # assumes pwd has makefile
+  make install
+}
+
+
+DefaultCleanUpStep() {
+  PatchSpecFile
+  ChangeDir ${SAVE_PWD}
+}
+
+
+DefaultPackageInstall() {
+  DefaultPreInstallStep
+  DefaultDownloadStep
+  DefaultExtractStep
+  DefaultPatchStep
+  DefaultConfigureStep
+  DefaultBuildStep
+  DefaultInstallStep
+  DefaultCleanUpStep
+}
diff --git a/nacl/config-nacl-runtime.cache b/nacl/config-nacl-runtime.cache
new file mode 100644
index 00000000000..3772ac81017
--- /dev/null
+++ b/nacl/config-nacl-runtime.cache
@@ -0,0 +1,18 @@
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+ac_cv_func_mmap=${ac_cv_func_mmap=no}
+ac_cv_var_timezone=${ac_cv_var_timezone=yes}
+ac_cv_host=${ac_cv_host=i686-pc-nacl}
+ac_cv_target=${ac_cv_target=i686-pc-nacl}
+
diff --git a/nacl/config-nacl-runtime64.cache b/nacl/config-nacl-runtime64.cache
new file mode 100644
index 00000000000..ce3bc3590bc
--- /dev/null
+++ b/nacl/config-nacl-runtime64.cache
@@ -0,0 +1,18 @@
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+ac_cv_func_mmap=${ac_cv_func_mmap=no}
+ac_cv_var_timezone=${ac_cv_var_timezone=yes}
+ac_cv_host=${ac_cv_host=x86_64-pc-nacl}
+ac_cv_target=${ac_cv_target=x86_64-pc-nacl}
+
diff --git a/nacl/nacl-common.sh b/nacl/nacl-common.sh
new file mode 100644
index 00000000000..b63b7eb8b71
--- /dev/null
+++ b/nacl/nacl-common.sh
@@ -0,0 +1,19 @@
+
+CopyNormalMonoLibs() {
+  NORMAL_MSCORLIB_DLL=$MONO_TRUNK_NACL/normal-mono/lib/mono/2.0/mscorlib.dll
+  if [ ! -f ${NORMAL_MSCORLIB_DLL} ]
+  then
+    Banner "Normal mscorlib.dll not found, building normal mono"
+    cd ${MONO_TRUNK_NACL}
+    ./normal-mono.sh
+  fi
+  if [ ! -f ${NORMAL_MSCORLIB_DLL} ]
+  then
+    Banner "Normal mscorlib.dll not found after normal mono build, exiting..."
+    exit -1
+  fi
+  Banner "Copying normal-mono libs to install dir"
+  mkdir -p ${INSTALL_PATH}/lib/mono
+  cp -R ${MONO_TRUNK_NACL}/normal-mono/lib/mono/* ${INSTALL_PATH}/lib/mono/
+}
+
diff --git a/nacl/nacl-mono-config-cache b/nacl/nacl-mono-config-cache
new file mode 100644
index 00000000000..830854bb661
--- /dev/null
+++ b/nacl/nacl-mono-config-cache
@@ -0,0 +1,16 @@
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+mono_cv_uscore=${mono_cv_uscore=no}
+ac_cv_target=${ac_cv_target=i686-pc-nacl}
+
diff --git a/nacl/nacl-mono.sh b/nacl/nacl-mono.sh
new file mode 100755
index 00000000000..a409db67b49
--- /dev/null
+++ b/nacl/nacl-mono.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+# nacl-mono.sh
+#
+# usage:  nacl-mono.sh
+#
+# this script builds a compiler for 32-bit NaCl code
+# (installed in ./compiler folder)
+#
+
+readonly MONO_TRUNK_NACL=$(pwd)
+
+readonly PACKAGE_NAME=nacl-mono-build
+
+readonly INSTALL_PATH=${MONO_TRUNK_NACL}/compiler
+
+source common.sh
+source nacl-common.sh
+
+
+CustomConfigureStep() {
+  Banner "Configuring ${PACKAGE_NAME}"
+  set +e
+  cd ${PACKAGE_NAME}
+  make distclean
+  cd ${MONO_TRUNK_NACL}
+  set -e
+  Remove ${PACKAGE_NAME}
+  MakeDir ${PACKAGE_NAME}
+  cd ${PACKAGE_NAME}
+  cp ../nacl-mono-config-cache ../nacl-mono-config-cache.temp
+  if [ $HOST_BITSIZE = "64" ]; then
+    ../../configure \
+      CC='cc -m32' CXX='g++ -m32' \
+      --host=i386-pc-linux \
+      --build=amd64-pc-linux \
+      --target=nacl \
+      --prefix=${INSTALL_PATH} \
+      --with-tls=pthread \
+      --enable-nacl-codegen \
+      --disable-mono-debugger \
+      --disable-mcs-build \
+      --with-sigaltstack=no \
+      --cache-file=../nacl-mono-config-cache.temp
+  else
+    ../../configure \
+      --target=nacl \
+      --prefix=${INSTALL_PATH} \
+      --with-tls=pthread \
+      --enable-nacl-codegen \
+      --disable-mono-debugger \
+      --disable-mcs-build \
+      --with-sigaltstack=no \
+      --cache-file=../nacl-mono-config-cache.temp
+  fi
+  
+
+  rm ../nacl-mono-config-cache.temp
+}
+
+CustomBuildStep() {
+  MONO_NACL_ALIGN_MASK_OFF=1 make -j4
+}
+
+CustomInstallStep() {
+  MONO_NACL_ALIGN_MASK_OFF=1 make install
+}
+
+CustomPackageInstall() {
+  CustomConfigureStep
+  #CustomBuildStep
+  #CustomInstallStep
+  DefaultBuildStep
+  DefaultInstallStep
+}
+
+
+CustomPackageInstall
+exit 0
diff --git a/nacl/nacl-runtime-mono.sh b/nacl/nacl-runtime-mono.sh
new file mode 100755
index 00000000000..f00a9328465
--- /dev/null
+++ b/nacl/nacl-runtime-mono.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Copyright (c) 2009 The Native Client Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that be
+# found in the LICENSE file.
+#
+
+# nacl-runtime-mono.sh
+#
+# usage:  nacl-runtime-mono.sh
+#
+# this script builds mono runtime for Native Client 
+#
+
+readonly MONO_TRUNK_NACL=$(pwd)
+
+source common.sh
+source nacl-common.sh
+
+readonly PACKAGE_NAME=runtime${TARGET_BIT_PREFIX}-build
+readonly INSTALL_PATH=${MONO_TRUNK_NACL}/runtime${TARGET_BIT_PREFIX}
+
+
+CustomConfigureStep() {
+  Banner "Configuring ${PACKAGE_NAME}"
+  # export the nacl tools
+  set +e
+  if [ -f ${PACKAGE_NAME}/Makefile ]
+  then
+    cd ${PACKAGE_NAME}
+  fi
+  make distclean
+  cd ${MONO_TRUNK_NACL}
+  set -e
+  cp config-nacl-runtime${TARGET_BIT_PREFIX}.cache config-nacl-runtime${TARGET_BIT_PREFIX}.cache.temp
+  Remove ${PACKAGE_NAME}
+  MakeDir ${PACKAGE_NAME}
+  cd ${PACKAGE_NAME}
+  # TODO: remove this once libintl.h becomes available to nacl
+  CC=${NACLCC} CXX=${NACLCXX} AR=${NACLAR} RANLIB=${NACLRANLIB} PKG_CONFIG_PATH=${NACL_SDK_USR_LIB}/pkgconfig \
+  PKG_CONFIG_LIBDIR=${NACL_SDK_USR_LIB} PATH=${NACL_BIN_PATH}:${PATH} LIBS="-lnosys -lg" \
+  CFLAGS="-g -D_POSIX_PATH_MAX=256 -DPATH_MAX=256" ../../configure \
+    --host=nacl${TARGET_BIT_PREFIX} \
+    --exec-prefix=${INSTALL_PATH} \
+    --libdir=${INSTALL_PATH}/lib \
+    --prefix=${INSTALL_PATH} \
+    --oldincludedir=${MONO_TRUNK_NACL}/runtime/include \
+    --disable-shared \
+    --disable-mcs-build \
+    --with-glib=embedded \
+    --with-tls=pthread \
+    --enable-threads=posix \
+    --without-sigaltstack \
+    --without-mmap \
+    --with-gc=included \
+    --enable-nacl-gc \
+    --enable-nacl-codegen \
+    --cache-file=../config-nacl-runtime${TARGET_BIT_PREFIX}.cache.temp
+  echo "// --- Native Client runtime below" >> config.h
+  echo "#define pthread_cleanup_push(x, y)" >> config.h
+  echo "#define pthread_cleanup_pop(x)" >> config.h
+  echo "#undef HAVE_EPOLL" >> config.h
+  echo "#undef HAVE_WORKING_SIGALTSTACK" >> config.h
+  echo "extern long int timezone;" >> config.h
+  echo "extern int daylight;" >> config.h
+  echo "#define sem_trywait(x) sem_wait(x)" >> config.h
+  echo "#define sem_timedwait(x,y) sem_wait(x)" >> config.h
+  echo "#define getdtablesize() (32768)" >> config.h
+  echo "// --- Native Client runtime below" >> eglib/src/eglib-config.h
+  echo "#undef G_BREAKPOINT" >> eglib/src/eglib-config.h
+  echo "#define G_BREAKPOINT() G_STMT_START { __asm__ (\"hlt\"); } G_STMT_END" >> eglib/src/eglib-config.h
+  rm ../config-nacl-runtime${TARGET_BIT_PREFIX}.cache.temp
+}
+
+CustomInstallStep() {
+  make install
+  CopyNormalMonoLibs
+}
+
+CustomPackageInstall() {
+  CustomConfigureStep
+  DefaultBuildStep
+  CustomInstallStep
+}
+
+
+CustomPackageInstall
+exit 0
diff --git a/nacl/nacl64-mono-config-cache b/nacl/nacl64-mono-config-cache
new file mode 100644
index 00000000000..6e2d0423801
--- /dev/null
+++ b/nacl/nacl64-mono-config-cache
@@ -0,0 +1,16 @@
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+mono_cv_uscore=${mono_cv_uscore=no}
+ac_cv_target=${ac_cv_target=x86_64-pc-nacl}
+
diff --git a/nacl/nacl64-mono.sh b/nacl/nacl64-mono.sh
new file mode 100755
index 00000000000..964bc43f002
--- /dev/null
+++ b/nacl/nacl64-mono.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+# nacl64-mono.sh
+#
+# usage:  nacl64-mono.sh
+#
+# this script builds a compiler for 64-bit NaCl code
+# (installed in ./compiler folder)
+#
+
+readonly MONO_TRUNK_NACL=$(pwd)
+
+readonly PACKAGE_NAME=nacl64-mono-build
+
+readonly INSTALL_PATH=${MONO_TRUNK_NACL}/compiler
+
+source common.sh
+source nacl-common.sh
+
+
+CustomConfigureStep() {
+  Banner "Configuring ${PACKAGE_NAME}"
+  set +e
+  cd ${PACKAGE_NAME}
+  make distclean
+  cd ${MONO_TRUNK_NACL}
+  set -e
+  Remove ${PACKAGE_NAME}
+  MakeDir ${PACKAGE_NAME}
+  cd ${PACKAGE_NAME}
+  cp ../nacl64-mono-config-cache ../nacl64-mono-config-cache.temp
+  if [ $HOST_BITSIZE = "64" ]; then
+    ../../configure \
+      CFLAGS="-O0" CXXFLAGS="-O0" CC='cc -m32' CXX='g++ -m32' \
+      --host=i386-pc-linux \
+      --build=amd64-pc-linux \
+      --target=nacl64 \
+      --prefix=${INSTALL_PATH} \
+      --with-tls=pthread \
+      --enable-nacl-codegen \
+      --disable-mono-debugger \
+      --disable-mcs-build \
+      --with-sigaltstack=no \
+      --cache-file=../nacl64-mono-config-cache.temp
+  else
+    ../../configure \
+      --target=nacl64 \
+      --prefix=${INSTALL_PATH} \
+      --with-tls=pthread \
+      --enable-nacl-codegen \
+      --disable-mono-debugger \
+      --disable-mcs-build \
+      --with-sigaltstack=no \
+      --cache-file=../nacl64-mono-config-cache.temp
+  fi
+  
+
+  rm ../nacl64-mono-config-cache.temp
+}
+
+CustomBuildStep() {
+  MONO_NACL_ALIGN_MASK_OFF=1 make -j4
+}
+
+CustomInstallStep() {
+  MONO_NACL_ALIGN_MASK_OFF=1 make install
+}
+
+CustomPackageInstall() {
+  CustomConfigureStep
+  #CustomBuildStep
+  #CustomInstallStep
+  DefaultBuildStep
+  DefaultInstallStep
+}
+
+
+CustomPackageInstall
+exit 0
diff --git a/nacl/normal-mono.sh b/nacl/normal-mono.sh
new file mode 100755
index 00000000000..88ebe68d613
--- /dev/null
+++ b/nacl/normal-mono.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright (c) 2009 The Native Client Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that be
+# found in the LICENSE file.
+#
+
+# normal-mono.sh
+#
+# usage:  normal-mono.sh
+#
+# this script builds normal x86 mono
+# (installed in ./normal folder)
+#
+
+readonly MONO_TRUNK_NACL=$(pwd)
+
+readonly PACKAGE_NAME=mono-normal-build
+
+source common.sh
+
+
+CustomConfigureStep() {
+  Banner "Configuring ${PACKAGE_NAME}"
+  set +e
+  if [ -f ${PACKAGE_NAME}/Makefile ]
+  then
+    cd ${PACKAGE_NAME}
+    make distclean
+  fi
+  cd ${MONO_TRUNK_NACL}
+  set -e
+  Remove ${PACKAGE_NAME}
+  MakeDir ${PACKAGE_NAME}
+  cd ${PACKAGE_NAME}
+  ../../configure \
+    --prefix=${MONO_TRUNK_NACL}/normal-mono \
+    --disable-parallel-mark \
+    --with-tls=pthread 
+}
+
+CustomPackageInstall() {
+  CustomConfigureStep
+  DefaultBuildStep
+  DefaultInstallStep
+}
+
+
+CustomPackageInstall
+exit 0
diff --git a/nacl/test/hw.cs b/nacl/test/hw.cs
new file mode 100644
index 00000000000..f82c33e5470
--- /dev/null
+++ b/nacl/test/hw.cs
@@ -0,0 +1,60 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Collections;
+using System.Text;
+using System.Threading;
+
+namespace Test {
+
+	public class c_code {
+
+		[MethodImplAttribute (MethodImplOptions.InternalCall)]
+		public extern static void my_c_func(int x, string s, double d);
+		[MethodImplAttribute (MethodImplOptions.InternalCall)]
+		public extern static void my_c_pass(int x);
+	}
+
+	public class HelloWorld
+	{
+		static public void Main ()
+		{
+		}
+
+		static public void Foobar (int x, string s)
+		{
+			// first line is a simple test
+			// 1. call back into c code 2. use mscorlib Math.Sqrt()
+			c_code.my_c_func(x, s, Math.Sqrt(3.1415 * 3.1415));
+
+			// second part of this test:
+			// attempt a try/catch, generate exception w/ throw
+			try {
+				c_code.my_c_pass(0);
+				// attempt an invalid cast
+				throw new InvalidCastException();
+				c_code.my_c_pass(1);
+			}
+			catch (InvalidCastException e) {
+				c_code.my_c_pass(2);
+			}
+			c_code.my_c_pass(3);
+
+			// third part of this test:
+			// attempt an invalid cast again, this time generating
+			// exception instead of using explicit throw.
+			try {
+				c_code.my_c_pass(0);
+				StringBuilder reference1 = new StringBuilder();
+				object reference2 = reference1;
+				// attempt invalid cast
+				int reference3 = (int)reference2;
+				c_code.my_c_pass(4);
+			}
+			catch (InvalidCastException e) {
+				c_code.my_c_pass(5);
+			}
+			c_code.my_c_pass(3);
+		}
+	} 
+}
diff --git a/nacl/test/my.c b/nacl/test/my.c
new file mode 100644
index 00000000000..218cec4ddf4
--- /dev/null
+++ b/nacl/test/my.c
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+#include <mono/metadata/appdomain.h>
+#include <mono/metadata/assembly.h>
+#include <mono/metadata/debug-helpers.h>
+
+extern void* mono_aot_module_mscorlib_info;
+
+extern void* mono_aot_module_hw_info;
+
+extern void mono_set_corlib_data(void *data, size_t size);
+extern void mono_aot_register_module(void *aot_info);
+extern void mono_aot_init(void);
+extern void mono_jit_set_aot_only(int aot_only);
+extern MonoDomain * mini_init (const char *filename, const char *runtime_version);
+
+#if !defined(TRUE)
+#define TRUE 1
+#endif
+#if !defined(FALSE)
+#define FALSE 0
+#endif
+
+void my_c_func(int arg, const char *str, double d) {
+  /* str from c# is immutable */
+  printf("*** my_c_func(%d, '%s', %1.4f) received\n", arg, str, (float)d);
+}
+
+
+void my_c_pass(int x) {
+  char *msg = "undefined";
+  switch(x) {
+    case 0:  msg = "about to throw an exception...";  break;
+    case 1:  msg = "thrown invalid cast exception was not caught!"; break;
+    case 2:  msg = "thrown invalid cast exception was safely caught!"; break;
+    case 3:  msg = "...leaving exeception test."; break;
+    case 4:  msg = "generated invalid cast exception was not caught!"; break;
+    case 5:  msg = "generated invalid cast exception was safely caught!"; break;
+  }
+  printf("*** my_c_pass(%d): %s\n", x, msg);
+}
+
+
+
+void try_mono() {
+  MonoDomain *domain;
+  MonoAssembly *ma;
+  MonoImage *mi;
+  MonoClass *mc;
+  MonoMethodDesc *mmd;
+  MonoMethod *mm;
+  MonoObject *mo;
+  FILE *mscorlib;
+  char *corlib_data = NULL;
+  void *args [2];
+  static int x = 123000;
+  args [0] = &x;
+  args [1] = "hello world";
+
+#if defined(__native_client__)
+  mscorlib = fopen("mscorlib.dll", "r");
+  if (NULL != mscorlib) {
+    size_t size;
+    struct stat st;
+    if (0 == stat("mscorlib.dll", &st)) {
+      size = st.st_size;
+      printf("reading mscorlib.dll, size %ld\n", size);
+      corlib_data = malloc(size);
+      if (corlib_data != NULL) {
+        while (fread(corlib_data, 1, size, mscorlib) != 0) ;
+        if (!ferror(mscorlib)) {
+          mono_set_corlib_data(corlib_data, size);
+        } else {
+          perror("error reading mscorlib.dll");
+          free(corlib_data);
+          corlib_data = NULL;
+        }
+      } else {
+        perror("Could not allocate memory");
+      }
+    } else {
+      perror("stat error");
+    }
+    fclose(mscorlib);
+  }
+#endif
+
+#ifdef AOT_VERSION
+  printf("address of mono_aot_module_mscorlib_info:  %p\n", mono_aot_module_mscorlib_info);
+  printf("address of mono_aot_module_hw_info:  %p\n", mono_aot_module_hw_info);
+
+  // mono_jit_set_aot_only(TRUE) should be enabled now.
+  // if not enabled, I suspect we're still jitting...
+  mono_jit_set_aot_only(TRUE);
+
+  mono_aot_register_module(mono_aot_module_mscorlib_info);
+  mono_aot_register_module(mono_aot_module_hw_info);
+#endif
+
+  domain = mini_init("hw.exe", "v2.0.50727");
+  printf("mono domain: %p\n", domain);
+
+  ma = mono_domain_assembly_open(domain, "hw.exe");
+  printf("mono assembly: %p\n", ma);
+
+  mi = mono_assembly_get_image(ma);
+  printf("mono image: %p\n", mi);
+
+  mc = mono_class_from_name(mi, "Test", "HelloWorld");
+  printf("mono class: %p\n", mc);
+
+  mmd = mono_method_desc_new("Test.HelloWorld:Foobar(int,string)", TRUE);
+  printf("mono desc method: %p\n", mmd);
+
+  mm = mono_method_desc_search_in_image(mmd, mi);
+  printf("mono method: %p\n", mm);
+
+  // add c functions for mono test code to invoke
+  mono_add_internal_call("Test.c_code::my_c_func", (void *) my_c_func);
+  mono_add_internal_call("Test.c_code::my_c_pass", (void *) my_c_pass);
+
+  mo = mono_runtime_invoke(mm, NULL, args, NULL);
+  printf("mono object: %p\n", mo);
+  if (NULL != corlib_data) free(corlib_data);
+}
+
+
+int main() {
+  int i;
+  printf("address of main(): %p\n", main);
+  printf("address of stack : %p\n", &i);
+  printf("\nProgram a.out output:\n");
+  printf("==========================\n");
+  try_mono();
+  printf("==========================\n\n");
+  return 0;
+}
diff --git a/nacl/test/nacl b/nacl/test/nacl
new file mode 100755
index 00000000000..626f3a24b62
--- /dev/null
+++ b/nacl/test/nacl
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+set -o nounset
+set -o errexit
+set -o verbose
+
+source ../common.sh
+
+MONO_RUNTIME_BASE=../runtime${TARGET_BIT_PREFIX}
+MONO_BASE=../compiler
+MONO="${MONO_BASE}/bin/nacl${TARGET_BIT_PREFIX}-mono"
+CC=$NACLCC
+AS=$NACLAS
+MODE=nacl
+COMPILE_AOT=0
+AOT_OBJS=
+CC_DEFINES=
+MONO_SNAPSHOT=mono-normal-build
+RELOCATE_RODATA="-Wl,--section-start=.rodata=0x2000000"
+export NACL_ALLOW_DYNCODE_REPLACEMENT=1
+
+if [ $# -gt 0 ]; then
+  while [ -n "$*" ]
+  do
+    if [ $1 == "normal" ]; then
+      MONO_RUNTIME_BASE=../normal-mono
+      MONO_BASE=../normal-mono
+      MONO=${MONO_BASE}/bin/mono
+      CC=gcc
+      AS=as
+      MODE=normal
+    elif [ $1 == "aot" ]; then
+      COMPILE_AOT=1
+      CC_DEFINES=-DAOT_VERSION
+      RELOCATE_RODATA=
+    elif [ $1 == "regression" ]; then
+      MODE=regression
+    else
+      echo "Unrecognized option '$1'"
+      exit -1
+    fi
+    shift
+  done
+fi
+
+readonly MONO_NORMAL_BASE=../normal-mono
+readonly NCVAL=ncval
+
+# add nacl-gcc to path (from NaCl SDK)
+export PATH=${NACL_BIN_PATH}:$PATH
+
+# add sel_ldr to path (from NaCl dev tree)
+export PATH=${NACL_DEV}/native_client/scons-out/dbg-${OS_SUBDIR}-x86-${TARGET_BITSIZE}/staging:$PATH
+
+# add nacl-mono to path
+export PATH=../normal-mono/bin:$PATH
+
+# echo version of nacl-gcc
+$CC --version
+
+echo $PATH
+which sel_ldr
+
+# echo version of gmcs 
+which gmcs
+../normal-mono/bin/gmcs --version
+
+# echo version of nacl-mono
+${MONO} --version
+
+# add MONO_PATH so mono can crank on local directory
+export MONO_PATH=$(pwd)
+echo ${MONO_PATH}
+
+#-----
+# enable the appropriate set of AOT options below.
+
+readonly AOT_OPTIONS=full,static,nodebug,ntrampolines=4096
+#-----
+
+# make a temp copy of mscorlib.dll in this directory
+cp ${MONO_NORMAL_BASE}/lib/mono/2.0/mscorlib.dll .
+
+# compile c-sharp file with gmcs
+MONO_PATH=. ../normal-mono/bin/gmcs -lib:. -warn:0 hw.cs
+
+# convert .exe to .exe.o assembly files
+# convert mscorlib to a .dll.o file
+if [ $COMPILE_AOT = "1" ]; then
+  MONO_PATH=. ${MONO} --aot=${AOT_OPTIONS} mscorlib.dll
+  MONO_PATH=. ${MONO} --aot=${AOT_OPTIONS} hw.exe
+  AOT_OBJS="hw.exe.o mscorlib.dll.o"
+fi
+
+# compile c and assembly into a.out, all statically linked
+# different options for normal and nacl-mono
+if [ $MODE = "normal" ]; then
+  $CC -g -static my.c ${CC_DEFINES} ${AOT_OBJS} -o hw-test -lmono-2.0 -lpthread -lm -ldl -lrt -I${MONO_RUNTIME_BASE}/include -I${MONO_RUNTIME_BASE}/include/mono-2.0 -L${MONO_RUNTIME_BASE}/lib
+elif [ $MODE = "nacl" ]; then
+  $CC -static my.c ${CC_DEFINES} ${AOT_OBJS} -o hw-test.nexe -lmono-2.0 -lpthread -lm -lnosys -I${MONO_RUNTIME_BASE}/include -I${MONO_RUNTIME_BASE}/include/mono-2.0 -L${MONO_RUNTIME_BASE}/lib ${RELOCATE_RODATA}
+fi
+
+readonly fsatests="basic.exe basic-float.exe basic-long.exe basic-calls.exe objects.exe arrays.exe basic-math.exe exceptions.exe devirtualization.exe basic-simd.exe gc-stress.exe imt_big_iface_test.exe generics.exe iltests.exe nacl.exe"
+if [ $MODE = "regression" ]; then
+  #rm -rf fsa-tmp
+  mkdir -p fsa-tmp
+  DIR=$(pwd)
+  cd ../${MONO_SNAPSHOT}/mono/mini/
+   make $fsatests generics-variant-types.dll TestDriver.dll
+   cp $fsatests generics-variant-types.dll TestDriver.dll $DIR/fsa-tmp
+  cd -
+ 
+  CLASS=${MONO_NORMAL_BASE}/lib/mono/2.0
+  cp $CLASS/System.Core.dll $CLASS/System.dll $CLASS/Mono.Posix.dll $CLASS/System.Configuration.dll $CLASS/System.Security.dll $CLASS/System.Xml.dll $CLASS/Mono.Security.dll $CLASS/Mono.Simd.dll fsa-tmp
+  cp mscorlib.dll fsa-tmp
+
+  AOT_OBJS=""
+  if [ $COMPILE_AOT = "1" ]; then
+    for t in $fsatests; do
+      MONO_PATH=fsa-tmp ${MONO} --aot=${AOT_OPTIONS} fsa-tmp/$t
+      AOT_OBJS="${AOT_OBJS} fsa-tmp/$t.o" 
+    done
+    for d in fsa-tmp/*.dll; do
+      MONO_PATH=fsa-tmp ${MONO} --aot=${AOT_OPTIONS} $d
+      AOT_OBJS="${AOT_OBJS} $d.o" 
+    done  
+  fi
+
+  $CC -o fsa-tmp/fsacheck.nexe -g -static ../../mono/mini/fsacheck.c ${CC_DEFINES} ${AOT_OBJS} -lmono-2.0 -lpthread -lm -lnosys -L${MONO_RUNTIME_BASE}/lib -I${MONO_RUNTIME_BASE}/include/mono-2.0  -I${MONO_RUNTIME_BASE}/include ${RELOCATE_RODATA}
+fi
+
+if [ $MODE = "regression" ]; then
+  cd fsa-tmp
+  ${NCVAL} -readwrite_sfi fsacheck.nexe 2> validator_out || echo "fsacheck.nexe invalid: continuing anyway"
+  nacl-objdump -d fsacheck.nexe > fsacheck.disasm
+  for t in $fsatests; do
+    sel_ldr -a -c fsacheck.nexe $t || true
+  done
+else
+  export MONO_PATH=$(pwd)
+  # run generated test(select one or more below)
+  if [ $MODE = "normal" ]; then
+    ./hw-test
+  else
+    ${NCVAL} -readwrite_sfi hw-test.nexe 2> validator_out || echo "hw-test.nexe invalid: continuing anyway"
+    nacl-objdump -d hw-test.nexe > hw-test.disasm
+    sel_ldr -a -c hw-test.nexe
+  fi
+fi
+
+exit 0
+
diff --git a/runtime/mono-wrapper.in b/runtime/mono-wrapper.in
index eb6cd895921..3c94c5a8df1 100644
--- a/runtime/mono-wrapper.in
+++ b/runtime/mono-wrapper.in
@@ -3,5 +3,6 @@ r='@mono_build_root@'
 MONO_CFG_DIR='@mono_cfg_dir@'
 PATH="$r/runtime/_tmpinst/bin:$PATH"
 MONO_SHARED_DIR=$r/runtime
+export MONO_NACL_ALIGN_MASK_OFF=@MONO_NACL_ALIGN_MASK_OFF@
 export MONO_CFG_DIR MONO_SHARED_DIR PATH
 exec "$r/libtool" --mode=execute "$r/@mono_runtime@" --config "@mono_cfg_dir@/mono/config" "$@"