NaCl ARM work, first pass
authorElijah Taylor <elijahtaylor@google.com>
Thu, 31 Jan 2013 22:00:58 +0000 (14:00 -0800)
committerZoltan Varga <vargaz@gmail.com>
Sun, 7 Apr 2013 07:47:16 +0000 (09:47 +0200)
configure.in
libgc/include/private/gc_locks.h
libgc/include/private/gcconfig.h
libgc/pthread_stop_world.c
mono/mini/exceptions-arm.c
mono/mini/mini-arm.c
mono/mini/mini-arm.h
mono/mini/mini-ops.h
mono/utils/atomic.h

index 72ad17034e357dd9c256895c0bec3135fa01d4d3..b12e87886cba9c9e2ac5cc7df0c8707023eed1b1 100644 (file)
@@ -2586,6 +2586,15 @@ case "$host" in
                   NESTED_LIBGC_FLAGS="$NESTED_LIBGC_FLAGS -DHAVE_ARMV6"
                fi
                ;;
+# TODO: make proper support for NaCl host.
+#        arm*-*nacl)
+#              TARGET=ARM;
+#              arch_target=arm;
+#              ACCESS_UNALIGNED="no"
+#              JIT_SUPPORTED=yes
+#              sgen_supported=true
+#              AOT_SUPPORTED="no"
+#              ;;
        s390-*-linux*)
                TARGET=S390;
                arch_target=s390;
@@ -2650,6 +2659,28 @@ if test "x$host" != "x$target"; then
                AC_DEFINE(__mono_ilp32__, 1, [64 bit mode with 4 byte longs and pointers])
                sizeof_register=8
                ;;
+# TODO: make proper support for NaCl target.
+#   arm*-*nacl)
+#              TARGET=ARM
+#              arch_target=arm
+#              AC_DEFINE(TARGET_ARM, 1, [...])
+#              ACCESS_UNALIGNED="no"
+#              JIT_SUPPORTED=yes
+#              sizeof_register=4
+#               CPPFLAGS="$CPPFLAGS \
+#                    -DARM_FPU_VFP=1 -D__ARM_EABI__ \
+#                    -D__arm__ \
+#                    -D__portable_native_client__ \
+#                    -DARM_FPU_VFP=1 \
+#                    -Dtimezone=_timezone \
+#                    -DDISABLE_SOCKETS \
+#                    -DDISABLE_ATTACH \
+#                    -DUSE_NEWLIB"
+#              jit_wanted=true
+               # Can't use tls, since it depends on the runtime detection of tls offsets
+               # in mono-compiler.h
+#              with_tls=pthread
+#              ;;
    i686-*-nacl)
                TARGET=X86
                arch_target=x86
index 8705d07a1bbc072365c40381008c9e178821eea3..80712fcdf3f597637aed10d181d2642af72c8917 100644 (file)
 #       define GC_CLEAR_DEFINED
 #    endif /* ALPHA */
 #    ifdef ARM32
+#ifdef __native_client__
+#define NACL_ALIGN() ".align 4\n"
+#define MASK_REGISTER(reg) "bic " reg ", " reg ", #0xc0000000\n"
+#else
+#define NACL_ALIGN()
+#define MASK_REGISTER(reg)
+#endif
         inline static int GC_test_and_set(volatile unsigned int *addr) {
 #if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__)
           int ret, tmp;
           __asm__ __volatile__ (
                                  "1:\n"
+                                 NACL_ALIGN()
+                                 MASK_REGISTER("%3")
                                  "ldrex %0, [%3]\n"
+                                 MASK_REGISTER("%3")
                                  "strex %1, %2, [%3]\n" 
                                  "teq %1, #0\n"
                                  "bne 1b\n"
            * bus because there are no SMP ARM machines.  If/when there are,
            * this code will likely need to be updated. */
           /* See linuxthreads/sysdeps/arm/pt-machine.h in glibc-2.1 */
-          __asm__ __volatile__("swp %0, %1, [%2]"
+          __asm__ __volatile__(MASK_REGISTER("%2")
+                               "swp %0, %1, [%2]"
                             : "=&r"(oldval)
                             : "r"(1), "r"(addr)
                             : "memory");
index 49ffccddf505b1680240cce5069929abae02ebd8..88e783800de9faf427837c7bd93a1b3bd15f99cd 100644 (file)
@@ -76,7 +76,9 @@
 # endif
 # if defined(__arm__) || defined(__thumb__)
 #    define ARM32
-#    if !defined(LINUX) && !defined(NETBSD) && !defined(DARWIN)
+#    if defined(NACL)
+#      define mach_type_known
+#    elif !defined(LINUX) && !defined(NETBSD) && !defined(DARWIN)
 #      define NOSYS
 #      define mach_type_known
 #    endif
 # endif
 
 # ifdef ARM32
-#   define CPP_WORDSZ 32
+# if defined( NACL )
+#   define MACH_TYPE "NACL"
+# else
 #   define MACH_TYPE "ARM32"
+# endif
+#   define CPP_WORDSZ 32
 #   define ALIGNMENT 4
 #   ifdef NETBSD
 #       define OS_TYPE "NETBSD"
index 8b25376b54ab6d691176ba61f0d7345c6b5c1ab6..fe2ba1a381025cf22af5cf92b04d45bf44600ad9 100644 (file)
@@ -537,6 +537,22 @@ static void pthread_stop_world()
        __asm__ __volatile__ ("add $16, %esp");\
     } while (0)
 
+#elif __arm__
+
+#define NACL_STORE_REGS()  \
+    do {                  \
+       __asm__ __volatile__ ("push {r4-r12,lr}");\
+       __asm__ __volatile__ ("mov r0, %0" : : "r" (&nacl_gc_thread_self->stop_info.stack_ptr)); \
+       __asm__ __volatile__ ("bic r0, r0, #0xc0000000");\
+       __asm__ __volatile__ ("str sp, [r0]");\
+       memcpy(nacl_gc_thread_self->stop_info.reg_storage, nacl_gc_thread_self->stop_info.stack_ptr, NACL_GC_REG_STORAGE_SIZE * sizeof(ptr_t));\
+       __asm__ __volatile__ ("add sp, sp, #40");\
+       __asm__ __volatile__ ("bic sp, sp, #0xc0000000");\
+    } while (0)
+#else
+
+#error "Please port NACL_STORE_REGS"
+
 #endif
 
 void nacl_pre_syscall_hook()
index 20d7ad41e4d3e9c54c46f27dbe92059fe0bcb51c..575654b67e3ca63724d4f5f70b7201fd356a9277 100644 (file)
@@ -493,6 +493,7 @@ mono_arch_find_jit_info (MonoDomain *domain, MonoJitTlsData *jit_tls,
        return FALSE;
 }
 
+#if MONO_ARCH_HAVE_SIGCTX_TO_MONOCTX
 void
 mono_arch_sigctx_to_monoctx (void *sigctx, MonoContext *mctx)
 {
@@ -504,6 +505,7 @@ mono_arch_monoctx_to_sigctx (MonoContext *mctx, void *ctx)
 {
        mono_monoctx_to_sigctx (mctx, ctx);
 }
+#endif /* MONO_ARCH_HAVE_SIGCTX_TO_MONOCTX */
 
 /*
  * handle_exception:
@@ -546,7 +548,7 @@ get_handle_signal_exception_addr (void)
 gboolean
 mono_arch_handle_exception (void *ctx, gpointer obj)
 {
-#if defined(MONO_CROSS_COMPILE)
+#if defined(MONO_CROSS_COMPILE) || !defined(MONO_ARCH_HAVE_SIGCTX_TO_MONOCTX)
        g_assert_not_reached ();
 #elif defined(MONO_ARCH_USE_SIGACTION)
        arm_ucontext *sigctx = ctx;
@@ -598,6 +600,8 @@ mono_arch_ip_from_context (void *sigctx)
 {
 #ifdef MONO_CROSS_COMPILE
        g_assert_not_reached ();
+#elif defined(__native_client__)
+       g_assert_not_reached ();
 #else
        arm_ucontext *my_uc = sigctx;
        return (void*) UCONTEXT_REG_PC (my_uc);
index eabff5c1c5c98ef4aa4d5404ed65e2ea73374ee2..99d84ec657d431df2bc3cfff5566109a69b06cd3 100644 (file)
@@ -25,7 +25,7 @@
 #include "mono/arch/arm/arm-fpa-codegen.h"
 #include "mono/arch/arm/arm-vfp-codegen.h"
 
-#if defined(__ARM_EABI__) && defined(__linux__) && !defined(PLATFORM_ANDROID)
+#if defined(__ARM_EABI__) && defined(__linux__) && !defined(PLATFORM_ANDROID) && !defined(__native_client__)
 #define HAVE_AEABI_READ_TP 1
 #endif
 
 #define IS_SOFT_FLOAT 0
 #endif
 
+#ifdef __native_client_codegen__
+const guint kNaClAlignment = kNaClAlignmentARM;
+const guint kNaClAlignmentMask = kNaClAlignmentMaskARM;
+gint8 nacl_align_byte = -1; /* 0xff */
+
+guint8 *
+mono_arch_nacl_pad (guint8 *code, int pad)
+{
+  /* Not yet properly implemented. */
+  g_assert_not_reached ();
+  return code;
+}
+
+guint8 *
+mono_arch_nacl_skip_nops (guint8 *code)
+{
+  /* Not yet properly implemented. */
+  g_assert_not_reached ();
+  return code;
+}
+
+#endif /* __native_client_codegen__ */
+
 #define ALIGN_TO(val,align) ((((guint64)val) + ((align) - 1)) & ~((align) - 1))
 
 #if __APPLE__
@@ -198,8 +221,8 @@ mono_arch_fregname (int reg)
        return "unknown";
 }
 
-#ifndef DISABLE_JIT
 
+#ifndef DISABLE_JIT
 static guint8*
 emit_big_add (guint8 *code, int dreg, int sreg, int imm)
 {
@@ -762,11 +785,14 @@ void
 mono_arch_init (void)
 {
        InitializeCriticalSection (&mini_arch_mutex);
-
+#ifdef MONO_ARCH_SOFT_DEBUG_SUPPORTED
        if (mini_get_debug_options ()->soft_breakpoints) {
                single_step_func_wrapper = create_function_wrapper (debugger_agent_single_step_from_context);
                breakpoint_func_wrapper = create_function_wrapper (debugger_agent_breakpoint_from_context);
        } else {
+#else
+       {
+#endif
                ss_trigger_page = mono_valloc (NULL, mono_pagesize (), MONO_MMAP_READ|MONO_MMAP_32BIT);
                bp_trigger_page = mono_valloc (NULL, mono_pagesize (), MONO_MMAP_READ|MONO_MMAP_32BIT);
                mono_mprotect (bp_trigger_page, mono_pagesize (), 0);
@@ -988,6 +1014,11 @@ mono_arch_regalloc_cost (MonoCompile *cfg, MonoMethodVar *vmv)
 void
 mono_arch_flush_icache (guint8 *code, gint size)
 {
+#if defined(__native_client__)
+  // For Native Client we don't have to flush i-cache here,
+  // as it's being done by dyncode interface.
+#else
+
 #ifdef MONO_CROSS_COMPILE
 #elif __APPLE__
        sys_icache_invalidate (code, size);
@@ -1014,6 +1045,7 @@ mono_arch_flush_icache (guint8 *code, gint size)
                        : "r" (code), "r" (code + size), "r" (0)
                        : "r0", "r1", "r3" );
 #endif
+#endif /* !__native_client__ */
 }
 
 typedef enum {
@@ -6353,6 +6385,8 @@ mono_arch_get_trampolines (gboolean aot)
        return mono_arm_get_exception_trampolines (aot);
 }
 
+
+#ifdef MONO_ARCH_SOFT_DEBUG_SUPPORTED
 /*
  * mono_arch_set_breakpoint:
  *
@@ -6539,6 +6573,8 @@ mono_arch_skip_single_step (MonoContext *ctx)
        MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + 4);
 }
 
+#endif /* MONO_ARCH_SOFT_DEBUG_SUPPORTED */
+
 /*
  * mono_arch_get_seq_point_info:
  *
index fa636f658fa0fd88d1ebb544ec44001b71cca418..f8aae7b9ac229a399b073fec5aaceb378e89828f 100644 (file)
@@ -9,6 +9,12 @@
 #include <mono/utils/mono-context.h>
 #include <glib.h>
 
+#ifdef __native_client_codegen__
+#define kNaClAlignmentARM 16
+#define kNaClAlignmentMaskARM (kNaClAlignmentARM - 1)
+#define kNaClLengthOfCallImm 4
+#endif
+
 #if defined(ARM_FPU_NONE) || (defined(__ARM_EABI__) && !defined(ARM_FPU_VFP) && !defined(ARM_FPU_VFP_HARD))
 #define MONO_ARCH_SOFT_FLOAT 1
 #endif
@@ -188,6 +194,11 @@ typedef struct MonoCompileArch {
 #define ARM_LAST_ARG_REG 3
 
 #define MONO_ARCH_USE_SIGACTION 1
+
+#if defined(__native_client__)
+#undef MONO_ARCH_USE_SIGACTION
+#endif
+
 #define MONO_ARCH_NEED_DIV_CHECK 1
 
 #define MONO_ARCH_HAVE_CREATE_DELEGATE_TRAMPOLINE
@@ -221,8 +232,14 @@ typedef struct MonoCompileArch {
 #define MONO_ARCH_GSHAREDVT_SUPPORTED 1
 #define MONO_ARCH_HAVE_GENERAL_RGCTX_LAZY_FETCH_TRAMPOLINE 1
 
+#if defined(__native_client__)
+#undef MONO_ARCH_SOFT_DEBUG_SUPPORTED
+#undef MONO_ARCH_HAVE_SIGCTX_TO_MONOCTX
+#undef MONO_ARCH_HAVE_CONTEXT_SET_INT_REG
+#endif
+
 /* Matches the HAVE_AEABI_READ_TP define in mini-arm.c */
-#if defined(__ARM_EABI__) && defined(__linux__) && !defined(TARGET_ANDROID)
+#if defined(__ARM_EABI__) && defined(__linux__) && !defined(TARGET_ANDROID) && !defined(__native_client__)
 #define MONO_ARCH_HAVE_TLS_GET 1
 #endif
 
@@ -274,4 +291,3 @@ mono_arm_load_jumptable_entry (guint8 *code, gpointer *jte, ARMReg reg) MONO_INT
 #endif
 
 #endif /* __MONO_MINI_ARM_H__ */
-
index e446e0fd59e507e1f69e15fa7afb5ae01384510c..394610bbc6facb512f0955cdd7029a008aeaad2d 100644 (file)
@@ -925,7 +925,7 @@ MINI_OP(OP_GC_PARAM_SLOT_LIVENESS_DEF, "gc_param_slot_liveness_def", NONE, NONE,
 /* #if defined(__native_client_codegen__) || defined(__native_client__) */
 /* We have to define these in terms of the TARGET defines, not NaCl defines */
 /* because genmdesc.pl doesn't have multiple defines per platform.          */
-#if defined(TARGET_AMD64) || defined(TARGET_X86)
+#if defined(TARGET_AMD64) || defined(TARGET_X86) || defined(TARGET_ARM)
 MINI_OP(OP_NACL_GC_SAFE_POINT,     "nacl_gc_safe_point", IREG, NONE, NONE)
 #endif
 
index 91571f51b3166f465a5a3545a31ce974a0caed11..171c2a495a67f9fe7260061c385ee1d9c43df105 100644 (file)
@@ -733,6 +733,14 @@ static inline gint32 InterlockedExchangeAdd(volatile gint32 *dest, gint32 add)
 
 #elif defined(__arm__)
 
+#ifdef __native_client__
+#define MASK_REGISTER(reg, cond) "bic" cond " " reg ", " reg ", #0xc0000000\n"
+#define NACL_ALIGN() ".align 4\n"
+#else
+#define MASK_REGISTER(reg, cond)
+#define NACL_ALIGN()
+#endif
+
 /*
  * Atomic operations on ARM doesn't contain memory barriers, and the runtime code
  * depends on this, so we add them explicitly.
@@ -743,11 +751,16 @@ static inline gint32 InterlockedCompareExchange(volatile gint32 *dest, gint32 ex
 #if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7S__)
        gint32 ret, tmp;
        __asm__ __volatile__ (  "1:\n"
+                               NACL_ALIGN()
                                "dmb\n"
                                "mov    %0, #0\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "al")
                                "ldrex %1, [%2]\n"
                                "teq    %1, %3\n"
                                "it eq\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "eq")
                                "strexeq %0, %4, [%2]\n"
                                "teq %0, #0\n"
                                "bne 1b\n"
@@ -761,12 +774,18 @@ static inline gint32 InterlockedCompareExchange(volatile gint32 *dest, gint32 ex
        gint32 a, b;
 
        __asm__ __volatile__ (    "0:\n\t"
+                                 NACL_ALIGN()
+                                 MASK_REGISTER("%2", "al")
                                  "ldr %1, [%2]\n\t"
                                  "cmp %1, %4\n\t"
                                  "mov %0, %1\n\t"
                                  "bne 1f\n\t"
+                                 NACL_ALIGN()
+                                 MASK_REGISTER("%2", "al")
                                  "swp %0, %3, [%2]\n\t"
                                  "cmp %0, %1\n\t"
+                                 NACL_ALIGN()
+                                 MASK_REGISTER("%2", "ne")
                                  "swpne %3, %0, [%2]\n\t"
                                  "bne 0b\n\t"
                                  "1:"
@@ -785,10 +804,15 @@ static inline gpointer InterlockedCompareExchangePointer(volatile gpointer *dest
        __asm__ __volatile__ (
                                "dmb\n"
                                "1:\n"
+                               NACL_ALIGN()
                                "mov    %0, #0\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "al")
                                "ldrex %1, [%2]\n"
                                "teq    %1, %3\n"
                                "it eq\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "eq")
                                "strexeq %0, %4, [%2]\n"
                                "teq %0, #0\n"
                                "bne 1b\n"
@@ -802,12 +826,18 @@ static inline gpointer InterlockedCompareExchangePointer(volatile gpointer *dest
        gpointer a, b;
 
        __asm__ __volatile__ (    "0:\n\t"
+                                 NACL_ALIGN()
+                                 MASK_REGISTER("%2", "al")
                                  "ldr %1, [%2]\n\t"
                                  "cmp %1, %4\n\t"
                                  "mov %0, %1\n\t"
                                  "bne 1f\n\t"
+                                 NACL_ALIGN()
+                                 MASK_REGISTER("%2", "eq")
                                  "swpeq %0, %3, [%2]\n\t"
                                  "cmp %0, %1\n\t"
+                                 NACL_ALIGN()
+                                 MASK_REGISTER("%2", "ne")
                                  "swpne %3, %0, [%2]\n\t"
                                  "bne 0b\n\t"
                                  "1:"
@@ -826,8 +856,12 @@ static inline gint32 InterlockedIncrement(volatile gint32 *dest)
        __asm__ __volatile__ (
                                "dmb\n"
                                "1:\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "al")
                                "ldrex %0, [%2]\n"
                                "add %0, %0, %3\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "al")
                                "strex %1, %0, [%2]\n"
                                "teq %1, #0\n"
                                "bne 1b\n"
@@ -841,10 +875,16 @@ static inline gint32 InterlockedIncrement(volatile gint32 *dest)
        gint32 a, b, c;
 
        __asm__ __volatile__ (  "0:\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "ldr %0, [%3]\n\t"
                                "add %1, %0, %4\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "swp %2, %1, [%3]\n\t"
                                "cmp %0, %2\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "ne")
                                "swpne %1, %2, [%3]\n\t"
                                "bne 0b"
                                : "=&r" (a), "=&r" (b), "=&r" (c)
@@ -862,8 +902,12 @@ static inline gint32 InterlockedDecrement(volatile gint32 *dest)
        __asm__ __volatile__ (
                                "dmb\n"
                                "1:\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "al")
                                "ldrex %0, [%2]\n"
                                "sub %0, %0, %3\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%2", "al")
                                "strex %1, %0, [%2]\n"
                                "teq %1, #0\n"
                                "bne 1b\n"
@@ -877,10 +921,16 @@ static inline gint32 InterlockedDecrement(volatile gint32 *dest)
        gint32 a, b, c;
 
        __asm__ __volatile__ (  "0:\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "ldr %0, [%3]\n\t"
                                "add %1, %0, %4\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "swp %2, %1, [%3]\n\t"
                                "cmp %0, %2\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "ne")
                                "swpne %1, %2, [%3]\n\t"
                                "bne 0b"
                                : "=&r" (a), "=&r" (b), "=&r" (c)
@@ -898,7 +948,11 @@ static inline gint32 InterlockedExchange(volatile gint32 *dest, gint32 exch)
        __asm__ __volatile__ (
                                  "dmb\n"
                              "1:\n"
+                             NACL_ALIGN()
+                             MASK_REGISTER("%3", "al")
                              "ldrex %0, [%3]\n"
+                             NACL_ALIGN()
+                             MASK_REGISTER("%3", "al")
                              "strex %1, %2, [%3]\n"
                              "teq %1, #0\n"
                              "bne 1b\n"
@@ -910,7 +964,9 @@ static inline gint32 InterlockedExchange(volatile gint32 *dest, gint32 exch)
 #else
        gint32 a;
 
-       __asm__ __volatile__ (  "swp %0, %2, [%1]"
+       __asm__ __volatile__ (  NACL_ALIGN()
+                               MASK_REGISTER("%1", "al")
+                                "swp %0, %2, [%1]"
                                : "=&r" (a)
                                : "r" (dest), "r" (exch));
 
@@ -925,7 +981,11 @@ static inline gpointer InterlockedExchangePointer(volatile gpointer *dest, gpoin
        __asm__ __volatile__ (
                                  "dmb\n"
                              "1:\n"
+                             NACL_ALIGN()
+                             MASK_REGISTER("%3", "al")
                              "ldrex %0, [%3]\n"
+                             NACL_ALIGN()
+                             MASK_REGISTER("%3", "al")
                              "strex %1, %2, [%3]\n"
                              "teq %1, #0\n"
                              "bne 1b\n"
@@ -937,7 +997,9 @@ static inline gpointer InterlockedExchangePointer(volatile gpointer *dest, gpoin
 #else
        gpointer a;
 
-       __asm__ __volatile__ (  "swp %0, %2, [%1]"
+       __asm__ __volatile__ (  NACL_ALIGN()
+                               MASK_REGISTER("%1", "al")
+                                "swp %0, %2, [%1]"
                                : "=&r" (a)
                                : "r" (dest), "r" (exch));
 
@@ -952,8 +1014,12 @@ static inline gint32 InterlockedExchangeAdd(volatile gint32 *dest, gint32 add)
        __asm__ __volatile__ (
                                "dmb\n"
                                "1:\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "ldrex %0, [%3]\n"
                                "add %1, %0, %4\n"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "strex %2, %1, [%3]\n"
                                "teq %2, #0\n"
                                "bne 1b\n"
@@ -967,10 +1033,16 @@ static inline gint32 InterlockedExchangeAdd(volatile gint32 *dest, gint32 add)
        int a, b, c;
 
        __asm__ __volatile__ (  "0:\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "ldr %0, [%3]\n\t"
                                "add %1, %0, %4\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "al")
                                "swp %2, %1, [%3]\n\t"
                                "cmp %0, %2\n\t"
+                               NACL_ALIGN()
+                               MASK_REGISTER("%3", "ne")
                                "swpne %1, %2, [%3]\n\t"
                                "bne 0b"
                                : "=&r" (a), "=&r" (b), "=&r" (c)