Merge pull request #1659 from alexanderkyte/stringbuilder-referencesource
[mono.git] / mono / mini / mini-amd64.c
index 3a9b8c245da5861e1fd4f2983c129d82e9e6be28..1a31eab4d025032dbed313949f1e334befc62c40 100755 (executable)
@@ -594,8 +594,6 @@ merge_argument_class_from_type (MonoGenericSharingContext *gsctx, MonoType *type
 
        ptype = mini_type_get_underlying_type (gsctx, type);
        switch (ptype->type) {
-       case MONO_TYPE_BOOLEAN:
-       case MONO_TYPE_CHAR:
        case MONO_TYPE_I1:
        case MONO_TYPE_U1:
        case MONO_TYPE_I2:
@@ -751,7 +749,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
        klass = mono_class_from_mono_type (type);
        size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
 #ifndef HOST_WIN32
-       if (!sig->pinvoke && !disable_vtypes_in_regs && ((is_return && (size == 8)) || (!is_return && (size <= 16)))) {
+       if (!sig->pinvoke && ((is_return && (size == 8)) || (!is_return && (size <= 16)))) {
                /* We pass and return vtypes of size 8 in a register */
        } else if (!sig->pinvoke || (size == 0) || (size > 16)) {
                pass_on_stack = TRUE;
@@ -1011,73 +1009,69 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 #endif
 
        /* return value */
-       {
-               ret_type = mini_type_get_underlying_type (gsctx, sig->ret);
-               switch (ret_type->type) {
-               case MONO_TYPE_BOOLEAN:
-               case MONO_TYPE_I1:
-               case MONO_TYPE_U1:
-               case MONO_TYPE_I2:
-               case MONO_TYPE_U2:
-               case MONO_TYPE_CHAR:
-               case MONO_TYPE_I4:
-               case MONO_TYPE_U4:
-               case MONO_TYPE_I:
-               case MONO_TYPE_U:
-               case MONO_TYPE_PTR:
-               case MONO_TYPE_FNPTR:
-               case MONO_TYPE_CLASS:
-               case MONO_TYPE_OBJECT:
-               case MONO_TYPE_SZARRAY:
-               case MONO_TYPE_ARRAY:
-               case MONO_TYPE_STRING:
-                       cinfo->ret.storage = ArgInIReg;
-                       cinfo->ret.reg = AMD64_RAX;
-                       break;
-               case MONO_TYPE_U8:
-               case MONO_TYPE_I8:
+       ret_type = mini_type_get_underlying_type (gsctx, sig->ret);
+       switch (ret_type->type) {
+       case MONO_TYPE_I1:
+       case MONO_TYPE_U1:
+       case MONO_TYPE_I2:
+       case MONO_TYPE_U2:
+       case MONO_TYPE_I4:
+       case MONO_TYPE_U4:
+       case MONO_TYPE_I:
+       case MONO_TYPE_U:
+       case MONO_TYPE_PTR:
+       case MONO_TYPE_FNPTR:
+       case MONO_TYPE_CLASS:
+       case MONO_TYPE_OBJECT:
+       case MONO_TYPE_SZARRAY:
+       case MONO_TYPE_ARRAY:
+       case MONO_TYPE_STRING:
+               cinfo->ret.storage = ArgInIReg;
+               cinfo->ret.reg = AMD64_RAX;
+               break;
+       case MONO_TYPE_U8:
+       case MONO_TYPE_I8:
+               cinfo->ret.storage = ArgInIReg;
+               cinfo->ret.reg = AMD64_RAX;
+               break;
+       case MONO_TYPE_R4:
+               cinfo->ret.storage = ArgInFloatSSEReg;
+               cinfo->ret.reg = AMD64_XMM0;
+               break;
+       case MONO_TYPE_R8:
+               cinfo->ret.storage = ArgInDoubleSSEReg;
+               cinfo->ret.reg = AMD64_XMM0;
+               break;
+       case MONO_TYPE_GENERICINST:
+               if (!mono_type_generic_inst_is_valuetype (ret_type)) {
                        cinfo->ret.storage = ArgInIReg;
                        cinfo->ret.reg = AMD64_RAX;
                        break;
-               case MONO_TYPE_R4:
-                       cinfo->ret.storage = ArgInFloatSSEReg;
-                       cinfo->ret.reg = AMD64_XMM0;
-                       break;
-               case MONO_TYPE_R8:
-                       cinfo->ret.storage = ArgInDoubleSSEReg;
-                       cinfo->ret.reg = AMD64_XMM0;
-                       break;
-               case MONO_TYPE_GENERICINST:
-                       if (!mono_type_generic_inst_is_valuetype (ret_type)) {
-                               cinfo->ret.storage = ArgInIReg;
-                               cinfo->ret.reg = AMD64_RAX;
-                               break;
-                       }
-                       /* fall through */
+               }
+               /* fall through */
 #if defined( __native_client_codegen__ )
-               case MONO_TYPE_TYPEDBYREF:
+       case MONO_TYPE_TYPEDBYREF:
 #endif
-               case MONO_TYPE_VALUETYPE: {
-                       guint32 tmp_gr = 0, tmp_fr = 0, tmp_stacksize = 0;
+       case MONO_TYPE_VALUETYPE: {
+               guint32 tmp_gr = 0, tmp_fr = 0, tmp_stacksize = 0;
 
-                       add_valuetype (gsctx, sig, &cinfo->ret, ret_type, TRUE, &tmp_gr, &tmp_fr, &tmp_stacksize);
-                       if (cinfo->ret.storage == ArgOnStack) {
-                               cinfo->vtype_retaddr = TRUE;
-                               /* The caller passes the address where the value is stored */
-                       }
-                       break;
+               add_valuetype (gsctx, sig, &cinfo->ret, ret_type, TRUE, &tmp_gr, &tmp_fr, &tmp_stacksize);
+               if (cinfo->ret.storage == ArgOnStack) {
+                       cinfo->vtype_retaddr = TRUE;
+                       /* The caller passes the address where the value is stored */
                }
+               break;
+       }
 #if !defined( __native_client_codegen__ )
-               case MONO_TYPE_TYPEDBYREF:
-                       /* Same as a valuetype with size 24 */
-                       cinfo->vtype_retaddr = TRUE;
-                       break;
+       case MONO_TYPE_TYPEDBYREF:
+               /* Same as a valuetype with size 24 */
+               cinfo->vtype_retaddr = TRUE;
+               break;
 #endif
-               case MONO_TYPE_VOID:
-                       break;
-               default:
-                       g_error ("Can't handle as return value 0x%x", ret_type->type);
-               }
+       case MONO_TYPE_VOID:
+               break;
+       default:
+               g_error ("Can't handle as return value 0x%x", ret_type->type);
        }
 
        pstart = 0;
@@ -1141,14 +1135,12 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 
                ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
                switch (ptype->type) {
-               case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
                case MONO_TYPE_U1:
                        add_general (&gr, &stack_size, ainfo);
                        break;
                case MONO_TYPE_I2:
                case MONO_TYPE_U2:
-               case MONO_TYPE_CHAR:
                        add_general (&gr, &stack_size, ainfo);
                        break;
                case MONO_TYPE_I4:
@@ -1250,7 +1242,7 @@ mono_arch_tail_call_supported (MonoCompile *cfg, MonoMethodSignature *caller_sig
        c1 = get_call_info (NULL, NULL, caller_sig);
        c2 = get_call_info (NULL, NULL, callee_sig);
        res = c1->stack_usage >= c2->stack_usage;
-       callee_ret = mini_replace_type (callee_sig->ret);
+       callee_ret = mini_get_underlying_type (cfg, callee_sig->ret);
        if (callee_ret && MONO_TYPE_ISSTRUCT (callee_ret) && c2->ret.storage != ArgValuetypeInReg)
                /* An address on the callee's stack is passed as the first argument */
                res = FALSE;
@@ -1649,17 +1641,14 @@ mono_arch_fill_argument_info (MonoCompile *cfg)
 {
        MonoType *sig_ret;
        MonoMethodSignature *sig;
-       MonoMethodHeader *header;
        MonoInst *ins;
        int i;
        CallInfo *cinfo;
 
-       header = cfg->header;
-
        sig = mono_method_signature (cfg->method);
 
        cinfo = cfg->arch.cinfo;
-       sig_ret = mini_replace_type (sig->ret);
+       sig_ret = mini_get_underlying_type (cfg, sig->ret);
 
        /*
         * Contrary to mono_arch_allocate_vars (), the information should describe
@@ -1693,15 +1682,9 @@ mono_arch_fill_argument_info (MonoCompile *cfg)
 
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = &cinfo->args [i];
-               MonoType *arg_type;
 
                ins = cfg->args [i];
 
-               if (sig->hasthis && (i == 0))
-                       arg_type = &mono_defaults.object_class->byval_arg;
-               else
-                       arg_type = sig->params [i - sig->hasthis];
-
                switch (ainfo->storage) {
                case ArgInIReg:
                case ArgInFloatSSEReg:
@@ -1729,19 +1712,16 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 {
        MonoType *sig_ret;
        MonoMethodSignature *sig;
-       MonoMethodHeader *header;
        MonoInst *ins;
        int i, offset;
        guint32 locals_stack_size, locals_stack_align;
        gint32 *offsets;
        CallInfo *cinfo;
 
-       header = cfg->header;
-
        sig = mono_method_signature (cfg->method);
 
        cinfo = cfg->arch.cinfo;
-       sig_ret = mini_replace_type (sig->ret);
+       sig_ret = mini_get_underlying_type (cfg, sig->ret);
 
        mono_arch_compute_omit_fp (cfg);
 
@@ -1885,12 +1865,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                if (ins->opcode != OP_REGVAR) {
                        ArgInfo *ainfo = &cinfo->args [i];
                        gboolean inreg = TRUE;
-                       MonoType *arg_type;
-
-                       if (sig->hasthis && (i == 0))
-                               arg_type = &mono_defaults.object_class->byval_arg;
-                       else
-                               arg_type = sig->params [i - sig->hasthis];
 
                        if (cfg->globalra) {
                                /* The new allocator needs info about the original locations of the arguments */
@@ -2022,7 +1996,7 @@ mono_arch_create_vars (MonoCompile *cfg)
        if (cinfo->ret.storage == ArgValuetypeInReg)
                cfg->ret_var_is_local = TRUE;
 
-       sig_ret = mini_replace_type (sig->ret);
+       sig_ret = mini_get_underlying_type (cfg, sig->ret);
        if ((cinfo->ret.storage != ArgValuetypeInReg) && MONO_TYPE_ISSTRUCT (sig_ret)) {
                cfg->vret_addr = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_ARG);
                if (G_UNLIKELY (cfg->verbose_level > 1)) {
@@ -2141,6 +2115,7 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
        MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, cinfo->sig_cookie.offset, sig_reg);
 }
 
+#ifdef ENABLE_LLVM
 static inline LLVMArgStorage
 arg_storage_to_llvm_arg_storage (MonoCompile *cfg, ArgStorage storage)
 {
@@ -2155,7 +2130,6 @@ arg_storage_to_llvm_arg_storage (MonoCompile *cfg, ArgStorage storage)
        }
 }
 
-#ifdef ENABLE_LLVM
 LLVMCallInfo*
 mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
 {
@@ -2167,7 +2141,7 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
        MonoType *t, *sig_ret;
 
        n = sig->param_count + sig->hasthis;
-       sig_ret = mini_replace_type (sig->ret);
+       sig_ret = mini_get_underlying_type (cfg, sig->ret);
 
        cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
 
@@ -2256,12 +2230,10 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
        MonoInst *arg, *in;
        MonoMethodSignature *sig;
        MonoType *sig_ret;
-       int i, n, stack_size;
+       int i, n;
        CallInfo *cinfo;
        ArgInfo *ainfo;
 
-       stack_size = 0;
-
        sig = call->signature;
        n = sig->param_count + sig->hasthis;
 
@@ -2290,6 +2262,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                else
                        t = sig->params [i - sig->hasthis];
 
+               t = mini_get_underlying_type (cfg, t);
                if (ainfo->storage == ArgOnStack && !MONO_TYPE_ISSTRUCT (t) && !call->tail_call) {
                        if (!t->byref) {
                                if (t->type == MONO_TYPE_R4)
@@ -2398,7 +2371,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos))
                emit_sig_cookie (cfg, call, cinfo);
 
-       sig_ret = mini_replace_type (sig->ret);
+       sig_ret = mini_get_underlying_type (cfg, sig->ret);
        if (sig_ret && MONO_TYPE_ISSTRUCT (sig_ret)) {
                MonoInst *vtarg;
 
@@ -2533,7 +2506,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
 void
 mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
 {
-       MonoType *ret = mini_replace_type (mono_method_signature (method)->ret);
+       MonoType *ret = mini_get_underlying_type (cfg, mono_method_signature (method)->ret);
 
        if (ret->type == MONO_TYPE_R4) {
                if (COMPILE_LLVM (cfg))
@@ -2706,7 +2679,7 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
                p->regs [greg ++] = PTR_TO_GREG(ret);
 
        for (i = pindex; i < sig->param_count; i++) {
-               MonoType *t = mono_type_get_underlying_type (sig->params [i]);
+               MonoType *t = mini_type_get_underlying_type (NULL, sig->params [i]);
                gpointer *arg = args [arg_index ++];
 
                if (t->byref) {
@@ -2737,7 +2710,6 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
                        p->regs [greg ++] = *(guint64*)(arg);
                        break;
 #endif
-               case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_U1:
                        p->regs [greg ++] = *(guint8*)(arg);
                        break;
@@ -2748,7 +2720,6 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g
                        p->regs [greg ++] = *(gint16*)(arg);
                        break;
                case MONO_TYPE_U2:
-               case MONO_TYPE_CHAR:
                        p->regs [greg ++] = *(guint16*)(arg);
                        break;
                case MONO_TYPE_I4:
@@ -2802,7 +2773,7 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
        MonoMethodSignature *sig = dinfo->sig;
        guint8 *ret = ((DynCallArgs*)buf)->ret;
        mgreg_t res = ((DynCallArgs*)buf)->res;
-       MonoType *sig_ret = mono_type_get_underlying_type (sig->ret);
+       MonoType *sig_ret = mini_type_get_underlying_type (NULL, sig->ret);
 
        switch (sig_ret->type) {
        case MONO_TYPE_VOID:
@@ -2822,14 +2793,12 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf)
                *(gint8*)ret = res;
                break;
        case MONO_TYPE_U1:
-       case MONO_TYPE_BOOLEAN:
                *(guint8*)ret = res;
                break;
        case MONO_TYPE_I2:
                *(gint16*)ret = res;
                break;
        case MONO_TYPE_U2:
-       case MONO_TYPE_CHAR:
                *(guint16*)ret = res;
                break;
        case MONO_TYPE_I4:
@@ -3353,7 +3322,10 @@ cc_signed_table [] = {
 static unsigned char*
 emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int sreg, int size, gboolean is_signed)
 {
-       amd64_sse_cvttsd2si_reg_reg (code, dreg, sreg);
+       if (size == 8)
+               amd64_sse_cvttsd2si_reg_reg (code, dreg, sreg);
+       else
+               amd64_sse_cvttsd2si_reg_reg_size (code, dreg, sreg, 4);
 
        if (size == 1)
                amd64_widen_reg (code, dreg, dreg, is_signed, FALSE);
@@ -3484,8 +3456,9 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
                break;
        case OP_FCALL:
        case OP_FCALL_REG:
-       case OP_FCALL_MEMBASE:
-               if (((MonoCallInst*)ins)->signature->ret->type == MONO_TYPE_R4) {
+       case OP_FCALL_MEMBASE: {
+               MonoType *rtype = mini_get_underlying_type (cfg, ((MonoCallInst*)ins)->signature->ret);
+               if (rtype->type == MONO_TYPE_R4) {
                        amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, AMD64_XMM0);
                }
                else {
@@ -3493,6 +3466,13 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
                                amd64_sse_movsd_reg_reg (code, ins->dreg, AMD64_XMM0);
                }
                break;
+       }
+       case OP_RCALL:
+       case OP_RCALL_REG:
+       case OP_RCALL_MEMBASE:
+               if (ins->dreg != AMD64_XMM0)
+                       amd64_sse_movss_reg_reg (code, ins->dreg, AMD64_XMM0);
+               break;
        case OP_VCALL:
        case OP_VCALL_REG:
        case OP_VCALL_MEMBASE:
@@ -3540,7 +3520,7 @@ static int tls_gs_offset;
 gboolean
 mono_amd64_have_tls_get (void)
 {
-#ifdef __APPLE__
+#ifdef TARGET_MACH
        static gboolean have_tls_get = FALSE;
        static gboolean inited = FALSE;
        guint8 *ins;
@@ -3571,6 +3551,8 @@ mono_amd64_have_tls_get (void)
        tls_gs_offset = ins[5];
 
        return have_tls_get;
+#elif defined(TARGET_ANDROID)
+       return FALSE;
 #else
        return TRUE;
 #endif
@@ -3770,8 +3752,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        MonoCallInst *call;
        guint offset;
        guint8 *code = cfg->native_code + cfg->code_len;
-       MonoInst *last_ins = NULL;
-       guint last_offset = 0;
        int max_len;
 
        /* Fix max_offset estimate for each successor bb */
@@ -3814,7 +3794,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        if (cfg->verbose_level > 2)
                g_print ("Basic block %d starting at offset 0x%x\n", bb->block_num, bb->native_offset);
 
-       if (cfg->prof_options & MONO_PROFILE_COVERAGE) {
+       if ((cfg->prof_options & MONO_PROFILE_COVERAGE) && cfg->coverage_info) {
                MonoProfileCoverageInfo *cov = cfg->coverage_info;
                g_assert (!cfg->compile_aot);
 
@@ -4431,9 +4411,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_shift_reg (code, X86_SAR, ins->dreg);
                        break;
                case OP_SHR_IMM:
-                       g_assert (amd64_is_imm32 (ins->inst_imm));
-                       amd64_shift_reg_imm_size (code, X86_SAR, ins->dreg, ins->inst_imm, 4);
-                       break;
                case OP_LSHR_IMM:
                        g_assert (amd64_is_imm32 (ins->inst_imm));
                        amd64_shift_reg_imm (code, X86_SAR, ins->dreg, ins->inst_imm);
@@ -4451,9 +4428,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_shift_reg (code, X86_SHR, ins->dreg);
                        break;
                case OP_SHL_IMM:
-                       g_assert (amd64_is_imm32 (ins->inst_imm));
-                       amd64_shift_reg_imm_size (code, X86_SHL, ins->dreg, ins->inst_imm, 4);
-                       break;
                case OP_LSHL_IMM:
                        g_assert (amd64_is_imm32 (ins->inst_imm));
                        amd64_shift_reg_imm (code, X86_SHL, ins->dreg, ins->inst_imm);
@@ -4654,10 +4628,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_mov_reg_imm_size (code, ins->dreg, 0, 8);
                        break;
                case OP_MOVE:
-                       amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof(mgreg_t));
+                       if (ins->dreg != ins->sreg1)
+                               amd64_mov_reg_reg (code, ins->dreg, ins->sreg1, sizeof(mgreg_t));
                        break;
                case OP_AMD64_SET_XMMREG_R4: {
-                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
+                       if (cfg->r4fp) {
+                               if (ins->dreg != ins->sreg1)
+                                       amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1);
+                       } else {
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
+                       }
                        break;
                }
                case OP_AMD64_SET_XMMREG_R8: {
@@ -4717,6 +4697,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                }
                case OP_CALL:
                case OP_FCALL:
+               case OP_RCALL:
                case OP_LCALL:
                case OP_VCALL:
                case OP_VCALL2:
@@ -4753,6 +4734,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        code = emit_move_return_value (cfg, ins, code);
                        break;
                case OP_FCALL_REG:
+               case OP_RCALL_REG:
                case OP_LCALL_REG:
                case OP_VCALL_REG:
                case OP_VCALL2_REG:
@@ -4801,6 +4783,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        code = emit_move_return_value (cfg, ins, code);
                        break;
                case OP_FCALL_MEMBASE:
+               case OP_RCALL_MEMBASE:
                case OP_LCALL_MEMBASE:
                case OP_VCALL_MEMBASE:
                case OP_VCALL2_MEMBASE:
@@ -5082,12 +5065,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        float f = *(float *)ins->inst_p0;
 
                        if ((f == 0.0) && (mono_signbit (f) == 0)) {
-                               amd64_sse_xorpd_reg_reg (code, ins->dreg, ins->dreg);
+                               if (cfg->r4fp)
+                                       amd64_sse_xorps_reg_reg (code, ins->dreg, ins->dreg);
+                               else
+                                       amd64_sse_xorpd_reg_reg (code, ins->dreg, ins->dreg);
                        }
                        else {
                                mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4, ins->inst_p0);
                                amd64_sse_movss_reg_membase (code, ins->dreg, AMD64_RIP, 0);
-                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                               if (!cfg->r4fp)
+                                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
                        }
                        break;
                }
@@ -5098,31 +5085,51 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_sse_movsd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
                        break;
                case OP_STORER4_MEMBASE_REG:
-                       /* This requires a double->single conversion */
-                       amd64_sse_cvtsd2ss_reg_reg (code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg1);
-                       amd64_sse_movss_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, MONO_ARCH_FP_SCRATCH_REG);
+                       if (cfg->r4fp) {
+                               amd64_sse_movss_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1);
+                       } else {
+                               /* This requires a double->single conversion */
+                               amd64_sse_cvtsd2ss_reg_reg (code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg1);
+                               amd64_sse_movss_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, MONO_ARCH_FP_SCRATCH_REG);
+                       }
                        break;
                case OP_LOADR4_MEMBASE:
-                       amd64_sse_movss_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
-                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       if (cfg->r4fp) {
+                               amd64_sse_movss_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       } else {
+                               amd64_sse_movss_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       }
                        break;
                case OP_ICONV_TO_R4:
-                       amd64_sse_cvtsi2ss_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
-                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       if (cfg->r4fp) {
+                               amd64_sse_cvtsi2ss_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       } else {
+                               amd64_sse_cvtsi2ss_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       }
                        break;
                case OP_ICONV_TO_R8:
                        amd64_sse_cvtsi2sd_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
                        break;
                case OP_LCONV_TO_R4:
-                       amd64_sse_cvtsi2ss_reg_reg (code, ins->dreg, ins->sreg1);
-                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       if (cfg->r4fp) {
+                               amd64_sse_cvtsi2ss_reg_reg (code, ins->dreg, ins->sreg1);
+                       } else {
+                               amd64_sse_cvtsi2ss_reg_reg (code, ins->dreg, ins->sreg1);
+                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       }
                        break;
                case OP_LCONV_TO_R8:
                        amd64_sse_cvtsi2sd_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
                case OP_FCONV_TO_R4:
-                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
-                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       if (cfg->r4fp) {
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
+                       } else {
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1);
+                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       }
                        break;
                case OP_FCONV_TO_I1:
                        code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 1, TRUE);
@@ -5146,6 +5153,40 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FCONV_TO_I8:
                        code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 8, TRUE);
                        break;
+
+               case OP_RCONV_TO_I1:
+                       amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
+                       break;
+               case OP_RCONV_TO_U1:
+                       amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
+                       break;
+               case OP_RCONV_TO_I2:
+                       amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
+                       break;
+               case OP_RCONV_TO_U2:
+                       amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
+                       break;
+               case OP_RCONV_TO_I4:
+                       amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_RCONV_TO_U4:
+                       amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_RCONV_TO_I8:
+                       amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 8);
+                       break;
+               case OP_RCONV_TO_R8:
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_RCONV_TO_R4:
+                       if (ins->dreg != ins->sreg1)
+                               amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
                case OP_LCONV_TO_R_UN: { 
                        guint8 *br [2];
 
@@ -5189,6 +5230,29 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (ins->dreg != ins->sreg1)
                                amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
+               case OP_RMOVE:
+                       if (ins->dreg != ins->sreg1)
+                               amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_MOVE_F_TO_I4:
+                       if (cfg->r4fp) {
+                               amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8);
+                       } else {
+                               amd64_sse_cvtsd2ss_reg_reg (code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg1);
+                               amd64_movd_reg_xreg_size (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG, 8);
+                       }
+                       break;
+               case OP_MOVE_I4_TO_F:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 8);
+                       if (!cfg->r4fp)
+                               amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+               case OP_MOVE_F_TO_I8:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8);
+                       break;
+               case OP_MOVE_I8_TO_F:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 8);
+                       break;
                case OP_FADD:
                        amd64_sse_addsd_reg_reg (code, ins->dreg, ins->sreg2);
                        break;
@@ -5228,6 +5292,30 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_SQRT:
                        EMIT_SSE2_FPFUNC (code, fsqrt, ins->dreg, ins->sreg1);
                        break;
+
+               case OP_RADD:
+                       amd64_sse_addss_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_RSUB:
+                       amd64_sse_subss_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_RMUL:
+                       amd64_sse_mulss_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_RDIV:
+                       amd64_sse_divss_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_RNEG: {
+                       static float r4_0 = -0.0;
+
+                       g_assert (ins->sreg1 == ins->dreg);
+
+                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4, &r4_0);
+                       amd64_sse_movss_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_RIP, 0);
+                       amd64_sse_xorps_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG);
+                       break;
+               }
+
                case OP_IMIN:
                        g_assert (cfg->opt & MONO_OPT_CMOV);
                        g_assert (ins->dreg == ins->sreg1);
@@ -5285,12 +5373,21 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         */
                        amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
                        break;
+               case OP_RCOMPARE:
+                       /*
+                        * FIXME: Get rid of this.
+                        * The two arguments are swapped because the fbranch instructions
+                        * depend on this for the non-sse case to work.
+                        */
+                       amd64_sse_comiss_reg_reg (code, ins->sreg2, ins->sreg1);
+                       break;
                case OP_FCNEQ:
                case OP_FCEQ: {
                        /* zeroing the register at the start results in 
                         * shorter and faster code (we can also remove the widening op)
                         */
                        guchar *unordered_check;
+
                        amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
                        amd64_sse_comisd_reg_reg (code, ins->sreg1, ins->sreg2);
                        unordered_check = code;
@@ -5311,7 +5408,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                }
                case OP_FCLT:
-               case OP_FCLT_UN:
+               case OP_FCLT_UN: {
                        /* zeroing the register at the start results in 
                         * shorter and faster code (we can also remove the widening op)
                         */
@@ -5331,6 +5428,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
                        }
                        break;
+               }
                case OP_FCLE: {
                        guchar *unordered_check;
                        amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
@@ -5347,6 +5445,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         * shorter and faster code (we can also remove the widening op)
                         */
                        guchar *unordered_check;
+
                        amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
                        amd64_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1);
                        if (ins->opcode == OP_FCGT) {
@@ -5369,7 +5468,58 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_patch (unordered_check, code);
                        break;
                }
-               
+
+               case OP_RCEQ:
+               case OP_RCGT:
+               case OP_RCLT:
+               case OP_RCLT_UN:
+               case OP_RCGT_UN: {
+                       int x86_cond;
+                       gboolean unordered = FALSE;
+
+                       amd64_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
+                       amd64_sse_comiss_reg_reg (code, ins->sreg2, ins->sreg1);
+
+                       switch (ins->opcode) {
+                       case OP_RCEQ:
+                               x86_cond = X86_CC_EQ;
+                               break;
+                       case OP_RCGT:
+                               x86_cond = X86_CC_LT;
+                               break;
+                       case OP_RCLT:
+                               x86_cond = X86_CC_GT;
+                               break;
+                       case OP_RCLT_UN:
+                               x86_cond = X86_CC_GT;
+                               unordered = TRUE;
+                               break;
+                       case OP_RCGT_UN:
+                               x86_cond = X86_CC_LT;
+                               unordered = TRUE;
+                               break;
+                       default:
+                               g_assert_not_reached ();
+                               break;
+                       }
+
+                       if (unordered) {
+                               guchar *unordered_check;
+                               guchar *jump_to_end;
+
+                               unordered_check = code;
+                               x86_branch8 (code, X86_CC_P, 0, FALSE);
+                               amd64_set_reg (code, x86_cond, ins->dreg, FALSE);
+                               jump_to_end = code;
+                               x86_jump8 (code, 0);
+                               amd64_patch (unordered_check, code);
+                               amd64_inc_reg (code, ins->dreg);
+                               amd64_patch (jump_to_end, code);
+                       } else {
+                               amd64_set_reg (code, x86_cond, ins->dreg, FALSE);
+                       }
+                       break;
+               }
                case OP_FCLT_MEMBASE:
                case OP_FCGT_MEMBASE:
                case OP_FCLT_UN_MEMBASE:
@@ -5522,14 +5672,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                }
                case OP_MEMORY_BARRIER: {
-                       switch (ins->backend.memory_barrier_kind) {
-                       case StoreLoadBarrier:
-                       case FullBarrier:
-                               /* http://blogs.sun.com/dave/resource/NHM-Pipeline-Blog-V2.txt */
-                               x86_prefix (code, X86_LOCK_PREFIX);
-                               amd64_alu_membase_imm (code, X86_ADD, AMD64_RSP, 0, 0);
-                               break;
-                       }
+                       if (ins->backend.memory_barrier_kind == MONO_MEMORY_BARRIER_SEQ)
+                               x86_mfence (code);
                        break;
                }
                case OP_ATOMIC_ADD_I4:
@@ -5553,62 +5697,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                }
                case OP_ATOMIC_EXCHANGE_I4:
                case OP_ATOMIC_EXCHANGE_I8: {
-                       guchar *br[2];
-                       int sreg2 = ins->sreg2;
-                       int breg = ins->inst_basereg;
-                       guint32 size;
-                       gboolean need_push = FALSE, rdx_pushed = FALSE;
-
-                       if (ins->opcode == OP_ATOMIC_EXCHANGE_I8)
-                               size = 8;
-                       else
-                               size = 4;
-
-                       /* 
-                        * See http://msdn.microsoft.com/en-us/magazine/cc302329.aspx for
-                        * an explanation of how this works.
-                        */
-
-                       /* cmpxchg uses eax as comperand, need to make sure we can use it
-                        * hack to overcome limits in x86 reg allocator 
-                        * (req: dreg == eax and sreg2 != eax and breg != eax) 
-                        */
-                       g_assert (ins->dreg == AMD64_RAX);
-
-                       if (breg == AMD64_RAX && ins->sreg2 == AMD64_RAX)
-                               /* Highly unlikely, but possible */
-                               need_push = TRUE;
-
-                       /* The pushes invalidate rsp */
-                       if ((breg == AMD64_RAX) || need_push) {
-                               amd64_mov_reg_reg (code, AMD64_R11, breg, 8);
-                               breg = AMD64_R11;
-                       }
-
-                       /* We need the EAX reg for the comparand */
-                       if (ins->sreg2 == AMD64_RAX) {
-                               if (breg != AMD64_R11) {
-                                       amd64_mov_reg_reg (code, AMD64_R11, AMD64_RAX, 8);
-                                       sreg2 = AMD64_R11;
-                               } else {
-                                       g_assert (need_push);
-                                       amd64_push_reg (code, AMD64_RDX);
-                                       amd64_mov_reg_reg (code, AMD64_RDX, AMD64_RAX, size);
-                                       sreg2 = AMD64_RDX;
-                                       rdx_pushed = TRUE;
-                               }
-                       }
-
-                       amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
-
-                       br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
-                       amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
-                       br [1] = code; amd64_branch8 (code, X86_CC_NE, -1, FALSE);
-                       amd64_patch (br [1], br [0]);
-
-                       if (rdx_pushed)
-                               amd64_pop_reg (code, AMD64_RDX);
+                       guint32 size = ins->opcode == OP_ATOMIC_EXCHANGE_I4 ? 4 : 8;
 
+                       /* LOCK prefix is implied. */
+                       amd64_mov_reg_reg (code, GP_SCRATCH_REG, ins->sreg2, size);
+                       amd64_xchg_membase_reg_size (code, ins->sreg1, ins->inst_offset, GP_SCRATCH_REG, size);
+                       amd64_mov_reg_reg (code, ins->dreg, GP_SCRATCH_REG, size);
                        break;
                }
                case OP_ATOMIC_CAS_I4:
@@ -5635,6 +5729,95 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_mov_reg_reg (code, ins->dreg, AMD64_RAX, size);
                        break;
                }
+               case OP_ATOMIC_LOAD_I1: {
+                       amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, TRUE, FALSE);
+                       break;
+               }
+               case OP_ATOMIC_LOAD_U1: {
+                       amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, FALSE);
+                       break;
+               }
+               case OP_ATOMIC_LOAD_I2: {
+                       amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, TRUE, TRUE);
+                       break;
+               }
+               case OP_ATOMIC_LOAD_U2: {
+                       amd64_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, TRUE);
+                       break;
+               }
+               case OP_ATOMIC_LOAD_I4: {
+                       amd64_movsxd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       break;
+               }
+               case OP_ATOMIC_LOAD_U4:
+               case OP_ATOMIC_LOAD_I8:
+               case OP_ATOMIC_LOAD_U8: {
+                       amd64_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, ins->opcode == OP_ATOMIC_LOAD_U4 ? 4 : 8);
+                       break;
+               }
+               case OP_ATOMIC_LOAD_R4: {
+                       amd64_sse_movss_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+               }
+               case OP_ATOMIC_LOAD_R8: {
+                       amd64_sse_movsd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       break;
+               }
+               case OP_ATOMIC_STORE_I1:
+               case OP_ATOMIC_STORE_U1:
+               case OP_ATOMIC_STORE_I2:
+               case OP_ATOMIC_STORE_U2:
+               case OP_ATOMIC_STORE_I4:
+               case OP_ATOMIC_STORE_U4:
+               case OP_ATOMIC_STORE_I8:
+               case OP_ATOMIC_STORE_U8: {
+                       int size;
+
+                       switch (ins->opcode) {
+                       case OP_ATOMIC_STORE_I1:
+                       case OP_ATOMIC_STORE_U1:
+                               size = 1;
+                               break;
+                       case OP_ATOMIC_STORE_I2:
+                       case OP_ATOMIC_STORE_U2:
+                               size = 2;
+                               break;
+                       case OP_ATOMIC_STORE_I4:
+                       case OP_ATOMIC_STORE_U4:
+                               size = 4;
+                               break;
+                       case OP_ATOMIC_STORE_I8:
+                       case OP_ATOMIC_STORE_U8:
+                               size = 8;
+                               break;
+                       }
+
+                       amd64_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, size);
+
+                       if (ins->backend.memory_barrier_kind == MONO_MEMORY_BARRIER_SEQ)
+                               x86_mfence (code);
+                       break;
+               }
+               case OP_ATOMIC_STORE_R4: {
+                       amd64_sse_cvtsd2ss_reg_reg (code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg1);
+                       amd64_sse_movss_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, MONO_ARCH_FP_SCRATCH_REG);
+
+                       if (ins->backend.memory_barrier_kind == MONO_MEMORY_BARRIER_SEQ)
+                               x86_mfence (code);
+                       break;
+               }
+               case OP_ATOMIC_STORE_R8: {
+                       x86_nop (code);
+                       x86_nop (code);
+                       amd64_sse_movsd_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1);
+                       x86_nop (code);
+                       x86_nop (code);
+
+                       if (ins->backend.memory_barrier_kind == MONO_MEMORY_BARRIER_SEQ)
+                               x86_mfence (code);
+                       break;
+               }
                case OP_CARD_TABLE_WBARRIER: {
                        int ptr = ins->sreg1;
                        int value = ins->sreg2;
@@ -6210,21 +6393,33 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_INSERTX_R4_SLOW:
                        switch (ins->inst_c0) {
                        case 0:
-                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               if (cfg->r4fp)
+                                       amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg2);
+                               else
+                                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
                                break;
                        case 1:
                                amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
-                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               if (cfg->r4fp)
+                                       amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg2);
+                               else
+                                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
                                amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
                                break;
                        case 2:
                                amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
-                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               if (cfg->r4fp)
+                                       amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg2);
+                               else
+                                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
                                amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
                                break;
                        case 3:
                                amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
-                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               if (cfg->r4fp)
+                                       amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg2);
+                               else
+                                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
                                amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
                                break;
                        }
@@ -6263,9 +6458,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_XZERO:
                        amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg);
                        break;
-               case OP_ICONV_TO_R8_RAW:
+               case OP_ICONV_TO_R4_RAW:
                        amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
-                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
                        break;
 
                case OP_FCONV_TO_R8_X:
@@ -6304,8 +6498,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
                        break;
                case OP_EXPAND_R4:
-                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
-                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->dreg);
+                       if (cfg->r4fp) {
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       } else {
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->dreg);
+                       }
                        amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
                        break;
                case OP_EXPAND_R8:
@@ -6362,9 +6560,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        g_assert_not_reached ();
 #endif
                }
-              
-               last_ins = ins;
-               last_offset = offset;
        }
 
        cfg->code_len = code - cfg->native_code;
@@ -6789,8 +6984,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        /* Keep this in sync with emit_load_volatile_arguments */
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
-               gint32 stack_offset;
-               MonoType *arg_type;
 
                ins = cfg->args [i];
 
@@ -6798,13 +6991,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        /* Unused arguments */
                        continue;
 
-               if (sig->hasthis && (i == 0))
-                       arg_type = &mono_defaults.object_class->byval_arg;
-               else
-                       arg_type = sig->params [i - sig->hasthis];
-
-               stack_offset = ainfo->offset + ARGS_OFFSET;
-
                if (cfg->globalra) {
                        /* All the other moves are done by the register allocator */
                        switch (ainfo->storage) {
@@ -7036,7 +7222,7 @@ void
 mono_arch_emit_epilog (MonoCompile *cfg)
 {
        MonoMethod *method = cfg->method;
-       int quad, pos, i;
+       int quad, i;
        guint8 *code;
        int max_epilog_size;
        CallInfo *cinfo;
@@ -7064,7 +7250,6 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                code = mono_arch_instrument_epilog (cfg, mono_trace_leave_method, code, TRUE);
 
        /* the code restoring the registers must be kept in sync with OP_TAILCALL */
-       pos = 0;
        
        if (method->save_lmf) {
                /* check if we need to restore protection of the stack after a stack overflow */
@@ -7371,7 +7556,6 @@ void*
 mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
 {
        guchar *code = p;
-       CallInfo *cinfo = NULL;
        MonoMethodSignature *sig;
        MonoInst *inst;
        int i, n, stack_area = 0;
@@ -7382,8 +7566,6 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                /* Allocate a new area on the stack and save arguments there */
                sig = mono_method_signature (cfg->method);
 
-               cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig);
-
                n = sig->param_count + sig->hasthis;
 
                stack_area = ALIGN_TO (n * 8, 16);
@@ -7427,7 +7609,7 @@ mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolea
        guchar *code = p;
        int save_mode = SAVE_NONE;
        MonoMethod *method = cfg->method;
-       MonoType *ret_type = mini_replace_type (mono_method_signature (method)->ret);
+       MonoType *ret_type = mini_get_underlying_type (cfg, mono_method_signature (method)->ret);
        int i;
        
        switch (ret_type->type) {
@@ -7643,8 +7825,6 @@ mono_arch_get_patch_offset (guint8 *code)
 gboolean
 mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guint8 *buf, int size)
 {
-       int i;
-       gboolean can_write = TRUE;
        /*
         * If method_start is non-NULL we need to perform bound checks, since we access memory
         * at code - offset we could go before the start of the method and end up in a different
@@ -7658,21 +7838,7 @@ mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guin
                memset (buf, 0, size);
                memcpy (buf + offset - diff, method_start, diff + size - offset);
        }
-       code -= offset;
-       for (i = 0; i < MONO_BREAKPOINT_ARRAY_SIZE; ++i) {
-               int idx = mono_breakpoint_info_index [i];
-               guint8 *ptr;
-               if (idx < 1)
-                       continue;
-               ptr = mono_breakpoint_info [idx].address;
-               if (ptr >= code && ptr < code + size) {
-                       guint8 saved_byte = mono_breakpoint_info [idx].saved_byte;
-                       can_write = FALSE;
-                       /*g_print ("patching %p with 0x%02x (was: 0x%02x)\n", ptr, saved_byte, buf [ptr - code]);*/
-                       buf [ptr - code] = saved_byte;
-               }
-       }
-       return can_write;
+       return TRUE;
 }
 
 #if defined(__native_client_codegen__)
@@ -8149,7 +8315,7 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
                        opcode = OP_ABS;
                }
                
-               if (opcode) {
+               if (opcode && fsig->param_count == 1) {
                        MONO_INST_NEW (cfg, ins, opcode);
                        ins->type = STACK_R8;
                        ins->dreg = mono_alloc_freg (cfg);
@@ -8180,7 +8346,7 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
                        }
                }
                
-               if (opcode) {
+               if (opcode && fsig->param_count == 2) {
                        MONO_INST_NEW (cfg, ins, opcode);
                        ins->type = fsig->params [0]->type == MONO_TYPE_I4 ? STACK_I4 : STACK_I8;
                        ins->dreg = mono_alloc_ireg (cfg);
@@ -8191,7 +8357,7 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
 
 #if 0
                /* OP_FREM is not IEEE compatible */
-               else if (strcmp (cmethod->name, "IEEERemainder") == 0) {
+               else if (strcmp (cmethod->name, "IEEERemainder") == 0 && fsig->param_count == 2) {
                        MONO_INST_NEW (cfg, ins, OP_FREM);
                        ins->inst_i0 = args [0];
                        ins->inst_i1 = args [1];
@@ -8199,11 +8365,6 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
 #endif
        }
 
-       /* 
-        * Can't implement CompareExchange methods this way since they have
-        * three arguments.
-        */
-
        return ins;
 }
 #endif
@@ -8521,6 +8682,26 @@ mono_arch_opcode_supported (int opcode)
        case OP_ATOMIC_EXCHANGE_I8:
        case OP_ATOMIC_CAS_I4:
        case OP_ATOMIC_CAS_I8:
+       case OP_ATOMIC_LOAD_I1:
+       case OP_ATOMIC_LOAD_I2:
+       case OP_ATOMIC_LOAD_I4:
+       case OP_ATOMIC_LOAD_I8:
+       case OP_ATOMIC_LOAD_U1:
+       case OP_ATOMIC_LOAD_U2:
+       case OP_ATOMIC_LOAD_U4:
+       case OP_ATOMIC_LOAD_U8:
+       case OP_ATOMIC_LOAD_R4:
+       case OP_ATOMIC_LOAD_R8:
+       case OP_ATOMIC_STORE_I1:
+       case OP_ATOMIC_STORE_I2:
+       case OP_ATOMIC_STORE_I4:
+       case OP_ATOMIC_STORE_I8:
+       case OP_ATOMIC_STORE_U1:
+       case OP_ATOMIC_STORE_U2:
+       case OP_ATOMIC_STORE_U4:
+       case OP_ATOMIC_STORE_U8:
+       case OP_ATOMIC_STORE_R4:
+       case OP_ATOMIC_STORE_R8:
                return TRUE;
        default:
                return FALSE;