2007-05-27 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-arm.c
index 26f2f97a70e630dd24224e2c4c56f479f1e944e6..3290b509b61124e1e0acdc316edb63b95fb8fa0f 100644 (file)
 #include "inssel.h"
 #include "cpu-arm.h"
 #include "trace.h"
+#ifdef ARM_FPU_FPA
 #include "mono/arch/arm/arm-fpa-codegen.h"
+#elif defined(ARM_FPU_VFP)
+#include "mono/arch/arm/arm-vfp-codegen.h"
+#endif
+
+static int v5_supported = 0;
+static int thumb_supported = 0;
+
+static int mono_arm_is_rotated_imm8 (guint32 val, gint *rot_amount);
 
 /*
  * TODO:
@@ -86,7 +95,7 @@ static guint8*
 emit_memcpy (guint8 *code, int size, int dreg, int doffset, int sreg, int soffset)
 {
        /* we can use r0-r3, since this is called only for incoming args on the stack */
-       if (0 && size > sizeof (gpointer) * 5) {
+       if (size > sizeof (gpointer) * 4) {
                guint8 *start_loop;
                code = emit_big_add (code, ARMREG_R0, sreg, soffset);
                code = emit_big_add (code, ARMREG_R1, dreg, doffset);
@@ -96,25 +105,50 @@ emit_memcpy (guint8 *code, int size, int dreg, int doffset, int sreg, int soffse
                ARM_ADD_REG_IMM8 (code, ARMREG_R0, ARMREG_R0, 4);
                ARM_ADD_REG_IMM8 (code, ARMREG_R1, ARMREG_R1, 4);
                ARM_SUBS_REG_IMM8 (code, ARMREG_R2, ARMREG_R2, 4);
-               ARM_B_COND (code, ARMCOND_LT, 0);
+               ARM_B_COND (code, ARMCOND_NE, 0);
                arm_patch (code - 4, start_loop);
                return code;
        }
-       g_assert (arm_is_imm12 (doffset));
-       g_assert (arm_is_imm12 (doffset + size));
-       g_assert (arm_is_imm12 (soffset));
-       g_assert (arm_is_imm12 (soffset + size));
-       while (size >= 4) {
-               ARM_LDR_IMM (code, ARMREG_LR, sreg, soffset);
-               ARM_STR_IMM (code, ARMREG_LR, dreg, doffset);
-               doffset += 4;
-               soffset += 4;
-               size -= 4;
+       if (arm_is_imm12 (doffset) && arm_is_imm12 (doffset + size) &&
+                       arm_is_imm12 (soffset) && arm_is_imm12 (soffset + size)) {
+               while (size >= 4) {
+                       ARM_LDR_IMM (code, ARMREG_LR, sreg, soffset);
+                       ARM_STR_IMM (code, ARMREG_LR, dreg, doffset);
+                       doffset += 4;
+                       soffset += 4;
+                       size -= 4;
+               }
+       } else if (size) {
+               code = emit_big_add (code, ARMREG_R0, sreg, soffset);
+               code = emit_big_add (code, ARMREG_R1, dreg, doffset);
+               doffset = soffset = 0;
+               while (size >= 4) {
+                       ARM_LDR_IMM (code, ARMREG_LR, ARMREG_R0, soffset);
+                       ARM_STR_IMM (code, ARMREG_LR, ARMREG_R1, doffset);
+                       doffset += 4;
+                       soffset += 4;
+                       size -= 4;
+               }
        }
        g_assert (size == 0);
        return code;
 }
 
+static guint8*
+emit_call_reg (guint8 *code, int reg)
+{
+       if (v5_supported) {
+               ARM_BLX_REG (code, reg);
+       } else {
+               ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
+               if (thumb_supported)
+                       ARM_BX (code, reg);
+               else
+                       ARM_MOV_REG_REG (code, ARMREG_PC, reg);
+       }
+       return code;
+}
+
 /*
  * mono_arch_get_argument_info:
  * @csig:  a method signature
@@ -189,6 +223,31 @@ guint32
 mono_arch_cpu_optimizazions (guint32 *exclude_mask)
 {
        guint32 opts = 0;
+       char buf [512];
+       char *line;
+       FILE *file = fopen ("/proc/cpuinfo", "r");
+       if (file) {
+               while ((line = fgets (buf, 512, file))) {
+                       if (strncmp (line, "Processor", 9) == 0) {
+                               char *ver = strstr (line, "(v");
+                               if (ver && (ver [2] == '5' || ver [2] == '6' || ver [2] == '7')) {
+                                       v5_supported = TRUE;
+                               }
+                               continue;
+                       }
+                       if (strncmp (line, "Features", 8) == 0) {
+                               char *th = strstr (line, "thumb");
+                               if (th) {
+                                       thumb_supported = TRUE;
+                                       if (v5_supported)
+                                               break;
+                               }
+                               continue;
+                       }
+               }
+               fclose (file);
+               /*printf ("features: v5: %d, thumb: %d\n", v5_supported, thumb_supported);*/
+       }
 
        /* no arm-specific optimizations yet */
        *exclude_mask = 0;
@@ -214,6 +273,10 @@ is_regsize_var (MonoType *t) {
        case MONO_TYPE_SZARRAY:
        case MONO_TYPE_ARRAY:
                return TRUE;
+       case MONO_TYPE_GENERICINST:
+               if (!mono_type_generic_inst_is_valuetype (t))
+                       return TRUE;
+               return FALSE;
        case MONO_TYPE_VALUETYPE:
                return FALSE;
        }
@@ -298,6 +361,7 @@ mono_arch_flush_icache (guint8 *code, gint size)
 enum {
        RegTypeGeneral,
        RegTypeBase,
+       RegTypeBaseGen,
        RegTypeFP,
        RegTypeStructByVal,
        RegTypeStructByAddr
@@ -335,16 +399,22 @@ add_general (guint *gr, guint *stack_size, ArgInfo *ainfo, gboolean simple)
                        ainfo->reg = *gr;
                }
        } else {
-               if (*gr > ARMREG_R2) {
-                       *stack_size += 7;
-                       *stack_size &= ~7;
+               if (*gr == ARMREG_R3) {
+                       /* first word in r3 and the second on the stack */
+                       ainfo->offset = *stack_size;
+                       ainfo->reg = ARMREG_SP; /* in the caller */
+                       ainfo->regtype = RegTypeBaseGen;
+                       *stack_size += 4;
+               } else if (*gr > ARMREG_R3) {
+                       /**stack_size += 7;
+                       *stack_size &= ~7;*/
                        ainfo->offset = *stack_size;
                        ainfo->reg = ARMREG_SP; /* in the caller */
                        ainfo->regtype = RegTypeBase;
                        *stack_size += 8;
                } else {
-                       if ((*gr) & 1)
-                               (*gr) ++;
+                       /*if ((*gr) & 1)
+                               (*gr) ++;*/
                        ainfo->reg = *gr;
                }
                (*gr) ++;
@@ -426,6 +496,14 @@ calculate_sizes (MonoMethodSignature *sig, gboolean is_pinvoke)
                        add_general (&gr, &stack_size, cinfo->args + n, TRUE);
                        n++;
                        break;
+               case MONO_TYPE_GENERICINST:
+                       if (!mono_type_generic_inst_is_valuetype (sig->params [i])) {
+                               cinfo->args [n].size = sizeof (gpointer);
+                               add_general (&gr, &stack_size, cinfo->args + n, TRUE);
+                               n++;
+                               break;
+                       }
+                       /* Fall through */
                case MONO_TYPE_TYPEDBYREF:
                case MONO_TYPE_VALUETYPE: {
                        gint size;
@@ -511,6 +589,12 @@ calculate_sizes (MonoMethodSignature *sig, gboolean is_pinvoke)
                        /* FIXME: cinfo->ret.reg = ???;
                        cinfo->ret.regtype = RegTypeFP;*/
                        break;
+               case MONO_TYPE_GENERICINST:
+                       if (!mono_type_generic_inst_is_valuetype (sig->ret)) {
+                               cinfo->ret.reg = ARMREG_R0;
+                               break;
+                       }
+                       break;
                case MONO_TYPE_VALUETYPE:
                        break;
                case MONO_TYPE_TYPEDBYREF:
@@ -549,17 +633,6 @@ mono_arch_allocate_vars (MonoCompile *m)
        /* allow room for the vararg method args: void* and long/double */
        if (mono_jit_trace_calls != NULL && mono_trace_eval (m->method))
                m->param_area = MAX (m->param_area, sizeof (gpointer)*8);
-       /* this is bug #60332: remove when #59509 is fixed, so no weird vararg 
-        * call convs needs to be handled this way.
-        */
-       if (m->flags & MONO_CFG_HAS_VARARGS)
-               m->param_area = MAX (m->param_area, sizeof (gpointer)*8);
-       /* gtk-sharp and other broken code will dllimport vararg functions even with
-        * non-varargs signatures. Since there is little hope people will get this right
-        * we assume they won't.
-        */
-       if (m->method->wrapper_type == MONO_WRAPPER_MANAGED_TO_NATIVE)
-               m->param_area = MAX (m->param_area, sizeof (gpointer)*8);
 
        header = mono_method_get_header (m->method);
 
@@ -642,13 +715,18 @@ mono_arch_allocate_vars (MonoCompile *m)
                if ((inst->flags & MONO_INST_IS_DEAD) || inst->opcode == OP_REGVAR)
                        continue;
 
-               /* inst->unused indicates native sized value types, this is used by the
+               /* inst->backend.is_pinvoke indicates native sized value types, this is used by the
                * pinvoke wrappers when they call functions returning structure */
-               if (inst->unused && MONO_TYPE_ISSTRUCT (inst->inst_vtype) && inst->inst_vtype->type != MONO_TYPE_TYPEDBYREF)
+               if (inst->backend.is_pinvoke && MONO_TYPE_ISSTRUCT (inst->inst_vtype) && inst->inst_vtype->type != MONO_TYPE_TYPEDBYREF)
                        size = mono_class_native_size (mono_class_from_mono_type (inst->inst_vtype), &align);
                else
                        size = mono_type_size (inst->inst_vtype, &align);
 
+               /* FIXME: if a structure is misaligned, our memcpy doesn't work,
+                * since it loads/stores misaligned words, which don't do the right thing.
+                */
+               if (align < 4 && size >= 4)
+                       align = 4;
                offset += align - 1;
                offset &= ~(align - 1);
                inst->inst_offset = offset;
@@ -660,7 +738,7 @@ mono_arch_allocate_vars (MonoCompile *m)
 
        curinst = 0;
        if (sig->hasthis) {
-               inst = m->varinfo [curinst];
+               inst = m->args [curinst];
                if (inst->opcode != OP_REGVAR) {
                        inst->opcode = OP_REGOFFSET;
                        inst->inst_basereg = frame_reg;
@@ -675,11 +753,16 @@ mono_arch_allocate_vars (MonoCompile *m)
        }
 
        for (i = 0; i < sig->param_count; ++i) {
-               inst = m->varinfo [curinst];
+               inst = m->args [curinst];
                if (inst->opcode != OP_REGVAR) {
                        inst->opcode = OP_REGOFFSET;
                        inst->inst_basereg = frame_reg;
                        size = mono_type_size (sig->params [i], &align);
+                       /* FIXME: if a structure is misaligned, our memcpy doesn't work,
+                        * since it loads/stores misaligned words, which don't do the right thing.
+                        */
+                       if (align < 4 && size >= 4)
+                               align = 4;
                        offset += align - 1;
                        offset &= ~(align - 1);
                        inst->inst_offset = offset;
@@ -757,13 +840,15 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                        arg->next = call->out_args;
                        call->out_args = arg;
                        if (ainfo->regtype == RegTypeGeneral) {
-                               arg->unused = ainfo->reg;
+                               arg->backend.reg3 = ainfo->reg;
                                call->used_iregs |= 1 << ainfo->reg;
                                if (arg->type == STACK_I8)
                                        call->used_iregs |= 1 << (ainfo->reg + 1);
                                if (arg->type == STACK_R8) {
                                        if (ainfo->size == 4) {
+#ifndef MONO_ARCH_SOFT_FLOAT
                                                arg->opcode = OP_OUTARG_R4;
+#endif
                                        } else {
                                                call->used_iregs |= 1 << (ainfo->reg + 1);
                                        }
@@ -771,7 +856,7 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                }
                        } else if (ainfo->regtype == RegTypeStructByAddr) {
                                /* FIXME: where si the data allocated? */
-                               arg->unused = ainfo->reg;
+                               arg->backend.reg3 = ainfo->reg;
                                call->used_iregs |= 1 << ainfo->reg;
                                g_assert_not_reached ();
                        } else if (ainfo->regtype == RegTypeStructByVal) {
@@ -783,13 +868,19 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                arg->opcode = OP_OUTARG_VT;
                                /* vtsize and offset have just 12 bits of encoding in number of words */
                                g_assert (((ainfo->vtsize | (ainfo->offset / 4)) & 0xfffff000) == 0);
-                               arg->unused = ainfo->reg | (ainfo->size << 4) | (ainfo->vtsize << 8) | ((ainfo->offset / 4) << 20);
+                               arg->backend.arg_info = ainfo->reg | (ainfo->size << 4) | (ainfo->vtsize << 8) | ((ainfo->offset / 4) << 20);
                        } else if (ainfo->regtype == RegTypeBase) {
                                arg->opcode = OP_OUTARG_MEMBASE;
-                               arg->unused = (ainfo->offset << 8) | ainfo->size;
+                               arg->backend.arg_info = (ainfo->offset << 8) | ainfo->size;
+                       } else if (ainfo->regtype == RegTypeBaseGen) {
+                               call->used_iregs |= 1 << ARMREG_R3;
+                               arg->opcode = OP_OUTARG_MEMBASE;
+                               arg->backend.arg_info = (ainfo->offset << 8) | 0xff;
+                               if (arg->type == STACK_R8)
+                                       cfg->flags |= MONO_CFG_HAS_FPOUT;
                        } else if (ainfo->regtype == RegTypeFP) {
-                               arg->unused = ainfo->reg;
-                               /* FPA args are passed in int regs */
+                               arg->backend.reg3 = ainfo->reg;
+                               /* FP args are passed in int regs */
                                call->used_iregs |= 1 << ainfo->reg;
                                if (ainfo->size == 8) {
                                        arg->opcode = OP_OUTARG_R8;
@@ -840,8 +931,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
        code = mono_arm_emit_load_imm (code, ARMREG_R0, (guint32)cfg->method);
        ARM_MOV_REG_IMM8 (code, ARMREG_R1, 0); /* NULL ebp for now */
        code = mono_arm_emit_load_imm (code, ARMREG_R2, (guint32)func);
-       ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
-       ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_R2);
+       code = emit_call_reg (code, ARMREG_R2);
        return code;
 }
 
@@ -872,7 +962,6 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
                cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
                code = cfg->native_code + offset;
        }
-handle_enum:
        switch (rtype) {
        case MONO_TYPE_VOID:
                /* special case string .ctor icall */
@@ -931,8 +1020,7 @@ handle_enum:
 
        code = mono_arm_emit_load_imm (code, ARMREG_R0, (guint32)cfg->method);
        code = mono_arm_emit_load_imm (code, ARMREG_IP, (guint32)func);
-       ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
-       ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
+       code = emit_call_reg (code, ARMREG_IP);
 
        switch (save_mode) {
        case SAVE_TWO:
@@ -1085,15 +1173,8 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (last_ins && (last_ins->opcode == OP_STOREI1_MEMBASE_REG) &&
                                        ins->inst_basereg == last_ins->inst_destbasereg &&
                                        ins->inst_offset == last_ins->inst_offset) {
-                               if (ins->dreg == last_ins->sreg1) {
-                                       last_ins->next = ins->next;                             
-                                       ins = ins->next;                                
-                                       continue;
-                               } else {
-                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
-                                       ins->opcode = OP_MOVE;
-                                       ins->sreg1 = last_ins->sreg1;
-                               }
+                               ins->opcode = (ins->opcode == OP_LOADI1_MEMBASE) ? CEE_CONV_I1 : CEE_CONV_U1;
+                               ins->sreg1 = last_ins->sreg1;                           
                        }
                        break;
                case OP_LOADU2_MEMBASE:
@@ -1101,15 +1182,8 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (last_ins && (last_ins->opcode == OP_STOREI2_MEMBASE_REG) &&
                                        ins->inst_basereg == last_ins->inst_destbasereg &&
                                        ins->inst_offset == last_ins->inst_offset) {
-                               if (ins->dreg == last_ins->sreg1) {
-                                       last_ins->next = ins->next;                             
-                                       ins = ins->next;                                
-                                       continue;
-                               } else {
-                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
-                                       ins->opcode = OP_MOVE;
-                                       ins->sreg1 = last_ins->sreg1;
-                               }
+                               ins->opcode = (ins->opcode == OP_LOADI2_MEMBASE) ? CEE_CONV_I2 : CEE_CONV_U2;
+                               ins->sreg1 = last_ins->sreg1;                           
                        }
                        break;
                case CEE_CONV_I4:
@@ -1244,6 +1318,14 @@ map_to_reg_reg_op (int op)
                return OP_STORER4_MEMINDEX;
        case OP_STORER8_MEMBASE_REG:
                return OP_STORER8_MEMINDEX;
+       case OP_STORE_MEMBASE_IMM:
+               return OP_STORE_MEMBASE_REG;
+       case OP_STOREI1_MEMBASE_IMM:
+               return OP_STOREI1_MEMBASE_REG;
+       case OP_STOREI2_MEMBASE_IMM:
+               return OP_STOREI2_MEMBASE_REG;
+       case OP_STOREI4_MEMBASE_IMM:
+               return OP_STOREI4_MEMBASE_REG;
        }
        g_assert_not_reached ();
 }
@@ -1256,15 +1338,16 @@ map_to_reg_reg_op (int op)
 static void
 mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 {
-       MonoInst *ins, *next, *temp, *last_ins = NULL;
-       int rot_amount, imm8;
+       MonoInst *ins, *temp, *last_ins = NULL;
+       int rot_amount, imm8, low_imm;
 
        /* setup the virtual reg allocator */
-       if (bb->max_ireg > cfg->rs->next_vireg)
-               cfg->rs->next_vireg = bb->max_ireg;
+       if (bb->max_vreg > cfg->rs->next_vreg)
+               cfg->rs->next_vreg = bb->max_vreg;
 
        ins = bb->code;
        while (ins) {
+loop_start:
                switch (ins->opcode) {
                case OP_ADD_IMM:
                case OP_SUB_IMM:
@@ -1338,20 +1421,74 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_LOADR8_MEMBASE:
                        if (arm_is_fpimm8 (ins->inst_offset))
                                break;
+                       low_imm = ins->inst_offset & 0x1ff;
+                       if ((imm8 = mono_arm_is_rotated_imm8 (ins->inst_offset & ~0x1ff, &rot_amount)) >= 0) {
+                               NEW_INS (cfg, temp, OP_ADD_IMM);
+                               temp->inst_imm = ins->inst_offset & ~0x1ff;
+                               temp->sreg1 = ins->inst_basereg;
+                               temp->dreg = mono_regstate_next_int (cfg->rs);
+                               ins->inst_basereg = temp->dreg;
+                               ins->inst_offset = low_imm;
+                               break;
+                       }
+                       /* VFP/FPA doesn't have indexed load instructions */
                        g_assert_not_reached ();
-                       /* FPA doesn't have indexed load instructions */
+                       break;
+               case OP_STORE_MEMBASE_REG:
+               case OP_STOREI4_MEMBASE_REG:
+               case OP_STOREI1_MEMBASE_REG:
+                       if (arm_is_imm12 (ins->inst_offset))
+                               break;
                        NEW_INS (cfg, temp, OP_ICONST);
                        temp->inst_c0 = ins->inst_offset;
                        temp->dreg = mono_regstate_next_int (cfg->rs);
                        ins->sreg2 = temp->dreg;
                        ins->opcode = map_to_reg_reg_op (ins->opcode);
                        break;
+               case OP_STOREI2_MEMBASE_REG:
+                       if (arm_is_imm8 (ins->inst_offset))
+                               break;
+                       NEW_INS (cfg, temp, OP_ICONST);
+                       temp->inst_c0 = ins->inst_offset;
+                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                       ins->sreg2 = temp->dreg;
+                       ins->opcode = map_to_reg_reg_op (ins->opcode);
+                       break;
+               case OP_STORER4_MEMBASE_REG:
+               case OP_STORER8_MEMBASE_REG:
+                       if (arm_is_fpimm8 (ins->inst_offset))
+                               break;
+                       low_imm = ins->inst_offset & 0x1ff;
+                       if ((imm8 = mono_arm_is_rotated_imm8 (ins->inst_offset & ~ 0x1ff, &rot_amount)) >= 0 && arm_is_fpimm8 (low_imm)) {
+                               NEW_INS (cfg, temp, OP_ADD_IMM);
+                               temp->inst_imm = ins->inst_offset & ~0x1ff;
+                               temp->sreg1 = ins->inst_destbasereg;
+                               temp->dreg = mono_regstate_next_int (cfg->rs);
+                               ins->inst_destbasereg = temp->dreg;
+                               ins->inst_offset = low_imm;
+                               break;
+                       }
+                       /*g_print ("fail with: %d (%d, %d)\n", ins->inst_offset, ins->inst_offset & ~0x1ff, low_imm);*/
+                       /* VFP/FPA doesn't have indexed store instructions */
+                       g_assert_not_reached ();
+                       break;
+               case OP_STORE_MEMBASE_IMM:
+               case OP_STOREI1_MEMBASE_IMM:
+               case OP_STOREI2_MEMBASE_IMM:
+               case OP_STOREI4_MEMBASE_IMM:
+                       NEW_INS (cfg, temp, OP_ICONST);
+                       temp->inst_c0 = ins->inst_imm;
+                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                       ins->sreg1 = temp->dreg;
+                       ins->opcode = map_to_reg_reg_op (ins->opcode);
+                       last_ins = temp;
+                       goto loop_start; /* make it handle the possibly big ins->inst_offset */
                }
                last_ins = ins;
                ins = ins->next;
        }
        bb->last_ins = last_ins;
-       bb->max_ireg = cfg->rs->next_vireg;
+       bb->max_vreg = cfg->rs->next_vreg;
 
 }
 
@@ -1368,7 +1505,15 @@ static guchar*
 emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int sreg, int size, gboolean is_signed)
 {
        /* sreg is a float, dreg is an integer reg  */
+#ifdef ARM_FPU_FPA
        ARM_FIXZ (code, dreg, sreg);
+#elif defined(ARM_FPU_VFP)
+       if (is_signed)
+               ARM_TOSIZD (code, ARM_VFP_F0, sreg);
+       else
+               ARM_TOUIZD (code, ARM_VFP_F0, sreg);
+       ARM_FMRS (code, dreg, ARM_VFP_F0);
+#endif
        if (!is_signed) {
                if (size == 1)
                        ARM_AND_REG_IMM8 (code, dreg, dreg, 0xff);
@@ -1403,7 +1548,7 @@ search_thunk_slot (void *data, int csize, int bsize, void *user_data) {
        guchar *code = data;
        guint32 *thunks = data;
        guint32 *endthunks = (guint32*)(code + bsize);
-       int i, count = 0;
+       int count = 0;
        int difflow, diffhigh;
 
        /* always ensure a call from pdata->code can reach to the thunks without further thunks */
@@ -1432,7 +1577,10 @@ search_thunk_slot (void *data, int csize, int bsize, void *user_data) {
                                /* found a free slot instead: emit thunk */
                                code = (guchar*)thunks;
                                ARM_LDR_IMM (code, ARMREG_IP, ARMREG_PC, 0);
-                               ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
+                               if (thumb_supported)
+                                       ARM_BX (code, ARMREG_IP);
+                               else
+                                       ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
                                thunks [2] = (guint32)pdata->target;
                                mono_arch_flush_icache ((guchar*)thunks, 12);
 
@@ -1478,19 +1626,37 @@ handle_thunk (int absolute, guchar *code, const guchar *target) {
 void
 arm_patch (guchar *code, const guchar *target)
 {
-       guint32 ins = *(guint32*)code;
+       guint32 *code32 = (void*)code;
+       guint32 ins = *code32;
        guint32 prim = (ins >> 25) & 7;
-       guint32 ovf;
+       guint32 tval = GPOINTER_TO_UINT (target);
 
        //g_print ("patching 0x%08x (0x%08x) to point to 0x%08x\n", code, ins, target);
        if (prim == 5) { /* 101b */
                /* the diff starts 8 bytes from the branch opcode */
                gint diff = target - code - 8;
+               gint tbits;
+               gint tmask = 0xffffffff;
+               if (tval & 1) { /* entering thumb mode */
+                       diff = target - 1 - code - 8;
+                       g_assert (thumb_supported);
+                       tbits = 0xf << 28; /* bl->blx bit pattern */
+                       g_assert ((ins & (1 << 24))); /* it must be a bl, not b instruction */
+                       /* this low bit of the displacement is moved to bit 24 in the instruction encoding */
+                       if (diff & 2) {
+                               tbits |= 1 << 24;
+                       }
+                       tmask = ~(1 << 24); /* clear the link bit */
+                       /*g_print ("blx to thumb: target: %p, code: %p, diff: %d, mask: %x\n", target, code, diff, tmask);*/
+               } else {
+                       tbits = 0;
+               }
                if (diff >= 0) {
                        if (diff <= 33554431) {
                                diff >>= 2;
                                ins = (ins & 0xff000000) | diff;
-                               *(guint32*)code = ins;
+                               ins &= tmask;
+                               *code32 = ins | tbits;
                                return;
                        }
                } else {
@@ -1498,7 +1664,8 @@ arm_patch (guchar *code, const guchar *target)
                        if (diff >= -33554432) {
                                diff >>= 2;
                                ins = (ins & 0xff000000) | (diff & ~0xff000000);
-                               *(guint32*)code = ins;
+                               ins &= tmask;
+                               *code32 = ins | tbits;
                                return;
                        }
                }
@@ -1507,11 +1674,49 @@ arm_patch (guchar *code, const guchar *target)
                return;
        }
 
-
+       /*
+        * The alternative call sequences looks like this:
+        *
+        *      ldr ip, [pc] // loads the address constant
+        *      b 1f         // jumps around the constant
+        *      address constant embedded in the code
+        *   1f:
+        *      mov lr, pc
+        *      mov pc, ip
+        *
+        * There are two cases for patching:
+        * a) at the end of method emission: in this case code points to the start
+        *    of the call sequence
+        * b) during runtime patching of the call site: in this case code points
+        *    to the mov pc, ip instruction
+        *
+        * We have to handle also the thunk jump code sequence:
+        *
+        *      ldr ip, [pc]
+        *      mov pc, ip
+        *      address constant // execution never reaches here
+        */
        if ((ins & 0x0ffffff0) == 0x12fff10) {
                /* branch and exchange: the address is constructed in a reg */
                g_assert_not_reached ();
        } else {
+               guint32 ccode [4];
+               guint32 *tmp = ccode;
+               guint8 *emit = (guint8*)tmp;
+               ARM_LDR_IMM (emit, ARMREG_IP, ARMREG_PC, 0);
+               ARM_MOV_REG_REG (emit, ARMREG_LR, ARMREG_PC);
+               ARM_MOV_REG_REG (emit, ARMREG_PC, ARMREG_IP);
+               ARM_BX (emit, ARMREG_IP);
+               if (ins == ccode [2]) {
+                       g_assert_not_reached (); // should be -2 ...
+                       code32 [-1] = (guint32)target;
+                       return;
+               }
+               if (ins == ccode [0]) {
+                       /* handles both thunk jump code and the far call sequence */
+                       code32 [2] = (guint32)target;
+                       return;
+               }
                g_assert_not_reached ();
        }
 //     g_print ("patched with 0x%08x\n", ins);
@@ -1523,7 +1728,7 @@ arm_patch (guchar *code, const guchar *target)
  * to be used with the emit macros.
  * Return -1 otherwise.
  */
-int
+static int
 mono_arm_is_rotated_imm8 (guint32 val, gint *rot_amount)
 {
        guint32 res, i;
@@ -1545,6 +1750,14 @@ guint8*
 mono_arm_emit_load_imm (guint8 *code, int dreg, guint32 val)
 {
        int imm8, rot_amount;
+#if 0
+       ARM_LDR_IMM (code, dreg, ARMREG_PC, 0);
+       /* skip the constant pool */
+       ARM_B (code, 0);
+       *(int*)code = val;
+       code += 4;
+       return code;
+#endif
        if ((imm8 = mono_arm_is_rotated_imm8 (val, &rot_amount)) >= 0) {
                ARM_MOV_REG_IMM (code, dreg, imm8, rot_amount);
        } else if ((imm8 = mono_arm_is_rotated_imm8 (~val, &rot_amount)) >= 0) {
@@ -1617,7 +1830,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        while (ins) {
                offset = code - cfg->native_code;
 
-               max_len = ((guint8 *)arm_cpu_desc [ins->opcode])[MONO_INST_LEN];
+               max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
 
                if (offset > (cfg->code_size - max_len - 16)) {
                        cfg->code_size *= 2;
@@ -1629,6 +1842,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                mono_debug_record_line_number (cfg, ins, offset);
 
                switch (ins->opcode) {
+               case OP_MEMORY_BARRIER:
+                       break;
                case OP_TLS_GET:
                        g_assert_not_reached ();
                        break;
@@ -1666,8 +1881,24 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_STORE_MEMBASE_REG:
                case OP_STOREI4_MEMBASE_REG:
-                       g_assert (arm_is_imm12 (ins->inst_offset));
-                       ARM_STR_IMM (code, ins->sreg1, ins->inst_destbasereg, ins->inst_offset);
+                       /* this case is special, since it happens for spill code after lowering has been called */
+                       if (arm_is_imm12 (ins->inst_offset)) {
+                               ARM_STR_IMM (code, ins->sreg1, ins->inst_destbasereg, ins->inst_offset);
+                       } else {
+                               code = mono_arm_emit_load_imm (code, ARMREG_LR, ins->inst_offset);
+                               ARM_STR_REG_REG (code, ins->sreg1, ins->inst_destbasereg, ARMREG_LR);
+                       }
+                       break;
+               case OP_STOREI1_MEMINDEX:
+                       ARM_STRB_REG_REG (code, ins->sreg1, ins->inst_destbasereg, ins->sreg2);
+                       break;
+               case OP_STOREI2_MEMINDEX:
+                       /* note: the args are reversed in the macro */
+                       ARM_STRH_REG_REG (code, ins->inst_destbasereg, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_STORE_MEMINDEX:
+               case OP_STOREI4_MEMINDEX:
+                       ARM_STR_REG_REG (code, ins->sreg1, ins->inst_destbasereg, ins->sreg2);
                        break;
                case CEE_LDIND_I:
                case CEE_LDIND_I4:
@@ -1683,22 +1914,30 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        ARM_LDR_REG_REG (code, ins->dreg, ins->inst_basereg, ins->sreg2);
                        break;
                case OP_LOADI1_MEMINDEX:
-                       ARM_LDRSB_REG_REG (code, ins->dreg, ins->inst_basereg, ins->sreg2);
+                       /* note: the args are reversed in the macro */
+                       ARM_LDRSB_REG_REG (code, ins->inst_basereg, ins->dreg, ins->sreg2);
                        break;
                case OP_LOADU1_MEMINDEX:
                        ARM_LDRB_REG_REG (code, ins->dreg, ins->inst_basereg, ins->sreg2);
                        break;
                case OP_LOADI2_MEMINDEX:
-                       ARM_LDRSH_REG_REG (code, ins->dreg, ins->inst_basereg, ins->sreg2);
+                       /* note: the args are reversed in the macro */
+                       ARM_LDRSH_REG_REG (code, ins->inst_basereg, ins->dreg, ins->sreg2);
                        break;
                case OP_LOADU2_MEMINDEX:
-                       ARM_LDRH_REG_REG (code, ins->dreg, ins->inst_basereg, ins->sreg2);
+                       /* note: the args are reversed in the macro */
+                       ARM_LDRH_REG_REG (code, ins->inst_basereg, ins->dreg, ins->sreg2);
                        break;
                case OP_LOAD_MEMBASE:
                case OP_LOADI4_MEMBASE:
                case OP_LOADU4_MEMBASE:
-                       g_assert (arm_is_imm12 (ins->inst_offset));
-                       ARM_LDR_IMM (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       /* this case is special, since it happens for spill code after lowering has been called */
+                       if (arm_is_imm12 (ins->inst_offset)) {
+                               ARM_LDR_IMM (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       } else {
+                               code = mono_arm_emit_load_imm (code, ARMREG_LR, ins->inst_offset);
+                               ARM_LDR_REG_REG (code, ins->dreg, ins->inst_basereg, ARMREG_LR);
+                       }
                        break;
                case OP_LOADI1_MEMBASE:
                        g_assert (arm_is_imm8 (ins->inst_offset));
@@ -1739,10 +1978,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        g_assert (imm8 >= 0);
                        ARM_CMP_REG_IMM (code, ins->sreg1, imm8, rot_amount);
                        break;
-               case OP_X86_TEST_NULL:
-                       g_assert_not_reached ();
-                       break;
-               case CEE_BREAK:
+               case OP_BREAK:
                        *(int*)code = 0xe7f001f0;
                        *(int*)code = 0xef9f0001;
                        code += 4;
@@ -1874,16 +2110,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        ARM_SHL_REG (code, ins->dreg, ins->sreg1, ins->sreg2);
                        break;
                case OP_SHL_IMM:
-                       ARM_SHL_IMM (code, ins->dreg, ins->sreg1, (ins->inst_imm & 0x1f));
+                       if (ins->inst_imm)
+                               ARM_SHL_IMM (code, ins->dreg, ins->sreg1, (ins->inst_imm & 0x1f));
                        break;
                case CEE_SHR:
                        ARM_SAR_REG (code, ins->dreg, ins->sreg1, ins->sreg2);
                        break;
                case OP_SHR_IMM:
-                       ARM_SAR_IMM (code, ins->dreg, ins->sreg1, (ins->inst_imm & 0x1f));
+                       if (ins->inst_imm)
+                               ARM_SAR_IMM (code, ins->dreg, ins->sreg1, (ins->inst_imm & 0x1f));
                        break;
                case OP_SHR_UN_IMM:
-                       ARM_SHR_IMM (code, ins->dreg, ins->sreg1, (ins->inst_imm & 0x1f));
+                       if (ins->inst_imm)
+                               ARM_SHR_IMM (code, ins->dreg, ins->sreg1, (ins->inst_imm & 0x1f));
                        break;
                case CEE_SHR_UN:
                        ARM_SHR_REG (code, ins->dreg, ins->sreg1, ins->sreg2);
@@ -1928,30 +2167,38 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_SETLRET: {
                        int saved = ins->sreg2;
-                       if (ins->sreg2 == ARMREG_R0) {
+                       if (ins->sreg2 == ARM_LSW_REG) {
                                ARM_MOV_REG_REG (code, ARMREG_LR, ins->sreg2);
                                saved = ARMREG_LR;
                        }
-                       if (ins->sreg1 != ARMREG_R0)
-                               ARM_MOV_REG_REG (code, ARMREG_R0, ins->sreg1);
-                       if (saved != ARMREG_R1)
-                               ARM_MOV_REG_REG (code, ARMREG_R1, saved);
+                       if (ins->sreg1 != ARM_LSW_REG)
+                               ARM_MOV_REG_REG (code, ARM_LSW_REG, ins->sreg1);
+                       if (saved != ARM_MSW_REG)
+                               ARM_MOV_REG_REG (code, ARM_MSW_REG, saved);
                        break;
                }
                case OP_SETFREG:
                case OP_FMOVE:
+#ifdef ARM_FPU_FPA
                        ARM_MVFD (code, ins->dreg, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CPYD (code, ins->dreg, ins->sreg1);
+#endif
                        break;
                case OP_FCONV_TO_R4:
+#ifdef ARM_FPU_FPA
                        ARM_MVFS (code, ins->dreg, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CVTD (code, ins->dreg, ins->sreg1);
+                       ARM_CVTS (code, ins->dreg, ins->dreg);
+#endif
                        break;
-               case CEE_JMP:
+               case OP_JMP:
                        /*
                         * Keep in sync with mono_arch_emit_epilog
                         */
                        g_assert (!cfg->method->save_lmf);
                        code = emit_big_add (code, ARMREG_SP, cfg->frame_reg, cfg->stack_usage);
-                       ARM_ADD_REG_IMM8 (code, ARMREG_SP, cfg->frame_reg, cfg->stack_usage);
                        ARM_POP_NWB (code, cfg->used_int_regs | ((1 << ARMREG_SP)) | ((1 << ARMREG_LR)));
                        mono_add_patch_info (cfg, (guint8*) code - cfg->native_code, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
                        ARM_B (code, 0);
@@ -1987,8 +2234,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                ARM_B (code, 0);
                                *(gpointer*)code = NULL;
                                code += 4;
-                               ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
-                               ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
+                               code = emit_call_reg (code, ARMREG_IP);
                        } else {
                                ARM_BL (code, 0);
                        }
@@ -1998,8 +2244,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_VCALL_REG:
                case OP_VOIDCALL_REG:
                case OP_CALL_REG:
-                       ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
-                       ARM_MOV_REG_REG (code, ARMREG_PC, ins->sreg1);
+                       code = emit_call_reg (code, ins->sreg1);
                        break;
                case OP_FCALL_MEMBASE:
                case OP_LCALL_MEMBASE:
@@ -2045,7 +2290,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        g_assert_not_reached ();
                        ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_LR);
                        break;
-               case CEE_THROW: {
+               case OP_THROW: {
                        if (ins->sreg1 != ARMREG_R0)
                                ARM_MOV_REG_REG (code, ARMREG_R0, ins->sreg1);
                        mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_INTERNAL_METHOD, 
@@ -2055,8 +2300,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                ARM_B (code, 0);
                                *(gpointer*)code = NULL;
                                code += 4;
-                               ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
-                               ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
+                               code = emit_call_reg (code, ARMREG_IP);
                        } else {
                                ARM_BL (code, 0);
                        }
@@ -2072,27 +2316,40 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                ARM_B (code, 0);
                                *(gpointer*)code = NULL;
                                code += 4;
-                               ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
-                               ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
+                               code = emit_call_reg (code, ARMREG_IP);
                        } else {
                                ARM_BL (code, 0);
                        }
                        break;
                }
                case OP_START_HANDLER:
-                       g_assert (arm_is_imm12 (ins->inst_left->inst_offset));
-                       ARM_STR_IMM (code, ARMREG_LR, ins->inst_left->inst_basereg, ins->inst_left->inst_offset);
+                       if (arm_is_imm12 (ins->inst_left->inst_offset)) {
+                               ARM_STR_IMM (code, ARMREG_LR, ins->inst_left->inst_basereg, ins->inst_left->inst_offset);
+                       } else {
+                               code = mono_arm_emit_load_imm (code, ARMREG_IP, ins->inst_left->inst_offset);
+                               ARM_STR_REG_REG (code, ARMREG_LR, ins->inst_left->inst_basereg, ARMREG_IP);
+                       }
                        break;
                case OP_ENDFILTER:
                        if (ins->sreg1 != ARMREG_R0)
                                ARM_MOV_REG_REG (code, ARMREG_R0, ins->sreg1);
-                       g_assert (arm_is_imm12 (ins->inst_left->inst_offset));
-                       ARM_LDR_IMM (code, ARMREG_IP, ins->inst_left->inst_basereg, ins->inst_left->inst_offset);
+                       if (arm_is_imm12 (ins->inst_left->inst_offset)) {
+                               ARM_LDR_IMM (code, ARMREG_IP, ins->inst_left->inst_basereg, ins->inst_left->inst_offset);
+                       } else {
+                               g_assert (ARMREG_IP != ins->inst_left->inst_basereg);
+                               code = mono_arm_emit_load_imm (code, ARMREG_IP, ins->inst_left->inst_offset);
+                               ARM_LDR_REG_REG (code, ARMREG_IP, ins->inst_left->inst_basereg, ARMREG_IP);
+                       }
                        ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
                        break;
-               case CEE_ENDFINALLY:
-                       g_assert (arm_is_imm12 (ins->inst_left->inst_offset));
-                       ARM_LDR_IMM (code, ARMREG_IP, ins->inst_left->inst_basereg, ins->inst_left->inst_offset);
+               case OP_ENDFINALLY:
+                       if (arm_is_imm12 (ins->inst_left->inst_offset)) {
+                               ARM_LDR_IMM (code, ARMREG_IP, ins->inst_left->inst_basereg, ins->inst_left->inst_offset);
+                       } else {
+                               g_assert (ARMREG_IP != ins->inst_left->inst_basereg);
+                               code = mono_arm_emit_load_imm (code, ARMREG_IP, ins->inst_left->inst_offset);
+                               ARM_LDR_REG_REG (code, ARMREG_IP, ins->inst_left->inst_basereg, ARMREG_IP);
+                       }
                        ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
                        break;
                case OP_CALL_HANDLER: 
@@ -2102,7 +2359,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_LABEL:
                        ins->inst_c0 = code - cfg->native_code;
                        break;
-               case CEE_BR:
+               case OP_BR:
                        if (ins->flags & MONO_INST_BRLABEL) {
                                /*if (ins->inst_i0->inst_c0) {
                                        ARM_B (code, 0);
@@ -2147,7 +2404,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        code += 4 * GPOINTER_TO_INT (ins->klass);
                        break;
                case OP_CEQ:
-                       ARM_MOV_REG_IMM8 (code, ins->dreg, 0);
+                       ARM_MOV_REG_IMM8_COND (code, ins->dreg, 0, ARMCOND_NE);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_EQ);
                        break;
                case OP_CLT:
@@ -2198,6 +2455,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
 
                /* floating point opcodes */
+#ifdef ARM_FPU_FPA
                case OP_R8CONST:
                        /* FIXME: we can optimize the imm load by dealing with part of 
                         * the displacement in LDFD (aligning to 512).
@@ -2226,15 +2484,29 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        ARM_LDFS (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
                        break;
                case CEE_CONV_R_UN: {
-                       /*static const guint64 adjust_val = 0x4330000000000000ULL;
-                       ppc_addis (code, ppc_r0, ppc_r0, 0x4330);
-                       ppc_stw (code, ppc_r0, -8, ppc_sp);
-                       ppc_stw (code, ins->sreg1, -4, ppc_sp);
-                       ppc_load (code, ppc_r11, &adjust_val);
-                       ppc_lfd (code, ins->dreg, -8, ppc_sp);
-                       ppc_lfd (code, ppc_f0, 0, ppc_r11);
-                       ppc_fsub (code, ins->dreg, ins->dreg, ppc_f0);*/
-                       g_assert_not_reached ();
+                       int tmpreg;
+                       tmpreg = ins->dreg == 0? 1: 0;
+                       ARM_CMP_REG_IMM8 (code, ins->sreg1, 0);
+                       ARM_FLTD (code, ins->dreg, ins->sreg1);
+                       ARM_B_COND (code, ARMCOND_GE, 8);
+                       /* save the temp register */
+                       ARM_SUB_REG_IMM8 (code, ARMREG_SP, ARMREG_SP, 8);
+                       ARM_STFD (code, tmpreg, ARMREG_SP, 0);
+                       ARM_LDFD (code, tmpreg, ARMREG_PC, 12);
+                       ARM_FPA_ADFD (code, ins->dreg, ins->dreg, tmpreg);
+                       ARM_LDFD (code, tmpreg, ARMREG_SP, 0);
+                       ARM_ADD_REG_IMM8 (code, ARMREG_SP, ARMREG_SP, 8);
+                       /* skip the constant pool */
+                       ARM_B (code, 8);
+                       code += 4;
+                       *(int*)code = 0x41f00000;
+                       code += 4;
+                       *(int*)code = 0;
+                       code += 4;
+                       /* FIXME: adjust:
+                        * ldfltd  ftemp, [pc, #8] 0x41f00000 0x00000000
+                        * adfltd  fdest, fdest, ftemp
+                        */
                        break;
                }
                case CEE_CONV_R4:
@@ -2243,14 +2515,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CONV_R8:
                        ARM_FLTD (code, ins->dreg, ins->sreg1);
                        break;
-               case OP_X86_FP_LOAD_I8:
+#elif defined(ARM_FPU_VFP)
+               case OP_R8CONST:
+                       /* FIXME: we can optimize the imm load by dealing with part of 
+                        * the displacement in LDFD (aligning to 512).
+                        */
+                       code = mono_arm_emit_load_imm (code, ARMREG_LR, (guint32)ins->inst_p0);
+                       ARM_FLDD (code, ins->dreg, ARMREG_LR, 0);
+                       break;
+               case OP_R4CONST:
+                       code = mono_arm_emit_load_imm (code, ARMREG_LR, (guint32)ins->inst_p0);
+                       ARM_FLDS (code, ins->dreg, ARMREG_LR, 0);
+                       ARM_CVTS (code, ins->dreg, ins->dreg);
+                       break;
+               case OP_STORER8_MEMBASE_REG:
+                       g_assert (arm_is_fpimm8 (ins->inst_offset));
+                       ARM_FSTD (code, ins->sreg1, ins->inst_destbasereg, ins->inst_offset);
+                       break;
+               case OP_LOADR8_MEMBASE:
+                       g_assert (arm_is_fpimm8 (ins->inst_offset));
+                       ARM_FLDD (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       break;
+               case OP_STORER4_MEMBASE_REG:
+                       g_assert (arm_is_fpimm8 (ins->inst_offset));
+                       ARM_FSTS (code, ins->sreg1, ins->inst_destbasereg, ins->inst_offset);
+                       break;
+               case OP_LOADR4_MEMBASE:
+                       g_assert (arm_is_fpimm8 (ins->inst_offset));
+                       ARM_FLDS (code, ins->dreg, ins->inst_basereg, ins->inst_offset);
+                       break;
+               case CEE_CONV_R_UN: {
                        g_assert_not_reached ();
-                       /*x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);*/
                        break;
-               case OP_X86_FP_LOAD_I4:
+               }
+               case CEE_CONV_R4:
+                       g_assert_not_reached ();
+                       //ARM_FLTS (code, ins->dreg, ins->sreg1);
+                       break;
+               case CEE_CONV_R8:
                        g_assert_not_reached ();
-                       /*x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, FALSE);*/
+                       //ARM_FLTD (code, ins->dreg, ins->sreg1);
                        break;
+#endif
                case OP_FCONV_TO_I1:
                        code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 1, TRUE);
                        break;
@@ -2309,6 +2615,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                ARM_MOV_REG_REG (code, ins->dreg, ins->sreg1);
                        break;
                }
+#ifdef ARM_FPU_FPA
                case OP_FADD:
                        ARM_FPA_ADFD (code, ins->dreg, ins->sreg1, ins->sreg2);
                        break;
@@ -2323,7 +2630,24 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;          
                case OP_FNEG:
                        ARM_MNFD (code, ins->dreg, ins->sreg1);
+                       break;
+#elif defined(ARM_FPU_VFP)
+               case OP_FADD:
+                       ARM_VFP_ADDD (code, ins->dreg, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_FSUB:
+                       ARM_VFP_SUBD (code, ins->dreg, ins->sreg1, ins->sreg2);
+                       break;          
+               case OP_FMUL:
+                       ARM_VFP_MULD (code, ins->dreg, ins->sreg1, ins->sreg2);
                        break;          
+               case OP_FDIV:
+                       ARM_VFP_DIVD (code, ins->dreg, ins->sreg1, ins->sreg2);
+                       break;          
+               case OP_FNEG:
+                       ARM_NEGD (code, ins->dreg, ins->sreg1);
+                       break;
+#endif
                case OP_FREM:
                        /* emulated */
                        g_assert_not_reached ();
@@ -2331,33 +2655,53 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FCOMPARE:
                        /* each fp compare op needs to do its own */
                        g_assert_not_reached ();
-                       ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+                       //ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
                        break;
                case OP_FCEQ:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
-                       ARM_MOV_REG_IMM8 (code, ins->dreg, 0);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
+                       ARM_MOV_REG_IMM8_COND (code, ins->dreg, 0, ARMCOND_NE);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_EQ);
                        break;
                case OP_FCLT:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        ARM_MOV_REG_IMM8 (code, ins->dreg, 0);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_MI);
                        break;
                case OP_FCLT_UN:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        ARM_MOV_REG_IMM8 (code, ins->dreg, 0);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_MI);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_VS);
                        break;
                case OP_FCGT:
                        /* swapped */
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg2, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg2, ins->sreg1);
+#endif
                        ARM_MOV_REG_IMM8 (code, ins->dreg, 0);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_MI);
                        break;
                case OP_FCGT_UN:
                        /* swapped */
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg2, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg2, ins->sreg1);
+#endif
                        ARM_MOV_REG_IMM8 (code, ins->dreg, 0);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_MI);
                        ARM_MOV_REG_IMM8_COND (code, ins->dreg, 1, ARMCOND_VS);
@@ -2369,50 +2713,90 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                 * V        Unordered               ARMCOND_VS
                 */
                case OP_FBEQ:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        EMIT_COND_BRANCH (ins, CEE_BEQ - CEE_BEQ);
                        break;
                case OP_FBNE_UN:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        EMIT_COND_BRANCH (ins, CEE_BNE_UN - CEE_BEQ);
                        break;
                case OP_FBLT:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_MI); /* N set */
                        break;
                case OP_FBLT_UN:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_VS); /* V set */
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_MI); /* N set */
                        break;
                case OP_FBGT:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg2, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg2, ins->sreg1);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_MI); /* N set, swapped args */
                        break;
                case OP_FBGT_UN:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg2, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg2, ins->sreg1);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_VS); /* V set */
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_MI); /* N set, swapped args */
                        break;
                case OP_FBGE:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_CS);
                        break;
                case OP_FBGE_UN:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg1, ins->sreg2);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg1, ins->sreg2);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_VS); /* V set */
-                       EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_CS);
+                       EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_GE);
                        break;
                case OP_FBLE:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg2, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg2, ins->sreg1);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_CS); /* swapped */
                        break;
                case OP_FBLE_UN:
+#ifdef ARM_FPU_FPA
                        ARM_FCMP (code, ARM_FPA_CMF, ins->sreg2, ins->sreg1);
+#elif defined(ARM_FPU_VFP)
+                       ARM_CMPD (code, ins->sreg2, ins->sreg1);
+#endif
                        EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_VS); /* V set */
-                       EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_CS); /* swapped */
+                       EMIT_COND_BRANCH_FLAGS (ins, ARMCOND_GE); /* swapped */
                        break;
-               case CEE_CKFINITE: {
+               case OP_CKFINITE: {
                        /*ppc_stfd (code, ins->sreg1, -8, ppc_sp);
                        ppc_lwz (code, ppc_r11, -8, ppc_sp);
                        ppc_rlwinm (code, ppc_r11, ppc_r11, 0, 1, 31);
@@ -2465,7 +2849,6 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
                const unsigned char *target;
 
                if (patch_info->type == MONO_PATCH_INFO_SWITCH) {
-                       gpointer *table = (gpointer *)patch_info->data.table->table;
                        gpointer *jt = (gpointer*)(ip + 8);
                        int i;
                        /* jt is the inlined jump table, 2 instructions after ip
@@ -2564,7 +2947,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        alloc_size = cfg->stack_offset;
        pos = 0;
 
-       if (1 || !method->save_lmf) {
+       if (!method->save_lmf) {
                ARM_PUSH (code, (cfg->used_int_regs | (1 << ARMREG_IP) | (1 << ARMREG_LR)));
                prev_sp_offset = 8; /* ip and lr */
                for (i = 0; i < 16; ++i) {
@@ -2584,8 +2967,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                alloc_size &= ~(MONO_ARCH_FRAME_ALIGNMENT - 1);
        }
 
+       /* the stack used in the pushed regs */
+       if (prev_sp_offset & 4)
+               alloc_size += 4;
        cfg->stack_usage = alloc_size;
-       g_assert ((alloc_size & (MONO_ARCH_FRAME_ALIGNMENT-1)) == 0);
        if (alloc_size) {
                if ((i = mono_arm_is_rotated_imm8 (alloc_size, &rot_amount)) >= 0) {
                        ARM_SUB_REG_IMM (code, ARMREG_SP, ARMREG_SP, i, rot_amount);
@@ -2600,8 +2985,8 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        prev_sp_offset += alloc_size;
 
         /* compute max_offset in order to use short forward jumps
-        * we always do it on ppc because the immediate displacement
-        * for jumps is too small 
+        * we could skip do it on arm because the immediate displacement
+        * for jumps is large enough, it may be useful later for constant pools
         */
        max_offset = 0;
        for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
@@ -2612,7 +2997,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        max_offset += 6; 
 
                while (ins) {
-                       max_offset += ((guint8 *)arm_cpu_desc [ins->opcode])[MONO_INST_LEN];
+                       max_offset += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
                        ins = ins->next;
                }
        }
@@ -2630,7 +3015,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                ArgInfo *ainfo = cinfo->args + i;
-               inst = cfg->varinfo [pos];
+               inst = cfg->args [pos];
                
                if (cfg->verbose_level > 2)
                        g_print ("Saving argument %d (type: %d)\n", i, ainfo->regtype);
@@ -2652,12 +3037,21 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        if (ainfo->regtype == RegTypeGeneral) {
                                switch (ainfo->size) {
                                case 1:
-                                       g_assert (arm_is_imm12 (inst->inst_offset));
-                                       ARM_STRB_IMM (code, ainfo->reg, inst->inst_basereg, inst->inst_offset);
+                                       if (arm_is_imm12 (inst->inst_offset))
+                                               ARM_STRB_IMM (code, ainfo->reg, inst->inst_basereg, inst->inst_offset);
+                                       else {
+                                               code = mono_arm_emit_load_imm (code, ARMREG_IP, inst->inst_offset);
+                                               ARM_STRB_REG_REG (code, ainfo->reg, inst->inst_basereg, ARMREG_IP);
+                                       }
                                        break;
                                case 2:
-                                       g_assert (arm_is_imm8 (inst->inst_offset));
-                                       ARM_STRH_IMM (code, ainfo->reg, inst->inst_basereg, inst->inst_offset);
+                                       if (arm_is_imm8 (inst->inst_offset)) {
+                                               ARM_STRH_IMM (code, ainfo->reg, inst->inst_basereg, inst->inst_offset);
+                                       } else {
+                                               code = mono_arm_emit_load_imm (code, ARMREG_IP, inst->inst_offset);
+                                               ARM_ADD_REG_REG (code, ARMREG_IP, ARMREG_IP, inst->inst_basereg);
+                                               ARM_STRH_IMM (code, ainfo->reg, ARMREG_IP, 0);
+                                       }
                                        break;
                                case 8:
                                        g_assert (arm_is_imm12 (inst->inst_offset));
@@ -2666,10 +3060,20 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                        ARM_STR_IMM (code, ainfo->reg + 1, inst->inst_basereg, inst->inst_offset + 4);
                                        break;
                                default:
-                                       g_assert (arm_is_imm12 (inst->inst_offset));
-                                       ARM_STR_IMM (code, ainfo->reg, inst->inst_basereg, inst->inst_offset);
+                                       if (arm_is_imm12 (inst->inst_offset)) {
+                                               ARM_STR_IMM (code, ainfo->reg, inst->inst_basereg, inst->inst_offset);
+                                       } else {
+                                               code = mono_arm_emit_load_imm (code, ARMREG_IP, inst->inst_offset);
+                                               ARM_STR_REG_REG (code, ainfo->reg, inst->inst_basereg, ARMREG_IP);
+                                       }
                                        break;
                                }
+                       } else if (ainfo->regtype == RegTypeBaseGen) {
+                               g_assert (arm_is_imm12 (prev_sp_offset + ainfo->offset));
+                               g_assert (arm_is_imm12 (inst->inst_offset));
+                               ARM_LDR_IMM (code, ARMREG_LR, ARMREG_SP, (prev_sp_offset + ainfo->offset));
+                               ARM_STR_IMM (code, ARMREG_LR, inst->inst_basereg, inst->inst_offset + 4);
+                               ARM_STR_IMM (code, ARMREG_R3, inst->inst_basereg, inst->inst_offset);
                        } else if (ainfo->regtype == RegTypeBase) {
                                g_assert (arm_is_imm12 (prev_sp_offset + ainfo->offset));
                                switch (ainfo->size) {
@@ -2680,8 +3084,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                                        break;
                                case 2:
                                        ARM_LDR_IMM (code, ARMREG_LR, ARMREG_SP, (prev_sp_offset + ainfo->offset));
-                                       g_assert (arm_is_imm8 (inst->inst_offset));
-                                       ARM_STRH_IMM (code, ARMREG_LR, inst->inst_basereg, inst->inst_offset);
+                                       if (arm_is_imm8 (inst->inst_offset)) {
+                                               ARM_STRH_IMM (code, ARMREG_LR, inst->inst_basereg, inst->inst_offset);
+                                       } else {
+                                               code = mono_arm_emit_load_imm (code, ARMREG_IP, inst->inst_offset);
+                                               ARM_ADD_REG_REG (code, ARMREG_IP, ARMREG_IP, inst->inst_basereg);
+                                               ARM_STRH_IMM (code, ARMREG_LR, ARMREG_IP, 0);
+                                       }
                                        break;
                                case 8:
                                        g_assert (arm_is_imm12 (inst->inst_offset));
@@ -2737,12 +3146,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        ARM_B (code, 0);
                        *(gpointer*)code = NULL;
                        code += 4;
-                       ARM_MOV_REG_REG (code, ARMREG_LR, ARMREG_PC);
-                       ARM_MOV_REG_REG (code, ARMREG_PC, ARMREG_IP);
+                       code = emit_call_reg (code, ARMREG_IP);
                } else {
                        ARM_BL (code, 0);
                }
-#if ARM_PORT
                /* we build the MonoLMF structure on the stack - see mini-arm.h */
                /* lmf_offset is the offset from the previous stack pointer,
                 * alloc_size is the total stack space allocated, so the offset
@@ -2760,13 +3167,12 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                /* *(lmf_addr) = r1 */
                ARM_STR_IMM (code, ARMREG_R1, ARMREG_R0, G_STRUCT_OFFSET (MonoLMF, previous_lmf));
                /* save method info */
-               code = mono_arm_emit_load_imm (code, ARMREG_R2, method);
+               code = mono_arm_emit_load_imm (code, ARMREG_R2, GPOINTER_TO_INT (method));
                ARM_STR_IMM (code, ARMREG_R2, ARMREG_R1, G_STRUCT_OFFSET (MonoLMF, method));
                ARM_STR_IMM (code, ARMREG_SP, ARMREG_R1, G_STRUCT_OFFSET (MonoLMF, ebp));
                /* save the current IP */
                ARM_MOV_REG_REG (code, ARMREG_R2, ARMREG_PC);
                ARM_STR_IMM (code, ARMREG_R2, ARMREG_R1, G_STRUCT_OFFSET (MonoLMF, eip));
-#endif
        }
 
        if (tracing)
@@ -2782,7 +3188,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 void
 mono_arch_emit_epilog (MonoCompile *cfg)
 {
-       MonoJumpInfo *patch_info;
        MonoMethod *method = cfg->method;
        int pos, i, rot_amount;
        int max_epilog_size = 16 + 20*4;
@@ -2804,7 +3209,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
 
        /*
-        * Keep in sync with CEE_JMP
+        * Keep in sync with OP_JMP
         */
        code = cfg->native_code + cfg->code_len;
 
@@ -2814,9 +3219,9 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
 
        if (method->save_lmf) {
-#if ARM_PORT
                int lmf_offset;
-               pos +=  sizeof (MonoLMF);
+               /* all but r0-r3, sp and pc */
+               pos += sizeof (MonoLMF) - (4 * 10);
                lmf_offset = pos;
                /* r2 contains the pointer to the current LMF */
                code = emit_big_add (code, ARMREG_R2, cfg->frame_reg, cfg->stack_usage - lmf_offset);
@@ -2825,26 +3230,22 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                /* lr = lmf_addr */
                ARM_LDR_IMM (code, ARMREG_LR, ARMREG_R2, G_STRUCT_OFFSET (MonoLMF, lmf_addr));
                /* *(lmf_addr) = previous_lmf */
-               ARM_STR_IMM (code, ARMREG_LR, ARMREG_IP, G_STRUCT_OFFSET (MonoLMF, previous_lmf));
+               ARM_STR_IMM (code, ARMREG_IP, ARMREG_LR, G_STRUCT_OFFSET (MonoLMF, previous_lmf));
                /* FIXME: speedup: there is no actual need to restore the registers if
                 * we didn't actually change them (idea from Zoltan).
                 */
                /* restore iregs */
-#endif
-               if ((i = mono_arm_is_rotated_imm8 (cfg->stack_usage, &rot_amount)) >= 0) {
-                       ARM_ADD_REG_IMM (code, ARMREG_SP, cfg->frame_reg, i, rot_amount);
-               } else {
-                       code = mono_arm_emit_load_imm (code, ARMREG_IP, cfg->stack_usage);
-                       ARM_ADD_REG_REG (code, ARMREG_SP, cfg->frame_reg, ARMREG_IP);
-               }
-               ARM_POP_NWB (code, cfg->used_int_regs | ((1 << ARMREG_SP) | (1 << ARMREG_PC)));
+               /* point sp at the registers to restore: 10 is 14 -4, because we skip r0-r3 */
+               ARM_ADD_REG_IMM8 (code, ARMREG_SP, ARMREG_R2, (sizeof (MonoLMF) - 10 * sizeof (gulong)));
+               ARM_POP_NWB (code, 0xaff0); /* restore ip to sp and lr to pc */
        } else {
                if ((i = mono_arm_is_rotated_imm8 (cfg->stack_usage, &rot_amount)) >= 0) {
                        ARM_ADD_REG_IMM (code, ARMREG_SP, cfg->frame_reg, i, rot_amount);
                } else {
-                       code = mono_arm_emit_load_imm (code, cfg->frame_reg, cfg->stack_usage);
+                       code = mono_arm_emit_load_imm (code, ARMREG_IP, cfg->stack_usage);
                        ARM_ADD_REG_REG (code, ARMREG_SP, ARMREG_SP, ARMREG_IP);
                }
+               /* FIXME: add v4 thumb interworking support */
                ARM_POP_NWB (code, cfg->used_int_regs | ((1 << ARMREG_SP) | (1 << ARMREG_PC)));
        }
 
@@ -2873,18 +3274,17 @@ exception_id_by_name (const char *name)
        if (strcmp (name, "ArrayTypeMismatchException") == 0)
                return MONO_EXC_ARRAY_TYPE_MISMATCH;
        g_error ("Unknown intrinsic exception %s\n", name);
+       return -1;
 }
 
 void
 mono_arch_emit_exceptions (MonoCompile *cfg)
 {
        MonoJumpInfo *patch_info;
-       int nthrows, i;
+       int i;
        guint8 *code;
        const guint8* exc_throw_pos [MONO_EXC_INTRINS_NUM] = {NULL};
        guint8 exc_throw_found [MONO_EXC_INTRINS_NUM] = {0};
-       guint32 code_size;
-       int exc_count = 0;
        int max_epilog_size = 50;
 
        /* count the number of exception infos */
@@ -2916,6 +3316,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                switch (patch_info->type) {
                case MONO_PATCH_INFO_EXC: {
                        unsigned char *ip = patch_info->ip.i + cfg->native_code;
+                       const char *ex_name = patch_info->data.target;
                        i = exception_id_by_name (patch_info->data.target);
                        if (exc_throw_pos [i]) {
                                arm_patch (ip, exc_throw_pos [i]);
@@ -2925,6 +3326,8 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                                exc_throw_pos [i] = code;
                        }
                        arm_patch (ip, code);
+                       //*(int*)code = 0xef9f0001;
+                       code += 4;
                        /*mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_EXC_NAME, patch_info->data.target);*/
                        ARM_LDR_IMM (code, ARMREG_R0, ARMREG_PC, 0);
                        /* we got here from a conditional call, so the calling ip is set in lr already */
@@ -2932,7 +3335,7 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                        patch_info->data.name = "mono_arch_throw_exception_by_name";
                        patch_info->ip.i = code - cfg->native_code;
                        ARM_B (code, 0);
-                       *(gpointer*)code = patch_info->data.target;
+                       *(gconstpointer*)code = ex_name;
                        code += 4;
                        break;
                }
@@ -2975,7 +3378,7 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
                this->sreg1 = this_reg;
                this->dreg = mono_regstate_next_int (cfg->rs);
                mono_bblock_add_inst (cfg->cbb, this);
-               mono_call_inst_add_outarg_reg (inst, this->dreg, this_dreg, FALSE);
+               mono_call_inst_add_outarg_reg (cfg, inst, this->dreg, this_dreg, FALSE);
        }
 
        if (vt_reg != -1) {
@@ -2985,14 +3388,19 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
                vtarg->sreg1 = vt_reg;
                vtarg->dreg = mono_regstate_next_int (cfg->rs);
                mono_bblock_add_inst (cfg->cbb, vtarg);
-               mono_call_inst_add_outarg_reg (inst, vtarg->dreg, ARMREG_R0, FALSE);
+               mono_call_inst_add_outarg_reg (cfg, inst, vtarg->dreg, ARMREG_R0, FALSE);
        }
 }
 
 MonoInst*
 mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {
-       return NULL;
+       MonoInst *ins = NULL;
+       if (cmethod->klass == mono_defaults.thread_class &&
+                       strcmp (cmethod->name, "MemoryBarrier") == 0) {
+               MONO_INST_NEW (cfg, ins, OP_MEMORY_BARRIER);
+       }
+       return ins;
 }
 
 gboolean
@@ -3017,3 +3425,11 @@ mono_arch_flush_register_windows (void)
 {
 }
 
+void
+mono_arch_fixup_jinfo (MonoCompile *cfg)
+{
+       /* max encoded stack usage is 64KB * 4 */
+       g_assert ((cfg->stack_usage & ~(0xffff << 2)) == 0);
+       cfg->jit_info->used_regs |= cfg->stack_usage << 14;
+}
+