New test.
[mono.git] / mono / mini / mini-ia64.c
index 59c805edc7320b478947c90ef2b95966ed3bfeee..4d67083dc11b09f640149cec68072701143b0716 100644 (file)
 #include <unistd.h>
 #include <sys/mman.h>
 
+#ifdef __INTEL_COMPILER
+#include <ia64intrin.h>
+#endif
+
 #include <mono/metadata/appdomain.h>
 #include <mono/metadata/debug-helpers.h>
 #include <mono/metadata/threads.h>
@@ -23,7 +27,6 @@
 #include "inssel.h"
 #include "cpu-ia64.h"
 
-static gint lmf_tls_offset = -1;
 static gint appdomain_tls_offset = -1;
 static gint thread_tls_offset = -1;
 
@@ -44,13 +47,19 @@ static const char*const * ins_spec = ia64_desc;
  * - compare instructions allways set p6 and p7
  */
 
-#define SIGNAL_STACK_SIZE (64 * 1024)
+/*
+ * There are a lot of places where generated code is disassembled/patched.
+ * The automatic bundling of instructions done by the code generation macros
+ * could complicate things, so it is best to call 
+ * ia64_codegen_set_one_ins_per_bundle () at those places.
+ */
 
 #define ARGS_OFFSET 16
 
 #define GP_SCRATCH_REG 31
 #define GP_SCRATCH_REG2 30
 #define FP_SCRATCH_REG 32
+#define FP_SCRATCH_REG2 33
 
 #define LOOP_ALIGNMENT 8
 #define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
@@ -107,6 +116,51 @@ mono_arch_fregname (int reg)
                return "unknown";
 }
 
+G_GNUC_UNUSED static void
+break_count (void)
+{
+}
+
+G_GNUC_UNUSED static gboolean
+debug_count (void)
+{
+       static int count = 0;
+       count ++;
+
+       if (count == atoi (getenv ("COUNT"))) {
+               break_count ();
+       }
+
+       if (count > atoi (getenv ("COUNT"))) {
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static gboolean
+debug_ins_sched (void)
+{
+#if 0
+       return debug_count ();
+#else
+       return TRUE;
+#endif
+}
+
+static gboolean
+debug_omit_fp (void)
+{
+#if 0
+       return debug_count ();
+#else
+       return TRUE;
+#endif
+}
+
+static void 
+ia64_patch (unsigned char* code, gpointer target);
+
 typedef enum {
        ArgInIReg,
        ArgInFloatReg,
@@ -345,6 +399,13 @@ get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
                        cinfo->ret.storage = ArgInFloatReg;
                        cinfo->ret.reg = 8;
                        break;
+               case MONO_TYPE_GENERICINST:
+                       if (!mono_type_generic_inst_is_valuetype (sig->ret)) {
+                               cinfo->ret.storage = ArgInIReg;
+                               cinfo->ret.reg = IA64_R8;
+                               break;
+                       }
+                       /* Fall through */
                case MONO_TYPE_VALUETYPE:
                case MONO_TYPE_TYPEDBYREF: {
                        guint32 tmp_gr = 0, tmp_fr = 0, tmp_stacksize = 0;
@@ -425,6 +486,12 @@ get_call_info (MonoMethodSignature *sig, gboolean is_pinvoke)
                case MONO_TYPE_ARRAY:
                        add_general (&gr, &stack_size, ainfo);
                        break;
+               case MONO_TYPE_GENERICINST:
+                       if (!mono_type_generic_inst_is_valuetype (sig->params [i])) {
+                               add_general (&gr, &stack_size, ainfo);
+                               break;
+                       }
+                       /* Fall through */
                case MONO_TYPE_VALUETYPE:
                case MONO_TYPE_TYPEDBYREF:
                        /* FIXME: */
@@ -534,6 +601,7 @@ is_regsize_var (MonoType *t) {
        case MONO_TYPE_U:
        case MONO_TYPE_PTR:
        case MONO_TYPE_FNPTR:
+       case MONO_TYPE_BOOLEAN:
                return TRUE;
        case MONO_TYPE_OBJECT:
        case MONO_TYPE_STRING:
@@ -541,6 +609,10 @@ is_regsize_var (MonoType *t) {
        case MONO_TYPE_SZARRAY:
        case MONO_TYPE_ARRAY:
                return TRUE;
+       case MONO_TYPE_GENERICINST:
+               if (!mono_type_generic_inst_is_valuetype (t))
+                       return TRUE;
+               return FALSE;
        case MONO_TYPE_VALUETYPE:
                return FALSE;
        }
@@ -552,6 +624,30 @@ mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
 {
        GList *vars = NULL;
        int i;
+       MonoMethodSignature *sig;
+       MonoMethodHeader *header;
+       CallInfo *cinfo;
+
+       header = mono_method_get_header (cfg->method);
+
+       sig = mono_method_signature (cfg->method);
+
+       cinfo = get_call_info (sig, FALSE);
+
+       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
+               MonoInst *ins = cfg->varinfo [i];
+
+               ArgInfo *ainfo = &cinfo->args [i];
+
+               if (ins->flags & (MONO_INST_IS_DEAD|MONO_INST_VOLATILE|MONO_INST_INDIRECT))
+                       continue;
+
+               if (ainfo->storage == ArgInIReg) {
+                       /* The input registers are non-volatile */
+                       ins->opcode = OP_REGVAR;
+                       ins->dreg = 32 + ainfo->reg;
+               }
+       }
 
        for (i = 0; i < cfg->num_varinfo; i++) {
                MonoInst *ins = cfg->varinfo [i];
@@ -581,7 +677,8 @@ static void
 mono_ia64_alloc_stacked_registers (MonoCompile *cfg)
 {
        CallInfo *cinfo;
-       guint32 reserved_regs = 3;
+       guint32 reserved_regs;
+       MonoMethodHeader *header;
 
        if (cfg->arch.reg_local0 > 0)
                /* Already done */
@@ -589,8 +686,10 @@ mono_ia64_alloc_stacked_registers (MonoCompile *cfg)
 
        cinfo = get_call_info (mono_method_signature (cfg->method), FALSE);
 
-       /* Three registers are reserved for use by the prolog/epilog */
-       reserved_regs = 3;
+       header = mono_method_get_header (cfg->method);
+       
+       /* Some registers are reserved for use by the prolog/epilog */
+       reserved_regs = header->num_clauses ? 4 : 3;
 
        if ((mono_jit_trace_calls != NULL && mono_trace_eval (cfg->method)) ||
                (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)) {
@@ -603,15 +702,24 @@ mono_ia64_alloc_stacked_registers (MonoCompile *cfg)
 
        cfg->arch.reg_in0 = 32;
        cfg->arch.reg_local0 = cfg->arch.reg_in0 + cinfo->reg_usage + reserved_regs;
-       cfg->arch.reg_out0 = cfg->arch.reg_local0 + 8;
+       cfg->arch.reg_out0 = cfg->arch.reg_local0 + 16;
 
        cfg->arch.reg_saved_ar_pfs = cfg->arch.reg_local0 - 1;
        cfg->arch.reg_saved_b0 = cfg->arch.reg_local0 - 2;
-       cfg->arch.reg_saved_sp = cfg->arch.reg_local0 - 3;
+       cfg->arch.reg_fp = cfg->arch.reg_local0 - 3;
+
+       /* 
+        * Frames without handlers save sp to fp, frames with handlers save it into
+        * a dedicated register.
+        */
+       if (header->num_clauses)
+               cfg->arch.reg_saved_sp = cfg->arch.reg_local0 - 4;
+       else
+               cfg->arch.reg_saved_sp = cfg->arch.reg_fp;
 
        if ((mono_jit_trace_calls != NULL && mono_trace_eval (cfg->method)) ||
                (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)) {
-               cfg->arch.reg_saved_return_val = cfg->arch.reg_local0 - 4;
+               cfg->arch.reg_saved_return_val = cfg->arch.reg_local0 - reserved_regs;
        }
 
        /* 
@@ -666,30 +774,58 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        gint32 *offsets;
        CallInfo *cinfo;
 
-       mono_ia64_alloc_stacked_registers (cfg);
-
        header = mono_method_get_header (cfg->method);
 
        sig = mono_method_signature (cfg->method);
 
        cinfo = get_call_info (sig, FALSE);
 
+       /*
+        * Determine whenever the frame pointer can be eliminated.
+        * FIXME: Remove some of the restrictions.
+        */
+       cfg->arch.omit_fp = TRUE;
+
+       if (!debug_omit_fp ())
+               cfg->arch.omit_fp = FALSE;
+
+       if (cfg->flags & MONO_CFG_HAS_ALLOCA)
+               cfg->arch.omit_fp = FALSE;
+       if (header->num_clauses)
+               cfg->arch.omit_fp = FALSE;
+       if (cfg->param_area)
+               cfg->arch.omit_fp = FALSE;
+       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
+               ArgInfo *ainfo = &cinfo->args [i];
+
+               if (ainfo->storage == ArgOnStack) {
+                       /* 
+                        * The stack offset can only be determined when the frame
+                        * size is known.
+                        */
+                       cfg->arch.omit_fp = FALSE;
+               }
+       }
+
+       mono_ia64_alloc_stacked_registers (cfg);
+
        /*
         * We use the ABI calling conventions for managed code as well.
         * Exception: valuetypes are never passed or returned in registers.
         */
 
-       /* Locals are allocated backwards from %fp */
-       cfg->frame_reg = cfg->arch.reg_saved_sp;
-       offset = 0;
+       if (cfg->arch.omit_fp) {
+               cfg->frame_reg = IA64_SP;
+               offset = ARGS_OFFSET;
+       }
+       else {
+               /* Locals are allocated backwards from %fp */
+               cfg->frame_reg = cfg->arch.reg_fp;
+               offset = 0;
+       }
 
        if (cfg->method->save_lmf) {
-               /* FIXME: */
-#if 0
-               /* Reserve stack space for saving LMF + argument regs */
-               offset += sizeof (MonoLMF);
-               cfg->arch.lmf_offset = offset;
-#endif
+               /* No LMF on IA64 */
        }
 
        if (sig->ret->type != MONO_TYPE_VOID) {
@@ -708,6 +844,8 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        break;
                case ArgAggregate:
                        /* Allocate a local to hold the result, the epilog will copy it to the correct place */
+                       if (cfg->arch.omit_fp)
+                               g_assert_not_reached ();
                        offset = ALIGN_TO (offset, 8);
                        offset += cinfo->ret.nslots * 8;
                        cfg->ret->opcode = OP_REGOFFSET;
@@ -721,7 +859,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        }
 
        /* Allocate locals */
-       offsets = mono_allocate_stack_slots (cfg, &locals_stack_size, &locals_stack_align);
+       offsets = mono_allocate_stack_slots_full (cfg, cfg->arch.omit_fp ? FALSE : TRUE, &locals_stack_size, &locals_stack_align);
        if (locals_stack_align) {
                offset = ALIGN_TO (offset, locals_stack_align);
        }
@@ -730,7 +868,10 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        MonoInst *inst = cfg->varinfo [i];
                        inst->opcode = OP_REGOFFSET;
                        inst->inst_basereg = cfg->frame_reg;
-                       inst->inst_offset = - (offset + offsets [i]);
+                       if (cfg->arch.omit_fp)
+                               inst->inst_offset = (offset + offsets [i]);
+                       else
+                               inst->inst_offset = - (offset + offsets [i]);
                        // printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
                }
        }
@@ -738,6 +879,8 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        offset += locals_stack_size;
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG)) {
+               if (cfg->arch.omit_fp)
+                       g_assert_not_reached ();
                g_assert (cinfo->sig_cookie.storage == ArgOnStack);
                cfg->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
        }
@@ -773,6 +916,8 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                                inreg = FALSE;
                                break;
                        case ArgOnStack:
+                               if (cfg->arch.omit_fp)
+                                       g_assert_not_reached ();
                                inst->opcode = OP_REGOFFSET;
                                inst->inst_basereg = cfg->frame_reg;
                                inst->inst_offset = ARGS_OFFSET + ainfo->offset;
@@ -800,11 +945,17 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                                        break;
                                }
                                offset = ALIGN_TO (offset, sizeof (gpointer));
-                               inst->inst_offset = - offset;
+                               if (cfg->arch.omit_fp)
+                                       inst->inst_offset = offset;
+                               else
+                                       inst->inst_offset = - offset;
                        }
                }
        }
 
+       if (cfg->arch.omit_fp && offset == 16)
+               offset = 0;
+
        cfg->stack_offset = offset;
 
        g_free (cinfo);
@@ -834,14 +985,14 @@ add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, MonoInst *arg, ArgStorage
                arg->opcode = OP_OUTARG_REG;
                arg->inst_left = tree;
                arg->inst_right = (MonoInst*)call;
-               arg->unused = reg;
+               arg->backend.reg3 = reg;
                call->used_iregs |= 1 << reg;
                break;
        case ArgInFloatReg:
                arg->opcode = OP_OUTARG_FREG;
                arg->inst_left = tree;
                arg->inst_right = (MonoInst*)call;
-               arg->unused = reg;
+               arg->backend.reg3 = reg;
                call->used_fregs |= 1 << reg;
                break;
        default:
@@ -849,12 +1000,47 @@ add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, MonoInst *arg, ArgStorage
        }
 }
 
+static void
+emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
+{
+       MonoInst *arg;
+       MonoMethodSignature *tmp_sig;
+       MonoInst *sig_arg;
+
+       /* FIXME: Add support for signature tokens to AOT */
+       cfg->disable_aot = TRUE;
+
+       g_assert (cinfo->sig_cookie.storage == ArgOnStack);
+
+       /*
+        * mono_ArgIterator_Setup assumes the signature cookie is 
+        * passed first and all the arguments which were before it are
+        * passed on the stack after the signature. So compensate by 
+        * passing a different signature.
+        */
+       tmp_sig = mono_metadata_signature_dup (call->signature);
+       tmp_sig->param_count -= call->signature->sentinelpos;
+       tmp_sig->sentinelpos = 0;
+       memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
+
+       MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
+       sig_arg->inst_p0 = tmp_sig;
+
+       MONO_INST_NEW (cfg, arg, OP_OUTARG);
+       arg->inst_left = sig_arg;
+       arg->inst_imm = 16 + cinfo->sig_cookie.offset;
+       arg->type = STACK_PTR;
+
+       /* prepend, so they get reversed */
+       arg->next = call->out_args;
+       call->out_args = arg;
+}
+
 /* 
  * take the arguments and generate the arch-specific
  * instructions to properly call the function in call.
  * This includes pushing, moving arguments to the right register
  * etc.
- * Issue: who does the spilling if needed, and when?
  */
 MonoCallInst*
 mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call, int is_virtual)
@@ -884,37 +1070,8 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                ainfo = cinfo->args + i;
 
                if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
-                       MonoMethodSignature *tmp_sig;
-
                        /* Emit the signature cookie just before the implicit arguments */
-                       MonoInst *sig_arg;
-                       /* FIXME: Add support for signature tokens to AOT */
-                       cfg->disable_aot = TRUE;
-
-                       g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-
-                       /*
-                        * mono_ArgIterator_Setup assumes the signature cookie is 
-                        * passed first and all the arguments which were before it are
-                        * passed on the stack after the signature. So compensate by 
-                        * passing a different signature.
-                        */
-                       tmp_sig = mono_metadata_signature_dup (call->signature);
-                       tmp_sig->param_count -= call->signature->sentinelpos;
-                       tmp_sig->sentinelpos = 0;
-                       memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
-
-                       MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
-                       sig_arg->inst_p0 = tmp_sig;
-
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                       arg->inst_left = sig_arg;
-                       arg->inst_imm = 16 + cinfo->sig_cookie.offset;
-                       arg->type = STACK_PTR;
-
-                       /* prepend, so they get reversed */
-                       arg->next = call->out_args;
-                       call->out_args = arg;
+                       emit_sig_cookie (cfg, call, cinfo);
                }
 
                if (is_virtual && i == 0) {
@@ -939,7 +1096,7 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
 
                        if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(arg_type))) {
                                MonoInst *stack_addr;
-                               gint align;
+                               guint32 align;
                                guint32 size;
 
                                if (arg_type->type == MONO_TYPE_TYPEDBYREF) {
@@ -949,41 +1106,44 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                else
                                if (sig->pinvoke)
                                        size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
-                               else
-                                       size = mono_type_stack_size (&in->klass->byval_arg, &align);
+                               else {
+                                       /* 
+                                        * Other backends use mono_type_stack_size (), but that
+                                        * aligns the size to 8, which is larger than the size of
+                                        * the source, leading to reads of invalid memory if the
+                                        * source is at the end of address space.
+                                        */
+                                       size = mono_class_value_size (in->klass, &align);
+                               }
 
-                               /* 
-                                * FIXME: The destination is 'size' long, but the source might
-                                * be smaller.
-                                */
                                if (ainfo->storage == ArgAggregate) {
                                        MonoInst *vtaddr, *load, *load2, *offset_ins, *set_reg;
-                                       int slot;
+                                       int slot, j;
 
                                        vtaddr = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
 
                                        /* 
                                         * Part of the structure is passed in registers.
                                         */
-                                       for (i = 0; i < ainfo->nregs; ++i) {
+                                       for (j = 0; j < ainfo->nregs; ++j) {
                                                int offset, load_op, dest_reg, arg_storage;
 
-                                               slot = ainfo->reg + i;
+                                               slot = ainfo->reg + j;
                                                
                                                if (ainfo->atype == AggregateSingleHFA) {
                                                        load_op = CEE_LDIND_R4;
-                                                       offset = i * 4;
-                                                       dest_reg = ainfo->reg + i;
+                                                       offset = j * 4;
+                                                       dest_reg = ainfo->reg + j;
                                                        arg_storage = ArgInFloatReg;
                                                } else if (ainfo->atype == AggregateDoubleHFA) {
                                                        load_op = CEE_LDIND_R8;
-                                                       offset = i * 8;
-                                                       dest_reg = ainfo->reg + i;
+                                                       offset = j * 8;
+                                                       dest_reg = ainfo->reg + j;
                                                        arg_storage = ArgInFloatReg;
                                                } else {
                                                        load_op = CEE_LDIND_I;
-                                                       offset = i * 8;
-                                                       dest_reg = cfg->arch.reg_out0 + ainfo->reg + i;
+                                                       offset = j * 8;
+                                                       dest_reg = cfg->arch.reg_out0 + ainfo->reg + j;
                                                        arg_storage = ArgInIReg;
                                                }
 
@@ -999,7 +1159,7 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                                MONO_INST_NEW (cfg, load, load_op);
                                                load->inst_left = load2;
 
-                                               if (i == 0)
+                                               if (j == 0)
                                                        set_reg = arg;
                                                else
                                                        MONO_INST_NEW (cfg, set_reg, OP_OUTARG_REG);
@@ -1013,16 +1173,16 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                        /* 
                                         * Part of the structure is passed on the stack.
                                         */
-                                       for (i = ainfo->nregs; i < ainfo->nslots; ++i) {
+                                       for (j = ainfo->nregs; j < ainfo->nslots; ++j) {
                                                MonoInst *outarg;
 
-                                               slot = ainfo->reg + i;
+                                               slot = ainfo->reg + j;
 
                                                MONO_INST_NEW (cfg, load, CEE_LDIND_I);
                                                load->ssa_op = MONO_SSA_LOAD;
                                                load->inst_i0 = (cfg)->varinfo [vtaddr->inst_c0];
 
-                                               NEW_ICONST (cfg, offset_ins, (i * sizeof (gpointer)));
+                                               NEW_ICONST (cfg, offset_ins, (j * sizeof (gpointer)));
                                                MONO_INST_NEW (cfg, load2, CEE_ADD);
                                                load2->inst_left = load;
                                                load2->inst_right = offset_ins;
@@ -1030,7 +1190,7 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                                                MONO_INST_NEW (cfg, load, CEE_LDIND_I);
                                                load->inst_left = load2;
 
-                                               if (i == 0)
+                                               if (j == 0)
                                                        outarg = arg;
                                                else
                                                        MONO_INST_NEW (cfg, outarg, OP_OUTARG);
@@ -1088,6 +1248,11 @@ mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call,
                }
        }
 
+       /* Handle the case where there are no implicit arguments */
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos)) {
+               emit_sig_cookie (cfg, call, cinfo);
+       }
+
        call->stack_usage = cinfo->stack_usage;
        cfg->param_area = MAX (cfg->param_area, call->stack_usage);
        cfg->arch.n_out_regs = MAX (cfg->arch.n_out_regs, cinfo->reg_usage);
@@ -1135,6 +1300,7 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                case OP_MUL_IMM: 
+               case OP_IMUL_IMM: 
                        /* remove unnecessary multiplication with 1 */
                        if (ins->inst_imm == 1) {
                                if (ins->dreg != ins->sreg1) {
@@ -1360,13 +1526,23 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_STOREI8_MEMBASE_IMM:
                case OP_STORE_MEMBASE_IMM:
                        /* There are no store_membase instructions on ia64 */
-                       NEW_INS (cfg, temp, OP_I8CONST);
-                       temp->inst_c0 = ins->inst_offset;
-                       temp->dreg = mono_regstate_next_int (cfg->rs);
-                       NEW_INS (cfg, temp2, CEE_ADD);
-                       temp2->sreg1 = ins->inst_destbasereg;
-                       temp2->sreg2 = temp->dreg;
-                       temp2->dreg = mono_regstate_next_int (cfg->rs);
+                       if (ins->inst_offset == 0) {
+                               temp2 = NULL;
+                       } else if (ia64_is_imm14 (ins->inst_offset)) {
+                               NEW_INS (cfg, temp2, OP_ADD_IMM);
+                               temp2->sreg1 = ins->inst_destbasereg;
+                               temp2->inst_imm = ins->inst_offset;
+                               temp2->dreg = mono_regstate_next_int (cfg->rs);
+                       }
+                       else {
+                               NEW_INS (cfg, temp, OP_I8CONST);
+                               temp->inst_c0 = ins->inst_offset;
+                               temp->dreg = mono_regstate_next_int (cfg->rs);
+                               NEW_INS (cfg, temp2, CEE_ADD);
+                               temp2->sreg1 = ins->inst_destbasereg;
+                               temp2->sreg2 = temp->dreg;
+                               temp2->dreg = mono_regstate_next_int (cfg->rs);
+                       }
 
                        switch (ins->opcode) {
                        case OP_STOREI1_MEMBASE_IMM:
@@ -1396,7 +1572,8 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
 
                        ins->inst_offset = 0;
-                       ins->inst_destbasereg = temp2->dreg;
+                       if (temp2)
+                               ins->inst_destbasereg = temp2->dreg;
                        break;
                case OP_STOREI1_MEMBASE_REG:
                case OP_STOREI2_MEMBASE_REG:
@@ -1438,7 +1615,13 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_LOAD_MEMBASE:
                case OP_LOADR4_MEMBASE:
                case OP_LOADR8_MEMBASE:
-                       /* There are no load_membase instructions on ia64 */
+               case OP_ATOMIC_EXCHANGE_I4:
+               case OP_ATOMIC_EXCHANGE_I8:
+               case OP_ATOMIC_ADD_NEW_I4:
+               case OP_ATOMIC_ADD_NEW_I8:
+               case OP_ATOMIC_ADD_IMM_NEW_I4:
+               case OP_ATOMIC_ADD_IMM_NEW_I8:
+                       /* There are no membase instructions on ia64 */
                        if (ins->inst_offset == 0) {
                                break;
                        }
@@ -1461,54 +1644,6 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        ins->inst_offset = 0;
                        ins->inst_basereg = temp2->dreg;
                        break;
-               case OP_FCALL_MEMBASE:
-               case OP_LCALL_MEMBASE:
-               case OP_VCALL_MEMBASE:
-               case OP_VOIDCALL_MEMBASE:
-               case OP_CALL_MEMBASE:
-                       /* There are no membase instructions on ia64 */
-                       if (ia64_is_imm14 (ins->inst_offset)) {
-                               NEW_INS (cfg, temp2, OP_ADD_IMM);
-                               temp2->sreg1 = ins->sreg1;
-                               temp2->inst_imm = ins->inst_offset;
-                               temp2->dreg = mono_regstate_next_int (cfg->rs);
-                       }
-                       else {
-                               NEW_INS (cfg, temp, OP_I8CONST);
-                               temp->inst_c0 = ins->inst_offset;
-                               temp->dreg = mono_regstate_next_int (cfg->rs);
-                               NEW_INS (cfg, temp2, CEE_ADD);
-                               temp2->sreg1 = ins->sreg1;
-                               temp2->sreg2 = temp->dreg;
-                               temp2->dreg = mono_regstate_next_int (cfg->rs);
-                       }
-
-                       NEW_INS (cfg, temp, OP_LOADI8_MEMBASE);
-                       temp->sreg1 = temp2->dreg;
-                       temp->dreg = mono_regstate_next_int (cfg->rs);
-
-                       ins->sreg1 = temp->dreg;
-
-                       switch (ins->opcode) {
-                       case OP_FCALL_MEMBASE:
-                               ins->opcode = OP_FCALL_REG;
-                               break;
-                       case OP_LCALL_MEMBASE:
-                               ins->opcode = OP_LCALL_REG;
-                               break;
-                       case OP_VCALL_MEMBASE:
-                               ins->opcode = OP_VCALL_REG;
-                               break;
-                       case OP_VOIDCALL_MEMBASE:
-                               ins->opcode = OP_VOIDCALL_REG;
-                               break;
-                       case OP_CALL_MEMBASE:
-                               ins->opcode = OP_CALL_REG;
-                               break;
-                       default:
-                               g_assert_not_reached ();
-                       }
-                       break;
                case OP_ADD_IMM:
                case OP_IADD_IMM:
                case OP_ISUB_IMM:
@@ -1526,6 +1661,11 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        gboolean is_imm = FALSE;
                        gboolean switched = FALSE;
 
+                       if (ins->opcode == OP_AND_IMM && ins->inst_imm == 255) {
+                               ins->opcode = OP_ZEXT_I1;
+                               break;
+                       }
+
                        switch (ins->opcode) {
                        case OP_ADD_IMM:
                        case OP_IADD_IMM:
@@ -1796,9 +1936,12 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
-               case OP_MUL_IMM: {
-                       /* This should be emulated, but rules in inssel.brg generate it */
+               case OP_MUL_IMM:
+               case OP_LMUL_IMM:
+               case OP_IMUL_IMM: {
                        int i, sum_reg;
+                       gboolean found = FALSE;
+                       int shl_op = ins->opcode == OP_IMUL_IMM ? OP_ISHL_IMM : OP_SHL_IMM;
 
                        /* First the easy cases */
                        if (ins->inst_imm == 1) {
@@ -1807,17 +1950,18 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        for (i = 1; i < 64; ++i)
                                if (ins->inst_imm == (((gint64)1) << i)) {
-                                       ins->opcode = OP_SHL_IMM;
+                                       ins->opcode = shl_op;
                                        ins->inst_imm = i;
+                                       found = TRUE;
                                        break;
                                }
 
                        /* This could be optimized */
-                       if (ins->opcode == OP_MUL_IMM) {
+                       if (!found) {
                                sum_reg = 0;
                                for (i = 0; i < 64; ++i) {
                                        if (ins->inst_imm & (((gint64)1) << i)) {
-                                               NEW_INS (cfg, temp, OP_SHL_IMM);
+                                               NEW_INS (cfg, temp, shl_op);
                                                temp->dreg = mono_regstate_next_int (cfg->rs);
                                                temp->sreg1 = ins->sreg1;
                                                temp->inst_imm = i;
@@ -2032,9 +2176,11 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, Ia64CodegenState code)
                break;
        case OP_VCALL:
        case OP_VCALL_REG:
-       case OP_VCALL_MEMBASE:
+       case OP_VCALL_MEMBASE: {
+               ArgStorage storage;
+
                cinfo = get_call_info (((MonoCallInst*)ins)->signature, FALSE);
-               ArgStorage storage = cinfo->ret.storage;
+               storage = cinfo->ret.storage;
 
                if (storage == ArgAggregate) {
                        MonoInst *local = (MonoInst*)cfg->arch.ret_var_addr_local;
@@ -2062,6 +2208,7 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, Ia64CodegenState code)
                }
                g_free (cinfo);
                break;
+       }
        default:
                g_assert_not_reached ();
        }
@@ -2069,13 +2216,28 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, Ia64CodegenState code)
        return code;
 }
 
+#define add_patch_info(cfg,code,patch_type,data) do { \
+       mono_add_patch_info (cfg, code.buf + code.nins - cfg->native_code, patch_type, data); \
+} while (0)
+
+#define emit_cond_system_exception(cfg,code,exc_name,predicate) do { \
+       MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
+    if (tins == NULL) \
+        add_patch_info (cfg, code, MONO_PATCH_INFO_EXC, exc_name); \
+    else \
+               add_patch_info (cfg, code, MONO_PATCH_INFO_BB, tins->inst_true_bb); \
+       ia64_br_cond_pred (code, (predicate), 0); \
+} while (0)
+
 static Ia64CodegenState
 emit_call (MonoCompile *cfg, Ia64CodegenState code, guint32 patch_type, gconstpointer data)
 {
-       mono_add_patch_info (cfg, code.buf - cfg->native_code, patch_type, data);
+       add_patch_info (cfg, code, patch_type, data);
 
        if ((patch_type == MONO_PATCH_INFO_ABS) || (patch_type == MONO_PATCH_INFO_INTERNAL_METHOD)) {
                /* Indirect call */
+               /* mono_arch_patch_callsite will patch this */
+               /* mono_arch_nullify_class_init_trampoline will patch this */
                ia64_movl (code, GP_SCRATCH_REG, 0);
                ia64_ld8_inc_imm (code, GP_SCRATCH_REG2, GP_SCRATCH_REG, 8);
                ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG2);
@@ -2084,6 +2246,7 @@ emit_call (MonoCompile *cfg, Ia64CodegenState code, guint32 patch_type, gconstpo
        }
        else {
                /* Can't use a direct call since the displacement might be too small */
+               /* mono_arch_patch_callsite will patch this */
                ia64_movl (code, GP_SCRATCH_REG, 0);
                ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG);
                ia64_br_call_reg (code, IA64_B0, IA64_B6);
@@ -2126,6 +2289,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
        ia64_codegen_init (code, code_start);
 
+#if 0
+       if (strstr (cfg->method->name, "conv_ovf_i1") && (bb->block_num == 2))
+               break_count ();
+#endif
+
        ins = bb->code;
        while (ins) {
                offset = code.buf - cfg->native_code;
@@ -2167,17 +2335,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                if (ins->inst_i0->inst_c0) {
                                        NOT_IMPLEMENTED;
                                } else {
-                                       ia64_begin_bundle (code);
-                                       mono_add_patch_info (cfg, code.buf - cfg->native_code, MONO_PATCH_INFO_LABEL, ins->inst_i0);
+                                       add_patch_info (cfg, code, MONO_PATCH_INFO_LABEL, ins->inst_i0);
                                        ia64_br_cond_pred (code, pred, 0);
                                }
                        } else {
                                if (ins->inst_target_bb->native_offset) {
-                                       gint64 disp = ((gint64)ins->inst_target_bb->native_offset - offset) >> 4;
-                                       ia64_br_cond_pred (code, pred, disp);
-                               } else {
+                                       guint8 *pos = code.buf + code.nins;
+
+                                       ia64_br_cond_pred (code, pred, 0);
                                        ia64_begin_bundle (code);
-                                       mono_add_patch_info (cfg, code.buf - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
+                                       ia64_patch (pos, cfg->native_code + ins->inst_target_bb->native_offset);
+                               } else {
+                                       add_patch_info (cfg, code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
                                        ia64_br_cond_pred (code, pred, 0);
                                } 
                        }
@@ -2336,6 +2505,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_LSHR_UN_IMM:
                        ia64_shr_u_imm (code, ins->dreg, ins->sreg1, ins->inst_imm);
                        break;
+               case CEE_MUL:
+                       /* Based on gcc code */
+                       ia64_setf_sig (code, FP_SCRATCH_REG, ins->sreg1);
+                       ia64_setf_sig (code, FP_SCRATCH_REG2, ins->sreg2);
+                       ia64_xmpy_l (code, FP_SCRATCH_REG, FP_SCRATCH_REG, FP_SCRATCH_REG2);
+                       ia64_getf_sig (code, ins->dreg, FP_SCRATCH_REG);
+                       break;
 
                case OP_STOREI1_MEMBASE_REG:
                        ia64_st1_hint (code, ins->inst_destbasereg, ins->sreg1, 0);
@@ -2350,6 +2526,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_STORE_MEMBASE_REG:
                        ia64_st8_hint (code, ins->inst_destbasereg, ins->sreg1, 0);
                        break;
+
+               case OP_IA64_STOREI1_MEMBASE_INC_REG:
+                       ia64_st1_inc_imm_hint (code, ins->inst_destbasereg, ins->sreg1, 1, 0);
+                       break;
+               case OP_IA64_STOREI2_MEMBASE_INC_REG:
+                       ia64_st2_inc_imm_hint (code, ins->inst_destbasereg, ins->sreg1, 2, 0);
+                       break;
+               case OP_IA64_STOREI4_MEMBASE_INC_REG:
+                       ia64_st4_inc_imm_hint (code, ins->inst_destbasereg, ins->sreg1, 4, 0);
+                       break;
+               case OP_IA64_STOREI8_MEMBASE_INC_REG:
+                       ia64_st8_inc_imm_hint (code, ins->inst_destbasereg, ins->sreg1, 8, 0);
+                       break;
+
                case OP_LOADU1_MEMBASE:
                        ia64_ld1 (code, ins->dreg, ins->inst_basereg);
                        break;
@@ -2375,6 +2565,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_LOADI8_MEMBASE:
                        ia64_ld8 (code, ins->dreg, ins->inst_basereg);
                        break;
+
+               case OP_IA64_LOADU1_MEMBASE_INC:
+                       ia64_ld1_inc_imm_hint (code, ins->dreg, ins->inst_basereg, 1, 0);
+                       break;
+               case OP_IA64_LOADU2_MEMBASE_INC:
+                       ia64_ld2_inc_imm_hint (code, ins->dreg, ins->inst_basereg, 2, 0);
+                       break;
+               case OP_IA64_LOADU4_MEMBASE_INC:
+                       ia64_ld4_inc_imm_hint (code, ins->dreg, ins->inst_basereg, 4, 0);
+                       break;
+               case OP_IA64_LOADI8_MEMBASE_INC:
+                       ia64_ld8_inc_imm_hint (code, ins->dreg, ins->inst_basereg, 8, 0);
+                       break;
+
                case OP_SEXT_I1:
                        ia64_sxt1 (code, ins->dreg, ins->sreg1);
                        break;
@@ -2552,24 +2756,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                case OP_COND_EXC_IOV:
                case OP_COND_EXC_OV:
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code,
-                                                                MONO_PATCH_INFO_EXC, "OverflowException");
-                       ia64_br_cond_pred (code, 6, 0);
+                       emit_cond_system_exception (cfg, code, "OverflowException", 6);
                        break;
                case OP_COND_EXC_IC:
                case OP_COND_EXC_C:
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code,
-                                                                MONO_PATCH_INFO_EXC, "OverflowException");
-                       ia64_br_cond_pred (code, 7, 0);
+                       emit_cond_system_exception (cfg, code, "OverflowException", 7);
                        break;
                case OP_IA64_COND_EXC:
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code,
-                                                                MONO_PATCH_INFO_EXC, ins->inst_p1);
-                       ia64_br_cond_pred (code, 6, 0);
+                       emit_cond_system_exception (cfg, code, ins->inst_p1, 6);
                        break;
                case OP_IA64_CSET:
-                       /* FIXME: Do this with one instruction ? */
-                       ia64_mov (code, ins->dreg, IA64_R0);
+                       ia64_mov_pred (code, 7, ins->dreg, IA64_R0);
+                       ia64_no_stop (code);
                        ia64_add1_pred (code, 6, ins->dreg, IA64_R0, IA64_R0);
                        break;
                case CEE_CONV_I1:
@@ -2617,7 +2815,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        else if (d == 1.0)
                                ia64_fmov (code, ins->dreg, 1);
                        else {
-                               mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, ins->inst_p0);
+                               add_patch_info (cfg, code, MONO_PATCH_INFO_R8, ins->inst_p0);
                                ia64_movl (code, GP_SCRATCH_REG, 0);
                                ia64_ldfd (code, ins->dreg, GP_SCRATCH_REG);
                        }
@@ -2631,7 +2829,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        else if (f == 1.0)
                                ia64_fmov (code, ins->dreg, 1);
                        else {
-                               mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4, ins->inst_p0);
+                               add_patch_info (cfg, code, MONO_PATCH_INFO_R4, ins->inst_p0);
                                ia64_movl (code, GP_SCRATCH_REG, 0);
                                ia64_ldfs (code, ins->dreg, GP_SCRATCH_REG);
                        }
@@ -2644,13 +2842,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        ia64_stfd_hint (code, ins->inst_destbasereg, ins->sreg1, 0);
                        break;
                case OP_STORER4_MEMBASE_REG:
-                       ia64_stfs_hint (code, ins->inst_destbasereg, ins->sreg1, 0);
+                       ia64_fnorm_s_sf (code, FP_SCRATCH_REG, ins->sreg1, 0);
+                       ia64_stfs_hint (code, ins->inst_destbasereg, FP_SCRATCH_REG, 0);
                        break;
                case OP_LOADR8_MEMBASE:
                        ia64_ldfd (code, ins->dreg, ins->inst_basereg);
                        break;
                case OP_LOADR4_MEMBASE:
                        ia64_ldfs (code, ins->dreg, ins->inst_basereg);
+                       ia64_fnorm_d_sf (code, ins->dreg, ins->dreg, 0);
                        break;
                case CEE_CONV_R4:
                        ia64_setf_sig (code, ins->dreg, ins->sreg1);
@@ -2696,24 +2896,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case CEE_CKFINITE:
                        /* Quiet NaN */
                        ia64_fclass_m (code, 6, 7, ins->sreg1, 0x080);
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code,
-                                                                MONO_PATCH_INFO_EXC, "ArithmeticException");
-                       ia64_br_cond_pred (code, 6, 0);
+                       emit_cond_system_exception (cfg, code, "ArithmeticException", 6);
                        /* Signaling NaN */
                        ia64_fclass_m (code, 6, 7, ins->sreg1, 0x040);
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code,
-                                                                MONO_PATCH_INFO_EXC, "ArithmeticException");
-                       ia64_br_cond_pred (code, 6, 0);
+                       emit_cond_system_exception (cfg, code, "ArithmeticException", 6);
                        /* Positive infinity */
                        ia64_fclass_m (code, 6, 7, ins->sreg1, 0x021);
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code,
-                                                                MONO_PATCH_INFO_EXC, "ArithmeticException");
-                       ia64_br_cond_pred (code, 6, 0);
+                       emit_cond_system_exception (cfg, code, "ArithmeticException", 6);
                        /* Negative infinity */
                        ia64_fclass_m (code, 6, 7, ins->sreg1, 0x022);
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code,
-                                                                MONO_PATCH_INFO_EXC, "ArithmeticException");
-                       ia64_br_cond_pred (code, 6, 0);
+                       emit_cond_system_exception (cfg, code, "ArithmeticException", 6);
                        break;
 
                /* Calls */
@@ -2747,27 +2939,55 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_VOIDCALL_REG:
                        call = (MonoCallInst*)ins;
 
-                       if (call->virtual) {
-                               /* Keep this in synch with get_vcall_slot_addr */
+                       /* Indirect call */
+                       /* 
+                        * mono_arch_patch_delegate_trampoline will patch this, this is why R8 is 
+                        * used.
+                        */
+                       ia64_mov (code, IA64_R8, ins->sreg1);
+                       ia64_ld8_inc_imm (code, GP_SCRATCH_REG2, IA64_R8, 8);
+                       ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG2);
+                       ia64_ld8 (code, IA64_GP, IA64_R8);
+                       ia64_br_call_reg (code, IA64_B0, IA64_B6);
 
-                               /* This is a virtual call */
-                               ia64_mov_to_br (code, IA64_B6, ins->sreg1);
+                       code = emit_move_return_value (cfg, ins, code);
+                       break;
 
-                               /*
-                                * This nop will tell get_vcall_slot_addr that this is a virtual 
-                                * call.
-                                */
-                               ia64_nop_i (code, 0x12345);
-                       }
+               case OP_FCALL_MEMBASE:
+               case OP_LCALL_MEMBASE:
+               case OP_VCALL_MEMBASE:
+               case OP_VOIDCALL_MEMBASE:
+               case OP_CALL_MEMBASE:
+                       /* 
+                        * There are no membase instructions on ia64, but we can't 
+                        * lower this since get_vcall_slot_addr () needs to decode it.
+                        */
+
+                       /* Keep this in synch with get_vcall_slot_addr */
+                       if (ia64_is_imm14 (ins->inst_offset))
+                               ia64_adds_imm (code, IA64_R8, ins->inst_offset, ins->sreg1);
                        else {
-                               /* Indirect call */
-                               ia64_mov (code, GP_SCRATCH_REG, ins->sreg1);
-                               ia64_ld8_inc_imm (code, GP_SCRATCH_REG2, GP_SCRATCH_REG, 8);
-                               ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG2);
-                               ia64_ld8 (code, IA64_GP, GP_SCRATCH_REG);
+                               ia64_movl (code, GP_SCRATCH_REG, ins->inst_offset);
+                               ia64_add (code, IA64_R8, GP_SCRATCH_REG, ins->sreg1);
                        }
+
+                       ia64_begin_bundle (code);
+                       ia64_codegen_set_one_ins_per_bundle (code, TRUE);
+
+                       ia64_ld8 (code, GP_SCRATCH_REG, IA64_R8);
+
+                       ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG);
+
+                       /*
+                        * This nop will tell get_vcall_slot_addr that this is a virtual 
+                        * call.
+                        */
+                       ia64_nop_i (code, 0x12345);
+
                        ia64_br_call_reg (code, IA64_B0, IA64_B6);
 
+                       ia64_codegen_set_one_ins_per_bundle (code, FALSE);
+
                        code = emit_move_return_value (cfg, ins, code);
                        break;
                case CEE_JMP: {
@@ -2783,22 +3003,37 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        /* Load arguments into their original registers */
                        code = emit_load_volatile_arguments (cfg, code);
 
-                       if (cfg->arch.stack_alloc_size)
-                               ia64_mov (code, IA64_SP, cfg->arch.reg_saved_sp);
+                       if (cfg->arch.stack_alloc_size) {
+                               if (cfg->arch.omit_fp) {
+                                       if (ia64_is_imm14 (cfg->arch.stack_alloc_size))
+                                               ia64_adds_imm (code, IA64_SP, (cfg->arch.stack_alloc_size), IA64_SP);
+                                       else {
+                                               ia64_movl (code, GP_SCRATCH_REG, cfg->arch.stack_alloc_size);
+                                               ia64_add (code, IA64_SP, GP_SCRATCH_REG, IA64_SP);
+                                       }
+                               }
+                               else
+                                       ia64_mov (code, IA64_SP, cfg->arch.reg_saved_sp);
+                       }
                        ia64_mov_to_ar_i (code, IA64_PFS, cfg->arch.reg_saved_ar_pfs);
                        ia64_mov_ret_to_br (code, IA64_B0, cfg->arch.reg_saved_b0);
 
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
+                       add_patch_info (cfg, code, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
                        ia64_movl (code, GP_SCRATCH_REG, 0);
                        ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG);
                        ia64_br_cond_reg (code, IA64_B6);
 
                        break;
                }
+               case CEE_BREAK:
+                       code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, mono_arch_break);
+                       break;
 
                case OP_LOCALLOC: {
                        gint32 abi_offset;
 
+                       /* FIXME: Sigaltstack support */
+
                        /* keep alignment */
                        ia64_adds_imm (code, GP_SCRATCH_REG, MONO_ARCH_FRAME_ALIGNMENT - 1, ins->sreg1);
                        ia64_movl (code, GP_SCRATCH_REG2, ~(MONO_ARCH_FRAME_ALIGNMENT - 1));
@@ -2821,11 +3056,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                /* Upper limit */
                                ia64_add (code, GP_SCRATCH_REG2, ins->dreg, GP_SCRATCH_REG);
 
+                               ia64_codegen_set_one_ins_per_bundle (code, TRUE);
+
                                /* Init loop */
                                ia64_st8_inc_imm_hint (code, ins->dreg, IA64_R0, 8, 0);
                                ia64_cmp_lt (code, 8, 9, ins->dreg, GP_SCRATCH_REG2);
                                ia64_br_cond_pred (code, 8, -2);
 
+                               ia64_codegen_set_one_ins_per_bundle (code, FALSE);
+
                                ia64_sub (code, ins->dreg, GP_SCRATCH_REG2, GP_SCRATCH_REG);
                        }
 
@@ -2836,21 +3075,85 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        ia64_ld8 (code, ins->dreg, ins->dreg);
                        break;
 
-                       /* Exception handling */
-               case OP_CALL_HANDLER:
-                       /*
-                        * Using a call instruction would mess up the register stack, so
-                        * save the return address to a register and use a
-                        * branch.
-                        */
+                       /* Synchronization */
+               case OP_MEMORY_BARRIER:
+                       ia64_mf (code);
+                       break;
+               case OP_ATOMIC_ADD_IMM_NEW_I4:
+                       g_assert (ins->inst_offset == 0);
+                       ia64_fetchadd4_acq_hint (code, ins->dreg, ins->inst_basereg, ins->inst_imm, 0);
+                       ia64_adds_imm (code, ins->dreg, ins->inst_imm, ins->dreg);
+                       break;
+               case OP_ATOMIC_ADD_IMM_NEW_I8:
+                       g_assert (ins->inst_offset == 0);
+                       ia64_fetchadd8_acq_hint (code, ins->dreg, ins->inst_basereg, ins->inst_imm, 0);
+                       ia64_adds_imm (code, ins->dreg, ins->inst_imm, ins->dreg);
+                       break;
+               case OP_ATOMIC_EXCHANGE_I4:
+                       ia64_xchg4_hint (code, ins->dreg, ins->inst_basereg, ins->sreg2, 0);
+                       ia64_sxt4 (code, ins->dreg, ins->dreg);
+                       break;
+               case OP_ATOMIC_EXCHANGE_I8:
+                       ia64_xchg8_hint (code, ins->dreg, ins->inst_basereg, ins->sreg2, 0);
+                       break;
+               case OP_ATOMIC_ADD_NEW_I4: {
+                       guint8 *label, *buf;
+
+                       /* From libatomic_ops */
+                       ia64_mf (code);
+
+                       ia64_begin_bundle (code);
+                       label = code.buf + code.nins;
+                       ia64_ld4_acq (code, GP_SCRATCH_REG, ins->sreg1);
+                       ia64_add (code, GP_SCRATCH_REG2, GP_SCRATCH_REG, ins->sreg2);
+                       ia64_mov_to_ar_m (code, IA64_CCV, GP_SCRATCH_REG);
+                       ia64_cmpxchg4_acq_hint (code, GP_SCRATCH_REG2, ins->sreg1, GP_SCRATCH_REG2, 0);
+                       ia64_cmp4_eq (code, 6, 7, GP_SCRATCH_REG, GP_SCRATCH_REG2);
+                       buf = code.buf + code.nins;
+                       ia64_br_cond_pred (code, 7, 0);
+                       ia64_begin_bundle (code);
+                       ia64_patch (buf, label);
+                       ia64_add (code, ins->dreg, GP_SCRATCH_REG, ins->sreg2);
+                       break;
+               }
+               case OP_ATOMIC_ADD_NEW_I8: {
+                       guint8 *label, *buf;
+
+                       /* From libatomic_ops */
+                       ia64_mf (code);
+
+                       ia64_begin_bundle (code);
+                       label = code.buf + code.nins;
+                       ia64_ld8_acq (code, GP_SCRATCH_REG, ins->sreg1);
+                       ia64_add (code, GP_SCRATCH_REG2, GP_SCRATCH_REG, ins->sreg2);
+                       ia64_mov_to_ar_m (code, IA64_CCV, GP_SCRATCH_REG);
+                       ia64_cmpxchg8_acq_hint (code, GP_SCRATCH_REG2, ins->sreg1, GP_SCRATCH_REG2, 0);
+                       ia64_cmp_eq (code, 6, 7, GP_SCRATCH_REG, GP_SCRATCH_REG2);
+                       buf = code.buf + code.nins;
+                       ia64_br_cond_pred (code, 7, 0);
+                       ia64_begin_bundle (code);
+                       ia64_patch (buf, label);
+                       ia64_add (code, ins->dreg, GP_SCRATCH_REG, ins->sreg2);
+                       break;
+               }
+
+                       /* Exception handling */
+               case OP_CALL_HANDLER:
+                       /*
+                        * Using a call instruction would mess up the register stack, so
+                        * save the return address to a register and use a
+                        * branch.
+                        */
+                       ia64_codegen_set_one_ins_per_bundle (code, TRUE);
                        ia64_mov (code, IA64_R15, IA64_R0);
                        ia64_mov_from_ip (code, GP_SCRATCH_REG);
                        /* Add the length of OP_CALL_HANDLER */
                        ia64_adds_imm (code, GP_SCRATCH_REG, 5 * 16, GP_SCRATCH_REG);
-                       mono_add_patch_info (cfg, code.buf - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
+                       add_patch_info (cfg, code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
                        ia64_movl (code, GP_SCRATCH_REG2, 0);
                        ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG2);
                        ia64_br_cond_reg (code, IA64_B6);
+                       ia64_codegen_set_one_ins_per_bundle (code, FALSE);
                        break;
                case OP_START_HANDLER: {
                        /*
@@ -2859,35 +3162,59 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
 
                        /* 
-                        * We might be called by the exception handling code, in which case the
-                        * the register stack is not set up correctly. So do it now.
+                        * R15 determines our caller. It is used since it is writable using
+                        * libunwind.
+                        * R15 == 0 means we are called by OP_CALL_HANDLER or via resume_context ()
+                        * R15 != 0 means we are called by call_filter ().
                         */
-                       ia64_alloc (code, GP_SCRATCH_REG2, cfg->arch.reg_local0 - cfg->arch.reg_in0, cfg->arch.reg_out0 - cfg->arch.reg_local0, cfg->arch.n_out_regs, 0);
-
-                       /* Set the fp register from the value passed in by the caller */
-                       /* R15 is used since it is writable using libunwind */
-                       /* R15 == 0 means we are called by OP_CALL_HANDLER or via resume_context () */
+                       ia64_codegen_set_one_ins_per_bundle (code, TRUE);
                        ia64_cmp_eq (code, 6, 7, IA64_R15, IA64_R0);
-                       ia64_add_pred (code, 7, cfg->frame_reg, IA64_R0, IA64_R15);
 
+                       ia64_br_cond_pred (code, 6, 6);
+
+                       /*
+                        * Called by call_filter:
+                        * Allocate a new stack frame, and set the fp register from the 
+                        * value passed in by the caller.
+                        * We allocate a similar frame as is done by the prolog, so
+                        * if an exception is thrown while executing the filter, the
+                        * unwinder can unwind through the filter frame using the unwind
+                        * info for the prolog. 
+                        */
+                       ia64_alloc (code, cfg->arch.reg_saved_ar_pfs, cfg->arch.reg_local0 - cfg->arch.reg_in0, cfg->arch.reg_out0 - cfg->arch.reg_local0, cfg->arch.n_out_regs, 0);
+                       ia64_mov_from_br (code, cfg->arch.reg_saved_b0, IA64_B0);
+                       ia64_mov (code, cfg->arch.reg_saved_sp, IA64_SP);
+                       ia64_mov (code, cfg->frame_reg, IA64_R15);
+                       /* Signal to endfilter that we are called by call_filter */
+                       ia64_mov (code, GP_SCRATCH_REG, IA64_R0);
+
+                       /* Save the return address */
                        ia64_adds_imm (code, GP_SCRATCH_REG2, spvar->inst_offset, cfg->frame_reg);
                        ia64_st8_hint (code, GP_SCRATCH_REG2, GP_SCRATCH_REG, 0);
+                       ia64_codegen_set_one_ins_per_bundle (code, FALSE);
 
                        break;
                }
-               case CEE_ENDFINALLY: {
-                       MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
-                       ia64_adds_imm (code, GP_SCRATCH_REG, spvar->inst_offset, cfg->frame_reg);
-                       ia64_ld8_hint (code, GP_SCRATCH_REG, GP_SCRATCH_REG, 0);
-                       ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG);
-                       ia64_br_cond_reg (code, IA64_B6);
-                       break;
-               }
+               case CEE_ENDFINALLY:
                case OP_ENDFILTER: {
-                       /* FIXME: Return the value */
+                       /* FIXME: Return the value in ENDFILTER */
                        MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
+
+                       /* Load the return address */
                        ia64_adds_imm (code, GP_SCRATCH_REG, spvar->inst_offset, cfg->frame_reg);
                        ia64_ld8_hint (code, GP_SCRATCH_REG, GP_SCRATCH_REG, 0);
+
+                       /* Test caller */
+                       ia64_cmp_eq (code, 6, 7, GP_SCRATCH_REG, IA64_R0);
+                       ia64_br_cond_pred (code, 7, 4);
+
+                       /* Called by call_filter */
+                       /* Pop frame */
+                       ia64_mov_to_ar_i (code, IA64_PFS, cfg->arch.reg_saved_ar_pfs);
+                       ia64_mov_to_br (code, IA64_B0, cfg->arch.reg_saved_b0);
+                       ia64_br_ret_reg (code, IA64_B0);                        
+
+                       /* Called by CALL_HANDLER */
                        ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG);
                        ia64_br_cond_reg (code, IA64_B6);
                        break;
@@ -3014,71 +3341,463 @@ static gboolean stops_in_template [32][3] = {
        { FALSE, FALSE, FALSE }
 };
 
+static int last_stop_in_template [32] = {
+       -1, 2, 1, 2, -1, 2, -1, -1,
+       -1, 2, 0, 2, -1, 2, -1, 2,
+       -1, 2, -1, 2, -1, -1, -1, 2,
+       -1, 2, -1, -1, -1, 2, -1, -1
+};
+
+static guint64 nops_for_ins_types [6] = {
+       IA64_NOP_I,
+       IA64_NOP_I,
+       IA64_NOP_M,
+       IA64_NOP_F,
+       IA64_NOP_B,
+       IA64_NOP_X
+};
+
+#define ITYPE_MATCH(itype1, itype2) (((itype1) == (itype2)) || (((itype2) == IA64_INS_TYPE_A) && (((itype1) == IA64_INS_TYPE_I) || ((itype1) == IA64_INS_TYPE_M))))
+
+/* 
+ * Debugging support
+ */
+
+#if 0
+#define DEBUG_INS_SCHED(a) do { a; } while (0)
+#else
+#define DEBUG_INS_SCHED(a)
+#endif
+
+static void
+ia64_analyze_deps (Ia64CodegenState *code, int *deps_start, int *stops)
+{
+       int i, pos, ins_index, current_deps_start, current_ins_start, reg;
+       guint8 *deps = code->dep_info;
+       gboolean need_stop, no_stop;
+
+       for (i = 0; i < code->nins; ++i)
+               stops [i] = FALSE;
+       
+       ins_index = 0;
+       current_deps_start = 0;
+       current_ins_start = 0;
+       deps_start [ins_index] = current_ins_start;
+       pos = 0;
+       no_stop = FALSE;
+       DEBUG_INS_SCHED (printf ("BEGIN.\n"));
+       while (pos < code->dep_info_pos) {
+               need_stop = FALSE;
+               switch (deps [pos]) {
+               case IA64_END_OF_INS:
+                       ins_index ++;
+                       current_ins_start = pos + 2;
+                       deps_start [ins_index] = current_ins_start;
+                       no_stop = FALSE;
+                       DEBUG_INS_SCHED (printf ("(%d) END INS.\n", ins_index - 1));
+                       break;
+               case IA64_NONE:
+                       break;
+               case IA64_READ_GR:
+                       reg = deps [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("READ GR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_GR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_WRITE_GR:
+                       reg = code->dep_info [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("WRITE GR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_GR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_READ_PR:
+                       reg = deps [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("READ PR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (((deps [i] == IA64_WRITE_PR) || (deps [i] == IA64_WRITE_PR_FLOAT)) && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_READ_PR_BRANCH:
+                       reg = deps [pos + 1];
+
+                       /* Writes to prs by non-float instructions are visible to branches */
+                       DEBUG_INS_SCHED (printf ("READ PR BRANCH: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_PR_FLOAT && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_WRITE_PR:
+                       reg = code->dep_info [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("WRITE PR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (((deps [i] == IA64_WRITE_PR) || (deps [i] == IA64_WRITE_PR_FLOAT)) && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_WRITE_PR_FLOAT:
+                       reg = code->dep_info [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("WRITE PR FP: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (((deps [i] == IA64_WRITE_GR) || (deps [i] == IA64_WRITE_PR_FLOAT)) && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_READ_BR:
+                       reg = deps [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("READ BR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_BR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_WRITE_BR:
+                       reg = code->dep_info [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("WRITE BR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_BR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_READ_BR_BRANCH:
+                       reg = deps [pos + 1];
+
+                       /* Writes to brs are visible to branches */
+                       DEBUG_INS_SCHED (printf ("READ BR BRACH: %d\n", reg));
+                       break;
+               case IA64_READ_FR:
+                       reg = deps [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("READ BR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_FR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_WRITE_FR:
+                       reg = code->dep_info [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("WRITE BR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_FR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_READ_AR:
+                       reg = deps [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("READ AR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_AR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_WRITE_AR:
+                       reg = code->dep_info [pos + 1];
+
+                       DEBUG_INS_SCHED (printf ("WRITE AR: %d\n", reg));
+                       for (i = current_deps_start; i < current_ins_start; i += 2)
+                               if (deps [i] == IA64_WRITE_AR && deps [i + 1] == reg)
+                                       need_stop = TRUE;
+                       break;
+               case IA64_NO_STOP:
+                       /* 
+                        * Explicitly indicate that a stop is not required. Useful for
+                        * example when two predicated instructions with negated predicates
+                        * write the same registers.
+                        */
+                       no_stop = TRUE;
+                       break;
+               default:
+                       g_assert_not_reached ();
+               }
+               pos += 2;
+
+               if (need_stop && !no_stop) {
+                       g_assert (ins_index > 0);
+                       stops [ins_index - 1] = 1;
+
+                       DEBUG_INS_SCHED (printf ("STOP\n"));
+                       current_deps_start = current_ins_start;
+
+                       /* Skip remaining deps for this instruction */
+                       while (deps [pos] != IA64_END_OF_INS)
+                               pos += 2;
+               }
+       }
+
+       if (code->nins > 0) {
+               /* No dependency info for the last instruction */
+               stops [code->nins - 1] = 1;
+       }
+
+       deps_start [code->nins] = code->dep_info_pos;
+}
+
+static void
+ia64_real_emit_bundle (Ia64CodegenState *code, int *deps_start, int *stops, int n, guint64 template, guint64 ins1, guint64 ins2, guint64 ins3, guint8 nops)
+{
+       int stop_pos, i, deps_to_shift, dep_shift;
+
+       g_assert (n <= code->nins);
+
+       // if (n > 1) printf ("FOUND: %ld.\n", template);
+
+       ia64_emit_bundle_template (code, template, ins1, ins2, ins3);
+
+       stop_pos = last_stop_in_template [template] + 1;
+       if (stop_pos > n)
+               stop_pos = n;
+
+       /* Compute the number of 'real' instructions before the stop */
+       deps_to_shift = stop_pos;
+       if (stop_pos >= 3 && (nops & (1 << 2)))
+               deps_to_shift --;
+       if (stop_pos >= 2 && (nops & (1 << 1)))
+               deps_to_shift --;
+       if (stop_pos >= 1 && (nops & (1 << 0)))
+               deps_to_shift --;
+
+       /* 
+        * We have to keep some dependencies whose instructions have been shifted
+        * out of the buffer. So nullify the end_of_ins markers in the dependency
+        * array.
+        */
+       for (i = deps_start [deps_to_shift]; i < deps_start [n]; i += 2)
+               if (code->dep_info [i] == IA64_END_OF_INS)
+                       code->dep_info [i] = IA64_NONE;
+
+       g_assert (deps_start [deps_to_shift] <= code->dep_info_pos);
+       memcpy (code->dep_info, &code->dep_info [deps_start [deps_to_shift]], code->dep_info_pos - deps_start [deps_to_shift]);
+       code->dep_info_pos = code->dep_info_pos - deps_start [deps_to_shift];
+
+       dep_shift = deps_start [deps_to_shift];
+       for (i = 0; i < code->nins + 1 - n; ++i)
+               deps_start [i] = deps_start [n + i] - dep_shift;
+
+       /* Determine the exact positions of instructions with unwind ops */
+       if (code->unw_op_count) {
+               int ins_pos [16];
+               int curr_ins, curr_ins_pos;
+
+               curr_ins = 0;
+               curr_ins_pos = ((code->buf - code->region_start - 16) / 16) * 3;
+               for (i = 0; i < 3; ++i) {
+                       if (! (nops & (1 << i))) {
+                               ins_pos [curr_ins] = curr_ins_pos + i;
+                               curr_ins ++;
+                       }
+               }
+
+               for (i = code->unw_op_pos; i < code->unw_op_count; ++i) {
+                       if (code->unw_ops_pos [i] < n) {
+                               code->unw_ops [i].when = ins_pos [code->unw_ops_pos [i]];
+                               //printf ("UNW-OP: %d -> %d\n", code->unw_ops_pos [i], code->unw_ops [i].when);
+                       }
+               }
+               if (code->unw_op_pos < code->unw_op_count)
+                       code->unw_op_pos += n;
+       }
+
+       if (n == code->nins) {
+               code->template = 0;
+               code->nins = 0;
+       }               
+       else {
+               memcpy (&code->instructions [0], &code->instructions [n], (code->nins - n) * sizeof (guint64));
+               memcpy (&code->itypes [0], &code->itypes [n], (code->nins - n) * sizeof (int));
+               memcpy (&stops [0], &stops [n], (code->nins - n) * sizeof (int));
+               code->nins -= n;
+       }
+}
+
 void
 ia64_emit_bundle (Ia64CodegenState *code, gboolean flush)
 {
-       int i, j, ins_type, template;
+       int i, ins_type, template, nins_to_emit;
+       int deps_start [16];
+       int stops [16];
+       gboolean found;
 
-       if (!code->automatic) {
-               if (code->nins == 0)
-                       return;
+       /*
+        * We implement a simple scheduler which tries to put three instructions 
+        * per bundle, then two, then one.
+        */
+       ia64_analyze_deps (code, deps_start, stops);
+
+       if ((code->nins >= 3) && !code->one_ins_per_bundle) {
+               /* Find a suitable template */
+               for (template = 0; template < 32; ++template) {
+                       if (stops_in_template [template][0] != stops [0] ||
+                               stops_in_template [template][1] != stops [1] ||
+                               stops_in_template [template][2] != stops [2])
+                               continue;
 
-               g_assert (code->nins == 3);
+                       found = TRUE;
+                       for (i = 0; i < 3; ++i) {
+                               ins_type = ins_types_in_template [template][i];
+                               switch (code->itypes [i]) {
+                               case IA64_INS_TYPE_A:
+                                       found &= (ins_type == IA64_INS_TYPE_I) || (ins_type == IA64_INS_TYPE_M);
+                                       break;
+                               default:
+                                       found &= (ins_type == code->itypes [i]);
+                                       break;
+                               }
+                       }
 
-               /* Verify template is correct */
-               template = code->template;
-               for (j = 0; j < 3; ++j) {
-                       if (code->stops [j])
-                               g_assert (stops_in_template [template]);
+                       if (found)
+                               found = debug_ins_sched ();
 
-                       ins_type = ins_types_in_template [template][j];
-                       switch (code->itypes [j]) {
-                       case IA64_INS_TYPE_A:
-                               g_assert ((ins_type == IA64_INS_TYPE_I) || (ins_type == IA64_INS_TYPE_M));
-                               break;
-                       case IA64_INS_TYPE_LX:
-                               g_assert (j == 1);
-                               g_assert (ins_type == IA64_INS_TYPE_LX);
-                               j ++;
+                       if (found) {
+                               ia64_real_emit_bundle (code, deps_start, stops, 3, template, code->instructions [0], code->instructions [1], code->instructions [2], 0);
                                break;
-                       default:
-                               g_assert (ins_type == code->itypes [j]);
                        }
                }
+       }
 
-               ia64_emit_bundle_template (code, template, code->instructions [0], code->instructions [1], code->instructions [2]);
-               code->template = 0;
-               code->nins = 0;
+       if (code->nins < IA64_INS_BUFFER_SIZE && !flush)
+               /* Wait for more instructions */
                return;
+
+       /* If it didn't work out, try putting two instructions into one bundle */
+       if ((code->nins >= 2) && !code->one_ins_per_bundle) {
+               /* Try a nop at the end */
+               for (template = 0; template < 32; ++template) {
+                       if (stops_in_template [template][0] != stops [0] ||
+                               ((stops_in_template [template][1] != stops [1]) &&
+                                (stops_in_template [template][2] != stops [1])))
+                                
+                               continue;
+
+                       if (!ITYPE_MATCH (ins_types_in_template [template][0], code->itypes [0]) ||
+                               !ITYPE_MATCH (ins_types_in_template [template][1], code->itypes [1]))
+                               continue;
+
+                       if (!debug_ins_sched ())
+                               continue;
+
+                       ia64_real_emit_bundle (code, deps_start, stops, 2, template, code->instructions [0], code->instructions [1], nops_for_ins_types [ins_types_in_template [template][2]], 1 << 2);
+                       break;
+               }
        }
 
-       for (i = 0; i < code->nins; ++i) {
-               switch (code->itypes [i]) {
+       if (code->nins < IA64_INS_BUFFER_SIZE && !flush)
+               /* Wait for more instructions */
+               return;
+
+       if ((code->nins >= 2) && !code->one_ins_per_bundle) {
+               /* Try a nop in the middle */
+               for (template = 0; template < 32; ++template) {
+                       if (((stops_in_template [template][0] != stops [0]) &&
+                                (stops_in_template [template][1] != stops [0])) ||
+                               stops_in_template [template][2] != stops [1])
+                               continue;
+
+                       if (!ITYPE_MATCH (ins_types_in_template [template][0], code->itypes [0]) ||
+                               !ITYPE_MATCH (ins_types_in_template [template][2], code->itypes [1]))
+                               continue;
+
+                       if (!debug_ins_sched ())
+                               continue;
+
+                       ia64_real_emit_bundle (code, deps_start, stops, 2, template, code->instructions [0], nops_for_ins_types [ins_types_in_template [template][1]], code->instructions [1], 1 << 1);
+                       break;
+               }
+       }
+
+       if ((code->nins >= 2) && flush && !code->one_ins_per_bundle) {
+               /* Try a nop at the beginning */
+               for (template = 0; template < 32; ++template) {
+                       if ((stops_in_template [template][1] != stops [0]) ||
+                               (stops_in_template [template][2] != stops [1]))
+                               continue;
+
+                       if (!ITYPE_MATCH (ins_types_in_template [template][1], code->itypes [0]) ||
+                               !ITYPE_MATCH (ins_types_in_template [template][2], code->itypes [1]))
+                               continue;
+
+                       if (!debug_ins_sched ())
+                               continue;
+
+                       ia64_real_emit_bundle (code, deps_start, stops, 2, template, nops_for_ins_types [ins_types_in_template [template][0]], code->instructions [0], code->instructions [1], 1 << 0);
+                       break;
+               }
+       }
+
+       if (code->nins < IA64_INS_BUFFER_SIZE && !flush)
+               /* Wait for more instructions */
+               return;
+
+       if (flush)
+               nins_to_emit = code->nins;
+       else
+               nins_to_emit = 1;
+
+       while (nins_to_emit > 0) {
+               if (!debug_ins_sched ())
+                       stops [0] = 1;
+               switch (code->itypes [0]) {
                case IA64_INS_TYPE_A:
-                       ia64_emit_bundle_template (code, IA64_TEMPLATE_MIIS, code->instructions [i], IA64_NOP_I, IA64_NOP_I);
+                       if (stops [0])
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MIIS, code->instructions [0], IA64_NOP_I, IA64_NOP_I, 0);
+                       else
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MII, code->instructions [0], IA64_NOP_I, IA64_NOP_I, 0);
                        break;
                case IA64_INS_TYPE_I:
-                       ia64_emit_bundle_template (code, IA64_TEMPLATE_MIIS, IA64_NOP_M, code->instructions [i], IA64_NOP_I);
+                       if (stops [0])
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MIIS, IA64_NOP_M, code->instructions [0], IA64_NOP_I, 0);
+                       else
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MII, IA64_NOP_M, code->instructions [0], IA64_NOP_I, 0);
                        break;
                case IA64_INS_TYPE_M:
-                       ia64_emit_bundle_template (code, IA64_TEMPLATE_MIIS, code->instructions [i], IA64_NOP_I, IA64_NOP_I);
+                       if (stops [0])
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MIIS, code->instructions [0], IA64_NOP_I, IA64_NOP_I, 0);
+                       else
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MII, code->instructions [0], IA64_NOP_I, IA64_NOP_I, 0);
                        break;
                case IA64_INS_TYPE_B:
-                       ia64_emit_bundle_template (code, IA64_TEMPLATE_MIBS, IA64_NOP_M, IA64_NOP_I, code->instructions [i]);
+                       if (stops [0])
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MIBS, IA64_NOP_M, IA64_NOP_I, code->instructions [0], 0);
+                       else
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MIB, IA64_NOP_M, IA64_NOP_I, code->instructions [0], 0);
                        break;
                case IA64_INS_TYPE_F:
-                       ia64_emit_bundle_template (code, IA64_TEMPLATE_MFIS, IA64_NOP_M, code->instructions [i], IA64_NOP_I);
+                       if (stops [0])
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MFIS, IA64_NOP_M, code->instructions [0], IA64_NOP_I, 0);
+                       else
+                               ia64_real_emit_bundle (code, deps_start, stops, 1, IA64_TEMPLATE_MFI, IA64_NOP_M, code->instructions [0], IA64_NOP_I, 0);
                        break;
                case IA64_INS_TYPE_LX:
-                       ia64_emit_bundle_template (code, IA64_TEMPLATE_MLXS, IA64_NOP_M, code->instructions [i], code->instructions [i + 1]);
-                       i ++;
+                       if (stops [0] || stops [1])
+                               ia64_real_emit_bundle (code, deps_start, stops, 2, IA64_TEMPLATE_MLXS, IA64_NOP_M, code->instructions [0], code->instructions [1], 0);
+                       else
+                               ia64_real_emit_bundle (code, deps_start, stops, 2, IA64_TEMPLATE_MLX, IA64_NOP_M, code->instructions [0], code->instructions [1], 0);
+                       nins_to_emit --;
                        break;
                default:
                        g_assert_not_reached ();
                }
+               nins_to_emit --;
        }
+}
 
-       code->nins = 0;
+unw_dyn_region_info_t*
+mono_ia64_create_unwind_region (Ia64CodegenState *code)
+{
+       unw_dyn_region_info_t *r;
+
+       g_assert (code->nins == 0);
+       r = g_malloc0 (_U_dyn_region_info_size (code->unw_op_count));
+       memcpy (&r->op, &code->unw_ops, sizeof (unw_dyn_op_t) * code->unw_op_count);
+       r->op_count = code->unw_op_count;
+       r->insn_count = ((code->buf - code->region_start) >> 4) * 3;
+       code->unw_op_count = 0;
+       code->unw_op_pos = 0;
+       code->region_start = code->buf;
+
+       return r;
 }
 
 static void 
@@ -3088,6 +3807,22 @@ ia64_patch (unsigned char* code, gpointer target)
        guint64 instructions [3];
        guint8 gen_buf [16];
        Ia64CodegenState gen;
+       int ins_to_skip;
+       gboolean found;
+
+       /* 
+        * code encodes both the position inside the buffer and code.nins when
+        * the instruction was emitted.
+        */
+       ins_to_skip = (guint64)code % 16;
+       code = (unsigned char*)((guint64)code & ~15);
+
+       /*
+        * Search for the first instruction which is 'patchable', skipping
+        * ins_to_skip instructions.
+        */
+
+       while (TRUE) {
 
        template = ia64_bundle_template (code);
        instructions [0] = ia64_bundle_ins1 (code);
@@ -3096,27 +3831,18 @@ ia64_patch (unsigned char* code, gpointer target)
 
        ia64_codegen_init (gen, gen_buf);
 
+       found = FALSE;
        for (i = 0; i < 3; ++i) {
                guint64 ins = instructions [i];
                int opcode = ia64_ins_opcode (ins);
 
-               /* Skip nops */
-               gboolean nop = FALSE;
-               switch (ins_types_in_template [template][i]) {
-               case IA64_INS_TYPE_I:
-                       nop = (ins == IA64_NOP_I);
-                       break;
-               case IA64_INS_TYPE_M:
-                       nop = (ins == IA64_NOP_M);
-                       break;
-               case IA64_INS_TYPE_LX:
-                       break;
-               default:
-                       break;
-               }
+               if (ins == nops_for_ins_types [ins_types_in_template [template][i]])
+                       continue;
 
-               if (nop)
+               if (ins_to_skip) {
+                       ins_to_skip --;
                        continue;
+               }
 
                switch (ins_types_in_template [template][i]) {
                case IA64_INS_TYPE_A:
@@ -3125,6 +3851,7 @@ ia64_patch (unsigned char* code, gpointer target)
                                /* adds */
                                ia64_adds_imm_pred (gen, ia64_ins_qp (ins), ia64_ins_r1 (ins), (guint64)target, ia64_ins_r3 (ins));
                                instructions [i] = gen.instructions [0];
+                               found = TRUE;
                        }
                        else
                                NOT_IMPLEMENTED;
@@ -3138,6 +3865,7 @@ ia64_patch (unsigned char* code, gpointer target)
                                ia64_br_cond_hint_pred (gen, ia64_ins_qp (ins), disp, 0, 0, 0);
                                
                                instructions [i] = gen.instructions [0];
+                               found = TRUE;
                        }
                        else if (opcode == 5) {
                                /* br.call */
@@ -3146,6 +3874,7 @@ ia64_patch (unsigned char* code, gpointer target)
                                /* FIXME: hints */
                                ia64_br_call_hint_pred (gen, ia64_ins_qp (ins), ia64_ins_b1 (ins), disp, 0, 0, 0);
                                instructions [i] = gen.instructions [0];
+                               found = TRUE;
                        }
                        else
                                NOT_IMPLEMENTED;
@@ -3159,6 +3888,7 @@ ia64_patch (unsigned char* code, gpointer target)
                                ia64_movl_pred (gen, ia64_ins_qp (ins), ia64_ins_r1 (ins), target);
                                instructions [1] = gen.instructions [0];
                                instructions [2] = gen.instructions [1];
+                               found = TRUE;
                        }
                        else
                                NOT_IMPLEMENTED;
@@ -3167,11 +3897,17 @@ ia64_patch (unsigned char* code, gpointer target)
                default:
                        NOT_IMPLEMENTED;
                }
+
+               if (found) {
+                       /* Rewrite code */
+                       ia64_codegen_init (gen, code);
+                       ia64_emit_bundle_template (&gen, template, instructions [0], instructions [1], instructions [2]);
+                       return;
+               }
        }
 
-       /* Rewrite code */
-       ia64_codegen_init (gen, code);
-       ia64_emit_bundle_template (&gen, template, instructions [0], instructions [1], instructions [2]);
+       code += 16;
+       }
 }
 
 void
@@ -3185,6 +3921,8 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
 
                target = mono_resolve_patch_target (method, domain, code, patch_info, run_cctors);
 
+               if (patch_info->type == MONO_PATCH_INFO_NONE)
+                       continue;
                if (mono_compile_aot) {
                        NOT_IMPLEMENTED;
                }
@@ -3202,8 +3940,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        int alloc_size, pos, i;
        Ia64CodegenState code;
        CallInfo *cinfo;
-       unw_dyn_region_info_t *r_pro;
-       int unw_op_count;
        
        sig = mono_method_signature (method);
        pos = 0;
@@ -3220,7 +3956,6 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        cfg->native_code = g_malloc (cfg->code_size);
 
        ia64_codegen_init (code, cfg->native_code);
-       ia64_codegen_set_automatic (code, FALSE);
 
        alloc_size = ALIGN_TO (cfg->stack_offset, MONO_ARCH_FRAME_ALIGNMENT);
        if (cfg->param_area)
@@ -3239,66 +3974,57 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        pos = 0;
 
        if (method->save_lmf) {
-               /* FIXME: */
+               /* No LMF on IA64 */
        }
 
        alloc_size -= pos;
 
-       /* Initialize unwind info */
-       r_pro = g_malloc0 (_U_dyn_region_info_size (3));
-       unw_op_count = 0;
-
-       ia64_begin_bundle_template (code, IA64_TEMPLATE_MIIS);
+       ia64_unw_save_reg (code, UNW_IA64_AR_PFS, UNW_IA64_GR + cfg->arch.reg_saved_ar_pfs);
        ia64_alloc (code, cfg->arch.reg_saved_ar_pfs, cfg->arch.reg_local0 - cfg->arch.reg_in0, cfg->arch.reg_out0 - cfg->arch.reg_local0, cfg->arch.n_out_regs, 0);
+       ia64_unw_save_reg (code, UNW_IA64_RP, UNW_IA64_GR + cfg->arch.reg_saved_b0);
        ia64_mov_from_br (code, cfg->arch.reg_saved_b0, IA64_B0);
 
-       _U_dyn_op_save_reg (&r_pro->op[unw_op_count++], _U_QP_TRUE, /* when=*/ 0,
-                                               /* reg=*/ UNW_IA64_AR_PFS, /* dst=*/ UNW_IA64_GR + cfg->arch.reg_saved_ar_pfs);
-       _U_dyn_op_save_reg (&r_pro->op[unw_op_count++], _U_QP_TRUE, /* when=*/ 1,
-                                               /* reg=*/ UNW_IA64_RP, /* dst=*/ UNW_IA64_GR + cfg->arch.reg_saved_b0);
-
-       if (alloc_size || cinfo->stack_usage) {
-               ia64_mov (code, cfg->frame_reg, IA64_SP);
-               _U_dyn_op_save_reg (&r_pro->op[unw_op_count++], _U_QP_TRUE, /* when=*/ 2,
-                                                       /* reg=*/ UNW_IA64_SP, /* dst=*/ UNW_IA64_GR + cfg->frame_reg);
+       if ((alloc_size || cinfo->stack_usage) && !cfg->arch.omit_fp) {
+               ia64_unw_save_reg (code, UNW_IA64_SP, UNW_IA64_GR + cfg->arch.reg_saved_sp);
+               ia64_mov (code, cfg->arch.reg_saved_sp, IA64_SP);
+               if (cfg->frame_reg != cfg->arch.reg_saved_sp)
+                       ia64_mov (code, cfg->frame_reg, IA64_SP);
        }
-       else
-               ia64_nop_i (code, 0);
-       ia64_stop (code);
-       ia64_end_bundle (code);
-
-       /* Finish unwind info */
-       r_pro->op_count = unw_op_count;
-       r_pro->insn_count = (code.buf - cfg->native_code) >> 4;
-
-       cfg->arch.r_pro = r_pro;
 
        if (alloc_size) {
-               /* See mono_emit_stack_alloc */
-#if defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
-               NOT_IMPLEMENTED;
-#else
+               int pagesize = getpagesize ();
 
+#if defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+               if (alloc_size >= pagesize) {
+                       gint32 remaining_size = alloc_size;
+
+                       /* Generate stack touching code */
+                       ia64_mov (code, GP_SCRATCH_REG, IA64_SP);                       
+                       while (remaining_size >= pagesize) {
+                               ia64_movl (code, GP_SCRATCH_REG2, pagesize);
+                               ia64_sub (code, GP_SCRATCH_REG, GP_SCRATCH_REG, GP_SCRATCH_REG2);
+                               ia64_ld8 (code, GP_SCRATCH_REG2, GP_SCRATCH_REG);
+                               remaining_size -= pagesize;
+                       }
+               }
+#endif
                if (ia64_is_imm14 (-alloc_size)) {
-                       ia64_begin_bundle_template (code, IA64_TEMPLATE_MISI);
-                       ia64_nop_m (code, 0);
-                       ia64_mov (code, cfg->arch.reg_saved_sp, IA64_SP); ia64_stop (code);
+                       if (cfg->arch.omit_fp)
+                               ia64_unw_add (code, UNW_IA64_SP, (-alloc_size));
                        ia64_adds_imm (code, IA64_SP, (-alloc_size), IA64_SP);
-                       ia64_end_bundle (code);
                }
                else {
-                       ia64_begin_bundle_template (code, IA64_TEMPLATE_MLXS);
-                       ia64_mov (code, cfg->arch.reg_saved_sp, IA64_SP);
-                       ia64_movl (code, GP_SCRATCH_REG, -alloc_size); ia64_stop (code);
-                       ia64_begin_bundle_template (code, IA64_TEMPLATE_MIIS);
+                       ia64_movl (code, GP_SCRATCH_REG, -alloc_size);
+                       if (cfg->arch.omit_fp)
+                               ia64_unw_add (code, UNW_IA64_SP, (-alloc_size));
                        ia64_add (code, IA64_SP, GP_SCRATCH_REG, IA64_SP);
-                       ia64_nop_i (code, 0);
-                       ia64_nop_i (code, 0); ia64_stop (code);
-                       ia64_end_bundle (code);
                }
-#endif
        }
-       ia64_codegen_set_automatic (code, TRUE);
+
+       ia64_begin_bundle (code);
+
+       /* Initialize unwind info */
+       cfg->arch.r_pro = mono_ia64_create_unwind_region (&code);
 
        if (sig->ret->type != MONO_TYPE_VOID) {
                if ((cinfo->ret.storage == ArgInIReg) && (cfg->ret->opcode != OP_REGVAR)) {
@@ -3394,7 +4120,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        }
 
        if (method->save_lmf) {
-               /* FIXME: */
+               /* No LMF on IA64 */
        }
 
        ia64_codegen_close (code);
@@ -3448,7 +4174,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
        
        if (method->save_lmf) {
-               /* FIXME: */
+               /* No LMF on IA64 */
        }
 
        /* Load returned vtypes into registers if needed */
@@ -3482,27 +4208,38 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
        g_free (cinfo);
 
-       ia64_end_bundle (code);
-       ia64_codegen_set_automatic (code, FALSE);
+       ia64_begin_bundle (code);
 
-       ia64_begin_bundle_template (code, IA64_TEMPLATE_MIIS);
-       if (cfg->arch.stack_alloc_size)
-               ia64_mov (code, IA64_SP, cfg->arch.reg_saved_sp);
-       else
-               ia64_nop_m (code, 0);
-       ia64_mov_to_ar_i (code, IA64_PFS, cfg->arch.reg_saved_ar_pfs);
-       ia64_mov_ret_to_br (code, IA64_B0, cfg->arch.reg_saved_b0); ia64_stop (code);
-       ia64_end_bundle (code);
+       code.region_start = cfg->native_code;
+
+       /* Label the unwind state at the start of the exception throwing region */
+       //ia64_unw_label_state (code, 1234);
 
-       ia64_begin_bundle_template (code, IA64_TEMPLATE_BBBS);
+       if (cfg->arch.stack_alloc_size) {
+               if (cfg->arch.omit_fp) {
+                       if (ia64_is_imm14 (cfg->arch.stack_alloc_size)) {
+                               ia64_unw_pop_frames (code, 1);
+                               ia64_adds_imm (code, IA64_SP, (cfg->arch.stack_alloc_size), IA64_SP);
+                       } else {
+                               ia64_movl (code, GP_SCRATCH_REG, cfg->arch.stack_alloc_size);
+                               ia64_unw_pop_frames (code, 1);
+                               ia64_add (code, IA64_SP, GP_SCRATCH_REG, IA64_SP);
+                       }
+               }
+               else {
+                       ia64_unw_pop_frames (code, 1);
+                       ia64_mov (code, IA64_SP, cfg->arch.reg_saved_sp);
+               }
+       }
+       ia64_mov_to_ar_i (code, IA64_PFS, cfg->arch.reg_saved_ar_pfs);
+       ia64_mov_ret_to_br (code, IA64_B0, cfg->arch.reg_saved_b0);
        ia64_br_ret_reg (code, IA64_B0);
-       ia64_nop_b (code, 0);
-       ia64_nop_b (code, 0); ia64_stop (code);
-       ia64_end_bundle (code);
 
-       ia64_codegen_set_automatic (code, TRUE);
        ia64_codegen_close (code);
 
+       cfg->arch.r_epilog = mono_ia64_create_unwind_region (&code);
+       cfg->arch.r_pro->next = cfg->arch.r_epilog;
+
        cfg->code_len = code.buf - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
@@ -3512,13 +4249,12 @@ void
 mono_arch_emit_exceptions (MonoCompile *cfg)
 {
        MonoJumpInfo *patch_info;
-       int nthrows;
+       int i, nthrows;
        Ia64CodegenState code;
        gboolean empty = TRUE;
-       /*
+       //unw_dyn_region_info_t *r_exceptions;
        MonoClass *exc_classes [16];
        guint8 *exc_throw_start [16], *exc_throw_end [16];
-       */
        guint32 code_size = 0;
 
        /* Compute needed space */
@@ -3531,6 +4267,9 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                        code_size += 4 + 7; /* sizeof (float) + alignment */
        }
 
+       if (code_size == 0)
+               return;
+
        while (cfg->code_len + code_size > (cfg->code_size - 16)) {
                cfg->code_size *= 2;
                cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
@@ -3539,6 +4278,9 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 
        ia64_codegen_init (code, cfg->native_code + cfg->code_len);
 
+       /* The unwind state here is the same as before the epilog */
+       //ia64_unw_copy_state (code, 1234);
+
        /* add code to raise exceptions */
        /* FIXME: Optimize this */
        nthrows = 0;
@@ -3548,32 +4290,77 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                        MonoClass *exc_class;
                        guint8* throw_ip;
                        guint8* buf;
+                       guint64 exc_token_index;
 
                        exc_class = mono_class_from_name (mono_defaults.corlib, "System", patch_info->data.name);
                        g_assert (exc_class);
+                       exc_token_index = mono_metadata_token_index (exc_class->type_token);
                        throw_ip = cfg->native_code + patch_info->ip.i;
 
+                       ia64_begin_bundle (code);
+
                        ia64_patch (cfg->native_code + patch_info->ip.i, code.buf);
 
-                       ia64_movl (code, cfg->arch.reg_out0 + 0, exc_class->type_token);
+                       /* Find a throw sequence for the same exception class */
+                       for (i = 0; i < nthrows; ++i)
+                               if (exc_classes [i] == exc_class)
+                                       break;
+
+                       if (i < nthrows) {
+                               gint64 offset = exc_throw_end [i] - 16 - throw_ip;
 
-                       ia64_begin_bundle (code);
+                               if (ia64_is_adds_imm (offset))
+                                       ia64_adds_imm (code, cfg->arch.reg_out0 + 1, offset, IA64_R0);
+                               else
+                                       ia64_movl (code, cfg->arch.reg_out0 + 1, offset);
 
-                       patch_info->data.name = "mono_arch_throw_corlib_exception";
-                       patch_info->type = MONO_PATCH_INFO_INTERNAL_METHOD;
-                       patch_info->ip.i = code.buf - cfg->native_code;
+                               buf = code.buf + code.nins;
+                               ia64_br_cond_pred (code, 0, 0);
+                               ia64_begin_bundle (code);
+                               ia64_patch (buf, exc_throw_start [i]);
 
-                       /* Indirect call */
-                       ia64_movl (code, GP_SCRATCH_REG, 0);
-                       ia64_ld8_inc_imm (code, GP_SCRATCH_REG2, GP_SCRATCH_REG, 8);
-                       ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG2);
-                       ia64_ld8 (code, IA64_GP, GP_SCRATCH_REG);
+                               patch_info->type = MONO_PATCH_INFO_NONE;
+                       }
+                       else {
+                               /* Arg1 */
+                               buf = code.buf;
+                               ia64_movl (code, cfg->arch.reg_out0 + 1, 0);
 
-                       /* Compute the offset */
-                       buf = code.buf + 32;
-                       ia64_movl (code, cfg->arch.reg_out0 + 1, buf - throw_ip);
+                               ia64_begin_bundle (code);
 
-                       ia64_br_call_reg (code, IA64_B0, IA64_B6);
+                               if (nthrows < 16) {
+                                       exc_classes [nthrows] = exc_class;
+                                       exc_throw_start [nthrows] = code.buf;
+                               }
+
+                               /* Arg2 */
+                               if (ia64_is_adds_imm (exc_token_index))
+                                       ia64_adds_imm (code, cfg->arch.reg_out0 + 0, exc_token_index, IA64_R0);
+                               else
+                                       ia64_movl (code, cfg->arch.reg_out0 + 0, exc_token_index);
+
+                               patch_info->data.name = "mono_arch_throw_corlib_exception";
+                               patch_info->type = MONO_PATCH_INFO_INTERNAL_METHOD;
+                               patch_info->ip.i = code.buf + code.nins - cfg->native_code;
+
+                               /* Indirect call */
+                               ia64_movl (code, GP_SCRATCH_REG, 0);
+                               ia64_ld8_inc_imm (code, GP_SCRATCH_REG2, GP_SCRATCH_REG, 8);
+                               ia64_mov_to_br (code, IA64_B6, GP_SCRATCH_REG2);
+                               ia64_ld8 (code, IA64_GP, GP_SCRATCH_REG);
+
+                               ia64_br_call_reg (code, IA64_B0, IA64_B6);
+
+                               /* Patch up the throw offset */
+                               ia64_begin_bundle (code);
+
+                               ia64_patch (buf, (gpointer)(code.buf - 16 - throw_ip));
+
+                               if (nthrows < 16) {
+                                       exc_throw_end [nthrows] = code.buf;
+                                       nthrows ++;
+                               }
+                       }
 
                        empty = FALSE;
                        break;
@@ -3589,6 +4376,10 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
 
        ia64_codegen_close (code);
 
+       /* FIXME: */
+       //r_exceptions = mono_ia64_create_unwind_region (&code);
+       //cfg->arch.r_epilog = r_exceptions;
+
        cfg->code_len = code.buf - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
@@ -3617,48 +4408,52 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
 
                stack_area = ALIGN_TO (n * 8, 16);
 
-               ia64_movl (code, GP_SCRATCH_REG, stack_area);
+               if (n) {
+                       ia64_movl (code, GP_SCRATCH_REG, stack_area);
 
-               ia64_sub (code, IA64_SP, IA64_SP, GP_SCRATCH_REG);
+                       ia64_sub (code, IA64_SP, IA64_SP, GP_SCRATCH_REG);
 
-               /* FIXME: Allocate out registers */
+                       /* FIXME: Allocate out registers */
 
-               ia64_mov (code, cfg->arch.reg_out0 + 1, IA64_SP);
+                       ia64_mov (code, cfg->arch.reg_out0 + 1, IA64_SP);
 
-               /* Required by the ABI */
-               ia64_adds_imm (code, IA64_SP, -16, IA64_SP);
+                       /* Required by the ABI */
+                       ia64_adds_imm (code, IA64_SP, -16, IA64_SP);
 
-               mono_add_patch_info (cfg, code.buf - cfg->native_code, MONO_PATCH_INFO_METHODCONST, cfg->method);
-               ia64_movl (code, cfg->arch.reg_out0 + 0, 0);
+                       add_patch_info (cfg, code, MONO_PATCH_INFO_METHODCONST, cfg->method);
+                       ia64_movl (code, cfg->arch.reg_out0 + 0, 0);
 
-               /* Save arguments to the stack */
-               for (i = 0; i < n; ++i) {
-                       ins = cfg->varinfo [i];
+                       /* Save arguments to the stack */
+                       for (i = 0; i < n; ++i) {
+                               ins = cfg->varinfo [i];
 
-                       if (ins->opcode == OP_REGVAR) {
-                               ia64_movl (code, GP_SCRATCH_REG, (i * 8));
-                               ia64_add (code, GP_SCRATCH_REG, cfg->arch.reg_out0 + 1, GP_SCRATCH_REG);
-                               ia64_st8 (code, GP_SCRATCH_REG, ins->dreg);
-                       }
-                       else {
-                               ia64_movl (code, GP_SCRATCH_REG, ins->inst_offset);
-                               ia64_add (code, GP_SCRATCH_REG, ins->inst_basereg, GP_SCRATCH_REG);
-                               ia64_ld8 (code, GP_SCRATCH_REG2, GP_SCRATCH_REG);
-                               ia64_movl (code, GP_SCRATCH_REG, (i * 8));                              
-                               ia64_add (code, GP_SCRATCH_REG, cfg->arch.reg_out0 + 1, GP_SCRATCH_REG);
-                               ia64_st8 (code, GP_SCRATCH_REG, GP_SCRATCH_REG2);
+                               if (ins->opcode == OP_REGVAR) {
+                                       ia64_movl (code, GP_SCRATCH_REG, (i * 8));
+                                       ia64_add (code, GP_SCRATCH_REG, cfg->arch.reg_out0 + 1, GP_SCRATCH_REG);
+                                       ia64_st8 (code, GP_SCRATCH_REG, ins->dreg);
+                               }
+                               else {
+                                       ia64_movl (code, GP_SCRATCH_REG, ins->inst_offset);
+                                       ia64_add (code, GP_SCRATCH_REG, ins->inst_basereg, GP_SCRATCH_REG);
+                                       ia64_ld8 (code, GP_SCRATCH_REG2, GP_SCRATCH_REG);
+                                       ia64_movl (code, GP_SCRATCH_REG, (i * 8));                              
+                                       ia64_add (code, GP_SCRATCH_REG, cfg->arch.reg_out0 + 1, GP_SCRATCH_REG);
+                                       ia64_st8 (code, GP_SCRATCH_REG, GP_SCRATCH_REG2);
+                               }
                        }
                }
+               else
+                       ia64_mov (code, cfg->arch.reg_out0 + 1, IA64_R0);
        }
        else
                ia64_mov (code, cfg->arch.reg_out0 + 1, IA64_R0);
 
-       mono_add_patch_info (cfg, code.buf - cfg->native_code, MONO_PATCH_INFO_METHODCONST, cfg->method);
+       add_patch_info (cfg, code, MONO_PATCH_INFO_METHODCONST, cfg->method);
        ia64_movl (code, cfg->arch.reg_out0 + 0, 0);
 
        code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func);
 
-       if (enable_arguments) {
+       if (enable_arguments && stack_area) {
                ia64_movl (code, GP_SCRATCH_REG, stack_area);
 
                ia64_add (code, IA64_SP, IA64_SP, GP_SCRATCH_REG);
@@ -3711,7 +4506,7 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
 
        g_free (cinfo);
 
-       mono_add_patch_info (cfg, code.buf - cfg->native_code, MONO_PATCH_INFO_METHODCONST, method);
+       add_patch_info (cfg, code, MONO_PATCH_INFO_METHODCONST, method);
        ia64_movl (code, cfg->arch.reg_out0 + 0, 0);
        code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func);
 
@@ -3755,6 +4550,18 @@ mono_arch_save_unwind_info (MonoCompile *cfg)
        di->u.pi.regions = cfg->arch.r_pro;
 
        _U_dyn_register (di);
+
+       /*
+       {
+               unw_dyn_region_info_t *region = di->u.pi.regions;
+
+               printf ("Unwind info for method %s:\n", mono_method_full_name (cfg->method, TRUE));
+               while (region) {
+                       printf ("    [Region: %d]\n", region->insn_count);
+                       region = region->next;
+               }
+       }
+       */
 }
 
 void
@@ -3763,10 +4570,19 @@ mono_arch_flush_icache (guint8 *code, gint size)
        guint8* p = (guint8*)((guint64)code & ~(0x3f));
        guint8* end = (guint8*)((guint64)code + size);
 
+#ifdef __INTEL_COMPILER
+       /* icc doesn't define an fc.i instrinsic, but fc==fc.i on itanium 2 */
+       while (p < end) {
+               __fc ((guint64)p);
+               p += 32;
+       }
+#else
        while (p < end) {
                __asm__ __volatile__ ("fc.i %0"::"r"(p));
+               /* FIXME: This could be increased to 128 on some cpus */
                p += 32;
        }
+#endif
 }
 
 void
@@ -3805,26 +4621,30 @@ mono_arch_get_patch_offset (guint8 *code)
 gpointer*
 mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
 {
-       guint8 *bundle1 = code - 48;
-       guint8 *bundle2 = code - 32;
-       guint8 *bundle3 = code - 16;
-       guint64 ins11 = ia64_bundle_ins1 (bundle1);
-       guint64 ins12 = ia64_bundle_ins2 (bundle1);
-       guint64 ins13 = ia64_bundle_ins3 (bundle1);
+       guint8 *bundle2 = code - 48;
+       guint8 *bundle3 = code - 32;
+       guint8 *bundle4 = code - 16;
        guint64 ins21 = ia64_bundle_ins1 (bundle2);
        guint64 ins22 = ia64_bundle_ins2 (bundle2);
        guint64 ins23 = ia64_bundle_ins3 (bundle2);
        guint64 ins31 = ia64_bundle_ins1 (bundle3);
        guint64 ins32 = ia64_bundle_ins2 (bundle3);
        guint64 ins33 = ia64_bundle_ins3 (bundle3);
+       guint64 ins41 = ia64_bundle_ins1 (bundle4);
+       guint64 ins42 = ia64_bundle_ins2 (bundle4);
+       guint64 ins43 = ia64_bundle_ins3 (bundle4);
        int reg;
 
        /* 
         * Virtual calls are made with:
-        * [MII]       nop.m 0x0
-        *             mov.sptk b6=r8,0x2000000000f32a80
+        *
+        * [MII]       ld8 r31=[r8]
+        *             nop.i 0x0
         *             nop.i 0x0;;
         * [MII]       nop.m 0x0
+        *             mov.sptk b6=r31,0x2000000000f32a80
+        *             nop.i 0x0
+        * [MII]       nop.m 0x0
         *             nop.i 0x123456
         *             nop.i 0x0
         * [MIB]       nop.m 0x0
@@ -3832,23 +4652,24 @@ mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
         *             br.call.sptk.few b0=b6;;
         */
 
-       if ((ia64_bundle_template (bundle2) == IA64_TEMPLATE_MIIS) &&
-               (ia64_bundle_template (bundle3) == IA64_TEMPLATE_MIBS) &&
-               (ins21 == IA64_NOP_M) && 
-               (ia64_ins_opcode (ins22) == 0) && (ia64_ins_x3 (ins22) == 0) && (ia64_ins_x6 (ins22) == 0x1) && (ia64_ins_y (ins22) == 0) &&
-               (ins23 == IA64_NOP_I) &&
-               (ins31 == IA64_NOP_M) &&
-               (ins32 == IA64_NOP_I) &&
-               (ia64_ins_opcode (ins33) == 1) && (ia64_ins_b1 (ins33) == 0) && (ia64_ins_b2 (ins33) == 6) &&
-               ((ins22 >> 6) & 0xfffff) == 0x12345) {
-               g_assert (ins11 == IA64_NOP_M);
-               g_assert (ins13 == IA64_NOP_I);
-               g_assert (ia64_ins_opcode (ins12) == 0);
-               g_assert (ia64_ins_x3 (ins12) == 7);
-               g_assert (ia64_ins_x (ins12) == 0);
-               g_assert (ia64_ins_b1 (ins12) == IA64_B6);
-
-               reg = ia64_ins_r2 (ins12);
+       if (((ia64_bundle_template (bundle3) == IA64_TEMPLATE_MII) ||
+                (ia64_bundle_template (bundle3) == IA64_TEMPLATE_MIIS)) &&
+               (ia64_bundle_template (bundle4) == IA64_TEMPLATE_MIBS) &&
+               (ins31 == IA64_NOP_M) && 
+               (ia64_ins_opcode (ins32) == 0) && (ia64_ins_x3 (ins32) == 0) && (ia64_ins_x6 (ins32) == 0x1) && (ia64_ins_y (ins32) == 0) &&
+               (ins33 == IA64_NOP_I) &&
+               (ins41 == IA64_NOP_M) &&
+               (ins42 == IA64_NOP_I) &&
+               (ia64_ins_opcode (ins43) == 1) && (ia64_ins_b1 (ins43) == 0) && (ia64_ins_b2 (ins43) == 6) &&
+               ((ins32 >> 6) & 0xfffff) == 0x12345) {
+               g_assert (ins21 == IA64_NOP_M);
+               g_assert (ins23 == IA64_NOP_I);
+               g_assert (ia64_ins_opcode (ins22) == 0);
+               g_assert (ia64_ins_x3 (ins22) == 7);
+               g_assert (ia64_ins_x (ins22) == 0);
+               g_assert (ia64_ins_b1 (ins22) == IA64_B6);
+
+               reg = IA64_R8;
 
                /* 
                 * Must be a scratch register, since only those are saved by the trampoline
@@ -3873,16 +4694,6 @@ mono_arch_get_delegate_method_ptr_addr (guint8* code, gpointer *regs)
 
 static gboolean tls_offset_inited = FALSE;
 
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-
-static void
-setup_stack (MonoJitTlsData *tls)
-{
-       NOT_IMPLEMENTED;
-}
-
-#endif
-
 void
 mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
 {
@@ -3890,29 +4701,13 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                tls_offset_inited = TRUE;
 
                appdomain_tls_offset = mono_domain_get_tls_offset ();
-               lmf_tls_offset = mono_get_lmf_tls_offset ();
                thread_tls_offset = mono_thread_get_tls_offset ();
        }               
-
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-       setup_stack (tls);
-#endif
 }
 
 void
 mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
-#ifdef MONO_ARCH_SIGSEGV_ON_ALTSTACK
-       struct sigaltstack sa;
-
-       sa.ss_sp = tls->signal_stack;
-       sa.ss_size = SIGNAL_STACK_SIZE;
-       sa.ss_flags = SS_DISABLE;
-       sigaltstack  (&sa, NULL);
-
-       if (tls->signal_stack)
-               munmap (tls->signal_stack, SIGNAL_STACK_SIZE);
-#endif
 }
 
 void
@@ -3942,7 +4737,7 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
                        vtarg->dreg = mono_regstate_next_int (cfg->rs);
                        mono_bblock_add_inst (cfg->cbb, vtarg);
 
-                       mono_call_inst_add_outarg_reg (call, vtarg->dreg, out_reg, FALSE);
+                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, out_reg, FALSE);
 
                        out_reg ++;
                }
@@ -3959,7 +4754,7 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
                this->dreg = mono_regstate_next_int (cfg->rs);
                mono_bblock_add_inst (cfg->cbb, this);
 
-               mono_call_inst_add_outarg_reg (call, this->dreg, out_reg, FALSE);
+               mono_call_inst_add_outarg_reg (cfg, call, this->dreg, out_reg, FALSE);
        }
 }
 
@@ -3968,89 +4763,37 @@ mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethod
 {
        MonoInst *ins = NULL;
 
-       /* FIXME: */
-       return NULL;
-
-       if (cmethod->klass == mono_defaults.math_class) {
-               if (strcmp (cmethod->name, "Sin") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SIN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Cos") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_COS);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Tan") == 0) {
-                               return ins;
-                       MONO_INST_NEW (cfg, ins, OP_TAN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Atan") == 0) {
-                               return ins;
-                       MONO_INST_NEW (cfg, ins, OP_ATAN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Sqrt") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SQRT);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
-                       MONO_INST_NEW (cfg, ins, OP_ABS);
-                       ins->inst_i0 = args [0];
-               }
-#if 0
-               /* OP_FREM is not IEEE compatible */
-               else if (strcmp (cmethod->name, "IEEERemainder") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_FREM);
-                       ins->inst_i0 = args [0];
-                       ins->inst_i1 = args [1];
-               }
-#endif
+       if (cmethod->klass == mono_defaults.thread_class &&
+               strcmp (cmethod->name, "MemoryBarrier") == 0) {
+               MONO_INST_NEW (cfg, ins, OP_MEMORY_BARRIER);
        } else if(cmethod->klass->image == mono_defaults.corlib &&
                           (strcmp (cmethod->klass->name_space, "System.Threading") == 0) &&
                           (strcmp (cmethod->klass->name, "Interlocked") == 0)) {
 
                if (strcmp (cmethod->name, "Increment") == 0) {
-                       MonoInst *ins_iconst;
                        guint32 opcode;
 
                        if (fsig->params [0]->type == MONO_TYPE_I4)
-                               opcode = OP_ATOMIC_ADD_NEW_I4;
+                               opcode = OP_ATOMIC_ADD_IMM_NEW_I4;
                        else if (fsig->params [0]->type == MONO_TYPE_I8)
-                               opcode = OP_ATOMIC_ADD_NEW_I8;
+                               opcode = OP_ATOMIC_ADD_IMM_NEW_I8;
                        else
                                g_assert_not_reached ();
                        MONO_INST_NEW (cfg, ins, opcode);
-                       MONO_INST_NEW (cfg, ins_iconst, OP_ICONST);
-                       ins_iconst->inst_c0 = 1;
-
+                       ins->inst_imm = 1;
                        ins->inst_i0 = args [0];
-                       ins->inst_i1 = ins_iconst;
                } else if (strcmp (cmethod->name, "Decrement") == 0) {
-                       MonoInst *ins_iconst;
                        guint32 opcode;
 
                        if (fsig->params [0]->type == MONO_TYPE_I4)
-                               opcode = OP_ATOMIC_ADD_NEW_I4;
+                               opcode = OP_ATOMIC_ADD_IMM_NEW_I4;
                        else if (fsig->params [0]->type == MONO_TYPE_I8)
-                               opcode = OP_ATOMIC_ADD_NEW_I8;
+                               opcode = OP_ATOMIC_ADD_IMM_NEW_I8;
                        else
                                g_assert_not_reached ();
                        MONO_INST_NEW (cfg, ins, opcode);
-                       MONO_INST_NEW (cfg, ins_iconst, OP_ICONST);
-                       ins_iconst->inst_c0 = -1;
-
+                       ins->inst_imm = -1;
                        ins->inst_i0 = args [0];
-                       ins->inst_i1 = ins_iconst;
-               } else if (strcmp (cmethod->name, "Add") == 0) {
-                       guint32 opcode;
-
-                       if (fsig->params [0]->type == MONO_TYPE_I4)
-                               opcode = OP_ATOMIC_ADD_I4;
-                       else if (fsig->params [0]->type == MONO_TYPE_I8)
-                               opcode = OP_ATOMIC_ADD_I8;
-                       else
-                               g_assert_not_reached ();
-                       
-                       MONO_INST_NEW (cfg, ins, opcode);
-
-                       ins->inst_i0 = args [0];
-                       ins->inst_i1 = args [1];
                } else if (strcmp (cmethod->name, "Exchange") == 0) {
                        guint32 opcode;
 
@@ -4065,6 +4808,20 @@ mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethod
 
                        MONO_INST_NEW (cfg, ins, opcode);
 
+                       ins->inst_i0 = args [0];
+                       ins->inst_i1 = args [1];
+               } else if (strcmp (cmethod->name, "Add") == 0) {
+                       guint32 opcode;
+
+                       if (fsig->params [0]->type == MONO_TYPE_I4)
+                               opcode = OP_ATOMIC_ADD_NEW_I4;
+                       else if (fsig->params [0]->type == MONO_TYPE_I8)
+                               opcode = OP_ATOMIC_ADD_NEW_I8;
+                       else
+                               g_assert_not_reached ();
+                       
+                       MONO_INST_NEW (cfg, ins, opcode);
+
                        ins->inst_i0 = args [0];
                        ins->inst_i1 = args [1];
                } else if (strcmp (cmethod->name, "Read") == 0 && (fsig->params [0]->type == MONO_TYPE_I8)) {
@@ -4072,11 +4829,6 @@ mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethod
                        MONO_INST_NEW (cfg, ins, CEE_LDIND_I8);
                        ins->inst_i0 = args [0];
                }
-
-               /* 
-                * Can't implement CompareExchange methods this way since they have
-                * three arguments.
-                */
        }
 
        return ins;