+reg: CEE_LDIND_REF (OP_REGVAR),
+reg: CEE_LDIND_I (OP_REGVAR),
+reg: CEE_LDIND_I8 (OP_REGVAR),
+reg: CEE_LDIND_I4 (OP_REGVAR),
+reg: CEE_LDIND_U4 (OP_REGVAR) "0" {
+ state->reg1 = state->left->tree->dreg;
+ tree->dreg = state->reg1;
+}
+
+reg: OP_ATOMIC_ADD_NEW_I4 (base, OP_ICONST),
+reg: OP_ATOMIC_ADD_NEW_I8 (base, OP_ICONST) {
+ tree->opcode = tree->opcode == OP_ATOMIC_ADD_NEW_I4 ? OP_IA64_FETCHADD4_IMM : OP_IA64_FETCHADD8_IMM;
+ tree->dreg = state->reg1;
+ tree->inst_imm = state->right->tree->inst_imm;
+ tree->inst_basereg = state->left->tree->inst_basereg;
+ tree->inst_offset = state->left->tree->inst_offset;
+
+ mono_bblock_add_inst (s->cbb, tree);
+} cost {
+ int imm = state->right->tree->inst_imm;
+
+ MBCOND (imm == 1 || imm == 4 || imm == 8 || imm == 16 || imm == -1 || imm == -4 || imm == -8 || imm == -16);
+ return 1;
+}
+
+reg: OP_ATOMIC_EXCHANGE_I4 (base, reg),
+reg: OP_ATOMIC_EXCHANGE_I8 (base, reg),
+reg: OP_ATOMIC_ADD_NEW_I4 (base, reg),
+reg: OP_ATOMIC_ADD_NEW_I8 (base, reg) {
+ tree->opcode = tree->opcode;
+ tree->dreg = state->reg1;
+ tree->sreg2 = state->right->reg1;
+ tree->inst_basereg = state->left->tree->inst_basereg;
+ tree->inst_offset = state->left->tree->inst_offset;
+
+ mono_bblock_add_inst (s->cbb, tree);
+}
+
+# Optimized memset implementation
+stmt: OP_MEMSET (base) "0" {
+ int dest_reg, dest_reg2, val_reg, unit, align;
+ int size = tree->unused;
+
+ dest_reg = mono_regstate_next_int (s->rs);
+
+ if (state->left->tree->inst_basereg == s->frame_reg)
+ /* Aligned by mono_allocate_stack_slots */
+ align = 8;
+ else
+ align = 4;
+
+ if (tree->inst_imm == 0)
+ val_reg = IA64_R0;
+ else {
+ val_reg = mono_regstate_next_int (s->rs);
+
+ MONO_EMIT_NEW_ICONST (s, val_reg, tree->inst_imm);
+ }
+
+ MONO_EMIT_NEW_BIALU_IMM (s, OP_ADD_IMM, dest_reg, state->left->tree->inst_basereg, state->left->tree->inst_offset);
+
+ for (unit = align; unit >= 1; unit = unit >> 1) {
+ dest_reg2 = mono_regstate_next_int (s->rs);
+
+ /* Use two destination regs to increase paralellism */
+ if (size >= 2 * unit) {
+ int diff = (size / (2 * unit)) * unit;
+ MONO_EMIT_NEW_BIALU_IMM (s, OP_ADD_IMM, dest_reg2, state->left->tree->inst_basereg, state->left->tree->inst_offset + diff);
+
+ while (size >= (2 * unit)) {
+ MONO_EMIT_NEW_STORE_MEMBASE (s, size_to_ia64_store_membase_inc_reg (unit), dest_reg, 0, val_reg);
+ MONO_EMIT_NEW_STORE_MEMBASE (s, size_to_ia64_store_membase_inc_reg (unit), dest_reg2, 0, val_reg);
+ size -= 2 * unit;
+ }
+
+ if (size > 0)
+ MONO_EMIT_NEW_BIALU_IMM (s, OP_ADD_IMM, dest_reg, dest_reg, diff);
+ }
+
+ while (size >= unit) {
+ if (size == unit)
+ MONO_EMIT_NEW_STORE_MEMBASE (s, size_to_store_membase_reg (unit), dest_reg, 0, val_reg);
+ else
+ MONO_EMIT_NEW_STORE_MEMBASE (s, size_to_ia64_store_membase_inc_reg (unit), dest_reg, 0, val_reg);
+ size -= unit;
+ }
+ }
+
+}
+
+# Optimized memcpy implementation
+stmt: OP_MEMCPY (base, base) "0" {
+ int cur_reg, src_reg, dest_reg, unit;
+ int size = tree->unused;
+ int align;
+
+ src_reg = mono_regstate_next_int (s->rs);
+ dest_reg = mono_regstate_next_int (s->rs);
+
+ if ((state->left->tree->inst_basereg == s->frame_reg) &&
+ (state->right->tree->inst_basereg == s->frame_reg))
+ /* Aligned by mono_allocate_stack_slots */
+ align = 8;
+ else
+ align = 4;
+
+ MONO_EMIT_NEW_BIALU_IMM (s, OP_ADD_IMM, dest_reg, state->left->tree->inst_basereg, state->left->tree->inst_offset);
+ MONO_EMIT_NEW_BIALU_IMM (s, OP_ADD_IMM, src_reg, state->right->tree->inst_basereg, state->right->tree->inst_offset);
+
+ for (unit = align; unit >= 1; unit = unit >> 1) {
+ while (size >= unit) {
+ cur_reg = mono_regstate_next_int (s->rs);
+ MONO_EMIT_NEW_LOAD_MEMBASE_OP (s, size_to_ia64_load_u_membase_inc (unit), cur_reg, src_reg, 0);
+ MONO_EMIT_NEW_STORE_MEMBASE (s, size_to_ia64_store_membase_inc_reg (unit), dest_reg, 0, cur_reg);
+ size -= unit;
+ }
+ }
+}
+