+2008-10-09 Rodrigo Kumpera <rkumpera@novell.com>
+
+ * mini-ops.h: Added ops for packed and saturated math, shifts
+ and packing/unpacking.
+
+ * cpu-x86.md: Added descriptors for the above ops.
+
+ * mini-x86.c: Added code to emmit the above ops.
+
+ * simd-intrinsics.c: Added support for Vector16u and Vector8u.
+
2008-10-08 Zoltan Varga <vargaz@gmail.com>
* aot-compiler.c (compile_method): Enable AOT for generic code.
rsqrtps: dest:x src1:x len:4
shuffleps: dest:x src1:x len:5
+paddb: dest:x src1:x src2:x len:4 clob:1
+paddw: dest:x src1:x src2:x len:4 clob:1
+paddd: dest:x src1:x src2:x len:4 clob:1
+
+psubb: dest:x src1:x src2:x len:4 clob:1
+psubw: dest:x src1:x src2:x len:4 clob:1
+psubd: dest:x src1:x src2:x len:4 clob:1
+
+unpack_lowb: dest:x src1:x src2:x len:4 clob:1
+unpack_loww: dest:x src1:x src2:x len:4 clob:1
+unpack_lowd: dest:x src1:x src2:x len:4 clob:1
+unpack_lowq: dest:x src1:x src2:x len:4 clob:1
+
+unpack_highb: dest:x src1:x src2:x len:4 clob:1
+unpack_highw: dest:x src1:x src2:x len:4 clob:1
+unpack_highd: dest:x src1:x src2:x len:4 clob:1
+unpack_highq: dest:x src1:x src2:x len:4 clob:1
+
+paddb_sat: dest:x src1:x src2:x len:4 clob:1
+paddb_sat_un: dest:x src1:x src2:x len:4 clob:1
+
+paddw_sat: dest:x src1:x src2:x len:4 clob:1
+paddw_sat_un: dest:x src1:x src2:x len:4 clob:1
+
+psubb_sat: dest:x src1:x src2:x len:4 clob:1
+psubb_sat_un: dest:x src1:x src2:x len:4 clob:1
+
+psubw_sat: dest:x src1:x src2:x len:4 clob:1
+psubw_sat_un: dest:x src1:x src2:x len:4 clob:1
+
+pmulw: dest:x src1:x src2:x len:4 clob:1
+pmuld: dest:x src1:x src2:x len:4 clob:1
+
+pshrw: dest:x src1:x len:8 clob:1
+pshrw_reg: dest:x src1:x src2:x len:8 clob:1
+
+psarw: dest:x src1:x len:8 clob:1
+psarw_reg: dest:x src1:x src2:x len:8 clob:1
+
+pshlw: dest:x src1:x len:8 clob:1
+pshlw_reg: dest:x src1:x src2:x len:8 clob:1
+
xmove: dest:x src1:x len:4
xzero: dest:x len:4
+iconv_to_x: dest:x src1:i len:4
extract_i4: dest:i src1:x len:4
iconv_to_r8_raw: dest:f src1:i len:17
MINI_OP(OP_POR, "por", XREG, XREG, XREG)
MINI_OP(OP_PXOR, "pxor", XREG, XREG, XREG)
+MINI_OP(OP_PADDB, "paddb", XREG, XREG, XREG)
+MINI_OP(OP_PADDW, "paddw", XREG, XREG, XREG)
+MINI_OP(OP_PADDD, "paddd", XREG, XREG, XREG)
+
+MINI_OP(OP_PSUBB, "psubb", XREG, XREG, XREG)
+MINI_OP(OP_PSUBW, "psubw", XREG, XREG, XREG)
+MINI_OP(OP_PSUBD, "psubd", XREG, XREG, XREG)
+
+MINI_OP(OP_UNPACK_LOWB, "unpack_lowb", XREG, XREG, XREG)
+MINI_OP(OP_UNPACK_LOWW, "unpack_loww", XREG, XREG, XREG)
+MINI_OP(OP_UNPACK_LOWD, "unpack_lowd", XREG, XREG, XREG)
+MINI_OP(OP_UNPACK_LOWQ, "unpack_lowq", XREG, XREG, XREG)
+
+MINI_OP(OP_UNPACK_HIGHB, "unpack_highb", XREG, XREG, XREG)
+MINI_OP(OP_UNPACK_HIGHW, "unpack_highw", XREG, XREG, XREG)
+MINI_OP(OP_UNPACK_HIGHD, "unpack_highd", XREG, XREG, XREG)
+MINI_OP(OP_UNPACK_HIGHQ, "unpack_highq", XREG, XREG, XREG)
+
+MINI_OP(OP_PADDB_SAT, "paddb_sat", XREG, XREG, XREG)
+MINI_OP(OP_PADDB_SAT_UN, "paddb_sat_un", XREG, XREG, XREG)
+
+MINI_OP(OP_PADDW_SAT, "paddw_sat", XREG, XREG, XREG)
+MINI_OP(OP_PADDW_SAT_UN, "paddw_sat_un", XREG, XREG, XREG)
+
+MINI_OP(OP_PSUBB_SAT, "psubb_sat", XREG, XREG, XREG)
+MINI_OP(OP_PSUBB_SAT_UN, "psubb_sat_un", XREG, XREG, XREG)
+
+MINI_OP(OP_PSUBW_SAT, "psubw_sat", XREG, XREG, XREG)
+MINI_OP(OP_PSUBW_SAT_UN, "psubw_sat_un", XREG, XREG, XREG)
+
+MINI_OP(OP_PMULW, "pmulw", XREG, XREG, XREG)
+MINI_OP(OP_PMULD, "pmuld", XREG, XREG, XREG)
+
+/*SSE2 Shift ops must have the _reg version right after as code depends on this ordering.*/
+MINI_OP(OP_PSHRW, "pshrw", XREG, XREG, NONE)
+MINI_OP(OP_PSHRW_REG, "pshrw_reg", XREG, XREG, XREG)
+
+MINI_OP(OP_PSARW, "psarw", XREG, XREG, NONE)
+MINI_OP(OP_PSARW_REG, "psarw_reg", XREG, XREG, XREG)
+
+MINI_OP(OP_PSHLW, "pshlw", XREG, XREG, NONE)
+MINI_OP(OP_PSHLW_REG, "pshlw_reg", XREG, XREG, XREG)
+
MINI_OP(OP_EXTRACT_I4, "extract_i4", IREG, XREG, NONE)
MINI_OP(OP_ICONV_TO_R8_RAW, "iconv_to_r8_raw", FREG, IREG, NONE)
MINI_OP(OP_LOADX_R4, "loadx_r4", FREG, IREG, NONE)
MINI_OP(OP_FCONV_TO_R8_X, "fconv_to_r8_x", XREG, FREG, NONE)
MINI_OP(OP_XCONV_R8_TO_I4, "xconv_r8_to_i4", IREG, XREG, NONE)
+MINI_OP(OP_ICONV_TO_X, "iconv_to_x", XREG, IREG, NONE)
#endif
case OP_PXOR:
x86_sse_alu_pd_reg_reg (code, X86_SSE_PXOR, ins->sreg1, ins->sreg2);
break;
+
+ case OP_PADDB:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDB, ins->sreg1, ins->sreg2);
+ break;
+ case OP_PADDW:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDW, ins->sreg1, ins->sreg2);
+ break;
+ case OP_PADDD:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDD, ins->sreg1, ins->sreg2);
+ break;
+
+ case OP_PSUBB:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBB, ins->sreg1, ins->sreg2);
+ break;
+ case OP_PSUBW:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBW, ins->sreg1, ins->sreg2);
+ break;
+ case OP_PSUBD:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBD, ins->sreg1, ins->sreg2);
+ break;
+
+ case OP_UNPACK_LOWB:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLBW, ins->sreg1, ins->sreg2);
+ break;
+ case OP_UNPACK_LOWW:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLWD, ins->sreg1, ins->sreg2);
+ break;
+ case OP_UNPACK_LOWD:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLDQ, ins->sreg1, ins->sreg2);
+ break;
+
+ case OP_UNPACK_HIGHB:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHBW, ins->sreg1, ins->sreg2);
+ break;
+ case OP_UNPACK_HIGHW:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHWD, ins->sreg1, ins->sreg2);
+ break;
+ case OP_UNPACK_HIGHD:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHDQ, ins->sreg1, ins->sreg2);
+ break;
+
+ case OP_PADDB_SAT_UN:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDUSB, ins->sreg1, ins->sreg2);
+ break;
+ case OP_PSUBB_SAT_UN:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBUSB, ins->sreg1, ins->sreg2);
+ break;
+ case OP_PADDW_SAT_UN:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDUSW, ins->sreg1, ins->sreg2);
+ break;
+ case OP_PSUBW_SAT_UN:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBUSW, ins->sreg1, ins->sreg2);
+ break;
+
+ case OP_PMULW:
+ x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULLW, ins->sreg1, ins->sreg2);
+ break;
+
+ case OP_PSHRW:
+ x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SHR, ins->dreg, ins->inst_imm);
+ break;
+ case OP_PSHRW_REG:
+ x86_sse_shift_reg_reg (code, X86_SSE_PSRLW_REG, ins->dreg, ins->sreg2);
+ break;
+
+ case OP_PSARW:
+ x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SAR, ins->dreg, ins->inst_imm);
+ break;
+ case OP_PSARW_REG:
+ x86_sse_shift_reg_reg (code, X86_SSE_PSRAW_REG, ins->dreg, ins->sreg2);
+ break;
+
+ case OP_PSHLW:
+ x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SHL, ins->dreg, ins->inst_imm);
+ break;
+ case OP_PSHLW_REG:
+ x86_sse_shift_reg_reg (code, X86_SSE_PSLLW_REG, ins->dreg, ins->sreg2);
+ break;
+
+ case OP_ICONV_TO_X:
+ x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
+ break;
case OP_EXTRACT_I4:
x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
break;
TODO add stuff to man pages
TODO document this under /docs
TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
-TODO revant the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw.
+TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw.
TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
+TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
General notes for SIMD intrinsics.
SIMD_EMIT_CTOR,
SIMD_EMIT_CAST,
SIMD_EMIT_SHUFFLE,
+ SIMD_EMIT_SHIFT,
SIMD_EMIT_LOAD_ALIGNED,
SIMD_EMIT_STORE_ALIGNED
};
/*This is the size of the largest method name + 1 (to fit the ending \0). Align to 4 as well.*/
-#define SIMD_INTRINSIC_NAME_MAX 16
+#define SIMD_INTRINSIC_NAME_MAX 22
typedef struct {
const char name[SIMD_INTRINSIC_NAME_MAX];
guint8 flags;
} SimdIntrinsc;
+/*
+Missing:
+setters
+ */
static const SimdIntrinsc vector4f_intrinsics[] = {
{ ".ctor", 0, SIMD_EMIT_CTOR },
{ "AddSub", OP_ADDSUBPS, SIMD_EMIT_BINARY_SSE3 },
{ "op_Subtraction", OP_SUBPS, SIMD_EMIT_BINARY },
};
+/*
+Missing:
+A lot, revisit Vector4u.
+ */
static const SimdIntrinsc vector4u_intrinsics[] = {
{ "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
{ "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
{ "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
};
+/*
+Missing:
+.ctor
+getters
+setters
+ */
+static const SimdIntrinsc vector8u_intrinsics[] = {
+ { "AddWithSaturation", OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
+ { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
+ { "ShiftRightArithmethic", OP_PSARW, SIMD_EMIT_SHIFT },
+ { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
+ { "SubWithSaturation", OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
+ { "UnpackHigh", OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
+ { "UnpackLow", OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
+ { "op_Addition", OP_PADDW, SIMD_EMIT_BINARY },
+ { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
+ { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
+ { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
+ { "op_Explicit", 0, SIMD_EMIT_CAST },
+ { "op_LeftShift", OP_PSHLW, SIMD_EMIT_SHIFT },
+ { "op_Multiply", OP_PMULW, SIMD_EMIT_BINARY },
+ { "op_RightShift", OP_PSHRW, SIMD_EMIT_SHIFT },
+ { "op_Subtraction", OP_PSUBW, SIMD_EMIT_BINARY },
+};
+
+/*
+Missing:
+.ctor
+getters
+setters
+ */
+static const SimdIntrinsc vector16u_intrinsics[] = {
+ { "AddWithSaturation", OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
+ { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
+ { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
+ { "SubWithSaturation", OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
+ { "UnpackHigh", OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
+ { "UnpackLow", OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
+ { "op_Addition", OP_PADDB, SIMD_EMIT_BINARY },
+ { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
+ { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
+ { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
+ { "op_Explicit", 0, SIMD_EMIT_CAST },
+ { "op_Subtraction", OP_PSUBB, SIMD_EMIT_BINARY },
+};
+
+
/*TODO match using number of parameters as well*/
static int
simd_intrinsic_compare_by_name (const void *key, const void *value)
}
}
- /*TODO stop here is no var is xzero only*/
+ /*TODO stop here if no var is xzero only*/
/*
Scan all other bb and check if it has only one other use
return ins;
}
+static MonoInst*
+
+simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
+{
+ MonoInst *ins;
+ int vreg, vreg2 = -1, opcode = intrinsic->opcode;
+
+ vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
+
+ if (args [1]->opcode != OP_ICONST) {
+ MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
+ ins->klass = mono_defaults.int32_class;
+ ins->sreg1 = args [1]->dreg;
+ ins->type = STACK_I4;
+ ins->dreg = vreg2 = alloc_ireg (cfg);
+ MONO_ADD_INS (cfg->cbb, ins);
+
+ ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
+ }
+
+ MONO_INST_NEW (cfg, ins, opcode);
+ ins->klass = cmethod->klass;
+ ins->sreg1 = vreg;
+ ins->sreg2 = vreg2;
+
+ if (args [1]->opcode == OP_ICONST) {
+ ins->inst_imm = args [1]->inst_c0;
+ NULLIFY_INS (args [1]);
+ }
+
+ ins->type = STACK_VTYPE;
+ ins->dreg = alloc_ireg (cfg);
+ MONO_ADD_INS (cfg->cbb, ins);
+ return ins;
+}
+
+
static MonoInst*
simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
{
return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
case SIMD_EMIT_SHUFFLE:
return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
+ case SIMD_EMIT_SHIFT:
+ return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
case SIMD_EMIT_LOAD_ALIGNED:
return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
case SIMD_EMIT_STORE_ALIGNED:
return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
if (!strcmp ("Vector4u", cmethod->klass->name))
return emit_intrinsics (cfg, cmethod, fsig, args, vector4u_intrinsics, sizeof (vector4u_intrinsics) / sizeof (SimdIntrinsc));
+ if (!strcmp ("Vector8u", cmethod->klass->name))
+ return emit_intrinsics (cfg, cmethod, fsig, args, vector8u_intrinsics, sizeof (vector8u_intrinsics) / sizeof (SimdIntrinsc));
+ if (!strcmp ("Vector16u", cmethod->klass->name))
+ return emit_intrinsics (cfg, cmethod, fsig, args, vector16u_intrinsics, sizeof (vector16u_intrinsics) / sizeof (SimdIntrinsc));
return NULL;
}