2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revant the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw.
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
35 General notes for SIMD intrinsics.
37 -Bad extractor and constructor performance
38 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
39 It will be loaded in the FP stack just to be pushed on the call stack.
41 A similar thing happens with Vector4f constructor that require float vars to be
43 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
44 trip to the FP stack is desirable.
46 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
50 -Promote OP_EXTRACT_I4 to a STORE op
51 The advantage of this change is that it could have a _membase version and promote further optimizations.
53 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
57 #ifdef MONO_ARCH_SIMD_INTRINSICS
59 //#define IS_DEBUG_ON(cfg) (0)
61 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
62 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
65 SIMD_EMIT_BINARY_SSE3,
71 SIMD_EMIT_LOAD_ALIGNED,
72 SIMD_EMIT_STORE_ALIGNED
75 /*This is the size of the largest method name + 1 (to fit the ending \0). Align to 4 as well.*/
76 #define SIMD_INTRINSIC_NAME_MAX 16
79 const char name[SIMD_INTRINSIC_NAME_MAX];
81 guint8 simd_emit_mode;
85 static const SimdIntrinsc vector4f_intrinsics[] = {
86 { ".ctor", 0, SIMD_EMIT_CTOR },
87 { "AddSub", OP_ADDSUBPS, SIMD_EMIT_BINARY_SSE3 },
88 { "HorizontalAdd", OP_HADDPS, SIMD_EMIT_BINARY_SSE3 },
89 { "HorizontalSub", OP_HSUBPS, SIMD_EMIT_BINARY_SSE3 },
90 { "InvSqrt", OP_RSQRTPS, SIMD_EMIT_UNARY },
91 { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
92 { "Max", OP_MAXPS, SIMD_EMIT_BINARY },
93 { "Min", OP_MINPS, SIMD_EMIT_BINARY },
94 { "Shuffle", 0, SIMD_EMIT_SHUFFLE },
95 { "Sqrt", OP_SQRTPS, SIMD_EMIT_UNARY },
96 { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
97 { "get_W", 3, SIMD_EMIT_GETTER },
98 { "get_X", 0, SIMD_EMIT_GETTER },
99 { "get_Y", 1, SIMD_EMIT_GETTER },
100 { "get_Z", 2, SIMD_EMIT_GETTER },
101 { "op_Addition", OP_ADDPS, SIMD_EMIT_BINARY },
102 { "op_Division", OP_DIVPS, SIMD_EMIT_BINARY },
103 { "op_Explicit", 0, SIMD_EMIT_CAST },
104 { "op_Multiply", OP_MULPS, SIMD_EMIT_BINARY },
105 { "op_Subtraction", OP_SUBPS, SIMD_EMIT_BINARY },
108 static const SimdIntrinsc vector4u_intrinsics[] = {
109 { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
110 { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
111 { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
114 /*TODO match using number of parameters as well*/
116 simd_intrinsic_compare_by_name (const void *key, const void *value)
118 return strncmp(key, ((SimdIntrinsc *)value)->name, SIMD_INTRINSIC_NAME_MAX);
123 VREG_HAS_XZERO_BB0 = 0x02,
124 VREG_HAS_OTHER_OP_BB0 = 0x04,
125 VREG_SINGLE_BB_USE = 0x08,
126 VREG_MANY_BB_USE = 0x10,
130 get_ins_reg_by_idx (MonoInst *ins, int idx)
133 case 0: return ins->dreg;
134 case 1: return ins->sreg1;
135 case 2: return ins->sreg2;
140 This pass recalculate which vars need MONO_INST_INDIRECT.
142 We cannot do this for non SIMD vars since code like mono_get_vtable_var
143 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
146 mono_simd_simplify_indirection (MonoCompile *cfg)
149 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
153 for (i = 0; i < cfg->num_varinfo; i++) {
154 MonoInst *var = cfg->varinfo [i];
155 if (var->klass->simd_type) {
156 // printf ("cleaning indirect flag for %d\n", var->dreg);
157 var->flags &= ~MONO_INST_INDIRECT;
158 max_vreg = MAX (var->dreg, max_vreg);
162 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
163 if (!first_bb && bb->code)
165 for (ins = bb->code; ins; ins = ins->next) {
166 if (ins->opcode == OP_LDADDR) {
167 MonoInst *var = (MonoInst*)ins->inst_p0;
168 if (var->klass->simd_type) {
169 var->flags |= MONO_INST_INDIRECT;
175 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
176 vreg_flags = g_malloc0 (max_vreg + 1);
177 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
179 for (i = 0; i < cfg->num_varinfo; i++) {
180 MonoInst *var = cfg->varinfo [i];
181 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
182 vreg_flags [var->dreg] = VREG_USED;
183 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
187 /*Scan the first basic block looking xzeros not used*/
188 for (ins = first_bb->code; ins; ins = ins->next) {
189 if (ins->opcode == OP_XZERO) {
190 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
191 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
192 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
196 for (i = 0; i < 3; ++i) {
197 int reg = get_ins_reg_by_idx (ins, i);
198 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
199 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
200 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
201 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
206 if (IS_DEBUG_ON (cfg)) {
207 for (i = 0; i < cfg->num_varinfo; i++) {
208 MonoInst *var = cfg->varinfo [i];
209 if (var->klass->simd_type) {
210 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
211 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
212 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
213 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
218 /*TODO stop here is no var is xzero only*/
221 Scan all other bb and check if it has only one other use
222 Ideally this would be done after an extended bb formation pass
224 FIXME This pass could use dominator information to properly
225 place the XZERO on the bb that dominates all uses of the var,
226 but this will have zero effect with the current local reg alloc
228 TODO simply the use of flags.
231 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
232 for (ins = bb->code; ins; ins = ins->next) {
233 for (i = 0; i < 3; ++i) {
234 int reg = get_ins_reg_by_idx (ins, i);
235 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
238 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
239 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
240 vreg_flags [reg] |= VREG_MANY_BB_USE;
241 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
243 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
244 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
245 target_bb [reg] = bb;
246 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
253 for (i = 0; i < cfg->num_varinfo; i++) {
254 MonoInst *var = cfg->varinfo [i];
255 if (!var->klass->simd_type)
257 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
258 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
259 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
260 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
262 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
264 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
265 /*We can, pretty much kill it.*/
266 if (ins->dreg == var->dreg) {
268 } else if (ins->sreg1 == var->dreg || ins->sreg2 == var->dreg) {
270 MONO_INST_NEW (cfg, tmp, OP_XZERO);
271 tmp->dreg = var->dreg;
272 tmp->type = STACK_VTYPE;
273 tmp->klass = var->klass;
274 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
280 for (ins = first_bb->code; ins; ins = ins->next) {
281 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
290 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean is_this_ptr)
292 if (src->opcode == OP_XMOVE) {
293 /*FIXME returning src->sreg1 breaks during regalloc */
295 } else if (src->opcode == OP_LDADDR && is_this_ptr) {
296 int res = ((MonoInst*)src->inst_p0)->dreg;
299 } else if (src->opcode == OP_LOADX_MEMBASE) {
301 } else if (src->klass && src->klass->simd_type) {
304 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
305 mono_print_ins (src);
306 g_assert_not_reached ();
310 get_int_to_float_spill_area (MonoCompile *cfg)
312 if (!cfg->iconv_raw_var) {
313 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
314 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
316 return cfg->iconv_raw_var;
320 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
323 int left_vreg, right_vreg;
325 left_vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
326 right_vreg = get_simd_vreg (cfg, cmethod, args [1], FALSE);
329 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
330 ins->klass = cmethod->klass;
331 ins->sreg1 = left_vreg;
332 ins->sreg2 = right_vreg;
333 ins->type = STACK_VTYPE;
334 ins->klass = cmethod->klass;
335 ins->dreg = alloc_ireg (cfg);
336 MONO_ADD_INS (cfg->cbb, ins);
341 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
346 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
348 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
349 ins->klass = cmethod->klass;
351 ins->type = STACK_VTYPE;
352 ins->dreg = alloc_ireg (cfg);
353 MONO_ADD_INS (cfg->cbb, ins);
358 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
363 vreg = get_simd_vreg (cfg, cmethod, args [0], TRUE);
365 if (intrinsic->opcode) {
366 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
367 ins->klass = cmethod->klass;
369 ins->inst_c0 = intrinsic->opcode;
370 ins->type = STACK_VTYPE;
371 ins->dreg = vreg = alloc_ireg (cfg);
372 MONO_ADD_INS (cfg->cbb, ins);
375 MONO_INST_NEW (cfg, tmp, OP_EXTRACT_I4);
376 tmp->klass = cmethod->klass;
378 tmp->type = STACK_I4;
379 tmp->dreg = alloc_ireg (cfg);
380 MONO_ADD_INS (cfg->cbb, tmp);
382 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
383 ins->klass = mono_defaults.single_class;
384 ins->sreg1 = tmp->dreg;
385 ins->type = STACK_R8;
386 ins->dreg = alloc_freg (cfg);
387 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
388 MONO_ADD_INS (cfg->cbb, ins);
393 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
398 for (i = 1; i < 5; ++i) {
399 MONO_INST_NEW (cfg, ins, OP_PUSH_R4);
400 ins->sreg1 = args [5 - i]->dreg;
401 ins->klass = args [5 - i]->klass;
402 MONO_ADD_INS (cfg->cbb, ins);
405 /*TODO replace with proper LOAD macro */
406 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
407 ins->klass = cmethod->klass;
408 ins->type = STACK_VTYPE;
409 ins->dreg = get_simd_vreg (cfg, cmethod, args [0], TRUE);
410 MONO_ADD_INS (cfg->cbb, ins);
416 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
421 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
424 MONO_INST_NEW (cfg, ins, OP_XMOVE);
425 ins->klass = cmethod->klass;
426 ins->type = STACK_VTYPE;
428 ins->dreg = alloc_ireg (cfg);
429 MONO_ADD_INS (cfg->cbb, ins);
434 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
439 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
441 if (args [1]->opcode != OP_ICONST) {
442 g_warning ("Vector4f:Shuffle with non literals is not yet supported");
443 g_assert_not_reached ();
445 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
446 NULLIFY_INS (args [1]);
448 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
449 ins->klass = cmethod->klass;
451 ins->inst_c0 = args [1]->inst_c0;
452 ins->type = STACK_VTYPE;
453 ins->dreg = alloc_ireg (cfg);
454 MONO_ADD_INS (cfg->cbb, ins);
459 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
463 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
464 ins->klass = cmethod->klass;
465 ins->sreg1 = args [0]->dreg;
466 /*FIXME, shouldn't use use ->inst_offset?*/
467 ins->type = STACK_VTYPE;
468 ins->dreg = alloc_ireg (cfg);
469 MONO_ADD_INS (cfg->cbb, ins);
474 simd_intrinsic_emit_store_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
479 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
481 MONO_INST_NEW (cfg, ins, OP_STOREX_ALIGNED_MEMBASE_REG);
482 ins->klass = cmethod->klass;
483 ins->dreg = args [0]->dreg;
484 ins->inst_offset = args [0]->inst_offset;
486 ins->type = STACK_VTYPE;
487 MONO_ADD_INS (cfg->cbb, ins);
493 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
495 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
497 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
500 if (IS_DEBUG_ON (cfg)) {
502 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, result->name);
503 max = fsig->param_count + fsig->hasthis;
504 for (i = 0; i < max; ++i) {
505 printf ("param %d: ", i);
506 mono_print_ins (args [i]);
510 switch (result->simd_emit_mode) {
511 case SIMD_EMIT_BINARY_SSE3:
512 if (cfg->opt & MONO_OPT_SSE3)
513 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
515 case SIMD_EMIT_BINARY:
516 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
517 case SIMD_EMIT_UNARY:
518 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
519 case SIMD_EMIT_GETTER:
520 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
522 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
524 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
525 case SIMD_EMIT_SHUFFLE:
526 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
527 case SIMD_EMIT_LOAD_ALIGNED:
528 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
529 case SIMD_EMIT_STORE_ALIGNED:
530 return simd_intrinsic_emit_store_aligned (result, cfg, cmethod, args);
532 g_assert_not_reached ();
536 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
538 if (!cmethod->klass->simd_type)
540 cfg->uses_simd_intrinsics = 1;
541 if (!strcmp ("Vector4f", cmethod->klass->name))
542 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
543 if (!strcmp ("Vector4u", cmethod->klass->name))
544 return emit_intrinsics (cfg, cmethod, fsig, args, vector4u_intrinsics, sizeof (vector4u_intrinsics) / sizeof (SimdIntrinsc));