2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw.
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
34 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
36 General notes for SIMD intrinsics.
38 -Bad extractor and constructor performance
39 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
40 It will be loaded in the FP stack just to be pushed on the call stack.
42 A similar thing happens with Vector4f constructor that require float vars to be
44 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
45 trip to the FP stack is desirable.
47 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
51 -Promote OP_EXTRACT_I4 to a STORE op
52 The advantage of this change is that it could have a _membase version and promote further optimizations.
54 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
58 #ifdef MONO_ARCH_SIMD_INTRINSICS
60 //#define IS_DEBUG_ON(cfg) (0)
62 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
63 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
66 SIMD_EMIT_BINARY_SSE3,
73 SIMD_EMIT_LOAD_ALIGNED,
74 SIMD_EMIT_STORE_ALIGNED
77 /*This is the size of the largest method name + 1 (to fit the ending \0). Align to 4 as well.*/
78 #define SIMD_INTRINSIC_NAME_MAX 22
81 const char name[SIMD_INTRINSIC_NAME_MAX];
83 guint8 simd_emit_mode;
91 static const SimdIntrinsc vector4f_intrinsics[] = {
92 { ".ctor", 0, SIMD_EMIT_CTOR },
93 { "AddSub", OP_ADDSUBPS, SIMD_EMIT_BINARY_SSE3 },
94 { "HorizontalAdd", OP_HADDPS, SIMD_EMIT_BINARY_SSE3 },
95 { "HorizontalSub", OP_HSUBPS, SIMD_EMIT_BINARY_SSE3 },
96 { "InvSqrt", OP_RSQRTPS, SIMD_EMIT_UNARY },
97 { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
98 { "Max", OP_MAXPS, SIMD_EMIT_BINARY },
99 { "Min", OP_MINPS, SIMD_EMIT_BINARY },
100 { "Shuffle", 0, SIMD_EMIT_SHUFFLE },
101 { "Sqrt", OP_SQRTPS, SIMD_EMIT_UNARY },
102 { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
103 { "get_W", 3, SIMD_EMIT_GETTER },
104 { "get_X", 0, SIMD_EMIT_GETTER },
105 { "get_Y", 1, SIMD_EMIT_GETTER },
106 { "get_Z", 2, SIMD_EMIT_GETTER },
107 { "op_Addition", OP_ADDPS, SIMD_EMIT_BINARY },
108 { "op_Division", OP_DIVPS, SIMD_EMIT_BINARY },
109 { "op_Explicit", 0, SIMD_EMIT_CAST },
110 { "op_Multiply", OP_MULPS, SIMD_EMIT_BINARY },
111 { "op_Subtraction", OP_SUBPS, SIMD_EMIT_BINARY },
116 A lot, revisit Vector4u.
118 static const SimdIntrinsc vector4u_intrinsics[] = {
119 { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
120 { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
121 { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
130 static const SimdIntrinsc vector8u_intrinsics[] = {
131 { "AddWithSaturation", OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
132 { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
133 { "ShiftRightArithmethic", OP_PSARW, SIMD_EMIT_SHIFT },
134 { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
135 { "SubWithSaturation", OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
136 { "UnpackHigh", OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
137 { "UnpackLow", OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
138 { "op_Addition", OP_PADDW, SIMD_EMIT_BINARY },
139 { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
140 { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
141 { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
142 { "op_Explicit", 0, SIMD_EMIT_CAST },
143 { "op_LeftShift", OP_PSHLW, SIMD_EMIT_SHIFT },
144 { "op_Multiply", OP_PMULW, SIMD_EMIT_BINARY },
145 { "op_RightShift", OP_PSHRW, SIMD_EMIT_SHIFT },
146 { "op_Subtraction", OP_PSUBW, SIMD_EMIT_BINARY },
155 static const SimdIntrinsc vector16u_intrinsics[] = {
156 { "AddWithSaturation", OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
157 { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
158 { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
159 { "SubWithSaturation", OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
160 { "UnpackHigh", OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
161 { "UnpackLow", OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
162 { "op_Addition", OP_PADDB, SIMD_EMIT_BINARY },
163 { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
164 { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
165 { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
166 { "op_Explicit", 0, SIMD_EMIT_CAST },
167 { "op_Subtraction", OP_PSUBB, SIMD_EMIT_BINARY },
171 /*TODO match using number of parameters as well*/
173 simd_intrinsic_compare_by_name (const void *key, const void *value)
175 return strncmp(key, ((SimdIntrinsc *)value)->name, SIMD_INTRINSIC_NAME_MAX);
180 VREG_HAS_XZERO_BB0 = 0x02,
181 VREG_HAS_OTHER_OP_BB0 = 0x04,
182 VREG_SINGLE_BB_USE = 0x08,
183 VREG_MANY_BB_USE = 0x10,
187 get_ins_reg_by_idx (MonoInst *ins, int idx)
190 case 0: return ins->dreg;
191 case 1: return ins->sreg1;
192 case 2: return ins->sreg2;
197 This pass recalculate which vars need MONO_INST_INDIRECT.
199 We cannot do this for non SIMD vars since code like mono_get_vtable_var
200 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
203 mono_simd_simplify_indirection (MonoCompile *cfg)
206 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
210 for (i = 0; i < cfg->num_varinfo; i++) {
211 MonoInst *var = cfg->varinfo [i];
212 if (var->klass->simd_type) {
213 // printf ("cleaning indirect flag for %d\n", var->dreg);
214 var->flags &= ~MONO_INST_INDIRECT;
215 max_vreg = MAX (var->dreg, max_vreg);
219 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
220 if (!first_bb && bb->code)
222 for (ins = bb->code; ins; ins = ins->next) {
223 if (ins->opcode == OP_LDADDR) {
224 MonoInst *var = (MonoInst*)ins->inst_p0;
225 if (var->klass->simd_type) {
226 var->flags |= MONO_INST_INDIRECT;
232 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
233 vreg_flags = g_malloc0 (max_vreg + 1);
234 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
236 for (i = 0; i < cfg->num_varinfo; i++) {
237 MonoInst *var = cfg->varinfo [i];
238 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
239 vreg_flags [var->dreg] = VREG_USED;
240 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
244 /*Scan the first basic block looking xzeros not used*/
245 for (ins = first_bb->code; ins; ins = ins->next) {
246 if (ins->opcode == OP_XZERO) {
247 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
248 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
249 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
253 for (i = 0; i < 3; ++i) {
254 int reg = get_ins_reg_by_idx (ins, i);
255 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
256 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
257 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
258 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
263 if (IS_DEBUG_ON (cfg)) {
264 for (i = 0; i < cfg->num_varinfo; i++) {
265 MonoInst *var = cfg->varinfo [i];
266 if (var->klass->simd_type) {
267 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
268 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
269 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
270 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
275 /*TODO stop here if no var is xzero only*/
278 Scan all other bb and check if it has only one other use
279 Ideally this would be done after an extended bb formation pass
281 FIXME This pass could use dominator information to properly
282 place the XZERO on the bb that dominates all uses of the var,
283 but this will have zero effect with the current local reg alloc
285 TODO simply the use of flags.
288 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
289 for (ins = bb->code; ins; ins = ins->next) {
290 for (i = 0; i < 3; ++i) {
291 int reg = get_ins_reg_by_idx (ins, i);
292 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
295 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
296 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
297 vreg_flags [reg] |= VREG_MANY_BB_USE;
298 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
300 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
301 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
302 target_bb [reg] = bb;
303 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
310 for (i = 0; i < cfg->num_varinfo; i++) {
311 MonoInst *var = cfg->varinfo [i];
312 if (!var->klass->simd_type)
314 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
315 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
316 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
317 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
319 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
321 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
322 /*We can, pretty much kill it.*/
323 if (ins->dreg == var->dreg) {
325 } else if (ins->sreg1 == var->dreg || ins->sreg2 == var->dreg) {
327 MONO_INST_NEW (cfg, tmp, OP_XZERO);
328 tmp->dreg = var->dreg;
329 tmp->type = STACK_VTYPE;
330 tmp->klass = var->klass;
331 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
337 for (ins = first_bb->code; ins; ins = ins->next) {
338 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
347 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean is_this_ptr)
349 if (src->opcode == OP_XMOVE) {
350 /*FIXME returning src->sreg1 breaks during regalloc */
352 } else if (src->opcode == OP_LDADDR && is_this_ptr) {
353 int res = ((MonoInst*)src->inst_p0)->dreg;
356 } else if (src->opcode == OP_LOADX_MEMBASE) {
358 } else if (src->klass && src->klass->simd_type) {
361 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
362 mono_print_ins (src);
363 g_assert_not_reached ();
367 get_int_to_float_spill_area (MonoCompile *cfg)
369 if (!cfg->iconv_raw_var) {
370 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
371 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
373 return cfg->iconv_raw_var;
377 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
380 int left_vreg, right_vreg;
382 left_vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
383 right_vreg = get_simd_vreg (cfg, cmethod, args [1], FALSE);
386 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
387 ins->klass = cmethod->klass;
388 ins->sreg1 = left_vreg;
389 ins->sreg2 = right_vreg;
390 ins->type = STACK_VTYPE;
391 ins->klass = cmethod->klass;
392 ins->dreg = alloc_ireg (cfg);
393 MONO_ADD_INS (cfg->cbb, ins);
398 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
403 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
405 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
406 ins->klass = cmethod->klass;
408 ins->type = STACK_VTYPE;
409 ins->dreg = alloc_ireg (cfg);
410 MONO_ADD_INS (cfg->cbb, ins);
415 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
420 vreg = get_simd_vreg (cfg, cmethod, args [0], TRUE);
422 if (intrinsic->opcode) {
423 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
424 ins->klass = cmethod->klass;
426 ins->inst_c0 = intrinsic->opcode;
427 ins->type = STACK_VTYPE;
428 ins->dreg = vreg = alloc_ireg (cfg);
429 MONO_ADD_INS (cfg->cbb, ins);
432 MONO_INST_NEW (cfg, tmp, OP_EXTRACT_I4);
433 tmp->klass = cmethod->klass;
435 tmp->type = STACK_I4;
436 tmp->dreg = alloc_ireg (cfg);
437 MONO_ADD_INS (cfg->cbb, tmp);
439 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
440 ins->klass = mono_defaults.single_class;
441 ins->sreg1 = tmp->dreg;
442 ins->type = STACK_R8;
443 ins->dreg = alloc_freg (cfg);
444 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
445 MONO_ADD_INS (cfg->cbb, ins);
450 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
455 for (i = 1; i < 5; ++i) {
456 MONO_INST_NEW (cfg, ins, OP_PUSH_R4);
457 ins->sreg1 = args [5 - i]->dreg;
458 ins->klass = args [5 - i]->klass;
459 MONO_ADD_INS (cfg->cbb, ins);
462 /*TODO replace with proper LOAD macro */
463 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
464 ins->klass = cmethod->klass;
465 ins->type = STACK_VTYPE;
466 ins->dreg = get_simd_vreg (cfg, cmethod, args [0], TRUE);
467 MONO_ADD_INS (cfg->cbb, ins);
473 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
478 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
481 MONO_INST_NEW (cfg, ins, OP_XMOVE);
482 ins->klass = cmethod->klass;
483 ins->type = STACK_VTYPE;
485 ins->dreg = alloc_ireg (cfg);
486 MONO_ADD_INS (cfg->cbb, ins);
492 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
495 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
497 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
499 if (args [1]->opcode != OP_ICONST) {
500 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
501 ins->klass = mono_defaults.int32_class;
502 ins->sreg1 = args [1]->dreg;
503 ins->type = STACK_I4;
504 ins->dreg = vreg2 = alloc_ireg (cfg);
505 MONO_ADD_INS (cfg->cbb, ins);
507 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
510 MONO_INST_NEW (cfg, ins, opcode);
511 ins->klass = cmethod->klass;
515 if (args [1]->opcode == OP_ICONST) {
516 ins->inst_imm = args [1]->inst_c0;
517 NULLIFY_INS (args [1]);
520 ins->type = STACK_VTYPE;
521 ins->dreg = alloc_ireg (cfg);
522 MONO_ADD_INS (cfg->cbb, ins);
528 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
533 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
535 if (args [1]->opcode != OP_ICONST) {
536 g_warning ("Vector4f:Shuffle with non literals is not yet supported");
537 g_assert_not_reached ();
539 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
540 NULLIFY_INS (args [1]);
542 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
543 ins->klass = cmethod->klass;
545 ins->inst_c0 = args [1]->inst_c0;
546 ins->type = STACK_VTYPE;
547 ins->dreg = alloc_ireg (cfg);
548 MONO_ADD_INS (cfg->cbb, ins);
553 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
557 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
558 ins->klass = cmethod->klass;
559 ins->sreg1 = args [0]->dreg;
560 /*FIXME, shouldn't use use ->inst_offset?*/
561 ins->type = STACK_VTYPE;
562 ins->dreg = alloc_ireg (cfg);
563 MONO_ADD_INS (cfg->cbb, ins);
568 simd_intrinsic_emit_store_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
573 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
575 MONO_INST_NEW (cfg, ins, OP_STOREX_ALIGNED_MEMBASE_REG);
576 ins->klass = cmethod->klass;
577 ins->dreg = args [0]->dreg;
578 ins->inst_offset = args [0]->inst_offset;
580 ins->type = STACK_VTYPE;
581 MONO_ADD_INS (cfg->cbb, ins);
587 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
589 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
591 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
594 if (IS_DEBUG_ON (cfg)) {
596 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, result->name);
597 max = fsig->param_count + fsig->hasthis;
598 for (i = 0; i < max; ++i) {
599 printf ("param %d: ", i);
600 mono_print_ins (args [i]);
604 switch (result->simd_emit_mode) {
605 case SIMD_EMIT_BINARY_SSE3:
606 if (cfg->opt & MONO_OPT_SSE3)
607 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
609 case SIMD_EMIT_BINARY:
610 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
611 case SIMD_EMIT_UNARY:
612 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
613 case SIMD_EMIT_GETTER:
614 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
616 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
618 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
619 case SIMD_EMIT_SHUFFLE:
620 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
621 case SIMD_EMIT_SHIFT:
622 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
623 case SIMD_EMIT_LOAD_ALIGNED:
624 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
625 case SIMD_EMIT_STORE_ALIGNED:
626 return simd_intrinsic_emit_store_aligned (result, cfg, cmethod, args);
628 g_assert_not_reached ();
632 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
634 if (!cmethod->klass->simd_type)
636 cfg->uses_simd_intrinsics = 1;
637 if (!strcmp ("Vector4f", cmethod->klass->name))
638 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
639 if (!strcmp ("Vector4u", cmethod->klass->name))
640 return emit_intrinsics (cfg, cmethod, fsig, args, vector4u_intrinsics, sizeof (vector4u_intrinsics) / sizeof (SimdIntrinsc));
641 if (!strcmp ("Vector8u", cmethod->klass->name))
642 return emit_intrinsics (cfg, cmethod, fsig, args, vector8u_intrinsics, sizeof (vector8u_intrinsics) / sizeof (SimdIntrinsc));
643 if (!strcmp ("Vector16u", cmethod->klass->name))
644 return emit_intrinsics (cfg, cmethod, fsig, args, vector16u_intrinsics, sizeof (vector16u_intrinsics) / sizeof (SimdIntrinsc));