2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw.
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
34 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
36 General notes for SIMD intrinsics.
38 -Bad extractor and constructor performance
39 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
40 It will be loaded in the FP stack just to be pushed on the call stack.
42 A similar thing happens with Vector4f constructor that require float vars to be
44 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
45 trip to the FP stack is desirable.
47 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
51 -Promote OP_EXTRACT_I4 to a STORE op
52 The advantage of this change is that it could have a _membase version and promote further optimizations.
54 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
58 #ifdef MONO_ARCH_SIMD_INTRINSICS
60 //#define IS_DEBUG_ON(cfg) (0)
62 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
63 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
66 SIMD_EMIT_BINARY_SSE3,
73 SIMD_EMIT_LOAD_ALIGNED,
74 SIMD_EMIT_STORE_ALIGNED
77 /*This is the size of the largest method name + 1 (to fit the ending \0). Align to 4 as well.*/
78 #define SIMD_INTRINSIC_NAME_MAX 22
81 const char name[SIMD_INTRINSIC_NAME_MAX];
83 guint8 simd_emit_mode;
91 static const SimdIntrinsc vector4f_intrinsics[] = {
92 { ".ctor", 0, SIMD_EMIT_CTOR },
93 { "AddSub", OP_ADDSUBPS, SIMD_EMIT_BINARY_SSE3 },
94 { "HorizontalAdd", OP_HADDPS, SIMD_EMIT_BINARY_SSE3 },
95 { "HorizontalSub", OP_HSUBPS, SIMD_EMIT_BINARY_SSE3 },
96 { "InvSqrt", OP_RSQRTPS, SIMD_EMIT_UNARY },
97 { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
98 { "Max", OP_MAXPS, SIMD_EMIT_BINARY },
99 { "Min", OP_MINPS, SIMD_EMIT_BINARY },
100 { "Shuffle", 0, SIMD_EMIT_SHUFFLE },
101 { "Sqrt", OP_SQRTPS, SIMD_EMIT_UNARY },
102 { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
103 { "get_W", 3, SIMD_EMIT_GETTER },
104 { "get_X", 0, SIMD_EMIT_GETTER },
105 { "get_Y", 1, SIMD_EMIT_GETTER },
106 { "get_Z", 2, SIMD_EMIT_GETTER },
107 { "op_Addition", OP_ADDPS, SIMD_EMIT_BINARY },
108 { "op_Division", OP_DIVPS, SIMD_EMIT_BINARY },
109 { "op_Explicit", 0, SIMD_EMIT_CAST },
110 { "op_Multiply", OP_MULPS, SIMD_EMIT_BINARY },
111 { "op_Subtraction", OP_SUBPS, SIMD_EMIT_BINARY },
116 A lot, revisit Vector4u.
118 static const SimdIntrinsc vector4u_intrinsics[] = {
119 { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
120 { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
121 { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
130 static const SimdIntrinsc vector8us_intrinsics[] = {
131 { "AddWithSaturation", OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
132 { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
133 { "ShiftRightArithmethic", OP_PSARW, SIMD_EMIT_SHIFT },
134 { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
135 { "SubWithSaturation", OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
136 { "UnpackHigh", OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
137 { "UnpackLow", OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
138 { "op_Addition", OP_PADDW, SIMD_EMIT_BINARY },
139 { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
140 { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
141 { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
142 { "op_Explicit", 0, SIMD_EMIT_CAST },
143 { "op_LeftShift", OP_PSHLW, SIMD_EMIT_SHIFT },
144 { "op_Multiply", OP_PMULW, SIMD_EMIT_BINARY },
145 { "op_RightShift", OP_PSHRW, SIMD_EMIT_SHIFT },
146 { "op_Subtraction", OP_PSUBW, SIMD_EMIT_BINARY },
155 static const SimdIntrinsc vector16b_intrinsics[] = {
156 { "AddWithSaturation", OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
157 { "LoadAligned", 0, SIMD_EMIT_LOAD_ALIGNED },
158 { "StoreAligned", 0, SIMD_EMIT_STORE_ALIGNED },
159 { "SubWithSaturation", OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
160 { "UnpackHigh", OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
161 { "UnpackLow", OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
162 { "op_Addition", OP_PADDB, SIMD_EMIT_BINARY },
163 { "op_BitwiseAnd", OP_PAND, SIMD_EMIT_BINARY },
164 { "op_BitwiseOr", OP_POR, SIMD_EMIT_BINARY },
165 { "op_BitwiseXor", OP_PXOR, SIMD_EMIT_BINARY },
166 { "op_Explicit", 0, SIMD_EMIT_CAST },
167 { "op_Subtraction", OP_PSUBB, SIMD_EMIT_BINARY },
170 static guint32 simd_supported_versions;
172 /*TODO match using number of parameters as well*/
174 simd_intrinsic_compare_by_name (const void *key, const void *value)
176 return strncmp(key, ((SimdIntrinsc *)value)->name, SIMD_INTRINSIC_NAME_MAX);
181 VREG_HAS_XZERO_BB0 = 0x02,
182 VREG_HAS_OTHER_OP_BB0 = 0x04,
183 VREG_SINGLE_BB_USE = 0x08,
184 VREG_MANY_BB_USE = 0x10,
188 get_ins_reg_by_idx (MonoInst *ins, int idx)
191 case 0: return ins->dreg;
192 case 1: return ins->sreg1;
193 case 2: return ins->sreg2;
199 mono_simd_intrinsics_init ()
201 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
204 This pass recalculate which vars need MONO_INST_INDIRECT.
206 We cannot do this for non SIMD vars since code like mono_get_vtable_var
207 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
210 mono_simd_simplify_indirection (MonoCompile *cfg)
213 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
217 for (i = 0; i < cfg->num_varinfo; i++) {
218 MonoInst *var = cfg->varinfo [i];
219 if (var->klass->simd_type) {
220 // printf ("cleaning indirect flag for %d\n", var->dreg);
221 var->flags &= ~MONO_INST_INDIRECT;
222 max_vreg = MAX (var->dreg, max_vreg);
226 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
227 if (!first_bb && bb->code)
229 for (ins = bb->code; ins; ins = ins->next) {
230 if (ins->opcode == OP_LDADDR) {
231 MonoInst *var = (MonoInst*)ins->inst_p0;
232 if (var->klass->simd_type) {
233 var->flags |= MONO_INST_INDIRECT;
239 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
240 vreg_flags = g_malloc0 (max_vreg + 1);
241 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
243 for (i = 0; i < cfg->num_varinfo; i++) {
244 MonoInst *var = cfg->varinfo [i];
245 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
246 vreg_flags [var->dreg] = VREG_USED;
247 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
251 /*Scan the first basic block looking xzeros not used*/
252 for (ins = first_bb->code; ins; ins = ins->next) {
253 if (ins->opcode == OP_XZERO) {
254 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
255 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
256 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
260 for (i = 0; i < 3; ++i) {
261 int reg = get_ins_reg_by_idx (ins, i);
262 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
263 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
264 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
265 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
270 if (IS_DEBUG_ON (cfg)) {
271 for (i = 0; i < cfg->num_varinfo; i++) {
272 MonoInst *var = cfg->varinfo [i];
273 if (var->klass->simd_type) {
274 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
275 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
276 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
277 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
282 /*TODO stop here if no var is xzero only*/
285 Scan all other bb and check if it has only one other use
286 Ideally this would be done after an extended bb formation pass
288 FIXME This pass could use dominator information to properly
289 place the XZERO on the bb that dominates all uses of the var,
290 but this will have zero effect with the current local reg alloc
292 TODO simply the use of flags.
295 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
296 for (ins = bb->code; ins; ins = ins->next) {
297 for (i = 0; i < 3; ++i) {
298 int reg = get_ins_reg_by_idx (ins, i);
299 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
302 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
303 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
304 vreg_flags [reg] |= VREG_MANY_BB_USE;
305 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
307 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
308 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
309 target_bb [reg] = bb;
310 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
317 for (i = 0; i < cfg->num_varinfo; i++) {
318 MonoInst *var = cfg->varinfo [i];
319 if (!var->klass->simd_type)
321 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
322 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
323 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
324 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
326 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
328 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
329 /*We can, pretty much kill it.*/
330 if (ins->dreg == var->dreg) {
332 } else if (ins->sreg1 == var->dreg || ins->sreg2 == var->dreg) {
334 MONO_INST_NEW (cfg, tmp, OP_XZERO);
335 tmp->dreg = var->dreg;
336 tmp->type = STACK_VTYPE;
337 tmp->klass = var->klass;
338 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
344 for (ins = first_bb->code; ins; ins = ins->next) {
345 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
354 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean is_this_ptr)
356 if (src->opcode == OP_XMOVE) {
357 /*FIXME returning src->sreg1 breaks during regalloc */
359 } else if (src->opcode == OP_LDADDR && is_this_ptr) {
360 int res = ((MonoInst*)src->inst_p0)->dreg;
363 } else if (src->opcode == OP_LOADX_MEMBASE) {
365 } else if (src->klass && src->klass->simd_type) {
368 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
369 mono_print_ins (src);
370 g_assert_not_reached ();
374 get_int_to_float_spill_area (MonoCompile *cfg)
376 if (!cfg->iconv_raw_var) {
377 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
378 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
380 return cfg->iconv_raw_var;
384 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
387 int left_vreg, right_vreg;
389 left_vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
390 right_vreg = get_simd_vreg (cfg, cmethod, args [1], FALSE);
393 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
394 ins->klass = cmethod->klass;
395 ins->sreg1 = left_vreg;
396 ins->sreg2 = right_vreg;
397 ins->type = STACK_VTYPE;
398 ins->klass = cmethod->klass;
399 ins->dreg = alloc_ireg (cfg);
400 MONO_ADD_INS (cfg->cbb, ins);
405 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
410 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
412 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
413 ins->klass = cmethod->klass;
415 ins->type = STACK_VTYPE;
416 ins->dreg = alloc_ireg (cfg);
417 MONO_ADD_INS (cfg->cbb, ins);
422 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
427 vreg = get_simd_vreg (cfg, cmethod, args [0], TRUE);
429 if (intrinsic->opcode) {
430 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
431 ins->klass = cmethod->klass;
433 ins->inst_c0 = intrinsic->opcode;
434 ins->type = STACK_VTYPE;
435 ins->dreg = vreg = alloc_ireg (cfg);
436 MONO_ADD_INS (cfg->cbb, ins);
439 MONO_INST_NEW (cfg, tmp, OP_EXTRACT_I4);
440 tmp->klass = cmethod->klass;
442 tmp->type = STACK_I4;
443 tmp->dreg = alloc_ireg (cfg);
444 MONO_ADD_INS (cfg->cbb, tmp);
446 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
447 ins->klass = mono_defaults.single_class;
448 ins->sreg1 = tmp->dreg;
449 ins->type = STACK_R8;
450 ins->dreg = alloc_freg (cfg);
451 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
452 MONO_ADD_INS (cfg->cbb, ins);
457 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
462 for (i = 1; i < 5; ++i) {
463 MONO_INST_NEW (cfg, ins, OP_PUSH_R4);
464 ins->sreg1 = args [5 - i]->dreg;
465 ins->klass = args [5 - i]->klass;
466 MONO_ADD_INS (cfg->cbb, ins);
469 /*TODO replace with proper LOAD macro */
470 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
471 ins->klass = cmethod->klass;
472 ins->type = STACK_VTYPE;
473 ins->dreg = get_simd_vreg (cfg, cmethod, args [0], TRUE);
474 MONO_ADD_INS (cfg->cbb, ins);
480 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
485 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
488 MONO_INST_NEW (cfg, ins, OP_XMOVE);
489 ins->klass = cmethod->klass;
490 ins->type = STACK_VTYPE;
492 ins->dreg = alloc_ireg (cfg);
493 MONO_ADD_INS (cfg->cbb, ins);
499 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
502 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
504 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
506 if (args [1]->opcode != OP_ICONST) {
507 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
508 ins->klass = mono_defaults.int32_class;
509 ins->sreg1 = args [1]->dreg;
510 ins->type = STACK_I4;
511 ins->dreg = vreg2 = alloc_ireg (cfg);
512 MONO_ADD_INS (cfg->cbb, ins);
514 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
517 MONO_INST_NEW (cfg, ins, opcode);
518 ins->klass = cmethod->klass;
522 if (args [1]->opcode == OP_ICONST) {
523 ins->inst_imm = args [1]->inst_c0;
524 NULLIFY_INS (args [1]);
527 ins->type = STACK_VTYPE;
528 ins->dreg = alloc_ireg (cfg);
529 MONO_ADD_INS (cfg->cbb, ins);
535 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
540 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
542 if (args [1]->opcode != OP_ICONST) {
543 g_warning ("Vector4f:Shuffle with non literals is not yet supported");
544 g_assert_not_reached ();
546 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
547 NULLIFY_INS (args [1]);
549 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
550 ins->klass = cmethod->klass;
552 ins->inst_c0 = args [1]->inst_c0;
553 ins->type = STACK_VTYPE;
554 ins->dreg = alloc_ireg (cfg);
555 MONO_ADD_INS (cfg->cbb, ins);
560 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
564 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
565 ins->klass = cmethod->klass;
566 ins->sreg1 = args [0]->dreg;
567 /*FIXME, shouldn't use use ->inst_offset?*/
568 ins->type = STACK_VTYPE;
569 ins->dreg = alloc_ireg (cfg);
570 MONO_ADD_INS (cfg->cbb, ins);
575 simd_intrinsic_emit_store_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
580 vreg = get_simd_vreg (cfg, cmethod, args [0], FALSE);
582 MONO_INST_NEW (cfg, ins, OP_STOREX_ALIGNED_MEMBASE_REG);
583 ins->klass = cmethod->klass;
584 ins->dreg = args [0]->dreg;
585 ins->inst_offset = args [0]->inst_offset;
587 ins->type = STACK_VTYPE;
588 MONO_ADD_INS (cfg->cbb, ins);
594 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
596 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
598 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
601 if (IS_DEBUG_ON (cfg)) {
603 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, result->name);
604 max = fsig->param_count + fsig->hasthis;
605 for (i = 0; i < max; ++i) {
606 printf ("param %d: ", i);
607 mono_print_ins (args [i]);
611 switch (result->simd_emit_mode) {
612 case SIMD_EMIT_BINARY_SSE3:
613 if (simd_supported_versions & SIMD_VERSION_SSE3)
614 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
616 case SIMD_EMIT_BINARY:
617 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
618 case SIMD_EMIT_UNARY:
619 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
620 case SIMD_EMIT_GETTER:
621 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
623 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
625 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
626 case SIMD_EMIT_SHUFFLE:
627 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
628 case SIMD_EMIT_SHIFT:
629 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
630 case SIMD_EMIT_LOAD_ALIGNED:
631 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
632 case SIMD_EMIT_STORE_ALIGNED:
633 return simd_intrinsic_emit_store_aligned (result, cfg, cmethod, args);
635 g_assert_not_reached ();
639 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
641 if (!cmethod->klass->simd_type)
643 cfg->uses_simd_intrinsics = 1;
644 if (!strcmp ("Vector4f", cmethod->klass->name))
645 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
646 if (!strcmp ("Vector4u", cmethod->klass->name))
647 return emit_intrinsics (cfg, cmethod, fsig, args, vector4u_intrinsics, sizeof (vector4u_intrinsics) / sizeof (SimdIntrinsc));
648 if (!strcmp ("Vector8us", cmethod->klass->name))
649 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
650 if (!strcmp ("Vector16b", cmethod->klass->name))
651 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));