2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
34 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
35 TODO check if we need to init the SSE control word with better precision.
36 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
38 General notes for SIMD intrinsics.
40 -Bad extractor and constructor performance
41 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
42 It will be loaded in the FP stack just to be pushed on the call stack.
44 A similar thing happens with Vector4f constructor that require float vars to be
46 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
47 trip to the FP stack is desirable.
49 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
53 -Promote OP_EXTRACT_I4 to a STORE op
54 The advantage of this change is that it could have a _membase version and promote further optimizations.
56 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
60 #ifdef MONO_ARCH_SIMD_INTRINSICS
62 //#define IS_DEBUG_ON(cfg) (0)
64 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
65 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
74 SIMD_EMIT_LOAD_ALIGNED,
75 SIMD_EMIT_STORE_ALIGNED,
76 SIMD_EMIT_EXTRACT_MASK
79 #ifdef HAVE_ARRAY_ELEM_INIT
80 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
81 #define MSGSTRFIELD1(line) str##line
82 static const struct msgstr_t {
83 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
84 #include "simd-methods.h"
87 #define SIMD_METHOD(str,name) str,
88 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
94 #include "simd-methods.h"
96 #define method_name(idx) ((const char*)&method_names + (idx))
99 #define SIMD_METHOD(str,name) str,
100 static const char * const method_names [] = {
101 #include "simd-methods.h"
105 #define SIMD_METHOD(str,name) name,
107 #include "simd-methods.h"
111 #define method_name(idx) (method_names [(idx)])
118 guint8 simd_emit_mode : 4;
119 guint8 simd_version : 4;
127 static const SimdIntrinsc vector4f_intrinsics[] = {
128 { SN_ctor, 0, SIMD_EMIT_CTOR },
129 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
130 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
131 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
132 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
133 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
134 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
135 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
136 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
137 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
138 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
139 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
140 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
141 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
142 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
143 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
144 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
145 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
146 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
147 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
148 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
149 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
150 { SN_Shuffle, OP_SHUFLEPS, SIMD_EMIT_SHUFFLE },
151 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
152 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
153 { SN_get_W, 3, SIMD_EMIT_GETTER },
154 { SN_get_X, 0, SIMD_EMIT_GETTER },
155 { SN_get_Y, 1, SIMD_EMIT_GETTER },
156 { SN_get_Z, 2, SIMD_EMIT_GETTER },
157 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
158 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
159 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
160 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
161 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
162 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
163 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
164 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
173 static const SimdIntrinsc vector2d_intrinsics[] = {
174 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
175 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
176 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
177 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
178 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
179 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
180 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
181 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
182 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
183 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
184 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
185 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
186 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
187 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
188 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
189 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
190 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
191 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
192 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
193 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
194 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
195 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
196 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
197 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
198 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
199 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
200 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
209 static const SimdIntrinsc vector2l_intrinsics[] = {
210 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
211 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
212 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
213 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
214 { SN_ShiftRightLogic, OP_PSHRQ, SIMD_EMIT_SHIFT },
215 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
216 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
217 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
218 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
219 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
220 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
221 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
222 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
223 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
224 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
225 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
234 static const SimdIntrinsc vector4ui_intrinsics[] = {
235 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
236 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
237 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
238 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
239 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
240 { SN_ShiftRightArithmetic, OP_PSARD, SIMD_EMIT_SHIFT },
241 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
242 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
243 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
244 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
245 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
246 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
247 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
248 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
249 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
250 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
251 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
252 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
253 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
254 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
255 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
264 static const SimdIntrinsc vector4i_intrinsics[] = {
265 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
266 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
267 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
268 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
269 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
270 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
271 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
272 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
273 { SN_ShiftRightLogic, OP_PSHRD, SIMD_EMIT_SHIFT },
274 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
275 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
276 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
277 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
278 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
279 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
280 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
281 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
282 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
283 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
284 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
285 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
286 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
295 static const SimdIntrinsc vector8us_intrinsics[] = {
296 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
297 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
298 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
299 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
300 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
301 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
302 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
303 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
304 { SN_ShiftRightArithmetic, OP_PSARW, SIMD_EMIT_SHIFT },
305 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
306 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
307 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
308 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
309 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
310 { SN_SubWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
311 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
312 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
313 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
314 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
315 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
316 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
317 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
318 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
319 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
320 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
321 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
330 static const SimdIntrinsc vector8s_intrinsics[] = {
331 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
332 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
333 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
334 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
335 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
336 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
337 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
338 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
339 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
340 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
341 { SN_ShiftRightLogic, OP_PSHRW, SIMD_EMIT_SHIFT },
342 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
343 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
344 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
345 { SN_SubWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
346 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
347 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
348 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
349 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
350 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
351 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
352 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
353 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
354 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
355 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
356 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
365 static const SimdIntrinsc vector16b_intrinsics[] = {
366 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
367 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
368 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
369 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
370 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
371 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
372 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
373 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
374 { SN_SubWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
375 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
376 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
377 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
378 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
379 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
380 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
381 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
382 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
383 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
392 static const SimdIntrinsc vector16sb_intrinsics[] = {
393 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
394 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
395 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
396 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
397 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
398 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
399 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
400 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
401 { SN_SubWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
402 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
403 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
404 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
405 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
406 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
407 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
408 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
409 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
412 static guint32 simd_supported_versions;
414 /*TODO match using number of parameters as well*/
416 simd_intrinsic_compare_by_name (const void *key, const void *value)
418 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
423 VREG_HAS_XZERO_BB0 = 0x02,
424 VREG_HAS_OTHER_OP_BB0 = 0x04,
425 VREG_SINGLE_BB_USE = 0x08,
426 VREG_MANY_BB_USE = 0x10,
430 mono_simd_intrinsics_init (void)
432 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
433 /*TODO log the supported flags*/
436 static inline gboolean
437 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
439 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
440 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
441 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
442 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
448 static inline gboolean
449 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
451 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
454 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
455 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
456 vreg_flags [reg] |= VREG_MANY_BB_USE;
457 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
459 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
460 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
461 target_bb [reg] = bb;
462 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
468 This pass recalculate which vars need MONO_INST_INDIRECT.
470 We cannot do this for non SIMD vars since code like mono_get_vtable_var
471 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
474 mono_simd_simplify_indirection (MonoCompile *cfg)
477 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
481 for (i = 0; i < cfg->num_varinfo; i++) {
482 MonoInst *var = cfg->varinfo [i];
483 if (var->klass->simd_type) {
484 var->flags &= ~MONO_INST_INDIRECT;
485 max_vreg = MAX (var->dreg, max_vreg);
489 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
490 if (!first_bb && bb->code)
492 for (ins = bb->code; ins; ins = ins->next) {
493 if (ins->opcode == OP_LDADDR) {
494 MonoInst *var = (MonoInst*)ins->inst_p0;
495 if (var->klass->simd_type) {
496 var->flags |= MONO_INST_INDIRECT;
502 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
503 vreg_flags = g_malloc0 (max_vreg + 1);
504 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
506 for (i = 0; i < cfg->num_varinfo; i++) {
507 MonoInst *var = cfg->varinfo [i];
508 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
509 vreg_flags [var->dreg] = VREG_USED;
510 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
514 /*Scan the first basic block looking xzeros not used*/
515 for (ins = first_bb->code; ins; ins = ins->next) {
516 if (ins->opcode == OP_XZERO) {
517 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
518 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
519 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
523 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
526 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
528 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg1, max_vreg, vreg_flags))
530 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg2, max_vreg, vreg_flags))
534 if (IS_DEBUG_ON (cfg)) {
535 for (i = 0; i < cfg->num_varinfo; i++) {
536 MonoInst *var = cfg->varinfo [i];
537 if (var->klass->simd_type) {
538 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
539 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
540 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
541 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
546 /*TODO stop here if no var is xzero only*/
549 Scan all other bb and check if it has only one other use
550 Ideally this would be done after an extended bb formation pass
552 FIXME This pass could use dominator information to properly
553 place the XZERO on the bb that dominates all uses of the var,
554 but this will have zero effect with the current local reg alloc
556 TODO simply the use of flags.
559 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
560 for (ins = bb->code; ins; ins = ins->next) {
562 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
564 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
566 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg1, bb, max_vreg, vreg_flags, target_bb))
568 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg2, bb, max_vreg, vreg_flags, target_bb))
573 for (i = 0; i < cfg->num_varinfo; i++) {
574 MonoInst *var = cfg->varinfo [i];
575 if (!var->klass->simd_type)
577 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
578 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
579 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
580 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
582 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
584 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
585 /*We can, pretty much kill it.*/
586 if (ins->dreg == var->dreg) {
588 } else if (ins->sreg1 == var->dreg || ins->sreg2 == var->dreg) {
590 MONO_INST_NEW (cfg, tmp, OP_XZERO);
591 tmp->dreg = var->dreg;
592 tmp->type = STACK_VTYPE;
593 tmp->klass = var->klass;
594 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
600 for (ins = first_bb->code; ins; ins = ins->next) {
601 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
610 * This function expect that src be a value.
613 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
615 if (src->opcode == OP_XMOVE) {
617 } else if (src->type == STACK_VTYPE) {
620 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
621 mono_print_ins (src);
622 g_assert_not_reached ();
626 * This function will load the value if needed.
629 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
631 if (src->opcode == OP_XMOVE) {
633 } else if (src->opcode == OP_LDADDR) {
634 int res = ((MonoInst*)src->inst_p0)->dreg;
637 } else if (src->type == STACK_VTYPE) {
639 } else if (src->type == STACK_PTR) {
642 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
643 ins->klass = cmethod->klass;
644 ins->sreg1 = src->dreg;
645 ins->type = STACK_VTYPE;
646 ins->dreg = alloc_ireg (cfg);
647 MONO_ADD_INS (cfg->cbb, ins);
650 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
651 mono_print_ins (src);
652 g_assert_not_reached ();
656 get_int_to_float_spill_area (MonoCompile *cfg)
658 if (!cfg->iconv_raw_var) {
659 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
660 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
662 return cfg->iconv_raw_var;
666 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
669 int left_vreg, right_vreg;
671 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
672 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
675 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
676 ins->klass = cmethod->klass;
677 ins->sreg1 = left_vreg;
678 ins->sreg2 = right_vreg;
679 ins->type = STACK_VTYPE;
680 ins->klass = cmethod->klass;
681 ins->dreg = alloc_ireg (cfg);
682 ins->inst_c0 = intrinsic->flags;
683 MONO_ADD_INS (cfg->cbb, ins);
688 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
693 vreg = get_simd_vreg (cfg, cmethod, args [0]);
695 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
696 ins->klass = cmethod->klass;
698 ins->type = STACK_VTYPE;
699 ins->dreg = alloc_ireg (cfg);
700 MONO_ADD_INS (cfg->cbb, ins);
705 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
710 vreg = load_simd_vreg (cfg, cmethod, args [0]);
712 if (intrinsic->opcode) {
713 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
714 ins->klass = cmethod->klass;
716 ins->inst_c0 = intrinsic->opcode;
717 ins->type = STACK_VTYPE;
718 ins->dreg = vreg = alloc_ireg (cfg);
719 MONO_ADD_INS (cfg->cbb, ins);
722 MONO_INST_NEW (cfg, tmp, OP_EXTRACT_I4);
723 tmp->klass = cmethod->klass;
725 tmp->type = STACK_I4;
726 tmp->dreg = alloc_ireg (cfg);
727 MONO_ADD_INS (cfg->cbb, tmp);
729 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
730 ins->klass = mono_defaults.single_class;
731 ins->sreg1 = tmp->dreg;
732 ins->type = STACK_R8;
733 ins->dreg = alloc_freg (cfg);
734 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
735 MONO_ADD_INS (cfg->cbb, ins);
740 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
745 for (i = 1; i < 5; ++i) {
746 MONO_INST_NEW (cfg, ins, OP_PUSH_R4);
747 ins->sreg1 = args [5 - i]->dreg;
748 ins->klass = args [5 - i]->klass;
749 MONO_ADD_INS (cfg->cbb, ins);
752 if (args [0]->opcode == OP_LDADDR) { /*Eliminate LDADDR if it's initing a local var*/
753 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
754 NULLIFY_INS (args [0]);
756 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
757 ins->klass = cmethod->klass;
758 ins->type = STACK_VTYPE;
760 MONO_ADD_INS (cfg->cbb, ins);
762 int vreg = alloc_ireg (cfg);
764 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
765 ins->klass = cmethod->klass;
766 ins->type = STACK_VTYPE;
768 MONO_ADD_INS (cfg->cbb, ins);
770 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE_REG);
771 ins->klass = cmethod->klass;
772 ins->dreg = args [0]->dreg;
774 MONO_ADD_INS (cfg->cbb, ins);
780 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
785 vreg = get_simd_vreg (cfg, cmethod, args [0]);
788 MONO_INST_NEW (cfg, ins, OP_XMOVE);
789 ins->klass = cmethod->klass;
790 ins->type = STACK_VTYPE;
792 ins->dreg = alloc_ireg (cfg);
793 MONO_ADD_INS (cfg->cbb, ins);
799 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
802 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
804 vreg = get_simd_vreg (cfg, cmethod, args [0]);
806 if (args [1]->opcode != OP_ICONST) {
807 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
808 ins->klass = mono_defaults.int32_class;
809 ins->sreg1 = args [1]->dreg;
810 ins->type = STACK_I4;
811 ins->dreg = vreg2 = alloc_ireg (cfg);
812 MONO_ADD_INS (cfg->cbb, ins);
814 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
817 MONO_INST_NEW (cfg, ins, opcode);
818 ins->klass = cmethod->klass;
822 if (args [1]->opcode == OP_ICONST) {
823 ins->inst_imm = args [1]->inst_c0;
824 NULLIFY_INS (args [1]);
827 ins->type = STACK_VTYPE;
828 ins->dreg = alloc_ireg (cfg);
829 MONO_ADD_INS (cfg->cbb, ins);
835 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
840 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
842 if (args [1]->opcode != OP_ICONST) {
843 g_warning ("Shuffle with non literals is not yet supported");
844 g_assert_not_reached ();
846 vreg = get_simd_vreg (cfg, cmethod, args [0]);
847 NULLIFY_INS (args [1]);
849 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
850 ins->klass = cmethod->klass;
852 ins->inst_c0 = args [1]->inst_c0;
853 ins->type = STACK_VTYPE;
854 ins->dreg = alloc_ireg (cfg);
855 MONO_ADD_INS (cfg->cbb, ins);
860 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
864 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
865 ins->klass = cmethod->klass;
866 ins->sreg1 = args [0]->dreg;
867 ins->type = STACK_VTYPE;
868 ins->dreg = alloc_ireg (cfg);
869 MONO_ADD_INS (cfg->cbb, ins);
874 simd_intrinsic_emit_store_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
879 vreg = get_simd_vreg (cfg, cmethod, args [1]);
881 MONO_INST_NEW (cfg, ins, OP_STOREX_ALIGNED_MEMBASE_REG);
882 ins->klass = cmethod->klass;
883 ins->dreg = args [0]->dreg;
885 ins->type = STACK_VTYPE;
886 MONO_ADD_INS (cfg->cbb, ins);
891 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
896 vreg = get_simd_vreg (cfg, cmethod, args [0]);
898 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
899 ins->klass = cmethod->klass;
901 ins->type = STACK_I4;
902 ins->dreg = alloc_ireg (cfg);
903 MONO_ADD_INS (cfg->cbb, ins);
909 simd_version_name (guint32 version)
912 case SIMD_VERSION_SSE1:
914 case SIMD_VERSION_SSE2:
916 case SIMD_VERSION_SSE3:
918 case SIMD_VERSION_SSSE3:
920 case SIMD_VERSION_SSE41:
922 case SIMD_VERSION_SSE42:
924 case SIMD_VERSION_SSE4a:
931 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
933 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
935 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
938 if (IS_DEBUG_ON (cfg)) {
940 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
941 max = fsig->param_count + fsig->hasthis;
942 for (i = 0; i < max; ++i) {
943 printf ("param %d: ", i);
944 mono_print_ins (args [i]);
947 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
948 if (IS_DEBUG_ON (cfg))
949 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
953 switch (result->simd_emit_mode) {
954 case SIMD_EMIT_BINARY:
955 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
956 case SIMD_EMIT_UNARY:
957 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
958 case SIMD_EMIT_GETTER:
959 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
961 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
963 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
964 case SIMD_EMIT_SHUFFLE:
965 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
966 case SIMD_EMIT_SHIFT:
967 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
968 case SIMD_EMIT_LOAD_ALIGNED:
969 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
970 case SIMD_EMIT_STORE_ALIGNED:
971 return simd_intrinsic_emit_store_aligned (result, cfg, cmethod, args);
972 case SIMD_EMIT_EXTRACT_MASK:
973 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
975 g_assert_not_reached ();
979 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
981 if (!cmethod->klass->simd_type)
983 cfg->uses_simd_intrinsics = 1;
984 if (!strcmp ("Vector2d", cmethod->klass->name))
985 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
986 if (!strcmp ("Vector4f", cmethod->klass->name))
987 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
988 if (!strcmp ("Vector2l", cmethod->klass->name))
989 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
990 if (!strcmp ("Vector4ui", cmethod->klass->name))
991 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
992 if (!strcmp ("Vector4i", cmethod->klass->name))
993 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
994 if (!strcmp ("Vector8us", cmethod->klass->name))
995 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
996 if (!strcmp ("Vector8s", cmethod->klass->name))
997 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
998 if (!strcmp ("Vector16b", cmethod->klass->name))
999 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1000 if (!strcmp ("Vector16sb", cmethod->klass->name))
1001 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));