2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
34 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
35 TODO check if we need to init the SSE control word with better precision.
36 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
37 TODO make SimdRuntime.get_AccelMode work under AOT
39 General notes for SIMD intrinsics.
41 -Bad extractor and constructor performance
42 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
43 It will be loaded in the FP stack just to be pushed on the call stack.
45 A similar thing happens with Vector4f constructor that require float vars to be
47 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
48 trip to the FP stack is desirable.
50 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
54 -Promote OP_EXTRACT_I4 to a STORE op
55 The advantage of this change is that it could have a _membase version and promote further optimizations.
57 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
61 #ifdef MONO_ARCH_SIMD_INTRINSICS
63 //#define IS_DEBUG_ON(cfg) (0)
65 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
66 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
75 SIMD_EMIT_LOAD_ALIGNED,
76 SIMD_EMIT_STORE_ALIGNED,
77 SIMD_EMIT_EXTRACT_MASK,
81 #ifdef HAVE_ARRAY_ELEM_INIT
82 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
83 #define MSGSTRFIELD1(line) str##line
84 static const struct msgstr_t {
85 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
86 #include "simd-methods.h"
89 #define SIMD_METHOD(str,name) str,
90 #include "simd-methods.h"
95 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
96 #include "simd-methods.h"
98 #define method_name(idx) ((const char*)&method_names + (idx))
101 #define SIMD_METHOD(str,name) str,
102 static const char * const method_names [] = {
103 #include "simd-methods.h"
107 #define SIMD_METHOD(str,name) name,
109 #include "simd-methods.h"
113 #define method_name(idx) (method_names [(idx)])
120 guint8 simd_emit_mode : 4;
121 guint8 simd_version : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, 0, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
132 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
133 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
151 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
152 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
153 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_SHUFLEPS, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
159 { SN_get_W, 3, SIMD_EMIT_GETTER },
160 { SN_get_X, 0, SIMD_EMIT_GETTER },
161 { SN_get_Y, 1, SIMD_EMIT_GETTER },
162 { SN_get_Z, 2, SIMD_EMIT_GETTER },
163 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
164 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
166 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
167 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
168 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
169 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
170 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
179 static const SimdIntrinsc vector2d_intrinsics[] = {
180 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
181 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
182 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
183 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
184 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
185 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
186 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
187 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
188 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
189 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
190 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
191 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
192 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
193 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
194 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
195 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
196 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
197 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
198 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
199 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
200 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
201 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
202 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
203 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
204 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
205 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
206 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
207 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
208 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
209 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
210 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
219 static const SimdIntrinsc vector2ul_intrinsics[] = {
220 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
221 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
222 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
223 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
224 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
225 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
226 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
227 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
228 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
229 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
230 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
231 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
232 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
233 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
234 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
235 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
236 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
237 { SN_op_RightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
238 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
247 static const SimdIntrinsc vector2l_intrinsics[] = {
248 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
249 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
250 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
251 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
252 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
253 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
254 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
255 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
256 { SN_ShiftRightLogic, OP_PSHRQ, SIMD_EMIT_SHIFT },
257 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
258 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
259 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
260 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
261 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
263 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
264 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
265 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
266 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
267 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
276 static const SimdIntrinsc vector4ui_intrinsics[] = {
277 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
278 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
279 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
280 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
281 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
282 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
283 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
284 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
285 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
286 { SN_ShiftRightArithmetic, OP_PSARD, SIMD_EMIT_SHIFT },
287 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
288 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
289 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
290 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
291 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
292 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
293 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
294 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
296 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
297 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
298 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
299 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
300 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
301 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
310 static const SimdIntrinsc vector4i_intrinsics[] = {
311 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
312 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
313 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
314 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
315 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
316 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
317 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
318 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
319 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
320 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
321 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
322 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
323 { SN_ShiftRightLogic, OP_PSHRD, SIMD_EMIT_SHIFT },
324 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
325 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
326 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
327 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
328 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
329 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
330 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
331 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
332 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
333 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
334 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
335 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
336 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
345 static const SimdIntrinsc vector8us_intrinsics[] = {
346 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
347 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
348 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
349 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
350 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
351 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
352 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
353 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
354 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
355 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
356 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
357 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
358 { SN_ShiftRightArithmetic, OP_PSARW, SIMD_EMIT_SHIFT },
359 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
360 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
361 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
362 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
363 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
364 { SN_SubWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
365 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
366 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
367 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
368 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
369 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
370 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
371 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
372 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
373 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
374 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
375 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
384 static const SimdIntrinsc vector8s_intrinsics[] = {
385 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
386 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
387 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
388 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
389 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
390 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
391 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
392 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
393 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
394 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
395 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
396 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
397 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
398 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
399 { SN_ShiftRightLogic, OP_PSHRW, SIMD_EMIT_SHIFT },
400 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
401 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
402 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
403 { SN_SubWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
404 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
405 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
406 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
407 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
408 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
409 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
410 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
411 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
412 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
413 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
414 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
423 static const SimdIntrinsc vector16b_intrinsics[] = {
424 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
425 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
426 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
427 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
428 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
429 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
430 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
431 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
432 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
433 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
434 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
435 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
436 { SN_SubWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
437 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
438 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
439 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
440 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
441 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
442 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
443 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
444 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
445 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
454 static const SimdIntrinsc vector16sb_intrinsics[] = {
455 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
456 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
457 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
458 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
459 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
460 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
461 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
462 { SN_Prefetch0, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
463 { SN_Prefetch1, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
464 { SN_Prefetch2, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
465 { SN_PrefetchNTA, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
466 { SN_StoreAligned, 0, SIMD_EMIT_STORE_ALIGNED },
467 { SN_SubWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
468 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
469 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
470 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
471 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
472 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
473 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
474 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
475 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
478 static guint32 simd_supported_versions;
480 /*TODO match using number of parameters as well*/
482 simd_intrinsic_compare_by_name (const void *key, const void *value)
484 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
489 VREG_HAS_XZERO_BB0 = 0x02,
490 VREG_HAS_OTHER_OP_BB0 = 0x04,
491 VREG_SINGLE_BB_USE = 0x08,
492 VREG_MANY_BB_USE = 0x10,
496 mono_simd_intrinsics_init (void)
498 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
499 /*TODO log the supported flags*/
502 static inline gboolean
503 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
505 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
506 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
507 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
508 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
514 static inline gboolean
515 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
517 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
520 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
521 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
522 vreg_flags [reg] |= VREG_MANY_BB_USE;
523 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
525 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
526 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
527 target_bb [reg] = bb;
528 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
534 This pass recalculate which vars need MONO_INST_INDIRECT.
536 We cannot do this for non SIMD vars since code like mono_get_vtable_var
537 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
540 mono_simd_simplify_indirection (MonoCompile *cfg)
543 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
547 for (i = 0; i < cfg->num_varinfo; i++) {
548 MonoInst *var = cfg->varinfo [i];
549 if (var->klass->simd_type) {
550 var->flags &= ~MONO_INST_INDIRECT;
551 max_vreg = MAX (var->dreg, max_vreg);
555 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
556 if (!first_bb && bb->code)
558 for (ins = bb->code; ins; ins = ins->next) {
559 if (ins->opcode == OP_LDADDR) {
560 MonoInst *var = (MonoInst*)ins->inst_p0;
561 if (var->klass->simd_type) {
562 var->flags |= MONO_INST_INDIRECT;
568 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
569 vreg_flags = g_malloc0 (max_vreg + 1);
570 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
572 for (i = 0; i < cfg->num_varinfo; i++) {
573 MonoInst *var = cfg->varinfo [i];
574 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
575 vreg_flags [var->dreg] = VREG_USED;
576 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
580 /*Scan the first basic block looking xzeros not used*/
581 for (ins = first_bb->code; ins; ins = ins->next) {
582 if (ins->opcode == OP_XZERO) {
583 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
584 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
585 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
589 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
592 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
594 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg1, max_vreg, vreg_flags))
596 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg2, max_vreg, vreg_flags))
600 if (IS_DEBUG_ON (cfg)) {
601 for (i = 0; i < cfg->num_varinfo; i++) {
602 MonoInst *var = cfg->varinfo [i];
603 if (var->klass->simd_type) {
604 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
605 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
606 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
607 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
612 /*TODO stop here if no var is xzero only*/
615 Scan all other bb and check if it has only one other use
616 Ideally this would be done after an extended bb formation pass
618 FIXME This pass could use dominator information to properly
619 place the XZERO on the bb that dominates all uses of the var,
620 but this will have zero effect with the current local reg alloc
622 TODO simply the use of flags.
625 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
626 for (ins = bb->code; ins; ins = ins->next) {
628 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
630 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
632 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg1, bb, max_vreg, vreg_flags, target_bb))
634 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg2, bb, max_vreg, vreg_flags, target_bb))
639 for (i = 0; i < cfg->num_varinfo; i++) {
640 MonoInst *var = cfg->varinfo [i];
641 if (!var->klass->simd_type)
643 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
644 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
645 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
646 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
648 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
650 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
651 /*We can, pretty much kill it.*/
652 if (ins->dreg == var->dreg) {
654 } else if (ins->sreg1 == var->dreg || ins->sreg2 == var->dreg) {
656 MONO_INST_NEW (cfg, tmp, OP_XZERO);
657 tmp->dreg = var->dreg;
658 tmp->type = STACK_VTYPE;
659 tmp->klass = var->klass;
660 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
666 for (ins = first_bb->code; ins; ins = ins->next) {
667 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
676 * This function expect that src be a value.
679 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
681 if (src->opcode == OP_XMOVE) {
683 } else if (src->type == STACK_VTYPE) {
686 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
687 mono_print_ins (src);
688 g_assert_not_reached ();
692 * This function will load the value if needed.
695 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
697 if (src->opcode == OP_XMOVE) {
699 } else if (src->opcode == OP_LDADDR) {
700 int res = ((MonoInst*)src->inst_p0)->dreg;
703 } else if (src->type == STACK_VTYPE) {
705 } else if (src->type == STACK_PTR) {
708 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
709 ins->klass = cmethod->klass;
710 ins->sreg1 = src->dreg;
711 ins->type = STACK_VTYPE;
712 ins->dreg = alloc_ireg (cfg);
713 MONO_ADD_INS (cfg->cbb, ins);
716 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
717 mono_print_ins (src);
718 g_assert_not_reached ();
722 get_int_to_float_spill_area (MonoCompile *cfg)
724 if (!cfg->iconv_raw_var) {
725 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
726 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
728 return cfg->iconv_raw_var;
732 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
735 int left_vreg, right_vreg;
737 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
738 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
741 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
742 ins->klass = cmethod->klass;
743 ins->sreg1 = left_vreg;
744 ins->sreg2 = right_vreg;
745 ins->type = STACK_VTYPE;
746 ins->klass = cmethod->klass;
747 ins->dreg = alloc_ireg (cfg);
748 ins->inst_c0 = intrinsic->flags;
749 MONO_ADD_INS (cfg->cbb, ins);
754 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
759 vreg = get_simd_vreg (cfg, cmethod, args [0]);
761 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
762 ins->klass = cmethod->klass;
764 ins->type = STACK_VTYPE;
765 ins->dreg = alloc_ireg (cfg);
766 MONO_ADD_INS (cfg->cbb, ins);
771 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
776 vreg = load_simd_vreg (cfg, cmethod, args [0]);
778 if (intrinsic->opcode) {
779 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
780 ins->klass = cmethod->klass;
782 ins->inst_c0 = intrinsic->opcode;
783 ins->type = STACK_VTYPE;
784 ins->dreg = vreg = alloc_ireg (cfg);
785 MONO_ADD_INS (cfg->cbb, ins);
788 MONO_INST_NEW (cfg, tmp, OP_EXTRACT_I4);
789 tmp->klass = cmethod->klass;
791 tmp->type = STACK_I4;
792 tmp->dreg = alloc_ireg (cfg);
793 MONO_ADD_INS (cfg->cbb, tmp);
795 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
796 ins->klass = mono_defaults.single_class;
797 ins->sreg1 = tmp->dreg;
798 ins->type = STACK_R8;
799 ins->dreg = alloc_freg (cfg);
800 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
801 MONO_ADD_INS (cfg->cbb, ins);
806 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
811 for (i = 1; i < 5; ++i) {
812 MONO_INST_NEW (cfg, ins, OP_PUSH_R4);
813 ins->sreg1 = args [5 - i]->dreg;
814 ins->klass = args [5 - i]->klass;
815 MONO_ADD_INS (cfg->cbb, ins);
818 if (args [0]->opcode == OP_LDADDR) { /*Eliminate LDADDR if it's initing a local var*/
819 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
820 NULLIFY_INS (args [0]);
822 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
823 ins->klass = cmethod->klass;
824 ins->type = STACK_VTYPE;
826 MONO_ADD_INS (cfg->cbb, ins);
828 int vreg = alloc_ireg (cfg);
830 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
831 ins->klass = cmethod->klass;
832 ins->type = STACK_VTYPE;
834 MONO_ADD_INS (cfg->cbb, ins);
836 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE_REG);
837 ins->klass = cmethod->klass;
838 ins->dreg = args [0]->dreg;
840 MONO_ADD_INS (cfg->cbb, ins);
846 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
851 vreg = get_simd_vreg (cfg, cmethod, args [0]);
854 MONO_INST_NEW (cfg, ins, OP_XMOVE);
855 ins->klass = cmethod->klass;
856 ins->type = STACK_VTYPE;
858 ins->dreg = alloc_ireg (cfg);
859 MONO_ADD_INS (cfg->cbb, ins);
865 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
868 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
870 vreg = get_simd_vreg (cfg, cmethod, args [0]);
872 if (args [1]->opcode != OP_ICONST) {
873 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
874 ins->klass = mono_defaults.int32_class;
875 ins->sreg1 = args [1]->dreg;
876 ins->type = STACK_I4;
877 ins->dreg = vreg2 = alloc_ireg (cfg);
878 MONO_ADD_INS (cfg->cbb, ins);
880 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
883 MONO_INST_NEW (cfg, ins, opcode);
884 ins->klass = cmethod->klass;
888 if (args [1]->opcode == OP_ICONST) {
889 ins->inst_imm = args [1]->inst_c0;
890 NULLIFY_INS (args [1]);
893 ins->type = STACK_VTYPE;
894 ins->dreg = alloc_ireg (cfg);
895 MONO_ADD_INS (cfg->cbb, ins);
901 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
906 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
908 if (args [1]->opcode != OP_ICONST) {
909 g_warning ("Shuffle with non literals is not yet supported");
910 g_assert_not_reached ();
912 vreg = get_simd_vreg (cfg, cmethod, args [0]);
913 NULLIFY_INS (args [1]);
915 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
916 ins->klass = cmethod->klass;
918 ins->inst_c0 = args [1]->inst_c0;
919 ins->type = STACK_VTYPE;
920 ins->dreg = alloc_ireg (cfg);
921 MONO_ADD_INS (cfg->cbb, ins);
926 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
930 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
931 ins->klass = cmethod->klass;
932 ins->sreg1 = args [0]->dreg;
933 ins->type = STACK_VTYPE;
934 ins->dreg = alloc_ireg (cfg);
935 MONO_ADD_INS (cfg->cbb, ins);
940 simd_intrinsic_emit_store_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
945 vreg = get_simd_vreg (cfg, cmethod, args [1]);
947 MONO_INST_NEW (cfg, ins, OP_STOREX_ALIGNED_MEMBASE_REG);
948 ins->klass = cmethod->klass;
949 ins->dreg = args [0]->dreg;
951 ins->type = STACK_VTYPE;
952 MONO_ADD_INS (cfg->cbb, ins);
957 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
962 vreg = get_simd_vreg (cfg, cmethod, args [0]);
964 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
965 ins->klass = cmethod->klass;
967 ins->type = STACK_I4;
968 ins->dreg = alloc_ireg (cfg);
969 MONO_ADD_INS (cfg->cbb, ins);
975 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
979 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
980 ins->klass = cmethod->klass;
981 ins->sreg1 = args [0]->dreg;
982 ins->backend.arg_info = intrinsic->flags;
983 MONO_ADD_INS (cfg->cbb, ins);
988 simd_version_name (guint32 version)
991 case SIMD_VERSION_SSE1:
993 case SIMD_VERSION_SSE2:
995 case SIMD_VERSION_SSE3:
997 case SIMD_VERSION_SSSE3:
999 case SIMD_VERSION_SSE41:
1001 case SIMD_VERSION_SSE42:
1003 case SIMD_VERSION_SSE4a:
1010 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1012 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1014 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1017 if (IS_DEBUG_ON (cfg)) {
1019 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1020 max = fsig->param_count + fsig->hasthis;
1021 for (i = 0; i < max; ++i) {
1022 printf ("param %d: ", i);
1023 mono_print_ins (args [i]);
1026 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
1027 if (IS_DEBUG_ON (cfg))
1028 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
1032 switch (result->simd_emit_mode) {
1033 case SIMD_EMIT_BINARY:
1034 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1035 case SIMD_EMIT_UNARY:
1036 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1037 case SIMD_EMIT_GETTER:
1038 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1039 case SIMD_EMIT_CTOR:
1040 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1041 case SIMD_EMIT_CAST:
1042 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1043 case SIMD_EMIT_SHUFFLE:
1044 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1045 case SIMD_EMIT_SHIFT:
1046 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1047 case SIMD_EMIT_LOAD_ALIGNED:
1048 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1049 case SIMD_EMIT_STORE_ALIGNED:
1050 return simd_intrinsic_emit_store_aligned (result, cfg, cmethod, args);
1051 case SIMD_EMIT_EXTRACT_MASK:
1052 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1053 case SIMD_EMIT_PREFETCH:
1054 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1056 g_assert_not_reached ();
1060 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1062 if (!strcmp ("get_AccelMode", cmethod->name)) {
1064 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1071 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1073 if (!strcmp ("Mono.Simd", cmethod->klass->name_space) && !strcmp ("SimdRuntime", cmethod->klass->name))
1074 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1075 if (!cmethod->klass->simd_type)
1077 cfg->uses_simd_intrinsics = 1;
1078 if (!strcmp ("Vector2d", cmethod->klass->name))
1079 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1080 if (!strcmp ("Vector4f", cmethod->klass->name))
1081 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1082 if (!strcmp ("Vector2ul", cmethod->klass->name))
1083 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1084 if (!strcmp ("Vector2l", cmethod->klass->name))
1085 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1086 if (!strcmp ("Vector4ui", cmethod->klass->name))
1087 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1088 if (!strcmp ("Vector4i", cmethod->klass->name))
1089 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1090 if (!strcmp ("Vector8us", cmethod->klass->name))
1091 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1092 if (!strcmp ("Vector8s", cmethod->klass->name))
1093 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1094 if (!strcmp ("Vector16b", cmethod->klass->name))
1095 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1096 if (!strcmp ("Vector16sb", cmethod->klass->name))
1097 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));