2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
15 #include "mono/utils/bsearch.h"
16 #include <mono/metadata/abi-details.h>
17 #include <mono/metadata/reflection-internals.h>
20 General notes on SIMD intrinsics
22 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
23 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
24 TODO extend op_to_op_dest_membase to handle simd ops
25 TODO add support for indexed versions of simd ops
26 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
27 TODO make sure locals, arguments and spills are properly aligned.
28 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
29 TODO add stuff to man pages
30 TODO document this under /docs
31 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
32 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
33 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
34 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
35 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
36 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
37 TODO check if we need to init the SSE control word with better precision.
38 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
39 TODO make SimdRuntime.get_AccelMode work under AOT
40 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
41 TODO extend bounds checking code to support for range checking.
43 General notes for SIMD intrinsics.
45 -Bad extractor and constructor performance
46 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
47 It will be loaded in the FP stack just to be pushed on the call stack.
49 A similar thing happens with Vector4f constructor that require float vars to be
51 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
52 trip to the FP stack is desirable.
54 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
58 -Promote OP_EXTRACT_I4 to a STORE op
59 The advantage of this change is that it could have a _membase version and promote further optimizations.
61 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
65 #if defined (MONO_ARCH_SIMD_INTRINSICS)
67 #if defined (DISABLE_JIT)
70 mono_simd_intrinsics_init (void)
76 //#define IS_DEBUG_ON(cfg) (0)
78 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
79 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
85 SIMD_EMIT_GETTER_QWORD,
91 SIMD_EMIT_LOAD_ALIGNED,
93 SIMD_EMIT_EXTRACT_MASK,
97 #ifdef HAVE_ARRAY_ELEM_INIT
98 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
99 #define MSGSTRFIELD1(line) str##line
100 static const struct msgstr_t {
101 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
102 #include "simd-methods.h"
105 #define SIMD_METHOD(str,name) str,
106 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
112 #include "simd-methods.h"
114 #define method_name(idx) ((const char*)&method_names + (idx))
117 #define SIMD_METHOD(str,name) str,
118 static const char * const method_names [] = {
119 #include "simd-methods.h"
123 #define SIMD_METHOD(str,name) name,
125 #include "simd-methods.h"
129 #define method_name(idx) (method_names [(idx)])
136 guint8 simd_version_flags;
137 guint8 simd_emit_mode : 4;
141 static const SimdIntrinsic vector4f_intrinsics[] = {
142 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
143 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
144 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
145 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
146 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
147 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
148 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
149 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
150 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
151 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
152 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
153 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
154 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
155 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
156 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
157 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
158 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
159 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
160 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
161 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
162 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
163 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
164 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
165 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
166 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
167 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
168 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
169 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
170 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
171 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
172 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
173 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
174 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
175 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
176 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
177 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
178 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
179 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
180 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
181 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
182 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
183 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
184 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
185 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
186 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
187 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
188 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
189 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
190 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
191 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
192 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
195 static const SimdIntrinsic vector2d_intrinsics[] = {
196 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
197 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
198 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
199 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
200 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
201 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
202 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
203 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
204 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
205 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
206 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
207 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
208 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
209 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
210 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
211 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
212 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
213 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
214 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
215 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
216 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
217 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
219 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
220 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
221 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
222 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
223 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
224 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
225 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
226 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
227 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
228 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
229 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
232 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
233 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
234 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
235 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
236 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
239 static const SimdIntrinsic vector2ul_intrinsics[] = {
240 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
241 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
242 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
243 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
244 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
245 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
246 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
247 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
248 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
249 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
250 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
251 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
252 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
253 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
254 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
255 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
256 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
257 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
258 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
259 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
260 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
261 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
262 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
263 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
266 static const SimdIntrinsic vector2l_intrinsics[] = {
267 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
268 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
269 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
270 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
271 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
272 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
273 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
274 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
275 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
276 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
277 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
278 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
279 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
280 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
281 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
282 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
283 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
284 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
285 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
287 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
288 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
289 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
290 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
291 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
294 static const SimdIntrinsic vector4ui_intrinsics[] = {
295 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
296 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
297 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
298 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
299 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
300 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
301 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
302 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
303 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
304 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
305 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
306 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
307 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
308 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
309 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
310 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
311 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
312 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
313 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
314 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
315 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
316 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
317 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
318 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
319 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
320 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
321 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
322 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
323 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
324 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
325 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
326 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
327 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
328 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
329 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
332 static const SimdIntrinsic vector4i_intrinsics[] = {
333 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
334 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
335 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
336 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
337 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
338 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
339 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
340 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
341 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
342 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
343 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
344 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
345 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
346 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
347 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
348 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
349 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
350 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
351 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
352 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
353 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
354 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
355 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
356 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
357 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
358 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
359 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
360 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
361 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
362 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
363 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
364 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
365 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
366 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
367 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
368 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
369 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
370 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
373 static const SimdIntrinsic vector8us_intrinsics[] = {
374 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
375 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
376 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
377 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
378 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
379 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
380 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
381 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
382 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
384 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
385 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
386 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
387 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
388 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
389 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
390 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
391 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
392 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
393 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
395 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
396 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
397 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
398 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
399 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
400 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
401 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
402 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
403 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
404 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
406 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
407 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
408 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
409 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
410 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
411 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
412 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
413 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
414 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
415 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
416 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
417 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
418 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
419 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
420 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
421 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
424 static const SimdIntrinsic vector8s_intrinsics[] = {
425 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
426 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
427 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
428 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
429 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
430 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
431 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
432 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
437 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
438 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
439 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
440 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
441 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
442 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
443 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
444 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
446 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
447 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
448 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
449 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
450 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
451 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
452 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
453 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
454 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
455 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
457 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
458 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
459 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
460 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
461 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
462 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
463 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
464 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
465 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
466 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
467 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
468 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
469 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
470 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
471 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
472 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
475 static const SimdIntrinsic vector16b_intrinsics[] = {
476 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
477 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
478 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
481 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
482 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
483 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
484 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
485 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
486 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
487 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
488 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
489 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
490 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
492 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
493 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
500 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
501 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
502 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
503 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
504 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
505 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
506 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
507 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
508 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
509 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
510 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
511 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
512 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
513 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
514 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
515 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
516 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
517 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
523 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
524 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
525 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
526 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
527 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
528 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
530 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
531 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
532 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
539 static const SimdIntrinsic vector16sb_intrinsics[] = {
540 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
541 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
542 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
545 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
546 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
547 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
548 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
549 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
550 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
551 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
552 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
553 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
554 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
555 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
556 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
563 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
564 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
565 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
566 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
567 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
568 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
569 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
570 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
571 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
572 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
573 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
574 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
575 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
576 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
577 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
578 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
579 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
580 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
587 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
589 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
590 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
591 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
592 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
593 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
594 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
595 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
598 static guint32 simd_supported_versions;
600 static MonoInst* emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
601 static MonoInst* emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
603 /*TODO match using number of parameters as well*/
605 simd_intrinsic_compare_by_name (const void *key, const void *value)
607 return strcmp (key, method_name (((SimdIntrinsic *)value)->name));
612 VREG_HAS_XZERO_BB0 = 0x02,
613 VREG_HAS_OTHER_OP_BB0 = 0x04,
614 VREG_SINGLE_BB_USE = 0x08,
615 VREG_MANY_BB_USE = 0x10,
619 mono_simd_intrinsics_init (void)
621 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
622 /*TODO log the supported flags*/
625 static inline gboolean
626 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
628 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
629 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
630 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
631 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
637 static inline gboolean
638 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
640 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
643 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
644 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
645 vreg_flags [reg] |= VREG_MANY_BB_USE;
646 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
648 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
649 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
650 target_bb [reg] = bb;
651 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
658 This pass recalculate which vars need MONO_INST_INDIRECT.
660 We cannot do this for non SIMD vars since code like mono_get_vtable_var
661 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
664 mono_simd_simplify_indirection (MonoCompile *cfg)
667 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
671 for (i = 0; i < cfg->num_varinfo; i++) {
672 MonoInst *var = cfg->varinfo [i];
673 if (var->klass->simd_type) {
674 var->flags &= ~MONO_INST_INDIRECT;
675 max_vreg = MAX (var->dreg, max_vreg);
679 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
680 if (!first_bb && bb->code)
682 for (ins = bb->code; ins; ins = ins->next) {
683 if (ins->opcode == OP_LDADDR) {
684 MonoInst *var = (MonoInst*)ins->inst_p0;
685 if (var->klass->simd_type) {
686 var->flags |= MONO_INST_INDIRECT;
692 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
693 vreg_flags = (char *)g_malloc0 (max_vreg + 1);
694 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
696 for (i = 0; i < cfg->num_varinfo; i++) {
697 MonoInst *var = cfg->varinfo [i];
698 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
699 vreg_flags [var->dreg] = VREG_USED;
700 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
704 /*Scan the first basic block looking xzeros not used*/
705 for (ins = first_bb->code; ins; ins = ins->next) {
707 int sregs [MONO_MAX_SRC_REGS];
709 if (ins->opcode == OP_XZERO) {
710 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
711 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
712 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
716 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
718 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
720 num_sregs = mono_inst_get_src_registers (ins, sregs);
721 for (i = 0; i < num_sregs; ++i) {
722 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
727 if (IS_DEBUG_ON (cfg)) {
728 for (i = 0; i < cfg->num_varinfo; i++) {
729 MonoInst *var = cfg->varinfo [i];
730 if (var->klass->simd_type) {
731 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
732 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
733 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
734 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
739 /*TODO stop here if no var is xzero only*/
742 Scan all other bb and check if it has only one other use
743 Ideally this would be done after an extended bb formation pass
745 FIXME This pass could use dominator information to properly
746 place the XZERO on the bb that dominates all uses of the var,
747 but this will have zero effect with the current local reg alloc
749 TODO simply the use of flags.
752 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
753 for (ins = bb->code; ins; ins = ins->next) {
755 int sregs [MONO_MAX_SRC_REGS];
757 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
759 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
761 num_sregs = mono_inst_get_src_registers (ins, sregs);
762 for (i = 0; i < num_sregs; ++i) {
763 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
764 max_vreg, vreg_flags, target_bb))
770 for (i = 0; i < cfg->num_varinfo; i++) {
771 MonoInst *var = cfg->varinfo [i];
772 if (!var->klass->simd_type)
774 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
775 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
776 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
777 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
779 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
781 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
783 int sregs [MONO_MAX_SRC_REGS];
784 gboolean found = FALSE;
786 num_sregs = mono_inst_get_src_registers (ins, sregs);
787 for (j = 0; j < num_sregs; ++j) {
788 if (sregs [j] == var->dreg)
791 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
792 if (ins->dreg == var->dreg && !found) {
793 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i, target_bb [var->dreg]->block_num););
796 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i, target_bb [var->dreg]->block_num); );
798 MONO_INST_NEW (cfg, tmp, OP_XZERO);
799 tmp->dreg = var->dreg;
800 tmp->type = STACK_VTYPE;
801 tmp->klass = var->klass;
802 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
808 for (ins = first_bb->code; ins; ins = ins->next) {
809 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE)) {
810 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins->dreg); mono_print_ins(ins));
820 * This function expect that src be a value.
823 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
825 const char *spec = INS_INFO (src->opcode);
827 if (src->opcode == OP_XMOVE) {
829 } else if (spec [MONO_INST_DEST] == 'x') {
831 } else if (src->opcode == OP_VCALL) {
835 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
836 mono_print_ins (src);
837 g_assert_not_reached ();
841 * This function will load the value if needed.
844 load_simd_vreg_class (MonoCompile *cfg, MonoClass *klass, MonoInst *src, gboolean *indirect)
846 const char *spec = INS_INFO (src->opcode);
850 if (src->opcode == OP_XMOVE) {
852 } else if (src->opcode == OP_LDADDR) {
853 int res = ((MonoInst*)src->inst_p0)->dreg;
856 } else if (spec [MONO_INST_DEST] == 'x') {
858 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
863 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
865 ins->sreg1 = src->dreg;
866 ins->type = STACK_VTYPE;
867 ins->dreg = alloc_ireg (cfg);
868 MONO_ADD_INS (cfg->cbb, ins);
871 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
872 mono_print_ins (src);
873 g_assert_not_reached ();
877 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
879 return load_simd_vreg_class (cfg, cmethod->klass, src, indirect);
882 /*We share the var with fconv_to_r8_x to save some stack space.*/
884 get_double_spill_area (MonoCompile *cfg)
886 if (!cfg->fconv_to_r8_x_var) {
887 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
888 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
890 return cfg->fconv_to_r8_x_var;
893 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
895 if (!cfg->simd_ctor_var) {
896 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
897 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
899 return cfg->simd_ctor_var;
903 mono_type_to_expand_op (MonoType *type)
905 switch (type->type) {
923 g_assert_not_reached ();
928 type_to_comp_op (MonoType *t)
948 g_assert_not_reached ();
954 type_to_gt_op (MonoType *t)
971 type_to_padd_op (MonoType *t)
997 type_to_psub_op (MonoType *t)
1023 type_to_pmul_op (MonoType *t)
1037 /* PMULQ multiplies two 32 bit numbers into a 64 bit one */
1048 type_to_pdiv_op (MonoType *t)
1062 type_to_pxor_op (MonoType *t)
1065 * These opcodes have the same semantics, but using the
1066 * correctly typed version is better for performance.
1079 type_to_pand_op (MonoType *t)
1092 type_to_por_op (MonoType *t)
1105 type_to_pmin_op (MonoType *t)
1130 type_to_pmax_op (MonoType *t)
1155 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoClass *klass, MonoType *param_type, MonoInst *src)
1160 if (mono_class_from_mono_type (param_type)->simd_type)
1161 return get_simd_vreg (cfg, NULL, src);
1163 expand_op = mono_type_to_expand_op (param_type);
1164 MONO_INST_NEW (cfg, ins, expand_op);
1166 ins->sreg1 = src->dreg;
1167 ins->type = STACK_VTYPE;
1168 ins->dreg = alloc_ireg (cfg);
1169 MONO_ADD_INS (cfg->cbb, ins);
1171 if (expand_op == OP_EXPAND_R4)
1172 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1173 else if (expand_op == OP_EXPAND_R8)
1174 ins->backend.spill_var = get_double_spill_area (cfg);
1180 * simd_intrinsic_emit_binary_op:
1182 * Emit a binary SIMD opcode.
1183 * @LHS/@RHS are the two arguments, they can be either a SIMD type or a scalar one. Scalar arguments are
1184 * expanded to the SIMD type.
1187 simd_intrinsic_emit_binary_op (MonoCompile *cfg, int opcode, int flags, MonoClass *klass, MonoType *lhs_type, MonoType *rhs_type, MonoInst *lhs, MonoInst *rhs)
1190 int left_vreg, right_vreg;
1192 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, lhs_type, lhs);
1193 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, rhs_type, rhs);
1195 MONO_INST_NEW (cfg, ins, opcode);
1197 ins->sreg1 = left_vreg;
1198 ins->sreg2 = right_vreg;
1199 ins->type = STACK_VTYPE;
1200 ins->dreg = alloc_ireg (cfg);
1201 ins->inst_c0 = flags;
1202 MONO_ADD_INS (cfg->cbb, ins);
1207 simd_intrinsic_emit_binary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1209 MonoMethodSignature *sig = mono_method_signature (cmethod);
1211 g_assert (sig->param_count == 2);
1213 return simd_intrinsic_emit_binary_op (cfg, intrinsic->opcode, intrinsic->flags, cmethod->klass, sig->params [0], sig->params [1], args [0], args [1]);
1217 simd_intrinsic_emit_unary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1222 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1224 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1225 ins->klass = cmethod->klass;
1227 ins->type = STACK_VTYPE;
1228 ins->dreg = alloc_ireg (cfg);
1229 MONO_ADD_INS (cfg->cbb, ins);
1234 mono_type_to_extract_op (MonoType *type)
1236 switch (type->type) {
1238 return OP_EXTRACT_I1;
1240 return OP_EXTRACT_U1;
1242 return OP_EXTRACT_I2;
1244 return OP_EXTRACT_U2;
1248 return OP_EXTRACT_I4;
1250 g_assert_not_reached ();
1254 /*Returns the amount to shift the element index to get the dword it belongs to*/
1256 mono_type_elements_shift_bits (MonoType *type)
1258 switch (type->type) {
1270 g_assert_not_reached ();
1274 static G_GNUC_UNUSED int
1275 mono_type_to_insert_op (MonoType *type)
1277 switch (type->type) {
1280 return OP_INSERT_I1;
1283 return OP_INSERT_I2;
1286 return OP_INSERT_I4;
1289 return OP_INSERT_I8;
1291 return OP_INSERT_R4;
1293 return OP_INSERT_R8;
1295 g_assert_not_reached ();
1300 mono_type_to_slow_insert_op (MonoType *type)
1302 switch (type->type) {
1305 return OP_INSERTX_U1_SLOW;
1308 return OP_INSERT_I2;
1311 return OP_INSERTX_I4_SLOW;
1314 return OP_INSERTX_I8_SLOW;
1316 return OP_INSERTX_R4_SLOW;
1318 return OP_INSERTX_R8_SLOW;
1320 g_assert_not_reached ();
1325 simd_intrinsic_emit_setter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1328 MonoMethodSignature *sig = mono_method_signature (cmethod);
1333 size = mono_type_size (sig->params [0], &align);
1335 if (COMPILE_LLVM (cfg)) {
1336 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1337 ins->klass = cmethod->klass;
1338 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1339 ins->sreg2 = args [1]->dreg;
1340 ins->inst_c0 = intrinsic->opcode;
1341 MONO_ADD_INS (cfg->cbb, ins);
1342 } else if (size == 2 || size == 4 || size == 8) {
1343 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1344 ins->klass = cmethod->klass;
1345 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1346 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1347 ins->sreg2 = args [1]->dreg;
1348 ins->inst_c0 = intrinsic->opcode;
1349 if (sig->params [0]->type == MONO_TYPE_R4)
1350 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1351 else if (sig->params [0]->type == MONO_TYPE_R8)
1352 ins->backend.spill_var = get_double_spill_area (cfg);
1353 MONO_ADD_INS (cfg->cbb, ins);
1357 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1358 ins->klass = cmethod->klass;
1359 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1360 ins->type = STACK_I4;
1361 ins->dreg = vreg = alloc_ireg (cfg);
1362 ins->inst_c0 = intrinsic->opcode / 2;
1363 MONO_ADD_INS (cfg->cbb, ins);
1365 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1366 ins->klass = cmethod->klass;
1368 ins->sreg2 = args [1]->dreg;
1370 ins->inst_c0 = intrinsic->opcode;
1371 MONO_ADD_INS (cfg->cbb, ins);
1375 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1376 ins->klass = cmethod->klass;
1377 ins->dreg = args [0]->dreg;
1379 MONO_ADD_INS (cfg->cbb, ins);
1385 * simd_intrinsic_emit_getter_op:
1387 * Emit IR for loading an element of a SIMD value.
1389 * @klass is the simd type, @type is the element type.
1392 simd_intrinsic_emit_getter_op (MonoCompile *cfg, int index, MonoClass *klass, MonoType *type, MonoInst *arg)
1395 int vreg, shift_bits;
1397 vreg = load_simd_vreg_class (cfg, klass, arg, NULL);
1399 if (type->type == MONO_TYPE_I8 || type->type == MONO_TYPE_U8 || type->type == MONO_TYPE_R8) {
1401 gboolean is_r8 = type->type == MONO_TYPE_R8;
1403 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1406 ins->inst_c0 = index;
1408 ins->type = STACK_R8;
1409 ins->dreg = alloc_freg (cfg);
1410 ins->backend.spill_var = get_double_spill_area (cfg);
1412 ins->type = STACK_I8;
1413 ins->dreg = alloc_lreg (cfg);
1415 MONO_ADD_INS (cfg->cbb, ins);
1419 shift_bits = mono_type_elements_shift_bits (type);
1421 if ((index >> shift_bits) && !cfg->compile_llvm) {
1422 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1425 ins->inst_c0 = index >> shift_bits;
1426 ins->type = STACK_VTYPE;
1427 ins->dreg = vreg = alloc_ireg (cfg);
1428 MONO_ADD_INS (cfg->cbb, ins);
1431 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (type));
1434 ins->type = STACK_I4;
1435 ins->dreg = vreg = alloc_ireg (cfg);
1436 if (cfg->compile_llvm)
1437 ins->inst_c0 = index;
1439 ins->inst_c0 = index & ((1 << shift_bits) - 1);
1440 MONO_ADD_INS (cfg->cbb, ins);
1442 if (type->type == MONO_TYPE_R4) {
1443 MONO_INST_NEW (cfg, ins, cfg->r4fp ? OP_ICONV_TO_R4_RAW : OP_MOVE_I4_TO_F);
1444 ins->klass = mono_defaults.single_class;
1446 ins->type = cfg->r4_stack_type;
1447 ins->dreg = alloc_freg (cfg);
1448 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1449 MONO_ADD_INS (cfg->cbb, ins);
1455 simd_intrinsic_emit_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1457 MonoMethodSignature *sig = mono_method_signature (cmethod);
1459 return simd_intrinsic_emit_getter_op (cfg, intrinsic->opcode, cmethod->klass, sig->ret, args [0]);
1463 simd_intrinsic_emit_long_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1467 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1469 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1471 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1472 ins->klass = cmethod->klass;
1474 ins->inst_c0 = intrinsic->opcode;
1476 ins->type = STACK_R8;
1477 ins->dreg = alloc_freg (cfg);
1478 ins->backend.spill_var = get_double_spill_area (cfg);
1480 ins->type = STACK_I8;
1481 ins->dreg = alloc_lreg (cfg);
1483 MONO_ADD_INS (cfg->cbb, ins);
1489 simd_intrinsic_emit_ctor (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1491 MonoInst *ins = NULL;
1493 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1494 MonoMethodSignature *sig = mono_method_signature (cmethod);
1495 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1496 int arg_size = mono_type_size (sig->params [0], &i);
1499 if (sig->param_count == 1) {
1503 dreg = args [0]->inst_i0->dreg;
1504 NULLIFY_INS (args [0]);
1506 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1507 dreg = alloc_ireg (cfg);
1511 opcode = intrinsic->opcode;
1513 opcode = mono_type_to_expand_op (sig->params [0]);
1514 MONO_INST_NEW (cfg, ins, opcode);
1515 ins->klass = cmethod->klass;
1516 ins->sreg1 = args [1]->dreg;
1517 ins->type = STACK_VTYPE;
1520 MONO_ADD_INS (cfg->cbb, ins);
1521 if (sig->params [0]->type == MONO_TYPE_R4)
1522 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1523 else if (sig->params [0]->type == MONO_TYPE_R8)
1524 ins->backend.spill_var = get_double_spill_area (cfg);
1527 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1528 ins->dreg = args [0]->dreg;
1530 MONO_ADD_INS (cfg->cbb, ins);
1536 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1537 MONO_ADD_INS (cfg->cbb, ins);
1538 addr_reg = ins->dreg;
1540 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1541 addr_reg = args [0]->dreg;
1544 for (i = sig->param_count - 1; i >= 0; --i) {
1545 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1548 if (sig->param_count * arg_size < 16) {
1549 /* If there are not enough arguments, fill the rest with 0s */
1550 for (i = sig->param_count; i < 16 / arg_size; ++i) {
1553 MONO_EMIT_NEW_STORE_MEMBASE_IMM (cfg, OP_STOREI4_MEMBASE_IMM, addr_reg, i * arg_size, 0);
1556 g_assert_not_reached ();
1562 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1563 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1564 NULLIFY_INS (args [0]);
1566 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1567 ins->klass = cmethod->klass;
1568 ins->sreg1 = addr_reg;
1569 ins->type = STACK_VTYPE;
1571 MONO_ADD_INS (cfg->cbb, ins);
1577 simd_intrinsic_emit_cast (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1583 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1585 if (cmethod->is_inflated)
1587 klass = mono_class_from_mono_type (mono_method_signature (cmethod)->ret);
1589 klass = cmethod->klass;
1591 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1593 ins->type = STACK_VTYPE;
1595 ins->dreg = alloc_ireg (cfg);
1596 MONO_ADD_INS (cfg->cbb, ins);
1601 simd_intrinsic_emit_shift (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1604 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1606 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1608 if (args [1]->opcode != OP_ICONST) {
1609 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1610 ins->klass = mono_defaults.int32_class;
1611 ins->sreg1 = args [1]->dreg;
1612 ins->type = STACK_I4;
1613 ins->dreg = vreg2 = alloc_ireg (cfg);
1614 MONO_ADD_INS (cfg->cbb, ins);
1616 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1619 MONO_INST_NEW (cfg, ins, opcode);
1620 ins->klass = cmethod->klass;
1624 if (args [1]->opcode == OP_ICONST) {
1625 ins->inst_imm = args [1]->inst_c0;
1626 NULLIFY_INS (args [1]);
1629 ins->type = STACK_VTYPE;
1630 ins->dreg = alloc_ireg (cfg);
1631 MONO_ADD_INS (cfg->cbb, ins);
1635 static inline gboolean
1636 mono_op_is_packed_compare (int op)
1638 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1642 simd_intrinsic_emit_equality_op (MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args, int opcode, int flags)
1645 int left_vreg, right_vreg, tmp_vreg;
1647 left_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1648 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1650 MONO_INST_NEW (cfg, ins, opcode);
1651 ins->klass = cmethod->klass;
1652 ins->sreg1 = left_vreg;
1653 ins->sreg2 = right_vreg;
1654 ins->type = STACK_VTYPE;
1655 ins->klass = cmethod->klass;
1656 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1657 ins->inst_c0 = flags;
1658 MONO_ADD_INS (cfg->cbb, ins);
1660 /*FIXME the next ops are SSE specific*/
1661 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1662 ins->klass = cmethod->klass;
1663 ins->sreg1 = tmp_vreg;
1664 ins->type = STACK_I4;
1665 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1666 MONO_ADD_INS (cfg->cbb, ins);
1668 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1669 if (mono_op_is_packed_compare (opcode) || flags == SIMD_COMP_EQ) {
1670 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1671 NEW_UNALU (cfg, ins, flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1673 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1674 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1676 MONO_ADD_INS (cfg->cbb, ins);
1681 simd_intrinsic_emit_equality (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1683 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, intrinsic->opcode, intrinsic->flags);
1687 simd_intrinsic_emit_shuffle (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1690 int vreg, vreg2 = -1;
1691 int param_count = mono_method_signature (cmethod)->param_count;
1693 if (args [param_count - 1]->opcode != OP_ICONST) {
1694 /*TODO Shuffle with non literals is not yet supported */
1698 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1699 if (param_count == 3)
1700 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1702 NULLIFY_INS (args [param_count - 1]);
1705 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1706 ins->klass = cmethod->klass;
1709 ins->inst_c0 = args [param_count - 1]->inst_c0;
1710 ins->type = STACK_VTYPE;
1711 ins->dreg = alloc_ireg (cfg);
1712 MONO_ADD_INS (cfg->cbb, ins);
1714 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1715 ins->opcode = OP_SHUFPS;
1720 simd_intrinsic_emit_load_aligned (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1724 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1725 ins->klass = cmethod->klass;
1726 ins->sreg1 = args [0]->dreg;
1727 ins->type = STACK_VTYPE;
1728 ins->dreg = alloc_ireg (cfg);
1729 MONO_ADD_INS (cfg->cbb, ins);
1734 simd_intrinsic_emit_store (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1739 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1741 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1742 ins->klass = cmethod->klass;
1743 ins->dreg = args [0]->dreg;
1745 ins->type = STACK_VTYPE;
1746 MONO_ADD_INS (cfg->cbb, ins);
1751 simd_intrinsic_emit_extract_mask (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1756 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1758 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1759 ins->klass = cmethod->klass;
1761 ins->type = STACK_I4;
1762 ins->dreg = alloc_ireg (cfg);
1763 MONO_ADD_INS (cfg->cbb, ins);
1769 simd_intrinsic_emit_prefetch (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1773 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1774 ins->klass = cmethod->klass;
1775 ins->sreg1 = args [0]->dreg;
1776 ins->backend.arg_info = intrinsic->flags;
1777 MONO_ADD_INS (cfg->cbb, ins);
1782 simd_intrinsic_emit_const (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1786 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1787 ins->klass = cmethod->klass;
1788 ins->type = STACK_VTYPE;
1789 ins->dreg = alloc_xreg (cfg);
1790 MONO_ADD_INS (cfg->cbb, ins);
1795 simd_version_name (guint32 version)
1798 case SIMD_VERSION_SSE1:
1800 case SIMD_VERSION_SSE2:
1802 case SIMD_VERSION_SSE3:
1804 case SIMD_VERSION_SSSE3:
1806 case SIMD_VERSION_SSE41:
1808 case SIMD_VERSION_SSE42:
1810 case SIMD_VERSION_SSE4a:
1817 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsic *intrinsics, guint32 size)
1819 const SimdIntrinsic *result = (const SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
1821 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1824 if (IS_DEBUG_ON (cfg)) {
1826 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1827 max = fsig->param_count + fsig->hasthis;
1828 for (i = 0; i < max; ++i) {
1829 printf ("param %d: ", i);
1830 mono_print_ins (args [i]);
1833 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1834 if (IS_DEBUG_ON (cfg)) {
1836 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1837 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1838 if (result->simd_version_flags & (1 << x))
1839 printf ("%s ", simd_version_name (1 << x));
1846 switch (result->simd_emit_mode) {
1847 case SIMD_EMIT_BINARY:
1848 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1849 case SIMD_EMIT_UNARY:
1850 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1851 case SIMD_EMIT_SETTER:
1852 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1853 case SIMD_EMIT_GETTER:
1854 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1855 case SIMD_EMIT_GETTER_QWORD:
1856 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1857 case SIMD_EMIT_CTOR:
1858 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1859 case SIMD_EMIT_CAST:
1860 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1861 case SIMD_EMIT_SHUFFLE:
1862 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1863 case SIMD_EMIT_SHIFT:
1864 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1865 case SIMD_EMIT_EQUALITY:
1866 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1867 case SIMD_EMIT_LOAD_ALIGNED:
1868 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1869 case SIMD_EMIT_STORE:
1870 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1871 case SIMD_EMIT_EXTRACT_MASK:
1872 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1873 case SIMD_EMIT_PREFETCH:
1874 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1876 g_assert_not_reached ();
1880 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1884 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1886 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1887 mult_reg = alloc_preg (cfg);
1888 array_reg = arr->dreg;
1889 index_reg = index->dreg;
1891 #if SIZEOF_VOID_P == 8
1892 /* The array reg is 64 bits but the index reg is only 32 */
1893 index2_reg = alloc_preg (cfg);
1894 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1896 index2_reg = index_reg;
1898 index3_reg = alloc_preg (cfg);
1901 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1902 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1903 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1906 add_reg = alloc_preg (cfg);
1908 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1909 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1910 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1911 ins->type = STACK_PTR;
1912 MONO_ADD_INS (cfg->cbb, ins);
1918 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1920 if ((!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) && fsig->param_count == 2) {
1922 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1924 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1925 load->klass = cmethod->klass;
1927 load->type = STACK_VTYPE;
1928 load->dreg = alloc_ireg (cfg);
1929 MONO_ADD_INS (cfg->cbb, load);
1933 if ((!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) && fsig->param_count == 3) {
1935 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1936 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1938 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1939 store->klass = cmethod->klass;
1941 store->sreg1 = vreg;
1942 MONO_ADD_INS (cfg->cbb, store);
1946 if (!strcmp ("IsAligned", cmethod->name) && fsig->param_count == 2) {
1948 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1950 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1951 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1952 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1953 MONO_ADD_INS (cfg->cbb, ins);
1961 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1963 if (!strcmp ("get_AccelMode", cmethod->name) && fsig->param_count == 0) {
1965 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1972 is_sys_numerics_assembly (MonoAssembly *assembly)
1974 return !strcmp ("System.Numerics", assembly->aname.name);
1978 is_sys_numerics_vectors_assembly (MonoAssembly *assembly)
1980 return !strcmp ("System.Numerics.Vectors", assembly->aname.name);
1984 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1986 const char *class_name;
1988 if (is_sys_numerics_assembly (cmethod->klass->image->assembly))
1989 return emit_sys_numerics_intrinsics (cfg, cmethod, fsig, args);
1991 if (is_sys_numerics_vectors_assembly (cmethod->klass->image->assembly))
1992 return emit_sys_numerics_vectors_intrinsics (cfg, cmethod, fsig, args);
1994 if (strcmp ("Mono.Simd", cmethod->klass->image->assembly->aname.name) ||
1995 strcmp ("Mono.Simd", cmethod->klass->name_space))
1998 class_name = cmethod->klass->name;
1999 if (!strcmp ("SimdRuntime", class_name))
2000 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
2002 if (!strcmp ("ArrayExtensions", class_name))
2003 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
2005 if (!strcmp ("VectorOperations", class_name)) {
2006 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
2008 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
2009 } else if (!cmethod->klass->simd_type)
2012 cfg->uses_simd_intrinsics = 1;
2013 if (!strcmp ("Vector2d", class_name))
2014 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsic));
2015 if (!strcmp ("Vector4f", class_name))
2016 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsic));
2017 if (!strcmp ("Vector2ul", class_name))
2018 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsic));
2019 if (!strcmp ("Vector2l", class_name))
2020 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsic));
2021 if (!strcmp ("Vector4ui", class_name))
2022 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsic));
2023 if (!strcmp ("Vector4i", class_name))
2024 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsic));
2025 if (!strcmp ("Vector8us", class_name))
2026 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsic));
2027 if (!strcmp ("Vector8s", class_name))
2028 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsic));
2029 if (!strcmp ("Vector16b", class_name))
2030 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsic));
2031 if (!strcmp ("Vector16sb", class_name))
2032 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsic));
2038 assert_handled (MonoCompile *cfg, MonoMethod *method)
2040 MonoCustomAttrInfo *cattr;
2043 if (cfg->verbose_level > 1) {
2044 cattr = mono_custom_attrs_from_method_checked (method, &error);
2047 gboolean has_attr = FALSE;
2048 for (int i = 0; i < cattr->num_attrs; ++i)
2049 if (cattr->attrs [i].ctor && (!strcmp (cattr->attrs [i].ctor->klass->name, "JitIntrinsicAttribute")))
2052 printf ("SIMD intrinsic unhandled: %s\n", mono_method_get_name_full (method, TRUE, TRUE, MONO_TYPE_NAME_FORMAT_IL));
2054 //g_assert_not_reached ();
2056 mono_custom_attrs_free (cattr);
2061 // The entries should be ordered by name
2062 // System.Numerics.Vector2/Vector3/Vector4
2063 static const SimdIntrinsic vector2_intrinsics[] = {
2064 { SN_ctor, OP_EXPAND_R4 },
2066 { SN_Dot, OP_DPPS },
2067 { SN_Equals, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
2068 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2069 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2070 { SN_SquareRoot, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
2071 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2072 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2073 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2074 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2078 emit_vector_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2080 const SimdIntrinsic *intrins;
2081 MonoMethodSignature *sig = mono_method_signature (cmethod);
2082 MonoType *type = &cmethod->klass->byval_arg;
2085 * Vector2/3/4 are handled the same way, since the underlying SIMD type is the same (4 * r4).
2087 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector2_intrinsics, sizeof (vector2_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2089 assert_handled (cfg, cmethod);
2093 if (cfg->verbose_level > 1) {
2094 char *name = mono_method_full_name (cmethod, TRUE);
2095 printf (" SIMD intrinsic %s\n", name);
2099 switch (intrins->name) {
2101 gboolean match = TRUE;
2102 for (int i = 0; i < fsig->param_count; ++i)
2103 if (fsig->params [i]->type != MONO_TYPE_R4)
2107 return simd_intrinsic_emit_ctor (intrins, cfg, cmethod, args);
2110 if (!(fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && fsig->params [0] == type))
2112 return simd_intrinsic_emit_equality (intrins, cfg, cmethod, args);
2114 if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2116 return simd_intrinsic_emit_unary (intrins, cfg, cmethod, args);
2118 if (!(fsig->param_count == 2 && fsig->ret->type == MONO_TYPE_R4 && fsig->params [0] == type && fsig->params [1] == type))
2120 if (COMPILE_LLVM (cfg)) {
2123 ins = simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2124 /* The end result is in the lowest element */
2125 return simd_intrinsic_emit_getter_op (cfg, 0, cmethod->klass, mono_method_signature (cmethod)->ret, ins);
2129 // abs(x) = max(x, sub(0,x))
2133 if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2136 MONO_INST_NEW (cfg, zero, OP_XZERO);
2137 zero->dreg = alloc_xreg (cfg);
2138 zero->klass = cmethod->klass;
2139 MONO_ADD_INS (cfg->cbb, zero);
2141 sub = simd_intrinsic_emit_binary_op (cfg, OP_SUBPS, 0, cmethod->klass, sig->params [0], sig->params [0], zero, args [0]);
2142 return simd_intrinsic_emit_binary_op (cfg, OP_MAXPS, 0, cmethod->klass, sig->params [0], sig->params [0], args [0], sub);
2146 case SN_op_Addition:
2147 case SN_op_Division:
2148 case SN_op_Multiply:
2149 case SN_op_Subtraction:
2150 if (!(fsig->param_count == 2 && fsig->ret == type && (fsig->params [0] == type || fsig->params [0]->type == MONO_TYPE_R4) && (fsig->params [1] == type || fsig->params [1]->type == MONO_TYPE_R4)))
2152 return simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2157 assert_handled (cfg, cmethod);
2159 if (cfg->verbose_level > 1) {
2160 char *name = mono_method_full_name (cmethod, TRUE);
2161 printf (" SIMD method %s not handled.\n", name);
2167 /* These should be ordered by name */
2168 static const SimdIntrinsic vector_t_intrinsics[] = {
2174 { SN_GreaterThanOrEqual },
2176 { SN_LessThanOrEqual },
2179 { SN_get_AllOnes, OP_XONES },
2182 { SN_get_Zero, OP_XZERO },
2184 { SN_op_BitwiseAnd },
2185 { SN_op_BitwiseOr },
2187 { SN_op_ExclusiveOr },
2190 { SN_op_Subtraction }
2194 emit_vector_t_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2196 const SimdIntrinsic *intrins;
2197 MonoType *type, *etype;
2199 int size, len, index;
2201 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector_t_intrinsics, sizeof (vector_t_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2203 assert_handled (cfg, cmethod);
2207 type = &cmethod->klass->byval_arg;
2208 etype = mono_class_get_context (cmethod->klass)->class_inst->type_argv [0];
2209 size = mono_class_value_size (mono_class_from_mono_type (etype), NULL);
2213 if (!MONO_TYPE_IS_PRIMITIVE (etype))
2216 if (cfg->verbose_level > 1) {
2217 char *name = mono_method_full_name (cmethod, TRUE);
2218 printf (" SIMD intrinsic %s\n", name);
2222 switch (intrins->name) {
2224 if (!(fsig->param_count == 0 && fsig->ret->type == MONO_TYPE_I4))
2226 EMIT_NEW_ICONST (cfg, ins, len);
2228 case SN_get_AllOnes:
2230 if (!(fsig->param_count == 0 && mono_metadata_type_equal (fsig->ret, type)))
2232 return simd_intrinsic_emit_const (intrins, cfg, cmethod, args);
2234 g_assert (fsig->param_count == 1);
2235 if (args [1]->opcode != OP_ICONST)
2237 index = args [1]->inst_c0;
2238 if (index < 0 || index >= len)
2240 return simd_intrinsic_emit_getter_op (cfg, index, cmethod->klass, etype, args [0]);
2242 if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
2243 return simd_intrinsic_emit_ctor (NULL, cfg, cmethod, args);
2244 if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) {
2245 MonoInst *array_ins = args [1];
2246 MonoInst *index_ins;
2247 MonoInst *ldelema_ins;
2251 if (args [0]->opcode != OP_LDADDR)
2254 /* .ctor (T[]) or .ctor (T[], index) */
2256 if (fsig->param_count == 2) {
2257 index_ins = args [2];
2259 EMIT_NEW_ICONST (cfg, index_ins, 0);
2262 /* Emit index check for the end (index + len - 1 < array length) */
2263 end_index_reg = alloc_ireg (cfg);
2264 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2265 MONO_EMIT_BOUNDS_CHECK (cfg, array_ins->dreg, MonoArray, max_length, end_index_reg);
2267 /* Load the array slice into the simd reg */
2268 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2269 g_assert (args [0]->opcode == OP_LDADDR);
2270 var = args [0]->inst_p0;
2271 EMIT_NEW_LOAD_MEMBASE (cfg, ins, OP_LOADX_MEMBASE, var->dreg, ldelema_ins->dreg, 0);
2272 ins->klass = cmethod->klass;
2276 case SN_op_Explicit:
2277 return simd_intrinsic_emit_cast (intrins, cfg, cmethod, args);
2279 if (fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && mono_metadata_type_equal (fsig->params [0], type))
2280 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, type_to_comp_op (etype), SIMD_COMP_EQ);
2281 if (fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type))
2282 return simd_intrinsic_emit_binary_op (cfg, type_to_comp_op (etype), 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2285 case SN_GreaterThan:
2286 case SN_GreaterThanOrEqual:
2288 case SN_LessThanOrEqual: {
2289 MonoInst *cmp1, *cmp2;
2292 switch (etype->type) {
2302 eq_op = type_to_comp_op (etype);
2303 gt_op = type_to_gt_op (etype);
2305 switch (intrins->name) {
2306 case SN_GreaterThan:
2307 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2309 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2310 case SN_LessThanOrEqual:
2311 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2312 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2313 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2314 case SN_GreaterThanOrEqual:
2315 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2316 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2317 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2319 g_assert_not_reached ();
2325 switch (etype->type) {
2329 case MONO_TYPE_U8: {
2333 MONO_INST_NEW (cfg, ins, OP_XMOVE);
2334 ins->klass = cmethod->klass;
2335 ins->type = STACK_VTYPE;
2336 ins->sreg1 = args [0]->dreg;
2337 ins->dreg = alloc_xreg (cfg);
2338 MONO_ADD_INS (cfg->cbb, ins);
2345 case SN_op_Addition:
2346 case SN_op_Subtraction:
2347 case SN_op_Multiply:
2348 case SN_op_Division:
2349 case SN_op_ExclusiveOr:
2350 case SN_op_BitwiseAnd:
2351 case SN_op_BitwiseOr:
2354 if (!(fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, fsig->params [0]) && mono_metadata_type_equal (fsig->params [0], fsig->params [1])))
2357 switch (intrins->name) {
2358 case SN_op_Addition:
2359 op = type_to_padd_op (etype);
2361 case SN_op_Subtraction:
2362 op = type_to_psub_op (etype);
2364 case SN_op_Multiply:
2365 op = type_to_pmul_op (etype);
2367 case SN_op_Division:
2368 op = type_to_pdiv_op (etype);
2370 case SN_op_ExclusiveOr:
2371 op = type_to_pxor_op (etype);
2373 case SN_op_BitwiseAnd:
2374 op = type_to_pand_op (etype);
2376 case SN_op_BitwiseOr:
2377 op = type_to_por_op (etype);
2380 op = type_to_pmin_op (etype);
2383 op = type_to_pmax_op (etype);
2386 g_assert_not_reached ();
2389 return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2393 MonoInst *array_ins = args [1];
2394 MonoInst *index_ins = args [2];
2395 MonoInst *ldelema_ins;
2399 if (args [0]->opcode != OP_LDADDR)
2402 /* Emit index check for the end (index + len - 1 < array length) */
2403 end_index_reg = alloc_ireg (cfg);
2404 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2406 int length_reg = alloc_ireg (cfg);
2407 MONO_EMIT_NEW_LOAD_MEMBASE_OP_FAULT (cfg, OP_LOADI4_MEMBASE, length_reg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length));
2408 MONO_EMIT_NEW_BIALU (cfg, OP_COMPARE, -1, length_reg, end_index_reg);
2409 MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, "ArgumentException");
2411 /* Load the simd reg into the array slice */
2412 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2413 g_assert (args [0]->opcode == OP_LDADDR);
2414 var = args [0]->inst_p0;
2415 EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, ldelema_ins->dreg, 0, var->dreg);
2416 ins->klass = cmethod->klass;
2424 assert_handled (cfg, cmethod);
2426 if (cfg->verbose_level > 1) {
2427 char *name = mono_method_full_name (cmethod, TRUE);
2428 printf (" SIMD method %s not handled.\n", name);
2436 * emit_sys_numerics_intrinsics:
2438 * Emit intrinsics for the System.Numerics assembly.
2441 emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2443 const char *nspace = cmethod->klass->name_space;
2444 const char *class_name = cmethod->klass->name;
2446 if (!strcmp ("Vector2", class_name) || !strcmp ("Vector4", class_name) || !strcmp ("Vector3", class_name))
2447 return emit_vector_intrinsics (cfg, cmethod, fsig, args);
2449 if (!strcmp ("Vector`1", class_name))
2450 return emit_vector_t_intrinsics (cfg, cmethod, fsig, args);
2452 if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2453 if (!strcmp (cmethod->name, "get_IsHardwareAccelerated")) {
2456 if (simd_supported_versions)
2457 EMIT_NEW_ICONST (cfg, ins, 1);
2459 EMIT_NEW_ICONST (cfg, ins, 0);
2460 ins->type = STACK_I4;
2469 emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2471 const char *class_name = cmethod->klass->name;
2473 if (!strcmp (class_name, "Vector`1"))
2474 return emit_vector_t_intrinsics (cfg, cmethod, fsig, args);
2479 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2481 if (is_sys_numerics_assembly (field->parent->image->assembly)) {
2484 if (!strcmp (field->parent->name, "Vector2") ||
2485 !strcmp (field->parent->name, "Vector3") ||
2486 !strcmp (field->parent->name, "Vector4")) {
2487 if (!strcmp (field->name, "X"))
2489 else if (!strcmp (field->name, "Y"))
2491 else if (!strcmp (field->name, "Z"))
2493 else if (!strcmp (field->name, "W"))
2498 if (cfg->verbose_level > 1)
2499 printf (" SIMD intrinsic field access: %s\n", field->name);
2501 return simd_intrinsic_emit_getter_op (cfg, index, field->parent, mono_field_get_type (field), addr);
2507 #endif /* DISABLE_JIT */
2512 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2517 #endif /* MONO_ARCH_SIMD_INTRINSICS */