3 * simd support for intrinsics
6 * Rodrigo Kumpera (rkumpera@novell.com)
8 * (C) 2008 Novell, Inc.
16 #include "mono/utils/bsearch.h"
17 #include <mono/metadata/abi-details.h>
18 #include <mono/metadata/reflection-internals.h>
21 General notes on SIMD intrinsics
23 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
24 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
25 TODO extend op_to_op_dest_membase to handle simd ops
26 TODO add support for indexed versions of simd ops
27 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
28 TODO make sure locals, arguments and spills are properly aligned.
29 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
30 TODO add stuff to man pages
31 TODO document this under /docs
32 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
33 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
34 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
35 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
36 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
37 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
38 TODO check if we need to init the SSE control word with better precision.
39 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
40 TODO make SimdRuntime.get_AccelMode work under AOT
41 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
42 TODO extend bounds checking code to support for range checking.
44 General notes for SIMD intrinsics.
46 -Bad extractor and constructor performance
47 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
48 It will be loaded in the FP stack just to be pushed on the call stack.
50 A similar thing happens with Vector4f constructor that require float vars to be
52 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
53 trip to the FP stack is desirable.
55 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
59 -Promote OP_EXTRACT_I4 to a STORE op
60 The advantage of this change is that it could have a _membase version and promote further optimizations.
62 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
66 #if defined (MONO_ARCH_SIMD_INTRINSICS)
68 #if defined (DISABLE_JIT)
71 mono_simd_intrinsics_init (void)
77 //#define IS_DEBUG_ON(cfg) (0)
79 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
80 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
86 SIMD_EMIT_GETTER_QWORD,
92 SIMD_EMIT_LOAD_ALIGNED,
94 SIMD_EMIT_EXTRACT_MASK,
98 #ifdef HAVE_ARRAY_ELEM_INIT
99 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
100 #define MSGSTRFIELD1(line) str##line
101 static const struct msgstr_t {
102 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
103 #include "simd-methods.h"
106 #define SIMD_METHOD(str,name) str,
107 #include "simd-methods.h"
112 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
113 #include "simd-methods.h"
115 #define method_name(idx) ((const char*)&method_names + (idx))
118 #define SIMD_METHOD(str,name) str,
119 static const char * const method_names [] = {
120 #include "simd-methods.h"
124 #define SIMD_METHOD(str,name) name,
126 #include "simd-methods.h"
130 #define method_name(idx) (method_names [(idx)])
137 guint8 simd_version_flags;
138 guint8 simd_emit_mode : 4;
142 static const SimdIntrinsic vector4f_intrinsics[] = {
143 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
144 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
145 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
146 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
147 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
148 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
149 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
150 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
151 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
152 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
153 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
154 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
155 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
156 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
157 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
158 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
159 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
160 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
161 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
162 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
163 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
164 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
165 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
166 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
167 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
168 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
169 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
170 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
171 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
172 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
173 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
174 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
175 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
176 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
177 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
178 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
179 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
180 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
181 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
182 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
183 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
184 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
185 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
186 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
187 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
188 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
189 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
190 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
191 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
192 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
193 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
196 static const SimdIntrinsic vector2d_intrinsics[] = {
197 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
198 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
199 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
200 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
201 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
202 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
203 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
204 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
205 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
206 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
207 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
208 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
209 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
210 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
211 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
212 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
213 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
214 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
215 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
217 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
220 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
221 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
222 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
223 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
224 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
225 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
226 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
227 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
228 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
229 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
232 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
233 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
234 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
235 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
236 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
237 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
240 static const SimdIntrinsic vector2ul_intrinsics[] = {
241 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
242 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
243 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
244 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
245 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
246 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
247 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
248 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
249 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
250 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
251 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
252 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
253 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
254 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
255 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
256 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
257 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
258 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
259 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
260 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
261 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
262 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
263 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
264 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
267 static const SimdIntrinsic vector2l_intrinsics[] = {
268 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
269 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
270 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
271 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
272 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
273 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
274 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
275 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
276 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
277 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
278 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
279 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
280 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
281 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
282 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
283 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
284 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
285 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
287 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
288 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
289 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
290 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
291 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
292 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
295 static const SimdIntrinsic vector4ui_intrinsics[] = {
296 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
297 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
298 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
300 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
301 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
302 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
303 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
304 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
305 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
306 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
307 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
308 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
309 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
310 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
311 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
312 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
313 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
314 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
315 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
316 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
317 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
318 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
319 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
320 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
321 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
322 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
323 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
324 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
325 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
326 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
327 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
328 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
329 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
330 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
333 static const SimdIntrinsic vector4i_intrinsics[] = {
334 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
335 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
336 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
337 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
338 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
339 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
340 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
341 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
342 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
343 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
344 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
345 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
346 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
347 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
348 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
349 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
350 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
351 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
352 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
353 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
354 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
355 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
356 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
357 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
358 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
359 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
360 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
361 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
362 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
363 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
364 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
365 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
366 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
367 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
368 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
369 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
370 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
371 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
374 static const SimdIntrinsic vector8us_intrinsics[] = {
375 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
376 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
377 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
378 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
379 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
380 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
381 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
382 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
383 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
384 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
385 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
386 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
387 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
388 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
389 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
390 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
391 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
392 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
393 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
395 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
396 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
397 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
398 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
399 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
400 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
401 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
402 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
403 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
404 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
406 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
407 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
408 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
409 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
410 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
411 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
412 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
413 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
414 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
415 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
416 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
417 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
418 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
419 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
420 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
421 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
422 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
425 static const SimdIntrinsic vector8s_intrinsics[] = {
426 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
427 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
428 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
429 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
430 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
431 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
432 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
437 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
438 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
439 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
440 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
441 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
442 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
443 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
444 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
446 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
447 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
448 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
449 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
450 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
451 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
452 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
453 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
454 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
455 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
457 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
458 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
459 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
460 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
461 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
462 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
463 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
464 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
465 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
466 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
467 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
468 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
469 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
470 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
471 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
472 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
473 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
476 static const SimdIntrinsic vector16b_intrinsics[] = {
477 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
478 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
482 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
483 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
484 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
485 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
486 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
487 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
488 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
489 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
490 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
492 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
493 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
494 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
500 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
501 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
502 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
503 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
504 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
505 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
506 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
507 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
508 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
509 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
510 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
511 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
512 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
513 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
514 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
515 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
516 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
517 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
518 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
523 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
524 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
525 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
526 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
527 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
528 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
530 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
531 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
532 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
533 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
540 static const SimdIntrinsic vector16sb_intrinsics[] = {
541 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
542 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
545 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
546 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
547 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
548 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
549 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
550 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
551 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
552 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
553 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
554 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
555 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
556 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
557 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
563 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
564 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
565 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
566 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
567 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
568 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
569 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
570 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
571 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
572 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
573 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
574 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
575 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
576 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
577 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
578 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
579 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
580 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
581 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
587 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
589 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
590 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
591 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
592 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
593 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
594 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
595 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
596 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
599 static guint32 simd_supported_versions;
601 static MonoInst* emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
602 static MonoInst* emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
604 /*TODO match using number of parameters as well*/
606 simd_intrinsic_compare_by_name (const void *key, const void *value)
608 return strcmp (key, method_name (((SimdIntrinsic *)value)->name));
613 VREG_HAS_XZERO_BB0 = 0x02,
614 VREG_HAS_OTHER_OP_BB0 = 0x04,
615 VREG_SINGLE_BB_USE = 0x08,
616 VREG_MANY_BB_USE = 0x10,
620 mono_simd_intrinsics_init (void)
622 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
623 /*TODO log the supported flags*/
626 static inline gboolean
627 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
629 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
630 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
631 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
632 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
638 static inline gboolean
639 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
641 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
644 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
645 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
646 vreg_flags [reg] |= VREG_MANY_BB_USE;
647 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
649 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
650 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
651 target_bb [reg] = bb;
652 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
659 This pass recalculate which vars need MONO_INST_INDIRECT.
661 We cannot do this for non SIMD vars since code like mono_get_vtable_var
662 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
665 mono_simd_simplify_indirection (MonoCompile *cfg)
668 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
672 for (i = 0; i < cfg->num_varinfo; i++) {
673 MonoInst *var = cfg->varinfo [i];
674 if (var->klass->simd_type) {
675 var->flags &= ~MONO_INST_INDIRECT;
676 max_vreg = MAX (var->dreg, max_vreg);
680 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
681 if (!first_bb && bb->code)
683 for (ins = bb->code; ins; ins = ins->next) {
684 if (ins->opcode == OP_LDADDR) {
685 MonoInst *var = (MonoInst*)ins->inst_p0;
686 if (var->klass->simd_type) {
687 var->flags |= MONO_INST_INDIRECT;
693 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
694 vreg_flags = (char *)g_malloc0 (max_vreg + 1);
695 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
697 for (i = 0; i < cfg->num_varinfo; i++) {
698 MonoInst *var = cfg->varinfo [i];
699 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
700 vreg_flags [var->dreg] = VREG_USED;
701 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
705 /*Scan the first basic block looking xzeros not used*/
706 for (ins = first_bb->code; ins; ins = ins->next) {
708 int sregs [MONO_MAX_SRC_REGS];
710 if (ins->opcode == OP_XZERO) {
711 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
712 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
713 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
717 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
719 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
721 num_sregs = mono_inst_get_src_registers (ins, sregs);
722 for (i = 0; i < num_sregs; ++i) {
723 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
728 if (IS_DEBUG_ON (cfg)) {
729 for (i = 0; i < cfg->num_varinfo; i++) {
730 MonoInst *var = cfg->varinfo [i];
731 if (var->klass->simd_type) {
732 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
733 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
734 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
735 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
740 /*TODO stop here if no var is xzero only*/
743 Scan all other bb and check if it has only one other use
744 Ideally this would be done after an extended bb formation pass
746 FIXME This pass could use dominator information to properly
747 place the XZERO on the bb that dominates all uses of the var,
748 but this will have zero effect with the current local reg alloc
750 TODO simply the use of flags.
753 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
754 for (ins = bb->code; ins; ins = ins->next) {
756 int sregs [MONO_MAX_SRC_REGS];
758 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
760 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
762 num_sregs = mono_inst_get_src_registers (ins, sregs);
763 for (i = 0; i < num_sregs; ++i) {
764 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
765 max_vreg, vreg_flags, target_bb))
771 for (i = 0; i < cfg->num_varinfo; i++) {
772 MonoInst *var = cfg->varinfo [i];
773 if (!var->klass->simd_type)
775 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
776 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
777 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
778 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
780 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
782 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
784 int sregs [MONO_MAX_SRC_REGS];
785 gboolean found = FALSE;
787 num_sregs = mono_inst_get_src_registers (ins, sregs);
788 for (j = 0; j < num_sregs; ++j) {
789 if (sregs [j] == var->dreg)
792 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
793 if (ins->dreg == var->dreg && !found) {
794 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i, target_bb [var->dreg]->block_num););
797 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i, target_bb [var->dreg]->block_num); );
799 MONO_INST_NEW (cfg, tmp, OP_XZERO);
800 tmp->dreg = var->dreg;
801 tmp->type = STACK_VTYPE;
802 tmp->klass = var->klass;
803 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
809 for (ins = first_bb->code; ins; ins = ins->next) {
810 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE)) {
811 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins->dreg); mono_print_ins(ins));
821 * This function expect that src be a value.
824 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
826 const char *spec = INS_INFO (src->opcode);
828 if (src->opcode == OP_XMOVE) {
830 } else if (spec [MONO_INST_DEST] == 'x') {
832 } else if (src->opcode == OP_VCALL) {
836 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
837 mono_print_ins (src);
838 g_assert_not_reached ();
842 * This function will load the value if needed.
845 load_simd_vreg_class (MonoCompile *cfg, MonoClass *klass, MonoInst *src, gboolean *indirect)
847 const char *spec = INS_INFO (src->opcode);
851 if (src->opcode == OP_XMOVE) {
853 } else if (src->opcode == OP_LDADDR) {
854 int res = ((MonoInst*)src->inst_p0)->dreg;
857 } else if (spec [MONO_INST_DEST] == 'x') {
859 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
864 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
866 ins->sreg1 = src->dreg;
867 ins->type = STACK_VTYPE;
868 ins->dreg = alloc_ireg (cfg);
869 MONO_ADD_INS (cfg->cbb, ins);
872 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
873 mono_print_ins (src);
874 g_assert_not_reached ();
878 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
880 return load_simd_vreg_class (cfg, cmethod->klass, src, indirect);
883 /*We share the var with fconv_to_r8_x to save some stack space.*/
885 get_double_spill_area (MonoCompile *cfg)
887 if (!cfg->fconv_to_r8_x_var) {
888 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
889 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
891 return cfg->fconv_to_r8_x_var;
894 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
896 if (!cfg->simd_ctor_var) {
897 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
898 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
900 return cfg->simd_ctor_var;
904 mono_type_to_expand_op (MonoType *type)
906 switch (type->type) {
924 g_assert_not_reached ();
929 type_to_comp_op (MonoType *t)
949 g_assert_not_reached ();
955 type_to_gt_op (MonoType *t)
972 type_to_padd_op (MonoType *t)
998 type_to_psub_op (MonoType *t)
1024 type_to_pmul_op (MonoType *t)
1038 /* PMULQ multiplies two 32 bit numbers into a 64 bit one */
1049 type_to_pdiv_op (MonoType *t)
1063 type_to_pxor_op (MonoType *t)
1066 * These opcodes have the same semantics, but using the
1067 * correctly typed version is better for performance.
1080 type_to_pand_op (MonoType *t)
1093 type_to_por_op (MonoType *t)
1106 type_to_pmin_op (MonoType *t)
1131 type_to_pmax_op (MonoType *t)
1156 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoClass *klass, MonoType *param_type, MonoInst *src)
1161 if (mono_class_from_mono_type (param_type)->simd_type)
1162 return get_simd_vreg (cfg, NULL, src);
1164 expand_op = mono_type_to_expand_op (param_type);
1165 MONO_INST_NEW (cfg, ins, expand_op);
1167 ins->sreg1 = src->dreg;
1168 ins->type = STACK_VTYPE;
1169 ins->dreg = alloc_ireg (cfg);
1170 MONO_ADD_INS (cfg->cbb, ins);
1172 if (expand_op == OP_EXPAND_R4)
1173 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1174 else if (expand_op == OP_EXPAND_R8)
1175 ins->backend.spill_var = get_double_spill_area (cfg);
1181 * simd_intrinsic_emit_binary_op:
1183 * Emit a binary SIMD opcode.
1184 * @LHS/@RHS are the two arguments, they can be either a SIMD type or a scalar one. Scalar arguments are
1185 * expanded to the SIMD type.
1188 simd_intrinsic_emit_binary_op (MonoCompile *cfg, int opcode, int flags, MonoClass *klass, MonoType *lhs_type, MonoType *rhs_type, MonoInst *lhs, MonoInst *rhs)
1191 int left_vreg, right_vreg;
1193 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, lhs_type, lhs);
1194 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, rhs_type, rhs);
1196 MONO_INST_NEW (cfg, ins, opcode);
1198 ins->sreg1 = left_vreg;
1199 ins->sreg2 = right_vreg;
1200 ins->type = STACK_VTYPE;
1201 ins->dreg = alloc_ireg (cfg);
1202 ins->inst_c0 = flags;
1203 MONO_ADD_INS (cfg->cbb, ins);
1208 simd_intrinsic_emit_binary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1210 MonoMethodSignature *sig = mono_method_signature (cmethod);
1212 g_assert (sig->param_count == 2);
1214 return simd_intrinsic_emit_binary_op (cfg, intrinsic->opcode, intrinsic->flags, cmethod->klass, sig->params [0], sig->params [1], args [0], args [1]);
1218 simd_intrinsic_emit_unary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1223 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1225 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1226 ins->klass = cmethod->klass;
1228 ins->type = STACK_VTYPE;
1229 ins->dreg = alloc_ireg (cfg);
1230 MONO_ADD_INS (cfg->cbb, ins);
1235 mono_type_to_extract_op (MonoType *type)
1237 switch (type->type) {
1239 return OP_EXTRACT_I1;
1241 return OP_EXTRACT_U1;
1243 return OP_EXTRACT_I2;
1245 return OP_EXTRACT_U2;
1249 return OP_EXTRACT_I4;
1251 g_assert_not_reached ();
1255 /*Returns the amount to shift the element index to get the dword it belongs to*/
1257 mono_type_elements_shift_bits (MonoType *type)
1259 switch (type->type) {
1271 g_assert_not_reached ();
1275 static G_GNUC_UNUSED int
1276 mono_type_to_insert_op (MonoType *type)
1278 switch (type->type) {
1281 return OP_INSERT_I1;
1284 return OP_INSERT_I2;
1287 return OP_INSERT_I4;
1290 return OP_INSERT_I8;
1292 return OP_INSERT_R4;
1294 return OP_INSERT_R8;
1296 g_assert_not_reached ();
1301 mono_type_to_slow_insert_op (MonoType *type)
1303 switch (type->type) {
1306 return OP_INSERTX_U1_SLOW;
1309 return OP_INSERT_I2;
1312 return OP_INSERTX_I4_SLOW;
1315 return OP_INSERTX_I8_SLOW;
1317 return OP_INSERTX_R4_SLOW;
1319 return OP_INSERTX_R8_SLOW;
1321 g_assert_not_reached ();
1326 simd_intrinsic_emit_setter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1329 MonoMethodSignature *sig = mono_method_signature (cmethod);
1334 size = mono_type_size (sig->params [0], &align);
1336 if (COMPILE_LLVM (cfg)) {
1337 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1338 ins->klass = cmethod->klass;
1339 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1340 ins->sreg2 = args [1]->dreg;
1341 ins->inst_c0 = intrinsic->opcode;
1342 MONO_ADD_INS (cfg->cbb, ins);
1343 } else if (size == 2 || size == 4 || size == 8) {
1344 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1345 ins->klass = cmethod->klass;
1346 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1347 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1348 ins->sreg2 = args [1]->dreg;
1349 ins->inst_c0 = intrinsic->opcode;
1350 if (sig->params [0]->type == MONO_TYPE_R4)
1351 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1352 else if (sig->params [0]->type == MONO_TYPE_R8)
1353 ins->backend.spill_var = get_double_spill_area (cfg);
1354 MONO_ADD_INS (cfg->cbb, ins);
1358 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1359 ins->klass = cmethod->klass;
1360 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1361 ins->type = STACK_I4;
1362 ins->dreg = vreg = alloc_ireg (cfg);
1363 ins->inst_c0 = intrinsic->opcode / 2;
1364 MONO_ADD_INS (cfg->cbb, ins);
1366 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1367 ins->klass = cmethod->klass;
1369 ins->sreg2 = args [1]->dreg;
1371 ins->inst_c0 = intrinsic->opcode;
1372 MONO_ADD_INS (cfg->cbb, ins);
1376 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1377 ins->klass = cmethod->klass;
1378 ins->dreg = args [0]->dreg;
1380 MONO_ADD_INS (cfg->cbb, ins);
1386 * simd_intrinsic_emit_getter_op:
1388 * Emit IR for loading an element of a SIMD value.
1390 * @klass is the simd type, @type is the element type.
1393 simd_intrinsic_emit_getter_op (MonoCompile *cfg, int index, MonoClass *klass, MonoType *type, MonoInst *arg)
1396 int vreg, shift_bits;
1398 vreg = load_simd_vreg_class (cfg, klass, arg, NULL);
1400 if (type->type == MONO_TYPE_I8 || type->type == MONO_TYPE_U8 || type->type == MONO_TYPE_R8) {
1402 gboolean is_r8 = type->type == MONO_TYPE_R8;
1404 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1407 ins->inst_c0 = index;
1409 ins->type = STACK_R8;
1410 ins->dreg = alloc_freg (cfg);
1411 ins->backend.spill_var = get_double_spill_area (cfg);
1413 ins->type = STACK_I8;
1414 ins->dreg = alloc_lreg (cfg);
1416 MONO_ADD_INS (cfg->cbb, ins);
1420 shift_bits = mono_type_elements_shift_bits (type);
1422 if ((index >> shift_bits) && !cfg->compile_llvm) {
1423 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1426 ins->inst_c0 = index >> shift_bits;
1427 ins->type = STACK_VTYPE;
1428 ins->dreg = vreg = alloc_ireg (cfg);
1429 MONO_ADD_INS (cfg->cbb, ins);
1432 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (type));
1435 ins->type = STACK_I4;
1436 ins->dreg = vreg = alloc_ireg (cfg);
1437 if (cfg->compile_llvm)
1438 ins->inst_c0 = index;
1440 ins->inst_c0 = index & ((1 << shift_bits) - 1);
1441 MONO_ADD_INS (cfg->cbb, ins);
1443 if (type->type == MONO_TYPE_R4) {
1444 MONO_INST_NEW (cfg, ins, cfg->r4fp ? OP_ICONV_TO_R4_RAW : OP_MOVE_I4_TO_F);
1445 ins->klass = mono_defaults.single_class;
1447 ins->type = cfg->r4_stack_type;
1448 ins->dreg = alloc_freg (cfg);
1449 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1450 MONO_ADD_INS (cfg->cbb, ins);
1456 simd_intrinsic_emit_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1458 MonoMethodSignature *sig = mono_method_signature (cmethod);
1460 return simd_intrinsic_emit_getter_op (cfg, intrinsic->opcode, cmethod->klass, sig->ret, args [0]);
1464 simd_intrinsic_emit_long_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1468 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1470 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1472 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1473 ins->klass = cmethod->klass;
1475 ins->inst_c0 = intrinsic->opcode;
1477 ins->type = STACK_R8;
1478 ins->dreg = alloc_freg (cfg);
1479 ins->backend.spill_var = get_double_spill_area (cfg);
1481 ins->type = STACK_I8;
1482 ins->dreg = alloc_lreg (cfg);
1484 MONO_ADD_INS (cfg->cbb, ins);
1490 simd_intrinsic_emit_ctor (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1492 MonoInst *ins = NULL;
1494 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1495 MonoMethodSignature *sig = mono_method_signature (cmethod);
1496 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1497 int arg_size = mono_type_size (sig->params [0], &i);
1500 if (sig->param_count == 1) {
1504 dreg = args [0]->inst_i0->dreg;
1505 NULLIFY_INS (args [0]);
1507 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1508 dreg = alloc_ireg (cfg);
1512 opcode = intrinsic->opcode;
1514 opcode = mono_type_to_expand_op (sig->params [0]);
1515 MONO_INST_NEW (cfg, ins, opcode);
1516 ins->klass = cmethod->klass;
1517 ins->sreg1 = args [1]->dreg;
1518 ins->type = STACK_VTYPE;
1521 MONO_ADD_INS (cfg->cbb, ins);
1522 if (sig->params [0]->type == MONO_TYPE_R4)
1523 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1524 else if (sig->params [0]->type == MONO_TYPE_R8)
1525 ins->backend.spill_var = get_double_spill_area (cfg);
1528 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1529 ins->dreg = args [0]->dreg;
1531 MONO_ADD_INS (cfg->cbb, ins);
1537 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1538 MONO_ADD_INS (cfg->cbb, ins);
1539 addr_reg = ins->dreg;
1541 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1542 addr_reg = args [0]->dreg;
1545 for (i = sig->param_count - 1; i >= 0; --i) {
1546 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1549 if (sig->param_count * arg_size < 16) {
1550 /* If there are not enough arguments, fill the rest with 0s */
1551 for (i = sig->param_count; i < 16 / arg_size; ++i) {
1554 MONO_EMIT_NEW_STORE_MEMBASE_IMM (cfg, OP_STOREI4_MEMBASE_IMM, addr_reg, i * arg_size, 0);
1557 g_assert_not_reached ();
1563 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1564 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1565 NULLIFY_INS (args [0]);
1567 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1568 ins->klass = cmethod->klass;
1569 ins->sreg1 = addr_reg;
1570 ins->type = STACK_VTYPE;
1572 MONO_ADD_INS (cfg->cbb, ins);
1578 simd_intrinsic_emit_cast (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1584 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1586 if (cmethod->is_inflated)
1588 klass = mono_class_from_mono_type (mono_method_signature (cmethod)->ret);
1590 klass = cmethod->klass;
1592 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1594 ins->type = STACK_VTYPE;
1596 ins->dreg = alloc_ireg (cfg);
1597 MONO_ADD_INS (cfg->cbb, ins);
1602 simd_intrinsic_emit_shift (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1605 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1607 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1609 if (args [1]->opcode != OP_ICONST) {
1610 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1611 ins->klass = mono_defaults.int32_class;
1612 ins->sreg1 = args [1]->dreg;
1613 ins->type = STACK_I4;
1614 ins->dreg = vreg2 = alloc_ireg (cfg);
1615 MONO_ADD_INS (cfg->cbb, ins);
1617 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1620 MONO_INST_NEW (cfg, ins, opcode);
1621 ins->klass = cmethod->klass;
1625 if (args [1]->opcode == OP_ICONST) {
1626 ins->inst_imm = args [1]->inst_c0;
1627 NULLIFY_INS (args [1]);
1630 ins->type = STACK_VTYPE;
1631 ins->dreg = alloc_ireg (cfg);
1632 MONO_ADD_INS (cfg->cbb, ins);
1636 static inline gboolean
1637 mono_op_is_packed_compare (int op)
1639 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1643 simd_intrinsic_emit_equality_op (MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args, int opcode, int flags)
1646 int left_vreg, right_vreg, tmp_vreg;
1648 left_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1649 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1651 MONO_INST_NEW (cfg, ins, opcode);
1652 ins->klass = cmethod->klass;
1653 ins->sreg1 = left_vreg;
1654 ins->sreg2 = right_vreg;
1655 ins->type = STACK_VTYPE;
1656 ins->klass = cmethod->klass;
1657 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1658 ins->inst_c0 = flags;
1659 MONO_ADD_INS (cfg->cbb, ins);
1661 /*FIXME the next ops are SSE specific*/
1662 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1663 ins->klass = cmethod->klass;
1664 ins->sreg1 = tmp_vreg;
1665 ins->type = STACK_I4;
1666 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1667 MONO_ADD_INS (cfg->cbb, ins);
1669 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1670 if (mono_op_is_packed_compare (opcode) || flags == SIMD_COMP_EQ) {
1671 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1672 NEW_UNALU (cfg, ins, flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1674 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1675 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1677 MONO_ADD_INS (cfg->cbb, ins);
1682 simd_intrinsic_emit_equality (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1684 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, intrinsic->opcode, intrinsic->flags);
1688 simd_intrinsic_emit_shuffle (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1691 int vreg, vreg2 = -1;
1692 int param_count = mono_method_signature (cmethod)->param_count;
1694 if (args [param_count - 1]->opcode != OP_ICONST) {
1695 /*TODO Shuffle with non literals is not yet supported */
1699 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1700 if (param_count == 3)
1701 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1703 NULLIFY_INS (args [param_count - 1]);
1706 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1707 ins->klass = cmethod->klass;
1710 ins->inst_c0 = args [param_count - 1]->inst_c0;
1711 ins->type = STACK_VTYPE;
1712 ins->dreg = alloc_ireg (cfg);
1713 MONO_ADD_INS (cfg->cbb, ins);
1715 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1716 ins->opcode = OP_SHUFPS;
1721 simd_intrinsic_emit_load_aligned (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1725 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1726 ins->klass = cmethod->klass;
1727 ins->sreg1 = args [0]->dreg;
1728 ins->type = STACK_VTYPE;
1729 ins->dreg = alloc_ireg (cfg);
1730 MONO_ADD_INS (cfg->cbb, ins);
1735 simd_intrinsic_emit_store (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1740 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1742 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1743 ins->klass = cmethod->klass;
1744 ins->dreg = args [0]->dreg;
1746 ins->type = STACK_VTYPE;
1747 MONO_ADD_INS (cfg->cbb, ins);
1752 simd_intrinsic_emit_extract_mask (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1757 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1759 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1760 ins->klass = cmethod->klass;
1762 ins->type = STACK_I4;
1763 ins->dreg = alloc_ireg (cfg);
1764 MONO_ADD_INS (cfg->cbb, ins);
1770 simd_intrinsic_emit_prefetch (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1774 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1775 ins->klass = cmethod->klass;
1776 ins->sreg1 = args [0]->dreg;
1777 ins->backend.arg_info = intrinsic->flags;
1778 MONO_ADD_INS (cfg->cbb, ins);
1783 simd_intrinsic_emit_const (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1787 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1788 ins->klass = cmethod->klass;
1789 ins->type = STACK_VTYPE;
1790 ins->dreg = alloc_xreg (cfg);
1791 MONO_ADD_INS (cfg->cbb, ins);
1796 simd_version_name (guint32 version)
1799 case SIMD_VERSION_SSE1:
1801 case SIMD_VERSION_SSE2:
1803 case SIMD_VERSION_SSE3:
1805 case SIMD_VERSION_SSSE3:
1807 case SIMD_VERSION_SSE41:
1809 case SIMD_VERSION_SSE42:
1811 case SIMD_VERSION_SSE4a:
1818 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsic *intrinsics, guint32 size)
1820 const SimdIntrinsic *result = (const SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
1822 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1825 if (IS_DEBUG_ON (cfg)) {
1827 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1828 max = fsig->param_count + fsig->hasthis;
1829 for (i = 0; i < max; ++i) {
1830 printf ("param %d: ", i);
1831 mono_print_ins (args [i]);
1834 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1835 if (IS_DEBUG_ON (cfg)) {
1837 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1838 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1839 if (result->simd_version_flags & (1 << x))
1840 printf ("%s ", simd_version_name (1 << x));
1847 switch (result->simd_emit_mode) {
1848 case SIMD_EMIT_BINARY:
1849 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1850 case SIMD_EMIT_UNARY:
1851 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1852 case SIMD_EMIT_SETTER:
1853 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1854 case SIMD_EMIT_GETTER:
1855 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1856 case SIMD_EMIT_GETTER_QWORD:
1857 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1858 case SIMD_EMIT_CTOR:
1859 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1860 case SIMD_EMIT_CAST:
1861 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1862 case SIMD_EMIT_SHUFFLE:
1863 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1864 case SIMD_EMIT_SHIFT:
1865 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1866 case SIMD_EMIT_EQUALITY:
1867 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1868 case SIMD_EMIT_LOAD_ALIGNED:
1869 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1870 case SIMD_EMIT_STORE:
1871 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1872 case SIMD_EMIT_EXTRACT_MASK:
1873 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1874 case SIMD_EMIT_PREFETCH:
1875 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1877 g_assert_not_reached ();
1881 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1885 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1887 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1888 mult_reg = alloc_preg (cfg);
1889 array_reg = arr->dreg;
1890 index_reg = index->dreg;
1892 #if SIZEOF_VOID_P == 8
1893 /* The array reg is 64 bits but the index reg is only 32 */
1894 index2_reg = alloc_preg (cfg);
1895 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1897 index2_reg = index_reg;
1899 index3_reg = alloc_preg (cfg);
1902 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1903 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1904 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1907 add_reg = alloc_preg (cfg);
1909 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1910 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1911 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1912 ins->type = STACK_PTR;
1913 MONO_ADD_INS (cfg->cbb, ins);
1919 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1921 if ((!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) && fsig->param_count == 2) {
1923 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1925 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1926 load->klass = cmethod->klass;
1928 load->type = STACK_VTYPE;
1929 load->dreg = alloc_ireg (cfg);
1930 MONO_ADD_INS (cfg->cbb, load);
1934 if ((!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) && fsig->param_count == 3) {
1936 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1937 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1939 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1940 store->klass = cmethod->klass;
1942 store->sreg1 = vreg;
1943 MONO_ADD_INS (cfg->cbb, store);
1947 if (!strcmp ("IsAligned", cmethod->name) && fsig->param_count == 2) {
1949 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1951 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1952 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1953 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1954 MONO_ADD_INS (cfg->cbb, ins);
1962 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1964 if (!strcmp ("get_AccelMode", cmethod->name) && fsig->param_count == 0) {
1966 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1973 is_sys_numerics_assembly (MonoAssembly *assembly)
1975 return !strcmp ("System.Numerics", assembly->aname.name);
1979 is_sys_numerics_vectors_assembly (MonoAssembly *assembly)
1981 return !strcmp ("System.Numerics.Vectors", assembly->aname.name);
1985 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1987 const char *class_name;
1989 if (is_sys_numerics_assembly (cmethod->klass->image->assembly))
1990 return emit_sys_numerics_intrinsics (cfg, cmethod, fsig, args);
1992 if (is_sys_numerics_vectors_assembly (cmethod->klass->image->assembly))
1993 return emit_sys_numerics_vectors_intrinsics (cfg, cmethod, fsig, args);
1995 if (strcmp ("Mono.Simd", cmethod->klass->image->assembly->aname.name) ||
1996 strcmp ("Mono.Simd", cmethod->klass->name_space))
1999 class_name = cmethod->klass->name;
2000 if (!strcmp ("SimdRuntime", class_name))
2001 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
2003 if (!strcmp ("ArrayExtensions", class_name))
2004 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
2006 if (!strcmp ("VectorOperations", class_name)) {
2007 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
2009 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
2010 } else if (!cmethod->klass->simd_type)
2013 cfg->uses_simd_intrinsics = 1;
2014 if (!strcmp ("Vector2d", class_name))
2015 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsic));
2016 if (!strcmp ("Vector4f", class_name))
2017 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsic));
2018 if (!strcmp ("Vector2ul", class_name))
2019 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsic));
2020 if (!strcmp ("Vector2l", class_name))
2021 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsic));
2022 if (!strcmp ("Vector4ui", class_name))
2023 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsic));
2024 if (!strcmp ("Vector4i", class_name))
2025 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsic));
2026 if (!strcmp ("Vector8us", class_name))
2027 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsic));
2028 if (!strcmp ("Vector8s", class_name))
2029 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsic));
2030 if (!strcmp ("Vector16b", class_name))
2031 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsic));
2032 if (!strcmp ("Vector16sb", class_name))
2033 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsic));
2039 assert_handled (MonoCompile *cfg, MonoMethod *method)
2041 MonoCustomAttrInfo *cattr;
2044 if (cfg->verbose_level > 1) {
2045 cattr = mono_custom_attrs_from_method_checked (method, &error);
2048 gboolean has_attr = FALSE;
2049 for (int i = 0; i < cattr->num_attrs; ++i)
2050 if (cattr->attrs [i].ctor && (!strcmp (cattr->attrs [i].ctor->klass->name, "JitIntrinsicAttribute")))
2053 printf ("SIMD intrinsic unhandled: %s\n", mono_method_get_name_full (method, TRUE, TRUE, MONO_TYPE_NAME_FORMAT_IL));
2055 //g_assert_not_reached ();
2057 mono_custom_attrs_free (cattr);
2062 // The entries should be ordered by name
2063 // System.Numerics.Vector2/Vector3/Vector4
2064 static const SimdIntrinsic vector2_intrinsics[] = {
2065 { SN_ctor, OP_EXPAND_R4 },
2067 { SN_Dot, OP_DPPS },
2068 { SN_Equals, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
2069 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2070 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2071 { SN_SquareRoot, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
2072 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2073 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2074 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2075 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2079 emit_vector_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2081 const SimdIntrinsic *intrins;
2082 MonoMethodSignature *sig = mono_method_signature (cmethod);
2083 MonoType *type = &cmethod->klass->byval_arg;
2086 * Vector2/3/4 are handled the same way, since the underlying SIMD type is the same (4 * r4).
2088 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector2_intrinsics, sizeof (vector2_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2090 assert_handled (cfg, cmethod);
2094 if (cfg->verbose_level > 1) {
2095 char *name = mono_method_full_name (cmethod, TRUE);
2096 printf (" SIMD intrinsic %s\n", name);
2100 switch (intrins->name) {
2102 gboolean match = TRUE;
2103 for (int i = 0; i < fsig->param_count; ++i)
2104 if (fsig->params [i]->type != MONO_TYPE_R4)
2108 return simd_intrinsic_emit_ctor (intrins, cfg, cmethod, args);
2111 if (!(fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && fsig->params [0] == type))
2113 return simd_intrinsic_emit_equality (intrins, cfg, cmethod, args);
2115 if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2117 return simd_intrinsic_emit_unary (intrins, cfg, cmethod, args);
2119 if (!(fsig->param_count == 2 && fsig->ret->type == MONO_TYPE_R4 && fsig->params [0] == type && fsig->params [1] == type))
2121 if (COMPILE_LLVM (cfg)) {
2124 ins = simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2125 /* The end result is in the lowest element */
2126 return simd_intrinsic_emit_getter_op (cfg, 0, cmethod->klass, mono_method_signature (cmethod)->ret, ins);
2130 // abs(x) = max(x, sub(0,x))
2134 if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2137 MONO_INST_NEW (cfg, zero, OP_XZERO);
2138 zero->dreg = alloc_xreg (cfg);
2139 zero->klass = cmethod->klass;
2140 MONO_ADD_INS (cfg->cbb, zero);
2142 sub = simd_intrinsic_emit_binary_op (cfg, OP_SUBPS, 0, cmethod->klass, sig->params [0], sig->params [0], zero, args [0]);
2143 return simd_intrinsic_emit_binary_op (cfg, OP_MAXPS, 0, cmethod->klass, sig->params [0], sig->params [0], args [0], sub);
2147 case SN_op_Addition:
2148 case SN_op_Division:
2149 case SN_op_Multiply:
2150 case SN_op_Subtraction:
2151 if (!(fsig->param_count == 2 && fsig->ret == type && (fsig->params [0] == type || fsig->params [0]->type == MONO_TYPE_R4) && (fsig->params [1] == type || fsig->params [1]->type == MONO_TYPE_R4)))
2153 return simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2158 assert_handled (cfg, cmethod);
2160 if (cfg->verbose_level > 1) {
2161 char *name = mono_method_full_name (cmethod, TRUE);
2162 printf (" SIMD method %s not handled.\n", name);
2169 emit_vector_is_hardware_accelerated_intrinsic (MonoCompile *cfg)
2173 if (simd_supported_versions)
2174 EMIT_NEW_ICONST (cfg, ins, 1);
2176 EMIT_NEW_ICONST (cfg, ins, 0);
2177 ins->type = STACK_I4;
2181 /* These should be ordered by name */
2182 static const SimdIntrinsic vector_t_intrinsics[] = {
2188 { SN_GreaterThanOrEqual },
2190 { SN_LessThanOrEqual },
2193 { SN_get_AllOnes, OP_XONES },
2196 { SN_get_Zero, OP_XZERO },
2198 { SN_op_BitwiseAnd },
2199 { SN_op_BitwiseOr },
2201 { SN_op_ExclusiveOr },
2204 { SN_op_Subtraction }
2208 emit_vector_t_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2210 const SimdIntrinsic *intrins;
2211 MonoType *type, *etype;
2213 int size, len, index;
2215 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector_t_intrinsics, sizeof (vector_t_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2217 assert_handled (cfg, cmethod);
2221 type = &cmethod->klass->byval_arg;
2222 etype = mono_class_get_context (cmethod->klass)->class_inst->type_argv [0];
2223 size = mono_class_value_size (mono_class_from_mono_type (etype), NULL);
2227 if (!MONO_TYPE_IS_PRIMITIVE (etype))
2230 if (cfg->verbose_level > 1) {
2231 char *name = mono_method_full_name (cmethod, TRUE);
2232 printf (" SIMD intrinsic %s\n", name);
2236 switch (intrins->name) {
2238 if (!(fsig->param_count == 0 && fsig->ret->type == MONO_TYPE_I4))
2240 EMIT_NEW_ICONST (cfg, ins, len);
2242 case SN_get_AllOnes:
2244 if (!(fsig->param_count == 0 && mono_metadata_type_equal (fsig->ret, type)))
2246 return simd_intrinsic_emit_const (intrins, cfg, cmethod, args);
2248 g_assert (fsig->param_count == 1);
2249 if (args [1]->opcode != OP_ICONST)
2251 index = args [1]->inst_c0;
2252 if (index < 0 || index >= len)
2254 return simd_intrinsic_emit_getter_op (cfg, index, cmethod->klass, etype, args [0]);
2256 if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
2257 return simd_intrinsic_emit_ctor (NULL, cfg, cmethod, args);
2258 if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) {
2259 MonoInst *array_ins = args [1];
2260 MonoInst *index_ins;
2261 MonoInst *ldelema_ins;
2265 if (args [0]->opcode != OP_LDADDR)
2268 /* .ctor (T[]) or .ctor (T[], index) */
2270 if (fsig->param_count == 2) {
2271 index_ins = args [2];
2273 EMIT_NEW_ICONST (cfg, index_ins, 0);
2276 /* Emit index check for the end (index + len - 1 < array length) */
2277 end_index_reg = alloc_ireg (cfg);
2278 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2279 MONO_EMIT_BOUNDS_CHECK (cfg, array_ins->dreg, MonoArray, max_length, end_index_reg);
2281 /* Load the array slice into the simd reg */
2282 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2283 g_assert (args [0]->opcode == OP_LDADDR);
2284 var = args [0]->inst_p0;
2285 EMIT_NEW_LOAD_MEMBASE (cfg, ins, OP_LOADX_MEMBASE, var->dreg, ldelema_ins->dreg, 0);
2286 ins->klass = cmethod->klass;
2290 case SN_op_Explicit:
2291 return simd_intrinsic_emit_cast (intrins, cfg, cmethod, args);
2293 if (fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && mono_metadata_type_equal (fsig->params [0], type))
2294 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, type_to_comp_op (etype), SIMD_COMP_EQ);
2295 if (fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type))
2296 return simd_intrinsic_emit_binary_op (cfg, type_to_comp_op (etype), 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2299 case SN_GreaterThan:
2300 case SN_GreaterThanOrEqual:
2302 case SN_LessThanOrEqual: {
2303 MonoInst *cmp1, *cmp2;
2306 switch (etype->type) {
2316 eq_op = type_to_comp_op (etype);
2317 gt_op = type_to_gt_op (etype);
2319 switch (intrins->name) {
2320 case SN_GreaterThan:
2321 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2323 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2324 case SN_LessThanOrEqual:
2325 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2326 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2327 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2328 case SN_GreaterThanOrEqual:
2329 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2330 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2331 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2333 g_assert_not_reached ();
2339 switch (etype->type) {
2343 case MONO_TYPE_U8: {
2347 MONO_INST_NEW (cfg, ins, OP_XMOVE);
2348 ins->klass = cmethod->klass;
2349 ins->type = STACK_VTYPE;
2350 ins->sreg1 = args [0]->dreg;
2351 ins->dreg = alloc_xreg (cfg);
2352 MONO_ADD_INS (cfg->cbb, ins);
2359 case SN_op_Addition:
2360 case SN_op_Subtraction:
2361 case SN_op_Multiply:
2362 case SN_op_Division:
2363 case SN_op_ExclusiveOr:
2364 case SN_op_BitwiseAnd:
2365 case SN_op_BitwiseOr:
2368 if (!(fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, fsig->params [0]) && mono_metadata_type_equal (fsig->params [0], fsig->params [1])))
2371 switch (intrins->name) {
2372 case SN_op_Addition:
2373 op = type_to_padd_op (etype);
2375 case SN_op_Subtraction:
2376 op = type_to_psub_op (etype);
2378 case SN_op_Multiply:
2379 op = type_to_pmul_op (etype);
2381 case SN_op_Division:
2382 op = type_to_pdiv_op (etype);
2384 case SN_op_ExclusiveOr:
2385 op = type_to_pxor_op (etype);
2387 case SN_op_BitwiseAnd:
2388 op = type_to_pand_op (etype);
2390 case SN_op_BitwiseOr:
2391 op = type_to_por_op (etype);
2394 op = type_to_pmin_op (etype);
2397 op = type_to_pmax_op (etype);
2400 g_assert_not_reached ();
2403 return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2407 MonoInst *array_ins = args [1];
2408 MonoInst *index_ins = args [2];
2409 MonoInst *ldelema_ins;
2413 if (args [0]->opcode != OP_LDADDR)
2416 /* Emit index check for the end (index + len - 1 < array length) */
2417 end_index_reg = alloc_ireg (cfg);
2418 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2420 int length_reg = alloc_ireg (cfg);
2421 MONO_EMIT_NEW_LOAD_MEMBASE_OP_FAULT (cfg, OP_LOADI4_MEMBASE, length_reg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length));
2422 MONO_EMIT_NEW_BIALU (cfg, OP_COMPARE, -1, length_reg, end_index_reg);
2423 MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, "ArgumentException");
2425 /* Load the simd reg into the array slice */
2426 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2427 g_assert (args [0]->opcode == OP_LDADDR);
2428 var = args [0]->inst_p0;
2429 EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, ldelema_ins->dreg, 0, var->dreg);
2430 ins->klass = cmethod->klass;
2438 assert_handled (cfg, cmethod);
2440 if (cfg->verbose_level > 1) {
2441 char *name = mono_method_full_name (cmethod, TRUE);
2442 printf (" SIMD method %s not handled.\n", name);
2450 * emit_sys_numerics_intrinsics:
2452 * Emit intrinsics for the System.Numerics assembly.
2455 emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2457 const char *nspace = cmethod->klass->name_space;
2458 const char *class_name = cmethod->klass->name;
2460 if (!strcmp ("Vector2", class_name) || !strcmp ("Vector4", class_name) || !strcmp ("Vector3", class_name))
2461 return emit_vector_intrinsics (cfg, cmethod, fsig, args);
2463 if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2464 if (!strcmp (cmethod->name, "get_IsHardwareAccelerated"))
2465 return emit_vector_is_hardware_accelerated_intrinsic (cfg);
2472 emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2474 const char *nspace = cmethod->klass->name_space;
2475 const char *class_name = cmethod->klass->name;
2477 if (!strcmp (class_name, "Vector`1"))
2478 return emit_vector_t_intrinsics (cfg, cmethod, fsig, args);
2480 if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2481 if (!strcmp (cmethod->name, "get_IsHardwareAccelerated"))
2482 return emit_vector_is_hardware_accelerated_intrinsic (cfg);
2489 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2491 if (is_sys_numerics_assembly (field->parent->image->assembly)) {
2494 if (!strcmp (field->parent->name, "Vector2") ||
2495 !strcmp (field->parent->name, "Vector3") ||
2496 !strcmp (field->parent->name, "Vector4")) {
2497 if (!strcmp (field->name, "X"))
2499 else if (!strcmp (field->name, "Y"))
2501 else if (!strcmp (field->name, "Z"))
2503 else if (!strcmp (field->name, "W"))
2508 if (cfg->verbose_level > 1)
2509 printf (" SIMD intrinsic field access: %s\n", field->name);
2511 return simd_intrinsic_emit_getter_op (cfg, index, field->parent, mono_field_get_type (field), addr);
2517 #endif /* DISABLE_JIT */
2522 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2527 #endif /* MONO_ARCH_SIMD_INTRINSICS */