2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
15 #include "mono/utils/bsearch.h"
16 #include <mono/metadata/abi-details.h>
19 General notes on SIMD intrinsics
21 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
22 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
23 TODO extend op_to_op_dest_membase to handle simd ops
24 TODO add support for indexed versions of simd ops
25 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
26 TODO make sure locals, arguments and spills are properly aligned.
27 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
28 TODO add stuff to man pages
29 TODO document this under /docs
30 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
31 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
32 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
33 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
34 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
35 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
36 TODO check if we need to init the SSE control word with better precision.
37 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
38 TODO make SimdRuntime.get_AccelMode work under AOT
39 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
40 TODO extend bounds checking code to support for range checking.
42 General notes for SIMD intrinsics.
44 -Bad extractor and constructor performance
45 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
46 It will be loaded in the FP stack just to be pushed on the call stack.
48 A similar thing happens with Vector4f constructor that require float vars to be
50 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
51 trip to the FP stack is desirable.
53 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
57 -Promote OP_EXTRACT_I4 to a STORE op
58 The advantage of this change is that it could have a _membase version and promote further optimizations.
60 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
64 #if defined (MONO_ARCH_SIMD_INTRINSICS)
66 #if defined (DISABLE_JIT)
69 mono_simd_intrinsics_init (void)
75 //#define IS_DEBUG_ON(cfg) (0)
77 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
78 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
84 SIMD_EMIT_GETTER_QWORD,
90 SIMD_EMIT_LOAD_ALIGNED,
92 SIMD_EMIT_EXTRACT_MASK,
96 #ifdef HAVE_ARRAY_ELEM_INIT
97 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
98 #define MSGSTRFIELD1(line) str##line
99 static const struct msgstr_t {
100 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
101 #include "simd-methods.h"
104 #define SIMD_METHOD(str,name) str,
105 #include "simd-methods.h"
110 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
111 #include "simd-methods.h"
113 #define method_name(idx) ((const char*)&method_names + (idx))
116 #define SIMD_METHOD(str,name) str,
117 static const char * const method_names [] = {
118 #include "simd-methods.h"
122 #define SIMD_METHOD(str,name) name,
124 #include "simd-methods.h"
128 #define method_name(idx) (method_names [(idx)])
135 guint8 simd_version_flags;
136 guint8 simd_emit_mode : 4;
140 static const SimdIntrinsc vector4f_intrinsics[] = {
141 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
142 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
143 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
144 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
145 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
146 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
147 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
148 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
149 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
150 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
151 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
152 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
153 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
154 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
155 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
156 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
157 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
158 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
159 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
160 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
161 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
162 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
163 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
164 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
165 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
166 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
167 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
168 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
169 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
170 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
171 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
172 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
173 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
174 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
175 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
176 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
177 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
178 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
179 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
180 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
181 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
182 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
183 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
184 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
185 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
186 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
187 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
188 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
189 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
190 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
191 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
194 static const SimdIntrinsc vector2d_intrinsics[] = {
195 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
196 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
197 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
198 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
199 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
200 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
201 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
202 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
203 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
204 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
205 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
206 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
207 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
208 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
209 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
210 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
211 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
212 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
213 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
214 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
215 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
217 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
218 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
219 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
220 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
221 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
222 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
223 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
224 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
225 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
226 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
227 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
228 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
229 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
232 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
233 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
234 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
235 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
238 static const SimdIntrinsc vector2ul_intrinsics[] = {
239 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
240 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
241 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
242 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
243 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
244 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
245 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
246 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
247 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
248 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
249 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
250 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
251 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
252 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
253 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
254 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
255 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
256 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
257 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
258 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
259 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
260 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
261 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
262 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
265 static const SimdIntrinsc vector2l_intrinsics[] = {
266 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
267 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
268 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
269 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
270 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
271 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
272 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
273 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
274 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
275 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
276 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
277 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
278 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
279 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
280 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
281 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
282 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
283 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
284 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
285 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
286 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
287 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
288 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
289 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
290 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
293 static const SimdIntrinsc vector4ui_intrinsics[] = {
294 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
295 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
296 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
297 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
298 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
299 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
300 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
301 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
302 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
303 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
304 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
305 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
306 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
307 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
308 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
309 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
310 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
311 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
312 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
313 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
314 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
315 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
316 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
317 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
318 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
319 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
320 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
321 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
322 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
323 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
324 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
325 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
326 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
327 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
328 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
331 static const SimdIntrinsc vector4i_intrinsics[] = {
332 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
333 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
334 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
335 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
336 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
337 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
338 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
339 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
340 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
341 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
342 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
343 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
344 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
345 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
346 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
347 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
348 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
349 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
350 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
351 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
352 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
353 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
354 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
355 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
356 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
357 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
358 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
359 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
360 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
361 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
362 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
363 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
364 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
365 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
366 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
367 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
368 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
369 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
372 static const SimdIntrinsc vector8us_intrinsics[] = {
373 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
374 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
375 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
376 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
377 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
378 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
379 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
380 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
381 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
382 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
383 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
384 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
385 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
386 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
387 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
388 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
389 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
390 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
391 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
392 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
393 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
395 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
396 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
397 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
398 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
399 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
400 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
401 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
402 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
403 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
404 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
406 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
407 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
408 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
409 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
410 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
411 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
412 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
413 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
414 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
415 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
416 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
417 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
418 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
419 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
420 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
423 static const SimdIntrinsc vector8s_intrinsics[] = {
424 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
425 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
426 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
427 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
428 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
429 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
430 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
431 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
432 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
436 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
437 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
438 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
439 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
440 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
441 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
442 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
443 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
444 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
446 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
447 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
448 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
449 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
450 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
451 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
452 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
453 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
454 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
455 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
457 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
458 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
459 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
460 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
461 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
462 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
463 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
464 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
465 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
466 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
467 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
468 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
469 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
470 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
471 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
474 static const SimdIntrinsc vector16b_intrinsics[] = {
475 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
476 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
477 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
478 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
480 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
481 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
482 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
483 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
484 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
485 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
486 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
487 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
488 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
489 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
490 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
492 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
493 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
500 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
501 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
502 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
503 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
504 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
505 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
506 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
507 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
508 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
509 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
510 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
511 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
512 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
513 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
514 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
515 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
516 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
517 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
523 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
524 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
525 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
526 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
527 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
528 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
530 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
531 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
538 static const SimdIntrinsc vector16sb_intrinsics[] = {
539 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
540 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
541 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
542 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
544 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
545 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
546 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
547 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
548 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
549 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
550 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
551 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
552 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
553 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
554 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
555 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
556 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
563 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
564 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
565 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
566 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
567 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
568 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
569 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
570 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
571 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
572 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
573 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
574 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
575 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
576 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
577 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
578 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
579 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
580 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
587 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
589 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
590 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
591 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
592 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
593 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
594 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
597 static guint32 simd_supported_versions;
599 /*TODO match using number of parameters as well*/
601 simd_intrinsic_compare_by_name (const void *key, const void *value)
603 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
608 VREG_HAS_XZERO_BB0 = 0x02,
609 VREG_HAS_OTHER_OP_BB0 = 0x04,
610 VREG_SINGLE_BB_USE = 0x08,
611 VREG_MANY_BB_USE = 0x10,
615 mono_simd_intrinsics_init (void)
617 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
618 /*TODO log the supported flags*/
621 static inline gboolean
622 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
624 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
625 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
626 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
627 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
633 static inline gboolean
634 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
636 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
639 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
640 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
641 vreg_flags [reg] |= VREG_MANY_BB_USE;
642 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
644 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
645 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
646 target_bb [reg] = bb;
647 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
654 This pass recalculate which vars need MONO_INST_INDIRECT.
656 We cannot do this for non SIMD vars since code like mono_get_vtable_var
657 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
660 mono_simd_simplify_indirection (MonoCompile *cfg)
663 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
667 for (i = 0; i < cfg->num_varinfo; i++) {
668 MonoInst *var = cfg->varinfo [i];
669 if (var->klass->simd_type) {
670 var->flags &= ~MONO_INST_INDIRECT;
671 max_vreg = MAX (var->dreg, max_vreg);
675 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
676 if (!first_bb && bb->code)
678 for (ins = bb->code; ins; ins = ins->next) {
679 if (ins->opcode == OP_LDADDR) {
680 MonoInst *var = (MonoInst*)ins->inst_p0;
681 if (var->klass->simd_type) {
682 var->flags |= MONO_INST_INDIRECT;
688 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
689 vreg_flags = (char *)g_malloc0 (max_vreg + 1);
690 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
692 for (i = 0; i < cfg->num_varinfo; i++) {
693 MonoInst *var = cfg->varinfo [i];
694 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
695 vreg_flags [var->dreg] = VREG_USED;
696 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
700 /*Scan the first basic block looking xzeros not used*/
701 for (ins = first_bb->code; ins; ins = ins->next) {
703 int sregs [MONO_MAX_SRC_REGS];
705 if (ins->opcode == OP_XZERO) {
706 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
707 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
708 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
712 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
714 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
716 num_sregs = mono_inst_get_src_registers (ins, sregs);
717 for (i = 0; i < num_sregs; ++i) {
718 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
723 if (IS_DEBUG_ON (cfg)) {
724 for (i = 0; i < cfg->num_varinfo; i++) {
725 MonoInst *var = cfg->varinfo [i];
726 if (var->klass->simd_type) {
727 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
728 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
729 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
730 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
735 /*TODO stop here if no var is xzero only*/
738 Scan all other bb and check if it has only one other use
739 Ideally this would be done after an extended bb formation pass
741 FIXME This pass could use dominator information to properly
742 place the XZERO on the bb that dominates all uses of the var,
743 but this will have zero effect with the current local reg alloc
745 TODO simply the use of flags.
748 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
749 for (ins = bb->code; ins; ins = ins->next) {
751 int sregs [MONO_MAX_SRC_REGS];
753 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
755 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
757 num_sregs = mono_inst_get_src_registers (ins, sregs);
758 for (i = 0; i < num_sregs; ++i) {
759 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
760 max_vreg, vreg_flags, target_bb))
766 for (i = 0; i < cfg->num_varinfo; i++) {
767 MonoInst *var = cfg->varinfo [i];
768 if (!var->klass->simd_type)
770 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
771 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
772 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
773 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
775 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
777 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
779 int sregs [MONO_MAX_SRC_REGS];
780 gboolean found = FALSE;
782 num_sregs = mono_inst_get_src_registers (ins, sregs);
783 for (j = 0; j < num_sregs; ++j) {
784 if (sregs [j] == var->dreg)
787 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
788 if (ins->dreg == var->dreg && !found) {
789 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i, target_bb [var->dreg]->block_num););
792 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i, target_bb [var->dreg]->block_num); );
794 MONO_INST_NEW (cfg, tmp, OP_XZERO);
795 tmp->dreg = var->dreg;
796 tmp->type = STACK_VTYPE;
797 tmp->klass = var->klass;
798 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
804 for (ins = first_bb->code; ins; ins = ins->next) {
805 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE)) {
806 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins->dreg); mono_print_ins(ins));
816 * This function expect that src be a value.
819 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
821 if (src->opcode == OP_XMOVE) {
823 } else if (src->type == STACK_VTYPE) {
826 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
827 mono_print_ins (src);
828 g_assert_not_reached ();
832 * This function will load the value if needed.
835 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
839 if (src->opcode == OP_XMOVE) {
841 } else if (src->opcode == OP_LDADDR) {
842 int res = ((MonoInst*)src->inst_p0)->dreg;
845 } else if (src->type == STACK_VTYPE) {
847 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
852 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
853 ins->klass = cmethod->klass;
854 ins->sreg1 = src->dreg;
855 ins->type = STACK_VTYPE;
856 ins->dreg = alloc_ireg (cfg);
857 MONO_ADD_INS (cfg->cbb, ins);
860 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
861 mono_print_ins (src);
862 g_assert_not_reached ();
865 /*We share the var with fconv_to_r8_x to save some stack space.*/
867 get_double_spill_area (MonoCompile *cfg)
869 if (!cfg->fconv_to_r8_x_var) {
870 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
871 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
873 return cfg->fconv_to_r8_x_var;
876 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
878 if (!cfg->simd_ctor_var) {
879 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
880 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
882 return cfg->simd_ctor_var;
886 mono_type_to_expand_op (MonoType *type)
888 switch (type->type) {
906 g_assert_not_reached ();
911 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
914 MonoMethodSignature *sig = mono_method_signature (cmethod);
917 g_assert (sig->param_count == 2);
918 g_assert (position == 0 || position == 1);
920 if (mono_class_from_mono_type (sig->params [position])->simd_type)
921 return get_simd_vreg (cfg, cmethod, src);
923 expand_op = mono_type_to_expand_op (sig->params [position]);
924 MONO_INST_NEW (cfg, ins, expand_op);
925 ins->klass = cmethod->klass;
926 ins->sreg1 = src->dreg;
927 ins->type = STACK_VTYPE;
928 ins->dreg = alloc_ireg (cfg);
929 MONO_ADD_INS (cfg->cbb, ins);
931 if (expand_op == OP_EXPAND_R4)
932 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
933 else if (expand_op == OP_EXPAND_R8)
934 ins->backend.spill_var = get_double_spill_area (cfg);
940 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
943 int left_vreg, right_vreg;
945 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
946 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
949 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
950 ins->klass = cmethod->klass;
951 ins->sreg1 = left_vreg;
952 ins->sreg2 = right_vreg;
953 ins->type = STACK_VTYPE;
954 ins->dreg = alloc_ireg (cfg);
955 ins->inst_c0 = intrinsic->flags;
956 MONO_ADD_INS (cfg->cbb, ins);
961 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
966 vreg = get_simd_vreg (cfg, cmethod, args [0]);
968 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
969 ins->klass = cmethod->klass;
971 ins->type = STACK_VTYPE;
972 ins->dreg = alloc_ireg (cfg);
973 MONO_ADD_INS (cfg->cbb, ins);
978 mono_type_to_extract_op (MonoType *type)
980 switch (type->type) {
982 return OP_EXTRACT_I1;
984 return OP_EXTRACT_U1;
986 return OP_EXTRACT_I2;
988 return OP_EXTRACT_U2;
992 return OP_EXTRACT_I4;
994 g_assert_not_reached ();
998 /*Returns the amount to shift the element index to get the dword it belongs to*/
1000 mono_type_elements_shift_bits (MonoType *type)
1002 switch (type->type) {
1014 g_assert_not_reached ();
1018 static G_GNUC_UNUSED int
1019 mono_type_to_insert_op (MonoType *type)
1021 switch (type->type) {
1024 return OP_INSERT_I1;
1027 return OP_INSERT_I2;
1030 return OP_INSERT_I4;
1033 return OP_INSERT_I8;
1035 return OP_INSERT_R4;
1037 return OP_INSERT_R8;
1039 g_assert_not_reached ();
1044 mono_type_to_slow_insert_op (MonoType *type)
1046 switch (type->type) {
1049 return OP_INSERTX_U1_SLOW;
1052 return OP_INSERT_I2;
1055 return OP_INSERTX_I4_SLOW;
1058 return OP_INSERTX_I8_SLOW;
1060 return OP_INSERTX_R4_SLOW;
1062 return OP_INSERTX_R8_SLOW;
1064 g_assert_not_reached ();
1069 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1072 MonoMethodSignature *sig = mono_method_signature (cmethod);
1077 size = mono_type_size (sig->params [0], &align);
1079 if (COMPILE_LLVM (cfg)) {
1080 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1081 ins->klass = cmethod->klass;
1082 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1083 ins->sreg2 = args [1]->dreg;
1084 ins->inst_c0 = intrinsic->opcode;
1085 MONO_ADD_INS (cfg->cbb, ins);
1086 } else if (size == 2 || size == 4 || size == 8) {
1087 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1088 ins->klass = cmethod->klass;
1089 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1090 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1091 ins->sreg2 = args [1]->dreg;
1092 ins->inst_c0 = intrinsic->opcode;
1093 if (sig->params [0]->type == MONO_TYPE_R4)
1094 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1095 else if (sig->params [0]->type == MONO_TYPE_R8)
1096 ins->backend.spill_var = get_double_spill_area (cfg);
1097 MONO_ADD_INS (cfg->cbb, ins);
1101 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1102 ins->klass = cmethod->klass;
1103 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1104 ins->type = STACK_I4;
1105 ins->dreg = vreg = alloc_ireg (cfg);
1106 ins->inst_c0 = intrinsic->opcode / 2;
1107 MONO_ADD_INS (cfg->cbb, ins);
1109 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1110 ins->klass = cmethod->klass;
1112 ins->sreg2 = args [1]->dreg;
1114 ins->inst_c0 = intrinsic->opcode;
1115 MONO_ADD_INS (cfg->cbb, ins);
1119 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1120 ins->klass = cmethod->klass;
1121 ins->dreg = args [0]->dreg;
1123 MONO_ADD_INS (cfg->cbb, ins);
1129 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1132 MonoMethodSignature *sig = mono_method_signature (cmethod);
1133 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1135 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1137 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1138 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1139 ins->klass = cmethod->klass;
1141 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1142 ins->type = STACK_VTYPE;
1143 ins->dreg = vreg = alloc_ireg (cfg);
1144 MONO_ADD_INS (cfg->cbb, ins);
1147 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1148 ins->klass = cmethod->klass;
1150 ins->type = STACK_I4;
1151 ins->dreg = vreg = alloc_ireg (cfg);
1152 if (cfg->compile_llvm)
1153 ins->inst_c0 = intrinsic->opcode;
1155 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1156 MONO_ADD_INS (cfg->cbb, ins);
1158 if (sig->ret->type == MONO_TYPE_R4) {
1159 MONO_INST_NEW (cfg, ins, cfg->r4fp ? OP_ICONV_TO_R4_RAW : OP_MOVE_I4_TO_F);
1160 ins->klass = mono_defaults.single_class;
1162 ins->type = cfg->r4_stack_type;
1163 ins->dreg = alloc_freg (cfg);
1164 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1165 MONO_ADD_INS (cfg->cbb, ins);
1171 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1175 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1177 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1179 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1180 ins->klass = cmethod->klass;
1182 ins->inst_c0 = intrinsic->opcode;
1184 ins->type = STACK_R8;
1185 ins->dreg = alloc_freg (cfg);
1186 ins->backend.spill_var = get_double_spill_area (cfg);
1188 ins->type = STACK_I8;
1189 ins->dreg = alloc_lreg (cfg);
1191 MONO_ADD_INS (cfg->cbb, ins);
1197 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1199 MonoInst *ins = NULL;
1201 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1202 MonoMethodSignature *sig = mono_method_signature (cmethod);
1203 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1204 int arg_size = mono_type_size (sig->params [0], &i);
1206 if (sig->param_count == 1) {
1210 dreg = args [0]->inst_i0->dreg;
1211 NULLIFY_INS (args [0]);
1213 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1214 dreg = alloc_ireg (cfg);
1217 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1218 ins->klass = cmethod->klass;
1219 ins->sreg1 = args [1]->dreg;
1220 ins->type = STACK_VTYPE;
1223 MONO_ADD_INS (cfg->cbb, ins);
1224 if (sig->params [0]->type == MONO_TYPE_R4)
1225 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1226 else if (sig->params [0]->type == MONO_TYPE_R8)
1227 ins->backend.spill_var = get_double_spill_area (cfg);
1230 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1231 ins->dreg = args [0]->dreg;
1233 MONO_ADD_INS (cfg->cbb, ins);
1239 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1240 MONO_ADD_INS (cfg->cbb, ins);
1241 addr_reg = ins->dreg;
1243 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1244 addr_reg = args [0]->dreg;
1247 for (i = sig->param_count - 1; i >= 0; --i) {
1248 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1251 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1252 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1253 NULLIFY_INS (args [0]);
1255 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1256 ins->klass = cmethod->klass;
1257 ins->sreg1 = addr_reg;
1258 ins->type = STACK_VTYPE;
1260 MONO_ADD_INS (cfg->cbb, ins);
1266 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1271 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1273 //TODO macroize this
1274 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1275 ins->klass = cmethod->klass;
1276 ins->type = STACK_VTYPE;
1278 ins->dreg = alloc_ireg (cfg);
1279 MONO_ADD_INS (cfg->cbb, ins);
1284 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1287 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1289 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1291 if (args [1]->opcode != OP_ICONST) {
1292 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1293 ins->klass = mono_defaults.int32_class;
1294 ins->sreg1 = args [1]->dreg;
1295 ins->type = STACK_I4;
1296 ins->dreg = vreg2 = alloc_ireg (cfg);
1297 MONO_ADD_INS (cfg->cbb, ins);
1299 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1302 MONO_INST_NEW (cfg, ins, opcode);
1303 ins->klass = cmethod->klass;
1307 if (args [1]->opcode == OP_ICONST) {
1308 ins->inst_imm = args [1]->inst_c0;
1309 NULLIFY_INS (args [1]);
1312 ins->type = STACK_VTYPE;
1313 ins->dreg = alloc_ireg (cfg);
1314 MONO_ADD_INS (cfg->cbb, ins);
1318 static inline gboolean
1319 mono_op_is_packed_compare (int op)
1321 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1325 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1328 int left_vreg, right_vreg, tmp_vreg;
1330 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1331 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1334 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1335 ins->klass = cmethod->klass;
1336 ins->sreg1 = left_vreg;
1337 ins->sreg2 = right_vreg;
1338 ins->type = STACK_VTYPE;
1339 ins->klass = cmethod->klass;
1340 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1341 ins->inst_c0 = intrinsic->flags;
1342 MONO_ADD_INS (cfg->cbb, ins);
1344 /*FIXME the next ops are SSE specific*/
1345 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1346 ins->klass = cmethod->klass;
1347 ins->sreg1 = tmp_vreg;
1348 ins->type = STACK_I4;
1349 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1350 MONO_ADD_INS (cfg->cbb, ins);
1352 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1353 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1354 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1355 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1357 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1358 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1360 MONO_ADD_INS (cfg->cbb, ins);
1366 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1369 int vreg, vreg2 = -1;
1370 int param_count = mono_method_signature (cmethod)->param_count;
1372 if (args [param_count - 1]->opcode != OP_ICONST) {
1373 /*TODO Shuffle with non literals is not yet supported */
1377 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1378 if (param_count == 3)
1379 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1381 NULLIFY_INS (args [param_count - 1]);
1384 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1385 ins->klass = cmethod->klass;
1388 ins->inst_c0 = args [param_count - 1]->inst_c0;
1389 ins->type = STACK_VTYPE;
1390 ins->dreg = alloc_ireg (cfg);
1391 MONO_ADD_INS (cfg->cbb, ins);
1393 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1394 ins->opcode = OP_SHUFPS;
1399 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1403 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1404 ins->klass = cmethod->klass;
1405 ins->sreg1 = args [0]->dreg;
1406 ins->type = STACK_VTYPE;
1407 ins->dreg = alloc_ireg (cfg);
1408 MONO_ADD_INS (cfg->cbb, ins);
1413 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1418 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1420 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1421 ins->klass = cmethod->klass;
1422 ins->dreg = args [0]->dreg;
1424 ins->type = STACK_VTYPE;
1425 MONO_ADD_INS (cfg->cbb, ins);
1430 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1435 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1437 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1438 ins->klass = cmethod->klass;
1440 ins->type = STACK_I4;
1441 ins->dreg = alloc_ireg (cfg);
1442 MONO_ADD_INS (cfg->cbb, ins);
1448 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1452 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1453 ins->klass = cmethod->klass;
1454 ins->sreg1 = args [0]->dreg;
1455 ins->backend.arg_info = intrinsic->flags;
1456 MONO_ADD_INS (cfg->cbb, ins);
1461 simd_version_name (guint32 version)
1464 case SIMD_VERSION_SSE1:
1466 case SIMD_VERSION_SSE2:
1468 case SIMD_VERSION_SSE3:
1470 case SIMD_VERSION_SSSE3:
1472 case SIMD_VERSION_SSE41:
1474 case SIMD_VERSION_SSE42:
1476 case SIMD_VERSION_SSE4a:
1483 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1485 const SimdIntrinsc *result = (const SimdIntrinsc *)mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1487 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1490 if (IS_DEBUG_ON (cfg)) {
1492 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1493 max = fsig->param_count + fsig->hasthis;
1494 for (i = 0; i < max; ++i) {
1495 printf ("param %d: ", i);
1496 mono_print_ins (args [i]);
1499 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1500 if (IS_DEBUG_ON (cfg)) {
1502 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1503 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1504 if (result->simd_version_flags & (1 << x))
1505 printf ("%s ", simd_version_name (1 << x));
1512 switch (result->simd_emit_mode) {
1513 case SIMD_EMIT_BINARY:
1514 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1515 case SIMD_EMIT_UNARY:
1516 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1517 case SIMD_EMIT_SETTER:
1518 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1519 case SIMD_EMIT_GETTER:
1520 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1521 case SIMD_EMIT_GETTER_QWORD:
1522 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1523 case SIMD_EMIT_CTOR:
1524 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1525 case SIMD_EMIT_CAST:
1526 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1527 case SIMD_EMIT_SHUFFLE:
1528 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1529 case SIMD_EMIT_SHIFT:
1530 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1531 case SIMD_EMIT_EQUALITY:
1532 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1533 case SIMD_EMIT_LOAD_ALIGNED:
1534 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1535 case SIMD_EMIT_STORE:
1536 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1537 case SIMD_EMIT_EXTRACT_MASK:
1538 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1539 case SIMD_EMIT_PREFETCH:
1540 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1542 g_assert_not_reached ();
1546 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1550 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1552 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1553 mult_reg = alloc_preg (cfg);
1554 array_reg = arr->dreg;
1555 index_reg = index->dreg;
1557 #if SIZEOF_VOID_P == 8
1558 /* The array reg is 64 bits but the index reg is only 32 */
1559 index2_reg = alloc_preg (cfg);
1560 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1562 index2_reg = index_reg;
1564 index3_reg = alloc_preg (cfg);
1567 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1568 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1569 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1572 add_reg = alloc_preg (cfg);
1574 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1575 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1576 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1577 ins->type = STACK_PTR;
1578 MONO_ADD_INS (cfg->cbb, ins);
1584 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1586 if ((!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) && fsig->param_count == 2) {
1588 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1590 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1591 load->klass = cmethod->klass;
1593 load->type = STACK_VTYPE;
1594 load->dreg = alloc_ireg (cfg);
1595 MONO_ADD_INS (cfg->cbb, load);
1599 if ((!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) && fsig->param_count == 3) {
1601 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1602 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1604 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1605 store->klass = cmethod->klass;
1607 store->sreg1 = vreg;
1608 MONO_ADD_INS (cfg->cbb, store);
1612 if (!strcmp ("IsAligned", cmethod->name) && fsig->param_count == 2) {
1614 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1616 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1617 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1618 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1619 MONO_ADD_INS (cfg->cbb, ins);
1627 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1629 if (!strcmp ("get_AccelMode", cmethod->name) && fsig->param_count == 0) {
1631 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1638 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1640 const char *class_name;
1642 if (strcmp ("Mono.Simd", cmethod->klass->image->assembly->aname.name) ||
1643 strcmp ("Mono.Simd", cmethod->klass->name_space))
1646 class_name = cmethod->klass->name;
1647 if (!strcmp ("SimdRuntime", class_name))
1648 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1650 if (!strcmp ("ArrayExtensions", class_name))
1651 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1653 if (!strcmp ("VectorOperations", class_name)) {
1654 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1656 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1657 } else if (!cmethod->klass->simd_type)
1660 cfg->uses_simd_intrinsics = 1;
1661 if (!strcmp ("Vector2d", class_name))
1662 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1663 if (!strcmp ("Vector4f", class_name))
1664 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1665 if (!strcmp ("Vector2ul", class_name))
1666 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1667 if (!strcmp ("Vector2l", class_name))
1668 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1669 if (!strcmp ("Vector4ui", class_name))
1670 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1671 if (!strcmp ("Vector4i", class_name))
1672 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1673 if (!strcmp ("Vector8us", class_name))
1674 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1675 if (!strcmp ("Vector8s", class_name))
1676 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1677 if (!strcmp ("Vector16b", class_name))
1678 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1679 if (!strcmp ("Vector16sb", class_name))
1680 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));
1685 #endif /* DISABLE_JIT */
1686 #endif /* MONO_ARCH_SIMD_INTRINSICS */