2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
15 #include "mono/utils/bsearch.h"
16 #include <mono/metadata/abi-details.h>
19 General notes on SIMD intrinsics
21 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
22 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
23 TODO extend op_to_op_dest_membase to handle simd ops
24 TODO add support for indexed versions of simd ops
25 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
26 TODO make sure locals, arguments and spills are properly aligned.
27 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
28 TODO add stuff to man pages
29 TODO document this under /docs
30 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
31 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
32 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
33 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
34 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
35 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
36 TODO check if we need to init the SSE control word with better precision.
37 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
38 TODO make SimdRuntime.get_AccelMode work under AOT
39 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
40 TODO extend bounds checking code to support for range checking.
42 General notes for SIMD intrinsics.
44 -Bad extractor and constructor performance
45 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
46 It will be loaded in the FP stack just to be pushed on the call stack.
48 A similar thing happens with Vector4f constructor that require float vars to be
50 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
51 trip to the FP stack is desirable.
53 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
57 -Promote OP_EXTRACT_I4 to a STORE op
58 The advantage of this change is that it could have a _membase version and promote further optimizations.
60 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
64 #ifdef MONO_ARCH_SIMD_INTRINSICS
66 //#define IS_DEBUG_ON(cfg) (0)
68 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
69 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
75 SIMD_EMIT_GETTER_QWORD,
81 SIMD_EMIT_LOAD_ALIGNED,
83 SIMD_EMIT_EXTRACT_MASK,
87 #ifdef HAVE_ARRAY_ELEM_INIT
88 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
89 #define MSGSTRFIELD1(line) str##line
90 static const struct msgstr_t {
91 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
92 #include "simd-methods.h"
95 #define SIMD_METHOD(str,name) str,
96 #include "simd-methods.h"
101 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
102 #include "simd-methods.h"
104 #define method_name(idx) ((const char*)&method_names + (idx))
107 #define SIMD_METHOD(str,name) str,
108 static const char * const method_names [] = {
109 #include "simd-methods.h"
113 #define SIMD_METHOD(str,name) name,
115 #include "simd-methods.h"
119 #define method_name(idx) (method_names [(idx)])
126 guint8 simd_version_flags;
127 guint8 simd_emit_mode : 4;
131 static const SimdIntrinsc vector4f_intrinsics[] = {
132 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
133 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
134 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
135 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
136 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
137 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
138 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
139 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
140 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
141 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
142 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
143 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
144 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
145 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
146 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
147 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
148 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
149 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
150 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
151 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
152 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
153 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
154 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
155 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
156 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
157 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
158 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
159 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
160 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
161 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
162 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
163 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
164 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
165 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
166 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
167 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
168 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
169 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
170 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
171 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
172 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
173 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
174 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
175 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
176 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
177 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
178 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
179 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
180 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
181 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
182 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
185 static const SimdIntrinsc vector2d_intrinsics[] = {
186 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
187 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
188 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
189 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
190 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
191 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
192 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
193 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
194 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
195 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
196 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
197 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
198 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
199 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
200 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
201 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
202 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
203 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
204 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
205 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
206 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
207 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
208 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
209 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
210 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
211 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
212 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
213 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
214 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
215 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
216 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
217 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
220 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
221 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
222 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
223 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
224 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
225 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
226 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
229 static const SimdIntrinsc vector2ul_intrinsics[] = {
230 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
231 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
232 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
233 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
234 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
235 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
236 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
237 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
238 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
239 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
240 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
241 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
242 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
243 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
244 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
245 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
246 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
247 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
248 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
249 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
250 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
251 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
252 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
253 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
256 static const SimdIntrinsc vector2l_intrinsics[] = {
257 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
258 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
259 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
260 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
261 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
262 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
263 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
264 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
265 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
266 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
267 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
268 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
269 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
270 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
271 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
272 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
273 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
274 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
275 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
276 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
277 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
278 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
279 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
280 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
281 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
284 static const SimdIntrinsc vector4ui_intrinsics[] = {
285 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
286 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
287 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
288 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
289 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
290 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
291 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
292 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
293 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
294 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
295 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
296 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
297 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
298 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
299 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
300 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
301 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
302 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
303 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
304 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
305 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
306 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
307 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
308 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
309 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
310 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
311 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
312 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
313 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
314 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
315 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
316 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
317 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
318 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
319 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
322 static const SimdIntrinsc vector4i_intrinsics[] = {
323 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
324 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
325 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
326 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
327 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
328 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
329 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
330 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
331 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
332 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
333 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
334 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
335 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
336 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
337 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
338 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
339 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
340 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
341 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
342 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
343 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
344 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
345 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
346 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
347 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
348 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
349 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
350 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
351 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
352 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
353 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
354 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
355 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
356 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
357 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
358 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
359 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
360 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
363 static const SimdIntrinsc vector8us_intrinsics[] = {
364 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
365 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
366 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
367 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
368 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
369 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
370 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
371 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
372 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
373 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
374 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
375 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
376 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
377 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
378 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
379 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
380 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
381 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
382 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
384 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
385 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
386 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
387 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
388 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
389 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
390 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
391 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
392 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
393 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
395 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
396 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
397 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
398 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
399 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
400 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
401 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
402 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
403 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
404 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
405 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
406 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
407 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
408 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
409 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
410 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
411 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
414 static const SimdIntrinsc vector8s_intrinsics[] = {
415 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
416 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
417 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
418 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
419 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
420 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
421 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
422 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
423 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
424 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
425 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
426 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
427 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
428 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
429 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
430 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
431 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
432 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
433 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
437 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
438 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
439 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
440 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
441 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
442 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
443 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
444 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
446 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
447 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
448 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
449 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
450 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
451 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
452 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
453 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
454 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
455 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
456 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
457 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
458 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
459 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
460 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
461 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
462 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
465 static const SimdIntrinsc vector16b_intrinsics[] = {
466 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
467 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
468 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
469 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
470 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
471 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
472 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
473 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
474 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
475 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
476 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
477 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
478 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
479 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
482 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
483 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
484 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
485 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
486 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
487 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
488 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
489 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
490 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
491 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
492 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
493 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
500 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
501 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
502 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
503 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
504 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
505 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
506 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
507 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
508 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
509 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
510 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
511 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
512 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
513 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
514 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
515 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
516 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
517 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 static const SimdIntrinsc vector16sb_intrinsics[] = {
530 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
531 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
532 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
533 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
534 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
535 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
536 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
537 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
538 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
539 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
540 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
541 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
542 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
543 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
545 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
546 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
547 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
548 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
549 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
550 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
551 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
552 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
553 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
554 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
555 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
556 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
563 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
564 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
565 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
566 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
567 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
568 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
569 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
570 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
571 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
572 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
573 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
574 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
575 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
576 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
577 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
578 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
579 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
580 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 static guint32 simd_supported_versions;
590 /*TODO match using number of parameters as well*/
592 simd_intrinsic_compare_by_name (const void *key, const void *value)
594 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
599 VREG_HAS_XZERO_BB0 = 0x02,
600 VREG_HAS_OTHER_OP_BB0 = 0x04,
601 VREG_SINGLE_BB_USE = 0x08,
602 VREG_MANY_BB_USE = 0x10,
606 mono_simd_intrinsics_init (void)
608 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
609 /*TODO log the supported flags*/
612 static inline gboolean
613 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
615 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
616 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
617 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
618 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
624 static inline gboolean
625 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
627 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
630 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
631 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
632 vreg_flags [reg] |= VREG_MANY_BB_USE;
633 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
635 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
636 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
637 target_bb [reg] = bb;
638 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
645 This pass recalculate which vars need MONO_INST_INDIRECT.
647 We cannot do this for non SIMD vars since code like mono_get_vtable_var
648 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
651 mono_simd_simplify_indirection (MonoCompile *cfg)
654 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
658 for (i = 0; i < cfg->num_varinfo; i++) {
659 MonoInst *var = cfg->varinfo [i];
660 if (var->klass->simd_type) {
661 var->flags &= ~MONO_INST_INDIRECT;
662 max_vreg = MAX (var->dreg, max_vreg);
666 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
667 if (!first_bb && bb->code)
669 for (ins = bb->code; ins; ins = ins->next) {
670 if (ins->opcode == OP_LDADDR) {
671 MonoInst *var = (MonoInst*)ins->inst_p0;
672 if (var->klass->simd_type) {
673 var->flags |= MONO_INST_INDIRECT;
679 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
680 vreg_flags = g_malloc0 (max_vreg + 1);
681 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
683 for (i = 0; i < cfg->num_varinfo; i++) {
684 MonoInst *var = cfg->varinfo [i];
685 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
686 vreg_flags [var->dreg] = VREG_USED;
687 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
691 /*Scan the first basic block looking xzeros not used*/
692 for (ins = first_bb->code; ins; ins = ins->next) {
694 int sregs [MONO_MAX_SRC_REGS];
696 if (ins->opcode == OP_XZERO) {
697 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
698 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
699 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
703 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
705 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
707 num_sregs = mono_inst_get_src_registers (ins, sregs);
708 for (i = 0; i < num_sregs; ++i) {
709 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
714 if (IS_DEBUG_ON (cfg)) {
715 for (i = 0; i < cfg->num_varinfo; i++) {
716 MonoInst *var = cfg->varinfo [i];
717 if (var->klass->simd_type) {
718 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
719 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
720 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
721 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
726 /*TODO stop here if no var is xzero only*/
729 Scan all other bb and check if it has only one other use
730 Ideally this would be done after an extended bb formation pass
732 FIXME This pass could use dominator information to properly
733 place the XZERO on the bb that dominates all uses of the var,
734 but this will have zero effect with the current local reg alloc
736 TODO simply the use of flags.
739 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
740 for (ins = bb->code; ins; ins = ins->next) {
742 int sregs [MONO_MAX_SRC_REGS];
744 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
746 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
748 num_sregs = mono_inst_get_src_registers (ins, sregs);
749 for (i = 0; i < num_sregs; ++i) {
750 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
751 max_vreg, vreg_flags, target_bb))
757 for (i = 0; i < cfg->num_varinfo; i++) {
758 MonoInst *var = cfg->varinfo [i];
759 if (!var->klass->simd_type)
761 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
762 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
763 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
764 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
766 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
768 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
770 int sregs [MONO_MAX_SRC_REGS];
771 gboolean found = FALSE;
773 num_sregs = mono_inst_get_src_registers (ins, sregs);
774 for (j = 0; j < num_sregs; ++j) {
775 if (sregs [j] == var->dreg)
778 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
779 if (ins->dreg == var->dreg && !found) {
780 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i, target_bb [var->dreg]->block_num););
783 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i, target_bb [var->dreg]->block_num); );
785 MONO_INST_NEW (cfg, tmp, OP_XZERO);
786 tmp->dreg = var->dreg;
787 tmp->type = STACK_VTYPE;
788 tmp->klass = var->klass;
789 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
795 for (ins = first_bb->code; ins; ins = ins->next) {
796 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE)) {
797 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins->dreg); mono_print_ins(ins));
807 * This function expect that src be a value.
810 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
812 if (src->opcode == OP_XMOVE) {
814 } else if (src->type == STACK_VTYPE) {
817 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
818 mono_print_ins (src);
819 g_assert_not_reached ();
823 * This function will load the value if needed.
826 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
830 if (src->opcode == OP_XMOVE) {
832 } else if (src->opcode == OP_LDADDR) {
833 int res = ((MonoInst*)src->inst_p0)->dreg;
836 } else if (src->type == STACK_VTYPE) {
838 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
843 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
844 ins->klass = cmethod->klass;
845 ins->sreg1 = src->dreg;
846 ins->type = STACK_VTYPE;
847 ins->dreg = alloc_ireg (cfg);
848 MONO_ADD_INS (cfg->cbb, ins);
851 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
852 mono_print_ins (src);
853 g_assert_not_reached ();
856 /*We share the var with fconv_to_r8_x to save some stack space.*/
858 get_double_spill_area (MonoCompile *cfg)
860 if (!cfg->fconv_to_r8_x_var) {
861 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
862 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
864 return cfg->fconv_to_r8_x_var;
867 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
869 if (!cfg->simd_ctor_var) {
870 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
871 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
873 return cfg->simd_ctor_var;
877 mono_type_to_expand_op (MonoType *type)
879 switch (type->type) {
897 g_assert_not_reached ();
902 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
905 MonoMethodSignature *sig = mono_method_signature (cmethod);
908 g_assert (sig->param_count == 2);
909 g_assert (position == 0 || position == 1);
911 if (mono_class_from_mono_type (sig->params [position])->simd_type)
912 return get_simd_vreg (cfg, cmethod, src);
914 expand_op = mono_type_to_expand_op (sig->params [position]);
915 MONO_INST_NEW (cfg, ins, expand_op);
916 ins->klass = cmethod->klass;
917 ins->sreg1 = src->dreg;
918 ins->type = STACK_VTYPE;
919 ins->dreg = alloc_ireg (cfg);
920 MONO_ADD_INS (cfg->cbb, ins);
922 if (expand_op == OP_EXPAND_R4)
923 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
924 else if (expand_op == OP_EXPAND_R8)
925 ins->backend.spill_var = get_double_spill_area (cfg);
931 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
934 int left_vreg, right_vreg;
936 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
937 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
940 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
941 ins->klass = cmethod->klass;
942 ins->sreg1 = left_vreg;
943 ins->sreg2 = right_vreg;
944 ins->type = STACK_VTYPE;
945 ins->dreg = alloc_ireg (cfg);
946 ins->inst_c0 = intrinsic->flags;
947 MONO_ADD_INS (cfg->cbb, ins);
952 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
957 vreg = get_simd_vreg (cfg, cmethod, args [0]);
959 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
960 ins->klass = cmethod->klass;
962 ins->type = STACK_VTYPE;
963 ins->dreg = alloc_ireg (cfg);
964 MONO_ADD_INS (cfg->cbb, ins);
969 mono_type_to_extract_op (MonoType *type)
971 switch (type->type) {
973 return OP_EXTRACT_I1;
975 return OP_EXTRACT_U1;
977 return OP_EXTRACT_I2;
979 return OP_EXTRACT_U2;
983 return OP_EXTRACT_I4;
985 g_assert_not_reached ();
989 /*Returns the amount to shift the element index to get the dword it belongs to*/
991 mono_type_elements_shift_bits (MonoType *type)
993 switch (type->type) {
1005 g_assert_not_reached ();
1009 static G_GNUC_UNUSED int
1010 mono_type_to_insert_op (MonoType *type)
1012 switch (type->type) {
1015 return OP_INSERT_I1;
1018 return OP_INSERT_I2;
1021 return OP_INSERT_I4;
1024 return OP_INSERT_I8;
1026 return OP_INSERT_R4;
1028 return OP_INSERT_R8;
1030 g_assert_not_reached ();
1035 mono_type_to_slow_insert_op (MonoType *type)
1037 switch (type->type) {
1040 return OP_INSERTX_U1_SLOW;
1043 return OP_INSERT_I2;
1046 return OP_INSERTX_I4_SLOW;
1049 return OP_INSERTX_I8_SLOW;
1051 return OP_INSERTX_R4_SLOW;
1053 return OP_INSERTX_R8_SLOW;
1055 g_assert_not_reached ();
1060 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1063 MonoMethodSignature *sig = mono_method_signature (cmethod);
1068 size = mono_type_size (sig->params [0], &align);
1070 if (COMPILE_LLVM (cfg)) {
1071 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1072 ins->klass = cmethod->klass;
1073 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1074 ins->sreg2 = args [1]->dreg;
1075 ins->inst_c0 = intrinsic->opcode;
1076 MONO_ADD_INS (cfg->cbb, ins);
1077 } else if (size == 2 || size == 4 || size == 8) {
1078 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1079 ins->klass = cmethod->klass;
1080 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1081 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1082 ins->sreg2 = args [1]->dreg;
1083 ins->inst_c0 = intrinsic->opcode;
1084 if (sig->params [0]->type == MONO_TYPE_R4)
1085 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1086 else if (sig->params [0]->type == MONO_TYPE_R8)
1087 ins->backend.spill_var = get_double_spill_area (cfg);
1088 MONO_ADD_INS (cfg->cbb, ins);
1092 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1093 ins->klass = cmethod->klass;
1094 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1095 ins->type = STACK_I4;
1096 ins->dreg = vreg = alloc_ireg (cfg);
1097 ins->inst_c0 = intrinsic->opcode / 2;
1098 MONO_ADD_INS (cfg->cbb, ins);
1100 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1101 ins->klass = cmethod->klass;
1103 ins->sreg2 = args [1]->dreg;
1105 ins->inst_c0 = intrinsic->opcode;
1106 MONO_ADD_INS (cfg->cbb, ins);
1110 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1111 ins->klass = cmethod->klass;
1112 ins->dreg = args [0]->dreg;
1114 MONO_ADD_INS (cfg->cbb, ins);
1120 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1123 MonoMethodSignature *sig = mono_method_signature (cmethod);
1124 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1126 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1128 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1129 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1130 ins->klass = cmethod->klass;
1132 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1133 ins->type = STACK_VTYPE;
1134 ins->dreg = vreg = alloc_ireg (cfg);
1135 MONO_ADD_INS (cfg->cbb, ins);
1138 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1139 ins->klass = cmethod->klass;
1141 ins->type = STACK_I4;
1142 ins->dreg = vreg = alloc_ireg (cfg);
1143 if (cfg->compile_llvm)
1144 ins->inst_c0 = intrinsic->opcode;
1146 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1147 MONO_ADD_INS (cfg->cbb, ins);
1149 if (sig->ret->type == MONO_TYPE_R4) {
1150 MONO_INST_NEW (cfg, ins, cfg->r4fp ? OP_ICONV_TO_R4_RAW : OP_MOVE_I4_TO_F);
1151 ins->klass = mono_defaults.single_class;
1153 ins->type = cfg->r4_stack_type;
1154 ins->dreg = alloc_freg (cfg);
1155 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1156 MONO_ADD_INS (cfg->cbb, ins);
1162 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1166 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1168 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1170 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1171 ins->klass = cmethod->klass;
1173 ins->inst_c0 = intrinsic->opcode;
1175 ins->type = STACK_R8;
1176 ins->dreg = alloc_freg (cfg);
1177 ins->backend.spill_var = get_double_spill_area (cfg);
1179 ins->type = STACK_I8;
1180 ins->dreg = alloc_lreg (cfg);
1182 MONO_ADD_INS (cfg->cbb, ins);
1188 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1190 MonoInst *ins = NULL;
1192 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1193 MonoMethodSignature *sig = mono_method_signature (cmethod);
1194 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1195 int arg_size = mono_type_size (sig->params [0], &i);
1197 if (sig->param_count == 1) {
1201 dreg = args [0]->inst_i0->dreg;
1202 NULLIFY_INS (args [0]);
1204 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1205 dreg = alloc_ireg (cfg);
1208 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1209 ins->klass = cmethod->klass;
1210 ins->sreg1 = args [1]->dreg;
1211 ins->type = STACK_VTYPE;
1214 MONO_ADD_INS (cfg->cbb, ins);
1215 if (sig->params [0]->type == MONO_TYPE_R4)
1216 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1217 else if (sig->params [0]->type == MONO_TYPE_R8)
1218 ins->backend.spill_var = get_double_spill_area (cfg);
1221 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1222 ins->dreg = args [0]->dreg;
1224 MONO_ADD_INS (cfg->cbb, ins);
1230 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1231 MONO_ADD_INS (cfg->cbb, ins);
1232 addr_reg = ins->dreg;
1234 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1235 addr_reg = args [0]->dreg;
1238 for (i = sig->param_count - 1; i >= 0; --i) {
1239 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1242 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1243 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1244 NULLIFY_INS (args [0]);
1246 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1247 ins->klass = cmethod->klass;
1248 ins->sreg1 = addr_reg;
1249 ins->type = STACK_VTYPE;
1251 MONO_ADD_INS (cfg->cbb, ins);
1257 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1262 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1264 //TODO macroize this
1265 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1266 ins->klass = cmethod->klass;
1267 ins->type = STACK_VTYPE;
1269 ins->dreg = alloc_ireg (cfg);
1270 MONO_ADD_INS (cfg->cbb, ins);
1275 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1278 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1280 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1282 if (args [1]->opcode != OP_ICONST) {
1283 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1284 ins->klass = mono_defaults.int32_class;
1285 ins->sreg1 = args [1]->dreg;
1286 ins->type = STACK_I4;
1287 ins->dreg = vreg2 = alloc_ireg (cfg);
1288 MONO_ADD_INS (cfg->cbb, ins);
1290 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1293 MONO_INST_NEW (cfg, ins, opcode);
1294 ins->klass = cmethod->klass;
1298 if (args [1]->opcode == OP_ICONST) {
1299 ins->inst_imm = args [1]->inst_c0;
1300 NULLIFY_INS (args [1]);
1303 ins->type = STACK_VTYPE;
1304 ins->dreg = alloc_ireg (cfg);
1305 MONO_ADD_INS (cfg->cbb, ins);
1309 static inline gboolean
1310 mono_op_is_packed_compare (int op)
1312 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1316 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1319 int left_vreg, right_vreg, tmp_vreg;
1321 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1322 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1325 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1326 ins->klass = cmethod->klass;
1327 ins->sreg1 = left_vreg;
1328 ins->sreg2 = right_vreg;
1329 ins->type = STACK_VTYPE;
1330 ins->klass = cmethod->klass;
1331 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1332 ins->inst_c0 = intrinsic->flags;
1333 MONO_ADD_INS (cfg->cbb, ins);
1335 /*FIXME the next ops are SSE specific*/
1336 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1337 ins->klass = cmethod->klass;
1338 ins->sreg1 = tmp_vreg;
1339 ins->type = STACK_I4;
1340 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1341 MONO_ADD_INS (cfg->cbb, ins);
1343 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1344 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1345 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1346 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1348 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1349 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1351 MONO_ADD_INS (cfg->cbb, ins);
1357 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1360 int vreg, vreg2 = -1;
1361 int param_count = mono_method_signature (cmethod)->param_count;
1363 if (args [param_count - 1]->opcode != OP_ICONST) {
1364 /*TODO Shuffle with non literals is not yet supported */
1368 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1369 if (param_count == 3)
1370 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1372 NULLIFY_INS (args [param_count - 1]);
1375 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1376 ins->klass = cmethod->klass;
1379 ins->inst_c0 = args [param_count - 1]->inst_c0;
1380 ins->type = STACK_VTYPE;
1381 ins->dreg = alloc_ireg (cfg);
1382 MONO_ADD_INS (cfg->cbb, ins);
1384 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1385 ins->opcode = OP_SHUFPS;
1390 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1394 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1395 ins->klass = cmethod->klass;
1396 ins->sreg1 = args [0]->dreg;
1397 ins->type = STACK_VTYPE;
1398 ins->dreg = alloc_ireg (cfg);
1399 MONO_ADD_INS (cfg->cbb, ins);
1404 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1409 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1411 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1412 ins->klass = cmethod->klass;
1413 ins->dreg = args [0]->dreg;
1415 ins->type = STACK_VTYPE;
1416 MONO_ADD_INS (cfg->cbb, ins);
1421 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1426 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1428 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1429 ins->klass = cmethod->klass;
1431 ins->type = STACK_I4;
1432 ins->dreg = alloc_ireg (cfg);
1433 MONO_ADD_INS (cfg->cbb, ins);
1439 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1443 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1444 ins->klass = cmethod->klass;
1445 ins->sreg1 = args [0]->dreg;
1446 ins->backend.arg_info = intrinsic->flags;
1447 MONO_ADD_INS (cfg->cbb, ins);
1452 simd_version_name (guint32 version)
1455 case SIMD_VERSION_SSE1:
1457 case SIMD_VERSION_SSE2:
1459 case SIMD_VERSION_SSE3:
1461 case SIMD_VERSION_SSSE3:
1463 case SIMD_VERSION_SSE41:
1465 case SIMD_VERSION_SSE42:
1467 case SIMD_VERSION_SSE4a:
1474 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1476 const SimdIntrinsc * result = mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1478 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1481 if (IS_DEBUG_ON (cfg)) {
1483 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1484 max = fsig->param_count + fsig->hasthis;
1485 for (i = 0; i < max; ++i) {
1486 printf ("param %d: ", i);
1487 mono_print_ins (args [i]);
1490 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1491 if (IS_DEBUG_ON (cfg)) {
1493 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1494 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1495 if (result->simd_version_flags & (1 << x))
1496 printf ("%s ", simd_version_name (1 << x));
1503 switch (result->simd_emit_mode) {
1504 case SIMD_EMIT_BINARY:
1505 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1506 case SIMD_EMIT_UNARY:
1507 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1508 case SIMD_EMIT_SETTER:
1509 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1510 case SIMD_EMIT_GETTER:
1511 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1512 case SIMD_EMIT_GETTER_QWORD:
1513 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1514 case SIMD_EMIT_CTOR:
1515 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1516 case SIMD_EMIT_CAST:
1517 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1518 case SIMD_EMIT_SHUFFLE:
1519 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1520 case SIMD_EMIT_SHIFT:
1521 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1522 case SIMD_EMIT_EQUALITY:
1523 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1524 case SIMD_EMIT_LOAD_ALIGNED:
1525 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1526 case SIMD_EMIT_STORE:
1527 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1528 case SIMD_EMIT_EXTRACT_MASK:
1529 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1530 case SIMD_EMIT_PREFETCH:
1531 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1533 g_assert_not_reached ();
1537 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1541 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1543 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1544 mult_reg = alloc_preg (cfg);
1545 array_reg = arr->dreg;
1546 index_reg = index->dreg;
1548 #if SIZEOF_VOID_P == 8
1549 /* The array reg is 64 bits but the index reg is only 32 */
1550 index2_reg = alloc_preg (cfg);
1551 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1553 index2_reg = index_reg;
1555 index3_reg = alloc_preg (cfg);
1558 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1559 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1560 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1563 add_reg = alloc_preg (cfg);
1565 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1566 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1567 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1568 ins->type = STACK_PTR;
1569 MONO_ADD_INS (cfg->cbb, ins);
1575 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1577 if ((!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) && fsig->param_count == 2) {
1579 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1581 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1582 load->klass = cmethod->klass;
1584 load->type = STACK_VTYPE;
1585 load->dreg = alloc_ireg (cfg);
1586 MONO_ADD_INS (cfg->cbb, load);
1590 if ((!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) && fsig->param_count == 3) {
1592 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1593 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1595 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1596 store->klass = cmethod->klass;
1598 store->sreg1 = vreg;
1599 MONO_ADD_INS (cfg->cbb, store);
1603 if (!strcmp ("IsAligned", cmethod->name) && fsig->param_count == 2) {
1605 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1607 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1608 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1609 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1610 MONO_ADD_INS (cfg->cbb, ins);
1618 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1620 if (!strcmp ("get_AccelMode", cmethod->name) && fsig->param_count == 0) {
1622 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1629 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1631 const char *class_name;
1633 if (strcmp ("Mono.Simd", cmethod->klass->image->assembly->aname.name) ||
1634 strcmp ("Mono.Simd", cmethod->klass->name_space))
1637 class_name = cmethod->klass->name;
1638 if (!strcmp ("SimdRuntime", class_name))
1639 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1641 if (!strcmp ("ArrayExtensions", class_name))
1642 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1644 if (!strcmp ("VectorOperations", class_name)) {
1645 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1647 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1648 } else if (!cmethod->klass->simd_type)
1651 cfg->uses_simd_intrinsics = 1;
1652 if (!strcmp ("Vector2d", class_name))
1653 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1654 if (!strcmp ("Vector4f", class_name))
1655 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1656 if (!strcmp ("Vector2ul", class_name))
1657 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1658 if (!strcmp ("Vector2l", class_name))
1659 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1660 if (!strcmp ("Vector4ui", class_name))
1661 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1662 if (!strcmp ("Vector4i", class_name))
1663 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1664 if (!strcmp ("Vector8us", class_name))
1665 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1666 if (!strcmp ("Vector8s", class_name))
1667 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1668 if (!strcmp ("Vector16b", class_name))
1669 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1670 if (!strcmp ("Vector16sb", class_name))
1671 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));