2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
15 #include "mono/utils/bsearch.h"
16 #include <mono/metadata/abi-details.h>
19 General notes on SIMD intrinsics
21 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
22 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
23 TODO extend op_to_op_dest_membase to handle simd ops
24 TODO add support for indexed versions of simd ops
25 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
26 TODO make sure locals, arguments and spills are properly aligned.
27 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
28 TODO add stuff to man pages
29 TODO document this under /docs
30 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
31 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
32 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
33 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
34 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
35 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
36 TODO check if we need to init the SSE control word with better precision.
37 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
38 TODO make SimdRuntime.get_AccelMode work under AOT
39 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
40 TODO extend bounds checking code to support for range checking.
42 General notes for SIMD intrinsics.
44 -Bad extractor and constructor performance
45 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
46 It will be loaded in the FP stack just to be pushed on the call stack.
48 A similar thing happens with Vector4f constructor that require float vars to be
50 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
51 trip to the FP stack is desirable.
53 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
57 -Promote OP_EXTRACT_I4 to a STORE op
58 The advantage of this change is that it could have a _membase version and promote further optimizations.
60 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
64 #ifdef MONO_ARCH_SIMD_INTRINSICS
66 //#define IS_DEBUG_ON(cfg) (0)
68 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
69 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
75 SIMD_EMIT_GETTER_QWORD,
81 SIMD_EMIT_LOAD_ALIGNED,
83 SIMD_EMIT_EXTRACT_MASK,
87 #ifdef HAVE_ARRAY_ELEM_INIT
88 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
89 #define MSGSTRFIELD1(line) str##line
90 static const struct msgstr_t {
91 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
92 #include "simd-methods.h"
95 #define SIMD_METHOD(str,name) str,
96 #include "simd-methods.h"
101 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
102 #include "simd-methods.h"
104 #define method_name(idx) ((const char*)&method_names + (idx))
107 #define SIMD_METHOD(str,name) str,
108 static const char * const method_names [] = {
109 #include "simd-methods.h"
113 #define SIMD_METHOD(str,name) name,
115 #include "simd-methods.h"
119 #define method_name(idx) (method_names [(idx)])
126 guint8 simd_version_flags;
127 guint8 simd_emit_mode : 4;
131 static const SimdIntrinsc vector4f_intrinsics[] = {
132 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
133 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
134 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
135 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
136 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
137 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
138 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
139 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
140 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
141 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
142 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
143 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
144 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
145 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
146 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
147 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
148 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
149 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
150 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
151 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
152 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
153 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
154 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
155 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
156 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
157 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
158 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
159 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
160 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
161 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
162 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
163 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
164 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
165 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
166 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
167 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
168 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
169 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
170 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
171 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
172 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
173 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
174 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
175 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
176 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
177 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
178 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
179 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
180 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
181 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
182 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
185 static const SimdIntrinsc vector2d_intrinsics[] = {
186 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
187 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
188 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
189 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
190 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
191 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
192 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
193 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
194 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
195 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
196 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
197 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
198 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
199 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
200 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
201 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
202 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
203 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
204 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
205 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
206 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
207 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
208 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
209 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
210 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
211 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
212 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
213 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
214 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
215 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
216 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
217 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
220 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
221 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
222 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
223 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
224 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
225 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
226 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
229 static const SimdIntrinsc vector2ul_intrinsics[] = {
230 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
231 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
232 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
233 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
234 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
235 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
236 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
237 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
238 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
239 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
240 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
241 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
242 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
243 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
244 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
245 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
246 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
247 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
248 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
249 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
250 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
251 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
252 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
253 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
256 static const SimdIntrinsc vector2l_intrinsics[] = {
257 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
258 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
259 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
260 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
261 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
262 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
263 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
264 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
265 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
266 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
267 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
268 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
269 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
270 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
271 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
272 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
273 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
274 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
275 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
276 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
277 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
278 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
279 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
280 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
281 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
284 static const SimdIntrinsc vector4ui_intrinsics[] = {
285 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
286 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
287 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
288 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
289 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
290 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
291 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
292 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
293 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
294 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
295 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
296 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
297 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
298 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
299 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
300 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
301 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
302 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
303 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
304 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
305 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
306 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
307 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
308 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
309 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
310 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
311 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
312 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
313 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
314 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
315 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
316 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
317 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
318 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
319 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
322 static const SimdIntrinsc vector4i_intrinsics[] = {
323 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
324 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
325 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
326 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
327 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
328 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
329 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
330 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
331 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
332 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
333 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
334 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
335 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
336 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
337 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
338 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
339 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
340 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
341 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
342 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
343 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
344 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
345 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
346 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
347 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
348 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
349 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
350 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
351 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
352 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
353 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
354 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
355 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
356 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
357 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
358 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
359 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
360 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
363 static const SimdIntrinsc vector8us_intrinsics[] = {
364 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
365 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
366 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
367 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
368 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
369 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
370 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
371 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
372 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
373 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
374 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
375 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
376 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
377 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
378 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
379 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
380 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
381 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
382 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
384 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
385 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
386 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
387 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
388 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
389 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
390 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
391 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
392 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
393 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
395 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
396 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
397 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
398 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
399 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
400 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
401 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
402 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
403 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
404 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
405 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
406 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
407 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
408 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
409 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
410 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
411 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
414 static const SimdIntrinsc vector8s_intrinsics[] = {
415 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
416 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
417 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
418 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
419 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
420 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
421 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
422 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
423 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
424 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
425 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
426 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
427 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
428 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
429 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
430 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
431 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
432 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
433 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
437 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
438 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
439 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
440 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
441 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
442 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
443 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
444 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
446 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
447 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
448 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
449 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
450 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
451 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
452 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
453 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
454 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
455 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
456 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
457 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
458 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
459 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
460 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
461 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
462 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
465 static const SimdIntrinsc vector16b_intrinsics[] = {
466 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
467 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
468 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
469 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
470 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
471 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
472 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
473 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
474 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
475 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
476 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
477 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
478 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
479 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
482 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
483 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
484 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
485 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
486 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
487 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
488 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
489 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
490 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
491 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
492 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
493 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
500 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
501 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
502 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
503 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
504 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
505 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
506 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
507 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
508 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
509 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
510 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
511 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
512 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
513 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
514 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
515 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
516 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
517 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 static const SimdIntrinsc vector16sb_intrinsics[] = {
530 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
531 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
532 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
533 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
534 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
535 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
536 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
537 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
538 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
539 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
540 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
541 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
542 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
543 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
545 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
546 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
547 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
548 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
549 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
550 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
551 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
552 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
553 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
554 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
555 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
556 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
563 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
564 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
565 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
566 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
567 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
568 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
569 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
570 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
571 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
572 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
573 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
574 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
575 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
576 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
577 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
578 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
579 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
580 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 static guint32 simd_supported_versions;
590 /*TODO match using number of parameters as well*/
592 simd_intrinsic_compare_by_name (const void *key, const void *value)
594 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
599 VREG_HAS_XZERO_BB0 = 0x02,
600 VREG_HAS_OTHER_OP_BB0 = 0x04,
601 VREG_SINGLE_BB_USE = 0x08,
602 VREG_MANY_BB_USE = 0x10,
606 mono_simd_intrinsics_init (void)
608 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
609 /*TODO log the supported flags*/
612 static inline gboolean
613 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
615 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
616 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
617 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
618 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
624 static inline gboolean
625 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
627 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
630 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
631 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
632 vreg_flags [reg] |= VREG_MANY_BB_USE;
633 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
635 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
636 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
637 target_bb [reg] = bb;
638 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
645 This pass recalculate which vars need MONO_INST_INDIRECT.
647 We cannot do this for non SIMD vars since code like mono_get_vtable_var
648 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
651 mono_simd_simplify_indirection (MonoCompile *cfg)
654 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
658 for (i = 0; i < cfg->num_varinfo; i++) {
659 MonoInst *var = cfg->varinfo [i];
660 if (var->klass->simd_type) {
661 var->flags &= ~MONO_INST_INDIRECT;
662 max_vreg = MAX (var->dreg, max_vreg);
666 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
667 if (!first_bb && bb->code)
669 for (ins = bb->code; ins; ins = ins->next) {
670 if (ins->opcode == OP_LDADDR) {
671 MonoInst *var = (MonoInst*)ins->inst_p0;
672 if (var->klass->simd_type) {
673 var->flags |= MONO_INST_INDIRECT;
679 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
680 vreg_flags = g_malloc0 (max_vreg + 1);
681 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
683 for (i = 0; i < cfg->num_varinfo; i++) {
684 MonoInst *var = cfg->varinfo [i];
685 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
686 vreg_flags [var->dreg] = VREG_USED;
687 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
691 /*Scan the first basic block looking xzeros not used*/
692 for (ins = first_bb->code; ins; ins = ins->next) {
694 int sregs [MONO_MAX_SRC_REGS];
696 if (ins->opcode == OP_XZERO) {
697 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
698 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
699 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
703 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
705 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
707 num_sregs = mono_inst_get_src_registers (ins, sregs);
708 for (i = 0; i < num_sregs; ++i) {
709 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
714 if (IS_DEBUG_ON (cfg)) {
715 for (i = 0; i < cfg->num_varinfo; i++) {
716 MonoInst *var = cfg->varinfo [i];
717 if (var->klass->simd_type) {
718 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
719 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
720 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
721 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
726 /*TODO stop here if no var is xzero only*/
729 Scan all other bb and check if it has only one other use
730 Ideally this would be done after an extended bb formation pass
732 FIXME This pass could use dominator information to properly
733 place the XZERO on the bb that dominates all uses of the var,
734 but this will have zero effect with the current local reg alloc
736 TODO simply the use of flags.
739 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
740 for (ins = bb->code; ins; ins = ins->next) {
742 int sregs [MONO_MAX_SRC_REGS];
744 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
746 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
748 num_sregs = mono_inst_get_src_registers (ins, sregs);
749 for (i = 0; i < num_sregs; ++i) {
750 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
751 max_vreg, vreg_flags, target_bb))
757 for (i = 0; i < cfg->num_varinfo; i++) {
758 MonoInst *var = cfg->varinfo [i];
759 if (!var->klass->simd_type)
761 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
762 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
763 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
764 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
766 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
768 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
770 int sregs [MONO_MAX_SRC_REGS];
771 gboolean found = FALSE;
773 num_sregs = mono_inst_get_src_registers (ins, sregs);
774 for (j = 0; j < num_sregs; ++j) {
775 if (sregs [i] == var->dreg)
778 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
779 if (ins->dreg == var->dreg && !found) {
783 MONO_INST_NEW (cfg, tmp, OP_XZERO);
784 tmp->dreg = var->dreg;
785 tmp->type = STACK_VTYPE;
786 tmp->klass = var->klass;
787 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
793 for (ins = first_bb->code; ins; ins = ins->next) {
794 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
803 * This function expect that src be a value.
806 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
808 if (src->opcode == OP_XMOVE) {
810 } else if (src->type == STACK_VTYPE) {
813 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
814 mono_print_ins (src);
815 g_assert_not_reached ();
819 * This function will load the value if needed.
822 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
826 if (src->opcode == OP_XMOVE) {
828 } else if (src->opcode == OP_LDADDR) {
829 int res = ((MonoInst*)src->inst_p0)->dreg;
832 } else if (src->type == STACK_VTYPE) {
834 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
839 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
840 ins->klass = cmethod->klass;
841 ins->sreg1 = src->dreg;
842 ins->type = STACK_VTYPE;
843 ins->dreg = alloc_ireg (cfg);
844 MONO_ADD_INS (cfg->cbb, ins);
847 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
848 mono_print_ins (src);
849 g_assert_not_reached ();
853 get_int_to_float_spill_area (MonoCompile *cfg)
855 if (!cfg->iconv_raw_var) {
856 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
857 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
859 return cfg->iconv_raw_var;
862 /*We share the var with fconv_to_r8_x to save some stack space.*/
864 get_double_spill_area (MonoCompile *cfg)
866 if (!cfg->fconv_to_r8_x_var) {
867 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
868 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
870 return cfg->fconv_to_r8_x_var;
873 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
875 if (!cfg->simd_ctor_var) {
876 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
877 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
879 return cfg->simd_ctor_var;
883 mono_type_to_expand_op (MonoType *type)
885 switch (type->type) {
903 g_assert_not_reached ();
908 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
911 MonoMethodSignature *sig = mono_method_signature (cmethod);
914 g_assert (sig->param_count == 2);
915 g_assert (position == 0 || position == 1);
917 if (mono_class_from_mono_type (sig->params [position])->simd_type)
918 return get_simd_vreg (cfg, cmethod, src);
920 expand_op = mono_type_to_expand_op (sig->params [position]);
921 MONO_INST_NEW (cfg, ins, expand_op);
922 ins->klass = cmethod->klass;
923 ins->sreg1 = src->dreg;
924 ins->type = STACK_VTYPE;
925 ins->dreg = alloc_ireg (cfg);
926 MONO_ADD_INS (cfg->cbb, ins);
928 if (expand_op == OP_EXPAND_R4)
929 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
930 else if (expand_op == OP_EXPAND_R8)
931 ins->backend.spill_var = get_double_spill_area (cfg);
937 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
940 int left_vreg, right_vreg;
942 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
943 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
946 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
947 ins->klass = cmethod->klass;
948 ins->sreg1 = left_vreg;
949 ins->sreg2 = right_vreg;
950 ins->type = STACK_VTYPE;
951 ins->dreg = alloc_ireg (cfg);
952 ins->inst_c0 = intrinsic->flags;
953 MONO_ADD_INS (cfg->cbb, ins);
958 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
963 vreg = get_simd_vreg (cfg, cmethod, args [0]);
965 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
966 ins->klass = cmethod->klass;
968 ins->type = STACK_VTYPE;
969 ins->dreg = alloc_ireg (cfg);
970 MONO_ADD_INS (cfg->cbb, ins);
975 mono_type_to_extract_op (MonoType *type)
977 switch (type->type) {
979 return OP_EXTRACT_I1;
981 return OP_EXTRACT_U1;
983 return OP_EXTRACT_I2;
985 return OP_EXTRACT_U2;
989 return OP_EXTRACT_I4;
991 g_assert_not_reached ();
995 /*Returns the amount to shift the element index to get the dword it belongs to*/
997 mono_type_elements_shift_bits (MonoType *type)
999 switch (type->type) {
1011 g_assert_not_reached ();
1015 static G_GNUC_UNUSED int
1016 mono_type_to_insert_op (MonoType *type)
1018 switch (type->type) {
1021 return OP_INSERT_I1;
1024 return OP_INSERT_I2;
1027 return OP_INSERT_I4;
1030 return OP_INSERT_I8;
1032 return OP_INSERT_R4;
1034 return OP_INSERT_R8;
1036 g_assert_not_reached ();
1041 mono_type_to_slow_insert_op (MonoType *type)
1043 switch (type->type) {
1046 return OP_INSERTX_U1_SLOW;
1049 return OP_INSERT_I2;
1052 return OP_INSERTX_I4_SLOW;
1055 return OP_INSERTX_I8_SLOW;
1057 return OP_INSERTX_R4_SLOW;
1059 return OP_INSERTX_R8_SLOW;
1061 g_assert_not_reached ();
1066 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1069 MonoMethodSignature *sig = mono_method_signature (cmethod);
1074 size = mono_type_size (sig->params [0], &align);
1076 if (COMPILE_LLVM (cfg)) {
1077 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1078 ins->klass = cmethod->klass;
1079 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1080 ins->sreg2 = args [1]->dreg;
1081 ins->inst_c0 = intrinsic->opcode;
1082 MONO_ADD_INS (cfg->cbb, ins);
1083 } else if (size == 2 || size == 4 || size == 8) {
1084 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1085 ins->klass = cmethod->klass;
1086 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1087 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1088 ins->sreg2 = args [1]->dreg;
1089 ins->inst_c0 = intrinsic->opcode;
1090 if (sig->params [0]->type == MONO_TYPE_R4)
1091 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1092 else if (sig->params [0]->type == MONO_TYPE_R8)
1093 ins->backend.spill_var = get_double_spill_area (cfg);
1094 MONO_ADD_INS (cfg->cbb, ins);
1098 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1099 ins->klass = cmethod->klass;
1100 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1101 ins->type = STACK_I4;
1102 ins->dreg = vreg = alloc_ireg (cfg);
1103 ins->inst_c0 = intrinsic->opcode / 2;
1104 MONO_ADD_INS (cfg->cbb, ins);
1106 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1107 ins->klass = cmethod->klass;
1109 ins->sreg2 = args [1]->dreg;
1111 ins->inst_c0 = intrinsic->opcode;
1112 MONO_ADD_INS (cfg->cbb, ins);
1116 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1117 ins->klass = cmethod->klass;
1118 ins->dreg = args [0]->dreg;
1120 MONO_ADD_INS (cfg->cbb, ins);
1126 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1129 MonoMethodSignature *sig = mono_method_signature (cmethod);
1130 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1132 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1134 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1135 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1136 ins->klass = cmethod->klass;
1138 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1139 ins->type = STACK_VTYPE;
1140 ins->dreg = vreg = alloc_ireg (cfg);
1141 MONO_ADD_INS (cfg->cbb, ins);
1144 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1145 ins->klass = cmethod->klass;
1147 ins->type = STACK_I4;
1148 ins->dreg = vreg = alloc_ireg (cfg);
1149 if (cfg->compile_llvm)
1150 ins->inst_c0 = intrinsic->opcode;
1152 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1153 MONO_ADD_INS (cfg->cbb, ins);
1155 if (sig->ret->type == MONO_TYPE_R4) {
1156 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1157 ins->klass = mono_defaults.single_class;
1159 ins->type = STACK_R8;
1160 ins->dreg = alloc_freg (cfg);
1161 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1162 MONO_ADD_INS (cfg->cbb, ins);
1168 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1172 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1174 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1176 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1177 ins->klass = cmethod->klass;
1179 ins->inst_c0 = intrinsic->opcode;
1181 ins->type = STACK_R8;
1182 ins->dreg = alloc_freg (cfg);
1183 ins->backend.spill_var = get_double_spill_area (cfg);
1185 ins->type = STACK_I8;
1186 ins->dreg = alloc_lreg (cfg);
1188 MONO_ADD_INS (cfg->cbb, ins);
1194 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1196 MonoInst *ins = NULL;
1198 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1199 MonoMethodSignature *sig = mono_method_signature (cmethod);
1200 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1201 int arg_size = mono_type_size (sig->params [0], &i);
1203 if (sig->param_count == 1) {
1207 dreg = args [0]->inst_i0->dreg;
1208 NULLIFY_INS (args [0]);
1210 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1211 dreg = alloc_ireg (cfg);
1214 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1215 ins->klass = cmethod->klass;
1216 ins->sreg1 = args [1]->dreg;
1217 ins->type = STACK_VTYPE;
1220 MONO_ADD_INS (cfg->cbb, ins);
1221 if (sig->params [0]->type == MONO_TYPE_R4)
1222 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1223 else if (sig->params [0]->type == MONO_TYPE_R8)
1224 ins->backend.spill_var = get_double_spill_area (cfg);
1227 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1228 ins->dreg = args [0]->dreg;
1230 MONO_ADD_INS (cfg->cbb, ins);
1236 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1237 MONO_ADD_INS (cfg->cbb, ins);
1238 addr_reg = ins->dreg;
1240 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1241 addr_reg = args [0]->dreg;
1244 for (i = sig->param_count - 1; i >= 0; --i) {
1245 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1248 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1249 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1250 NULLIFY_INS (args [0]);
1252 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1253 ins->klass = cmethod->klass;
1254 ins->sreg1 = addr_reg;
1255 ins->type = STACK_VTYPE;
1257 MONO_ADD_INS (cfg->cbb, ins);
1263 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1268 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1270 //TODO macroize this
1271 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1272 ins->klass = cmethod->klass;
1273 ins->type = STACK_VTYPE;
1275 ins->dreg = alloc_ireg (cfg);
1276 MONO_ADD_INS (cfg->cbb, ins);
1281 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1284 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1286 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1288 if (args [1]->opcode != OP_ICONST) {
1289 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1290 ins->klass = mono_defaults.int32_class;
1291 ins->sreg1 = args [1]->dreg;
1292 ins->type = STACK_I4;
1293 ins->dreg = vreg2 = alloc_ireg (cfg);
1294 MONO_ADD_INS (cfg->cbb, ins);
1296 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1299 MONO_INST_NEW (cfg, ins, opcode);
1300 ins->klass = cmethod->klass;
1304 if (args [1]->opcode == OP_ICONST) {
1305 ins->inst_imm = args [1]->inst_c0;
1306 NULLIFY_INS (args [1]);
1309 ins->type = STACK_VTYPE;
1310 ins->dreg = alloc_ireg (cfg);
1311 MONO_ADD_INS (cfg->cbb, ins);
1315 static inline gboolean
1316 mono_op_is_packed_compare (int op)
1318 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1322 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1325 int left_vreg, right_vreg, tmp_vreg;
1327 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1328 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1331 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1332 ins->klass = cmethod->klass;
1333 ins->sreg1 = left_vreg;
1334 ins->sreg2 = right_vreg;
1335 ins->type = STACK_VTYPE;
1336 ins->klass = cmethod->klass;
1337 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1338 ins->inst_c0 = intrinsic->flags;
1339 MONO_ADD_INS (cfg->cbb, ins);
1341 /*FIXME the next ops are SSE specific*/
1342 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1343 ins->klass = cmethod->klass;
1344 ins->sreg1 = tmp_vreg;
1345 ins->type = STACK_I4;
1346 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1347 MONO_ADD_INS (cfg->cbb, ins);
1349 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1350 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1351 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1352 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1354 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1355 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1357 MONO_ADD_INS (cfg->cbb, ins);
1363 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1366 int vreg, vreg2 = -1;
1367 int param_count = mono_method_signature (cmethod)->param_count;
1369 if (args [param_count - 1]->opcode != OP_ICONST) {
1370 /*TODO Shuffle with non literals is not yet supported */
1374 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1375 if (param_count == 3)
1376 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1378 NULLIFY_INS (args [param_count - 1]);
1381 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1382 ins->klass = cmethod->klass;
1385 ins->inst_c0 = args [param_count - 1]->inst_c0;
1386 ins->type = STACK_VTYPE;
1387 ins->dreg = alloc_ireg (cfg);
1388 MONO_ADD_INS (cfg->cbb, ins);
1390 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1391 ins->opcode = OP_SHUFPS;
1396 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1400 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1401 ins->klass = cmethod->klass;
1402 ins->sreg1 = args [0]->dreg;
1403 ins->type = STACK_VTYPE;
1404 ins->dreg = alloc_ireg (cfg);
1405 MONO_ADD_INS (cfg->cbb, ins);
1410 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1415 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1417 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1418 ins->klass = cmethod->klass;
1419 ins->dreg = args [0]->dreg;
1421 ins->type = STACK_VTYPE;
1422 MONO_ADD_INS (cfg->cbb, ins);
1427 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1432 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1434 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1435 ins->klass = cmethod->klass;
1437 ins->type = STACK_I4;
1438 ins->dreg = alloc_ireg (cfg);
1439 MONO_ADD_INS (cfg->cbb, ins);
1445 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1449 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1450 ins->klass = cmethod->klass;
1451 ins->sreg1 = args [0]->dreg;
1452 ins->backend.arg_info = intrinsic->flags;
1453 MONO_ADD_INS (cfg->cbb, ins);
1458 simd_version_name (guint32 version)
1461 case SIMD_VERSION_SSE1:
1463 case SIMD_VERSION_SSE2:
1465 case SIMD_VERSION_SSE3:
1467 case SIMD_VERSION_SSSE3:
1469 case SIMD_VERSION_SSE41:
1471 case SIMD_VERSION_SSE42:
1473 case SIMD_VERSION_SSE4a:
1480 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1482 const SimdIntrinsc * result = mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1484 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1487 if (IS_DEBUG_ON (cfg)) {
1489 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1490 max = fsig->param_count + fsig->hasthis;
1491 for (i = 0; i < max; ++i) {
1492 printf ("param %d: ", i);
1493 mono_print_ins (args [i]);
1496 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1497 if (IS_DEBUG_ON (cfg)) {
1499 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1500 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1501 if (result->simd_version_flags & (1 << x))
1502 printf ("%s ", simd_version_name (1 << x));
1509 switch (result->simd_emit_mode) {
1510 case SIMD_EMIT_BINARY:
1511 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1512 case SIMD_EMIT_UNARY:
1513 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1514 case SIMD_EMIT_SETTER:
1515 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1516 case SIMD_EMIT_GETTER:
1517 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1518 case SIMD_EMIT_GETTER_QWORD:
1519 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1520 case SIMD_EMIT_CTOR:
1521 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1522 case SIMD_EMIT_CAST:
1523 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1524 case SIMD_EMIT_SHUFFLE:
1525 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1526 case SIMD_EMIT_SHIFT:
1527 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1528 case SIMD_EMIT_EQUALITY:
1529 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1530 case SIMD_EMIT_LOAD_ALIGNED:
1531 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1532 case SIMD_EMIT_STORE:
1533 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1534 case SIMD_EMIT_EXTRACT_MASK:
1535 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1536 case SIMD_EMIT_PREFETCH:
1537 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1539 g_assert_not_reached ();
1543 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1547 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1549 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1550 mult_reg = alloc_preg (cfg);
1551 array_reg = arr->dreg;
1552 index_reg = index->dreg;
1554 #if SIZEOF_VOID_P == 8
1555 /* The array reg is 64 bits but the index reg is only 32 */
1556 index2_reg = alloc_preg (cfg);
1557 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1559 index2_reg = index_reg;
1561 index3_reg = alloc_preg (cfg);
1564 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1565 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1566 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1569 add_reg = alloc_preg (cfg);
1571 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1572 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1573 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1574 ins->type = STACK_PTR;
1575 MONO_ADD_INS (cfg->cbb, ins);
1581 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1583 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1585 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1587 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1588 load->klass = cmethod->klass;
1590 load->type = STACK_VTYPE;
1591 load->dreg = alloc_ireg (cfg);
1592 MONO_ADD_INS (cfg->cbb, load);
1596 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1598 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1599 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1601 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1602 store->klass = cmethod->klass;
1604 store->sreg1 = vreg;
1605 MONO_ADD_INS (cfg->cbb, store);
1609 if (!strcmp ("IsAligned", cmethod->name)) {
1611 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1613 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1614 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1615 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1616 MONO_ADD_INS (cfg->cbb, ins);
1624 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1626 if (!strcmp ("get_AccelMode", cmethod->name)) {
1628 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1635 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1637 const char *class_name;
1639 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1642 class_name = cmethod->klass->name;
1643 if (!strcmp ("SimdRuntime", class_name))
1644 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1646 if (!strcmp ("ArrayExtensions", class_name))
1647 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1649 if (!strcmp ("VectorOperations", class_name)) {
1650 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1652 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1653 } else if (!cmethod->klass->simd_type)
1656 cfg->uses_simd_intrinsics = 1;
1657 if (!strcmp ("Vector2d", class_name))
1658 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1659 if (!strcmp ("Vector4f", class_name))
1660 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1661 if (!strcmp ("Vector2ul", class_name))
1662 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1663 if (!strcmp ("Vector2l", class_name))
1664 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1665 if (!strcmp ("Vector4ui", class_name))
1666 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1667 if (!strcmp ("Vector4i", class_name))
1668 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1669 if (!strcmp ("Vector8us", class_name))
1670 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1671 if (!strcmp ("Vector8s", class_name))
1672 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1673 if (!strcmp ("Vector16b", class_name))
1674 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1675 if (!strcmp ("Vector16sb", class_name))
1676 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));