2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
15 #include "mono/utils/bsearch.h"
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
34 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
35 TODO check if we need to init the SSE control word with better precision.
36 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
37 TODO make SimdRuntime.get_AccelMode work under AOT
38 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
39 TODO extend bounds checking code to support for range checking.
41 General notes for SIMD intrinsics.
43 -Bad extractor and constructor performance
44 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
45 It will be loaded in the FP stack just to be pushed on the call stack.
47 A similar thing happens with Vector4f constructor that require float vars to be
49 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
50 trip to the FP stack is desirable.
52 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
56 -Promote OP_EXTRACT_I4 to a STORE op
57 The advantage of this change is that it could have a _membase version and promote further optimizations.
59 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
63 #ifdef MONO_ARCH_SIMD_INTRINSICS
65 //#define IS_DEBUG_ON(cfg) (0)
67 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
68 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
74 SIMD_EMIT_GETTER_QWORD,
80 SIMD_EMIT_LOAD_ALIGNED,
82 SIMD_EMIT_EXTRACT_MASK,
86 #ifdef HAVE_ARRAY_ELEM_INIT
87 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
88 #define MSGSTRFIELD1(line) str##line
89 static const struct msgstr_t {
90 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
91 #include "simd-methods.h"
94 #define SIMD_METHOD(str,name) str,
95 #include "simd-methods.h"
100 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
101 #include "simd-methods.h"
103 #define method_name(idx) ((const char*)&method_names + (idx))
106 #define SIMD_METHOD(str,name) str,
107 static const char * const method_names [] = {
108 #include "simd-methods.h"
112 #define SIMD_METHOD(str,name) name,
114 #include "simd-methods.h"
118 #define method_name(idx) (method_names [(idx)])
125 guint8 simd_version_flags;
126 guint8 simd_emit_mode : 4;
130 static const SimdIntrinsc vector4f_intrinsics[] = {
131 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
132 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
133 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
134 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
135 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
136 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
137 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
138 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
139 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
140 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
141 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
142 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
143 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
144 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
145 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
146 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
147 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
148 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
149 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
150 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
151 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
152 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
153 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
154 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
155 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
156 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
157 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
158 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
159 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
160 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
161 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
162 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
163 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
164 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
165 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
166 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
167 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
168 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
169 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
170 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
171 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
172 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
173 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
174 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
175 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
176 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
177 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
178 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
179 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
180 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
181 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
184 static const SimdIntrinsc vector2d_intrinsics[] = {
185 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
186 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
187 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
188 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
189 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
190 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
191 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
192 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
193 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
194 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
195 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
196 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
197 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
198 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
199 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
200 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
201 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
202 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
203 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
204 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
205 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
206 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
207 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
208 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
209 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
210 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
211 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
212 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
213 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
214 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
215 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
216 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
217 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
220 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
221 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
222 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
223 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
224 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
225 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
228 static const SimdIntrinsc vector2ul_intrinsics[] = {
229 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
230 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
231 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
232 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
233 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
234 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
235 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
236 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
237 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
238 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
239 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
240 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
241 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
242 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
243 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
244 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
245 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
246 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
247 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
248 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
249 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
250 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
251 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
252 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
255 static const SimdIntrinsc vector2l_intrinsics[] = {
256 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
257 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
258 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
259 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
260 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
261 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
262 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
263 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
264 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
265 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
266 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
267 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
268 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
269 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
270 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
271 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
272 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
273 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
274 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
275 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
276 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
277 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
278 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
279 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
280 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
283 static const SimdIntrinsc vector4ui_intrinsics[] = {
284 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
285 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
286 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
287 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
288 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
289 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
290 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
291 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
292 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
293 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
294 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
295 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
296 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
297 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
298 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
300 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
301 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
302 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
303 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
304 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
305 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
306 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
307 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
308 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
309 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
310 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
311 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
312 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
313 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
314 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
315 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
316 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
317 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
318 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
321 static const SimdIntrinsc vector4i_intrinsics[] = {
322 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
323 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
324 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
325 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
326 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
327 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
328 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
329 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
330 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
331 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
332 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
333 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
334 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
335 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
336 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
337 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
338 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
339 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
340 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
341 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
342 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
343 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
344 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
345 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
346 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
347 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
348 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
349 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
350 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
351 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
352 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
353 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
354 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
355 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
356 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
357 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
358 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
359 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
362 static const SimdIntrinsc vector8us_intrinsics[] = {
363 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
364 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
365 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
366 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
367 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
368 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
369 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
370 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
371 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
372 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
373 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
374 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
375 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
376 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
377 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
378 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
379 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
380 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
381 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
382 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
384 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
385 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
386 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
387 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
388 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
389 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
390 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
391 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
392 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
393 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
395 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
396 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
397 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
398 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
399 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
400 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
401 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
402 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
403 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
404 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
405 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
406 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
407 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
408 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
409 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
410 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
413 static const SimdIntrinsc vector8s_intrinsics[] = {
414 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
415 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
416 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
417 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
418 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
419 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
420 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
421 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
422 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
423 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
424 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
425 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
426 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
427 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
428 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
429 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
430 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
431 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
432 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
436 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
437 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
438 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
439 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
440 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
441 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
442 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
443 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
444 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
446 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
447 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
448 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
449 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
450 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
451 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
452 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
453 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
454 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
455 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
456 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
457 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
458 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
459 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
460 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
461 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
464 static const SimdIntrinsc vector16b_intrinsics[] = {
465 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
466 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
467 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
468 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
469 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
470 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
471 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
472 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
473 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
474 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
475 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
476 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
477 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
478 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
482 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
483 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
484 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
485 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
486 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
487 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
488 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
489 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
490 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
491 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
492 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
493 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
499 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
500 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
501 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
502 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
503 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
504 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
505 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
506 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
507 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
508 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
509 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
510 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
511 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
512 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
513 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
514 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
515 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
516 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
517 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
528 static const SimdIntrinsc vector16sb_intrinsics[] = {
529 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
530 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
531 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
532 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
533 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
534 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
535 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
536 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
537 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
538 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
539 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
540 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
541 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
542 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
545 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
546 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
547 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
548 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
549 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
550 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
551 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
552 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
553 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
554 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
555 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
556 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
562 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
563 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
564 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
565 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
566 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
567 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
568 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
569 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
570 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
571 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
572 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
573 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
574 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
575 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
576 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
577 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
578 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
579 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
580 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
587 static guint32 simd_supported_versions;
589 /*TODO match using number of parameters as well*/
591 simd_intrinsic_compare_by_name (const void *key, const void *value)
593 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
598 VREG_HAS_XZERO_BB0 = 0x02,
599 VREG_HAS_OTHER_OP_BB0 = 0x04,
600 VREG_SINGLE_BB_USE = 0x08,
601 VREG_MANY_BB_USE = 0x10,
605 mono_simd_intrinsics_init (void)
607 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
608 /*TODO log the supported flags*/
611 static inline gboolean
612 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
614 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
615 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
616 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
617 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
623 static inline gboolean
624 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
626 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
629 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
630 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
631 vreg_flags [reg] |= VREG_MANY_BB_USE;
632 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
634 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
635 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
636 target_bb [reg] = bb;
637 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
644 This pass recalculate which vars need MONO_INST_INDIRECT.
646 We cannot do this for non SIMD vars since code like mono_get_vtable_var
647 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
650 mono_simd_simplify_indirection (MonoCompile *cfg)
653 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
657 for (i = 0; i < cfg->num_varinfo; i++) {
658 MonoInst *var = cfg->varinfo [i];
659 if (var->klass->simd_type) {
660 var->flags &= ~MONO_INST_INDIRECT;
661 max_vreg = MAX (var->dreg, max_vreg);
665 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
666 if (!first_bb && bb->code)
668 for (ins = bb->code; ins; ins = ins->next) {
669 if (ins->opcode == OP_LDADDR) {
670 MonoInst *var = (MonoInst*)ins->inst_p0;
671 if (var->klass->simd_type) {
672 var->flags |= MONO_INST_INDIRECT;
678 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
679 vreg_flags = g_malloc0 (max_vreg + 1);
680 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
682 for (i = 0; i < cfg->num_varinfo; i++) {
683 MonoInst *var = cfg->varinfo [i];
684 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
685 vreg_flags [var->dreg] = VREG_USED;
686 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
690 /*Scan the first basic block looking xzeros not used*/
691 for (ins = first_bb->code; ins; ins = ins->next) {
693 int sregs [MONO_MAX_SRC_REGS];
695 if (ins->opcode == OP_XZERO) {
696 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
697 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
698 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
702 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
704 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
706 num_sregs = mono_inst_get_src_registers (ins, sregs);
707 for (i = 0; i < num_sregs; ++i) {
708 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
713 if (IS_DEBUG_ON (cfg)) {
714 for (i = 0; i < cfg->num_varinfo; i++) {
715 MonoInst *var = cfg->varinfo [i];
716 if (var->klass->simd_type) {
717 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
718 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
719 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
720 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
725 /*TODO stop here if no var is xzero only*/
728 Scan all other bb and check if it has only one other use
729 Ideally this would be done after an extended bb formation pass
731 FIXME This pass could use dominator information to properly
732 place the XZERO on the bb that dominates all uses of the var,
733 but this will have zero effect with the current local reg alloc
735 TODO simply the use of flags.
738 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
739 for (ins = bb->code; ins; ins = ins->next) {
741 int sregs [MONO_MAX_SRC_REGS];
743 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
745 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
747 num_sregs = mono_inst_get_src_registers (ins, sregs);
748 for (i = 0; i < num_sregs; ++i) {
749 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
750 max_vreg, vreg_flags, target_bb))
756 for (i = 0; i < cfg->num_varinfo; i++) {
757 MonoInst *var = cfg->varinfo [i];
758 if (!var->klass->simd_type)
760 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
761 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
762 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
763 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
765 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
767 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
769 int sregs [MONO_MAX_SRC_REGS];
770 gboolean found = FALSE;
772 num_sregs = mono_inst_get_src_registers (ins, sregs);
773 for (j = 0; j < num_sregs; ++j) {
774 if (sregs [i] == var->dreg)
777 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
778 if (ins->dreg == var->dreg && !found) {
782 MONO_INST_NEW (cfg, tmp, OP_XZERO);
783 tmp->dreg = var->dreg;
784 tmp->type = STACK_VTYPE;
785 tmp->klass = var->klass;
786 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
792 for (ins = first_bb->code; ins; ins = ins->next) {
793 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
802 * This function expect that src be a value.
805 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
807 if (src->opcode == OP_XMOVE) {
809 } else if (src->type == STACK_VTYPE) {
812 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
813 mono_print_ins (src);
814 g_assert_not_reached ();
818 * This function will load the value if needed.
821 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
825 if (src->opcode == OP_XMOVE) {
827 } else if (src->opcode == OP_LDADDR) {
828 int res = ((MonoInst*)src->inst_p0)->dreg;
831 } else if (src->type == STACK_VTYPE) {
833 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
838 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
839 ins->klass = cmethod->klass;
840 ins->sreg1 = src->dreg;
841 ins->type = STACK_VTYPE;
842 ins->dreg = alloc_ireg (cfg);
843 MONO_ADD_INS (cfg->cbb, ins);
846 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
847 mono_print_ins (src);
848 g_assert_not_reached ();
852 get_int_to_float_spill_area (MonoCompile *cfg)
854 if (!cfg->iconv_raw_var) {
855 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
856 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
858 return cfg->iconv_raw_var;
861 /*We share the var with fconv_to_r8_x to save some stack space.*/
863 get_double_spill_area (MonoCompile *cfg)
865 if (!cfg->fconv_to_r8_x_var) {
866 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
867 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
869 return cfg->fconv_to_r8_x_var;
872 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
874 if (!cfg->simd_ctor_var) {
875 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
876 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
878 return cfg->simd_ctor_var;
882 mono_type_to_expand_op (MonoType *type)
884 switch (type->type) {
902 g_assert_not_reached ();
907 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
910 MonoMethodSignature *sig = mono_method_signature (cmethod);
913 g_assert (sig->param_count == 2);
914 g_assert (position == 0 || position == 1);
916 if (mono_class_from_mono_type (sig->params [position])->simd_type)
917 return get_simd_vreg (cfg, cmethod, src);
919 expand_op = mono_type_to_expand_op (sig->params [position]);
920 MONO_INST_NEW (cfg, ins, expand_op);
921 ins->klass = cmethod->klass;
922 ins->sreg1 = src->dreg;
923 ins->type = STACK_VTYPE;
924 ins->dreg = alloc_ireg (cfg);
925 MONO_ADD_INS (cfg->cbb, ins);
927 if (expand_op == OP_EXPAND_R4)
928 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
929 else if (expand_op == OP_EXPAND_R8)
930 ins->backend.spill_var = get_double_spill_area (cfg);
936 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
939 int left_vreg, right_vreg;
941 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
942 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
945 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
946 ins->klass = cmethod->klass;
947 ins->sreg1 = left_vreg;
948 ins->sreg2 = right_vreg;
949 ins->type = STACK_VTYPE;
950 ins->dreg = alloc_ireg (cfg);
951 ins->inst_c0 = intrinsic->flags;
952 MONO_ADD_INS (cfg->cbb, ins);
957 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
962 vreg = get_simd_vreg (cfg, cmethod, args [0]);
964 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
965 ins->klass = cmethod->klass;
967 ins->type = STACK_VTYPE;
968 ins->dreg = alloc_ireg (cfg);
969 MONO_ADD_INS (cfg->cbb, ins);
974 mono_type_to_extract_op (MonoType *type)
976 switch (type->type) {
978 return OP_EXTRACT_I1;
980 return OP_EXTRACT_U1;
982 return OP_EXTRACT_I2;
984 return OP_EXTRACT_U2;
988 return OP_EXTRACT_I4;
990 g_assert_not_reached ();
994 /*Returns the amount to shift the element index to get the dword it belongs to*/
996 mono_type_elements_shift_bits (MonoType *type)
998 switch (type->type) {
1010 g_assert_not_reached ();
1014 static G_GNUC_UNUSED int
1015 mono_type_to_insert_op (MonoType *type)
1017 switch (type->type) {
1020 return OP_INSERT_I1;
1023 return OP_INSERT_I2;
1026 return OP_INSERT_I4;
1029 return OP_INSERT_I8;
1031 return OP_INSERT_R4;
1033 return OP_INSERT_R8;
1035 g_assert_not_reached ();
1040 mono_type_to_slow_insert_op (MonoType *type)
1042 switch (type->type) {
1045 return OP_INSERTX_U1_SLOW;
1048 return OP_INSERT_I2;
1051 return OP_INSERTX_I4_SLOW;
1054 return OP_INSERTX_I8_SLOW;
1056 return OP_INSERTX_R4_SLOW;
1058 return OP_INSERTX_R8_SLOW;
1060 g_assert_not_reached ();
1065 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1068 MonoMethodSignature *sig = mono_method_signature (cmethod);
1073 size = mono_type_size (sig->params [0], &align);
1075 if (COMPILE_LLVM (cfg)) {
1076 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1077 ins->klass = cmethod->klass;
1078 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1079 ins->sreg2 = args [1]->dreg;
1080 ins->inst_c0 = intrinsic->opcode;
1081 MONO_ADD_INS (cfg->cbb, ins);
1082 } else if (size == 2 || size == 4 || size == 8) {
1083 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1084 ins->klass = cmethod->klass;
1085 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1086 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1087 ins->sreg2 = args [1]->dreg;
1088 ins->inst_c0 = intrinsic->opcode;
1089 if (sig->params [0]->type == MONO_TYPE_R4)
1090 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1091 else if (sig->params [0]->type == MONO_TYPE_R8)
1092 ins->backend.spill_var = get_double_spill_area (cfg);
1093 MONO_ADD_INS (cfg->cbb, ins);
1097 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1098 ins->klass = cmethod->klass;
1099 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1100 ins->type = STACK_I4;
1101 ins->dreg = vreg = alloc_ireg (cfg);
1102 ins->inst_c0 = intrinsic->opcode / 2;
1103 MONO_ADD_INS (cfg->cbb, ins);
1105 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1106 ins->klass = cmethod->klass;
1108 ins->sreg2 = args [1]->dreg;
1110 ins->inst_c0 = intrinsic->opcode;
1111 MONO_ADD_INS (cfg->cbb, ins);
1115 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1116 ins->klass = cmethod->klass;
1117 ins->dreg = args [0]->dreg;
1119 MONO_ADD_INS (cfg->cbb, ins);
1125 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1128 MonoMethodSignature *sig = mono_method_signature (cmethod);
1129 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1131 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1133 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1134 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1135 ins->klass = cmethod->klass;
1137 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1138 ins->type = STACK_VTYPE;
1139 ins->dreg = vreg = alloc_ireg (cfg);
1140 MONO_ADD_INS (cfg->cbb, ins);
1143 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1144 ins->klass = cmethod->klass;
1146 ins->type = STACK_I4;
1147 ins->dreg = vreg = alloc_ireg (cfg);
1148 if (cfg->compile_llvm)
1149 ins->inst_c0 = intrinsic->opcode;
1151 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1152 MONO_ADD_INS (cfg->cbb, ins);
1154 if (sig->ret->type == MONO_TYPE_R4) {
1155 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1156 ins->klass = mono_defaults.single_class;
1158 ins->type = STACK_R8;
1159 ins->dreg = alloc_freg (cfg);
1160 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1161 MONO_ADD_INS (cfg->cbb, ins);
1167 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1171 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1173 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1175 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1176 ins->klass = cmethod->klass;
1178 ins->inst_c0 = intrinsic->opcode;
1180 ins->type = STACK_R8;
1181 ins->dreg = alloc_freg (cfg);
1182 ins->backend.spill_var = get_double_spill_area (cfg);
1184 ins->type = STACK_I8;
1185 ins->dreg = alloc_lreg (cfg);
1187 MONO_ADD_INS (cfg->cbb, ins);
1193 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1195 MonoInst *ins = NULL;
1197 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1198 MonoMethodSignature *sig = mono_method_signature (cmethod);
1199 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1200 int arg_size = mono_type_size (sig->params [0], &i);
1202 if (sig->param_count == 1) {
1206 dreg = args [0]->inst_i0->dreg;
1207 NULLIFY_INS (args [0]);
1209 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1210 dreg = alloc_ireg (cfg);
1213 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1214 ins->klass = cmethod->klass;
1215 ins->sreg1 = args [1]->dreg;
1216 ins->type = STACK_VTYPE;
1219 MONO_ADD_INS (cfg->cbb, ins);
1220 if (sig->params [0]->type == MONO_TYPE_R4)
1221 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1222 else if (sig->params [0]->type == MONO_TYPE_R8)
1223 ins->backend.spill_var = get_double_spill_area (cfg);
1226 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1227 ins->dreg = args [0]->dreg;
1229 MONO_ADD_INS (cfg->cbb, ins);
1235 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1236 MONO_ADD_INS (cfg->cbb, ins);
1237 addr_reg = ins->dreg;
1239 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1240 addr_reg = args [0]->dreg;
1243 for (i = sig->param_count - 1; i >= 0; --i) {
1244 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1247 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1248 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1249 NULLIFY_INS (args [0]);
1251 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1252 ins->klass = cmethod->klass;
1253 ins->sreg1 = addr_reg;
1254 ins->type = STACK_VTYPE;
1256 MONO_ADD_INS (cfg->cbb, ins);
1262 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1267 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1269 //TODO macroize this
1270 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1271 ins->klass = cmethod->klass;
1272 ins->type = STACK_VTYPE;
1274 ins->dreg = alloc_ireg (cfg);
1275 MONO_ADD_INS (cfg->cbb, ins);
1280 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1283 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1285 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1287 if (args [1]->opcode != OP_ICONST) {
1288 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1289 ins->klass = mono_defaults.int32_class;
1290 ins->sreg1 = args [1]->dreg;
1291 ins->type = STACK_I4;
1292 ins->dreg = vreg2 = alloc_ireg (cfg);
1293 MONO_ADD_INS (cfg->cbb, ins);
1295 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1298 MONO_INST_NEW (cfg, ins, opcode);
1299 ins->klass = cmethod->klass;
1303 if (args [1]->opcode == OP_ICONST) {
1304 ins->inst_imm = args [1]->inst_c0;
1305 NULLIFY_INS (args [1]);
1308 ins->type = STACK_VTYPE;
1309 ins->dreg = alloc_ireg (cfg);
1310 MONO_ADD_INS (cfg->cbb, ins);
1314 static inline gboolean
1315 mono_op_is_packed_compare (int op)
1317 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1321 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1324 int left_vreg, right_vreg, tmp_vreg;
1326 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1327 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1330 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1331 ins->klass = cmethod->klass;
1332 ins->sreg1 = left_vreg;
1333 ins->sreg2 = right_vreg;
1334 ins->type = STACK_VTYPE;
1335 ins->klass = cmethod->klass;
1336 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1337 ins->inst_c0 = intrinsic->flags;
1338 MONO_ADD_INS (cfg->cbb, ins);
1340 /*FIXME the next ops are SSE specific*/
1341 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1342 ins->klass = cmethod->klass;
1343 ins->sreg1 = tmp_vreg;
1344 ins->type = STACK_I4;
1345 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1346 MONO_ADD_INS (cfg->cbb, ins);
1348 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1349 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1350 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1351 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1353 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1354 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1356 MONO_ADD_INS (cfg->cbb, ins);
1362 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1365 int vreg, vreg2 = -1;
1366 int param_count = mono_method_signature (cmethod)->param_count;
1368 if (args [param_count - 1]->opcode != OP_ICONST) {
1369 /*TODO Shuffle with non literals is not yet supported */
1373 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1374 if (param_count == 3)
1375 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1377 NULLIFY_INS (args [param_count - 1]);
1380 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1381 ins->klass = cmethod->klass;
1384 ins->inst_c0 = args [param_count - 1]->inst_c0;
1385 ins->type = STACK_VTYPE;
1386 ins->dreg = alloc_ireg (cfg);
1387 MONO_ADD_INS (cfg->cbb, ins);
1389 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1390 ins->opcode = OP_SHUFPS;
1395 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1399 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1400 ins->klass = cmethod->klass;
1401 ins->sreg1 = args [0]->dreg;
1402 ins->type = STACK_VTYPE;
1403 ins->dreg = alloc_ireg (cfg);
1404 MONO_ADD_INS (cfg->cbb, ins);
1409 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1414 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1416 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1417 ins->klass = cmethod->klass;
1418 ins->dreg = args [0]->dreg;
1420 ins->type = STACK_VTYPE;
1421 MONO_ADD_INS (cfg->cbb, ins);
1426 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1431 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1433 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1434 ins->klass = cmethod->klass;
1436 ins->type = STACK_I4;
1437 ins->dreg = alloc_ireg (cfg);
1438 MONO_ADD_INS (cfg->cbb, ins);
1444 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1448 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1449 ins->klass = cmethod->klass;
1450 ins->sreg1 = args [0]->dreg;
1451 ins->backend.arg_info = intrinsic->flags;
1452 MONO_ADD_INS (cfg->cbb, ins);
1457 simd_version_name (guint32 version)
1460 case SIMD_VERSION_SSE1:
1462 case SIMD_VERSION_SSE2:
1464 case SIMD_VERSION_SSE3:
1466 case SIMD_VERSION_SSSE3:
1468 case SIMD_VERSION_SSE41:
1470 case SIMD_VERSION_SSE42:
1472 case SIMD_VERSION_SSE4a:
1479 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1481 const SimdIntrinsc * result = mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1483 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1486 if (IS_DEBUG_ON (cfg)) {
1488 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1489 max = fsig->param_count + fsig->hasthis;
1490 for (i = 0; i < max; ++i) {
1491 printf ("param %d: ", i);
1492 mono_print_ins (args [i]);
1495 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1496 if (IS_DEBUG_ON (cfg)) {
1498 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1499 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1500 if (result->simd_version_flags & (1 << x))
1501 printf ("%s ", simd_version_name (1 << x));
1508 switch (result->simd_emit_mode) {
1509 case SIMD_EMIT_BINARY:
1510 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1511 case SIMD_EMIT_UNARY:
1512 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1513 case SIMD_EMIT_SETTER:
1514 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1515 case SIMD_EMIT_GETTER:
1516 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1517 case SIMD_EMIT_GETTER_QWORD:
1518 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1519 case SIMD_EMIT_CTOR:
1520 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1521 case SIMD_EMIT_CAST:
1522 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1523 case SIMD_EMIT_SHUFFLE:
1524 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1525 case SIMD_EMIT_SHIFT:
1526 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1527 case SIMD_EMIT_EQUALITY:
1528 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1529 case SIMD_EMIT_LOAD_ALIGNED:
1530 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1531 case SIMD_EMIT_STORE:
1532 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1533 case SIMD_EMIT_EXTRACT_MASK:
1534 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1535 case SIMD_EMIT_PREFETCH:
1536 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1538 g_assert_not_reached ();
1542 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1546 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1548 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1549 mult_reg = alloc_preg (cfg);
1550 array_reg = arr->dreg;
1551 index_reg = index->dreg;
1553 #if SIZEOF_VOID_P == 8
1554 /* The array reg is 64 bits but the index reg is only 32 */
1555 index2_reg = alloc_preg (cfg);
1556 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1558 index2_reg = index_reg;
1560 index3_reg = alloc_preg (cfg);
1563 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1564 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1565 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1568 add_reg = alloc_preg (cfg);
1570 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1571 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1572 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1573 ins->type = STACK_PTR;
1574 MONO_ADD_INS (cfg->cbb, ins);
1580 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1582 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1584 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1586 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1587 load->klass = cmethod->klass;
1589 load->type = STACK_VTYPE;
1590 load->dreg = alloc_ireg (cfg);
1591 MONO_ADD_INS (cfg->cbb, load);
1595 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1597 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1598 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1600 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1601 store->klass = cmethod->klass;
1603 store->sreg1 = vreg;
1604 MONO_ADD_INS (cfg->cbb, store);
1608 if (!strcmp ("IsAligned", cmethod->name)) {
1610 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1612 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1613 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1614 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1615 MONO_ADD_INS (cfg->cbb, ins);
1623 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1625 if (!strcmp ("get_AccelMode", cmethod->name)) {
1627 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1634 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1636 const char *class_name;
1638 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1641 class_name = cmethod->klass->name;
1642 if (!strcmp ("SimdRuntime", class_name))
1643 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1645 if (!strcmp ("ArrayExtensions", class_name))
1646 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1648 if (!strcmp ("VectorOperations", class_name)) {
1649 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1651 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1652 } else if (!cmethod->klass->simd_type)
1655 cfg->uses_simd_intrinsics = 1;
1656 if (!strcmp ("Vector2d", class_name))
1657 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1658 if (!strcmp ("Vector4f", class_name))
1659 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1660 if (!strcmp ("Vector2ul", class_name))
1661 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1662 if (!strcmp ("Vector2l", class_name))
1663 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1664 if (!strcmp ("Vector4ui", class_name))
1665 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1666 if (!strcmp ("Vector4i", class_name))
1667 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1668 if (!strcmp ("Vector8us", class_name))
1669 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1670 if (!strcmp ("Vector8s", class_name))
1671 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1672 if (!strcmp ("Vector16b", class_name))
1673 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1674 if (!strcmp ("Vector16sb", class_name))
1675 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));