2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
73 SIMD_EMIT_GETTER_QWORD,
79 SIMD_EMIT_LOAD_ALIGNED,
81 SIMD_EMIT_EXTRACT_MASK,
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name,
113 #include "simd-methods.h"
117 #define method_name(idx) (method_names [(idx)])
124 guint8 simd_version_flags;
125 guint8 simd_emit_mode : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
132 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
133 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
141 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
142 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
143 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
144 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
145 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
146 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
147 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
148 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
149 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
150 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
151 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
152 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
153 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
154 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
155 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
156 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
157 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
158 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
159 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
160 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
161 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
162 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
163 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
164 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
165 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
166 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
167 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
168 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
169 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
170 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
171 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
172 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
173 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
174 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
175 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
176 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
177 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
178 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
179 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
180 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
183 static const SimdIntrinsc vector2d_intrinsics[] = {
184 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
185 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
186 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
187 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
188 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
189 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
190 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
191 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
192 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
193 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
194 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
195 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
196 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
197 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
198 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
199 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
200 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
201 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
202 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
203 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
204 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
205 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
206 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
207 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
208 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
209 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
210 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
211 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
212 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
213 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
214 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
215 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
217 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
220 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
221 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
222 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
223 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
224 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
227 static const SimdIntrinsc vector2ul_intrinsics[] = {
228 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
229 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
230 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
231 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
232 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
233 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
234 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
235 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
236 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
237 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
238 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
239 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
240 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
241 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
242 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
243 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
244 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
245 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
246 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
247 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
248 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
249 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
250 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
251 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
254 static const SimdIntrinsc vector2l_intrinsics[] = {
255 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
256 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
257 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
258 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
259 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
260 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
261 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
262 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
263 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
264 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
265 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
266 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
267 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
268 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
269 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
270 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
271 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
272 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
273 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
274 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
275 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
276 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
277 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
278 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
279 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
282 static const SimdIntrinsc vector4ui_intrinsics[] = {
283 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
284 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
285 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
287 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
288 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
289 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
290 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
291 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
292 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
293 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
294 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
295 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
296 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
297 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
298 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
300 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
301 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
302 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
303 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
304 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
305 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
306 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
307 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
308 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
309 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
310 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
311 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
312 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
313 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
314 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
315 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
316 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
317 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
320 static const SimdIntrinsc vector4i_intrinsics[] = {
321 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
322 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
323 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
324 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
325 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
326 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
327 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
328 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
329 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
330 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
331 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
332 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
333 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
334 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
335 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
336 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
337 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
338 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
339 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
340 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
341 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
342 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
343 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
344 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
345 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
346 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
347 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
348 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
349 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
350 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
351 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
352 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
353 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
354 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
355 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
356 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
357 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
358 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
361 static const SimdIntrinsc vector8us_intrinsics[] = {
362 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
363 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
364 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
365 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
366 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
367 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
368 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
369 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
370 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
371 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
372 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
373 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
374 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
375 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
376 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
377 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
378 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
379 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
380 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
381 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
382 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
384 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
385 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
386 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
387 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
388 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
389 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
390 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
391 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
392 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
393 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
395 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
396 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
397 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
398 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
399 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
400 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
401 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
402 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
403 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
404 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
405 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
406 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
407 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
408 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
409 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
412 static const SimdIntrinsc vector8s_intrinsics[] = {
413 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
414 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
415 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
416 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
417 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
418 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
419 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
420 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
421 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
422 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
423 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
424 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
425 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
426 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
427 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
428 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
429 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
430 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
431 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
432 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
435 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
436 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
437 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
438 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
439 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
440 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
441 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
442 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
443 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
444 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
446 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
447 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
448 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
449 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
450 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
451 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
452 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
453 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
454 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
455 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
456 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
457 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
458 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
459 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
460 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
463 static const SimdIntrinsc vector16b_intrinsics[] = {
464 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
465 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
466 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
467 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
468 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
469 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
470 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
471 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
472 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
473 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
474 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
475 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
476 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
477 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
478 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
482 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
483 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
484 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
485 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
486 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
487 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
488 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
489 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
490 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
491 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
492 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
493 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
498 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
499 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
500 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
501 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
502 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
503 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
504 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
505 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
506 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
507 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
508 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
509 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
510 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
511 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
512 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
513 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
514 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
515 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
516 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
517 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
527 static const SimdIntrinsc vector16sb_intrinsics[] = {
528 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
529 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
530 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
531 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
532 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
533 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
534 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
535 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
536 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
537 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
538 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
539 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
540 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
541 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
542 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
545 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
546 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
547 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
548 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
549 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
550 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
551 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
552 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
553 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
554 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
555 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
556 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
561 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
562 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
563 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
564 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
565 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
566 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
567 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
568 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
569 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
570 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
571 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
572 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
573 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
574 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
575 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
576 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
577 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
578 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
579 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
580 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 static guint32 simd_supported_versions;
588 /*TODO match using number of parameters as well*/
590 simd_intrinsic_compare_by_name (const void *key, const void *value)
592 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
597 VREG_HAS_XZERO_BB0 = 0x02,
598 VREG_HAS_OTHER_OP_BB0 = 0x04,
599 VREG_SINGLE_BB_USE = 0x08,
600 VREG_MANY_BB_USE = 0x10,
604 mono_simd_intrinsics_init (void)
606 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
607 /*TODO log the supported flags*/
610 static inline gboolean
611 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
613 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
614 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
615 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
616 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
622 static inline gboolean
623 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
625 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
628 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
629 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
630 vreg_flags [reg] |= VREG_MANY_BB_USE;
631 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
633 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
634 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
635 target_bb [reg] = bb;
636 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
643 This pass recalculate which vars need MONO_INST_INDIRECT.
645 We cannot do this for non SIMD vars since code like mono_get_vtable_var
646 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
649 mono_simd_simplify_indirection (MonoCompile *cfg)
652 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
656 for (i = 0; i < cfg->num_varinfo; i++) {
657 MonoInst *var = cfg->varinfo [i];
658 if (var->klass->simd_type) {
659 var->flags &= ~MONO_INST_INDIRECT;
660 max_vreg = MAX (var->dreg, max_vreg);
664 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
665 if (!first_bb && bb->code)
667 for (ins = bb->code; ins; ins = ins->next) {
668 if (ins->opcode == OP_LDADDR) {
669 MonoInst *var = (MonoInst*)ins->inst_p0;
670 if (var->klass->simd_type) {
671 var->flags |= MONO_INST_INDIRECT;
677 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
678 vreg_flags = g_malloc0 (max_vreg + 1);
679 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
681 for (i = 0; i < cfg->num_varinfo; i++) {
682 MonoInst *var = cfg->varinfo [i];
683 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
684 vreg_flags [var->dreg] = VREG_USED;
685 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
689 /*Scan the first basic block looking xzeros not used*/
690 for (ins = first_bb->code; ins; ins = ins->next) {
692 int sregs [MONO_MAX_SRC_REGS];
694 if (ins->opcode == OP_XZERO) {
695 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
696 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
697 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
701 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
703 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
705 num_sregs = mono_inst_get_src_registers (ins, sregs);
706 for (i = 0; i < num_sregs; ++i) {
707 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
712 if (IS_DEBUG_ON (cfg)) {
713 for (i = 0; i < cfg->num_varinfo; i++) {
714 MonoInst *var = cfg->varinfo [i];
715 if (var->klass->simd_type) {
716 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
717 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
718 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
719 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
724 /*TODO stop here if no var is xzero only*/
727 Scan all other bb and check if it has only one other use
728 Ideally this would be done after an extended bb formation pass
730 FIXME This pass could use dominator information to properly
731 place the XZERO on the bb that dominates all uses of the var,
732 but this will have zero effect with the current local reg alloc
734 TODO simply the use of flags.
737 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
738 for (ins = bb->code; ins; ins = ins->next) {
740 int sregs [MONO_MAX_SRC_REGS];
742 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
744 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
746 num_sregs = mono_inst_get_src_registers (ins, sregs);
747 for (i = 0; i < num_sregs; ++i) {
748 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
749 max_vreg, vreg_flags, target_bb))
755 for (i = 0; i < cfg->num_varinfo; i++) {
756 MonoInst *var = cfg->varinfo [i];
757 if (!var->klass->simd_type)
759 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
760 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
761 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
762 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
764 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
766 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
768 int sregs [MONO_MAX_SRC_REGS];
769 gboolean found = FALSE;
771 num_sregs = mono_inst_get_src_registers (ins, sregs);
772 for (j = 0; j < num_sregs; ++j) {
773 if (sregs [i] == var->dreg)
776 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
777 if (ins->dreg == var->dreg && !found) {
781 MONO_INST_NEW (cfg, tmp, OP_XZERO);
782 tmp->dreg = var->dreg;
783 tmp->type = STACK_VTYPE;
784 tmp->klass = var->klass;
785 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
791 for (ins = first_bb->code; ins; ins = ins->next) {
792 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
801 * This function expect that src be a value.
804 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
806 if (src->opcode == OP_XMOVE) {
808 } else if (src->type == STACK_VTYPE) {
811 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
812 mono_print_ins (src);
813 g_assert_not_reached ();
817 * This function will load the value if needed.
820 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
824 if (src->opcode == OP_XMOVE) {
826 } else if (src->opcode == OP_LDADDR) {
827 int res = ((MonoInst*)src->inst_p0)->dreg;
830 } else if (src->type == STACK_VTYPE) {
832 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
837 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
838 ins->klass = cmethod->klass;
839 ins->sreg1 = src->dreg;
840 ins->type = STACK_VTYPE;
841 ins->dreg = alloc_ireg (cfg);
842 MONO_ADD_INS (cfg->cbb, ins);
845 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
846 mono_print_ins (src);
847 g_assert_not_reached ();
851 get_int_to_float_spill_area (MonoCompile *cfg)
853 if (!cfg->iconv_raw_var) {
854 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
855 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
857 return cfg->iconv_raw_var;
860 /*We share the var with fconv_to_r8_x to save some stack space.*/
862 get_double_spill_area (MonoCompile *cfg)
864 if (!cfg->fconv_to_r8_x_var) {
865 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
866 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
868 return cfg->fconv_to_r8_x_var;
871 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
873 if (!cfg->simd_ctor_var) {
874 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
875 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
877 return cfg->simd_ctor_var;
881 mono_type_to_expand_op (MonoType *type)
883 switch (type->type) {
901 g_assert_not_reached ();
905 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
908 MonoMethodSignature *sig = mono_method_signature (cmethod);
911 g_assert (sig->param_count == 2);
912 g_assert (position == 0 || position == 1);
914 if (mono_class_from_mono_type (sig->params [position])->simd_type)
915 return get_simd_vreg (cfg, cmethod, src);
917 expand_op = mono_type_to_expand_op (sig->params [position]);
918 MONO_INST_NEW (cfg, ins, expand_op);
919 ins->klass = cmethod->klass;
920 ins->sreg1 = src->dreg;
921 ins->type = STACK_VTYPE;
922 ins->dreg = alloc_ireg (cfg);
923 MONO_ADD_INS (cfg->cbb, ins);
925 if (expand_op == OP_EXPAND_R4)
926 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
927 else if (expand_op == OP_EXPAND_R8)
928 ins->backend.spill_var = get_double_spill_area (cfg);
934 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
937 int left_vreg, right_vreg;
939 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
940 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
943 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
944 ins->klass = cmethod->klass;
945 ins->sreg1 = left_vreg;
946 ins->sreg2 = right_vreg;
947 ins->type = STACK_VTYPE;
948 ins->dreg = alloc_ireg (cfg);
949 ins->inst_c0 = intrinsic->flags;
950 MONO_ADD_INS (cfg->cbb, ins);
955 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
960 vreg = get_simd_vreg (cfg, cmethod, args [0]);
962 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
963 ins->klass = cmethod->klass;
965 ins->type = STACK_VTYPE;
966 ins->dreg = alloc_ireg (cfg);
967 MONO_ADD_INS (cfg->cbb, ins);
972 mono_type_to_extract_op (MonoType *type)
974 switch (type->type) {
976 return OP_EXTRACT_I1;
978 return OP_EXTRACT_U1;
980 return OP_EXTRACT_I2;
982 return OP_EXTRACT_U2;
986 return OP_EXTRACT_I4;
988 g_assert_not_reached ();
991 /*Returns the amount to shift the element index to get the dword it belongs to*/
993 mono_type_elements_shift_bits (MonoType *type)
995 switch (type->type) {
1007 g_assert_not_reached ();
1010 static G_GNUC_UNUSED int
1011 mono_type_to_insert_op (MonoType *type)
1013 switch (type->type) {
1016 return OP_INSERT_I1;
1019 return OP_INSERT_I2;
1022 return OP_INSERT_I4;
1025 return OP_INSERT_I8;
1027 return OP_INSERT_R4;
1029 return OP_INSERT_R8;
1031 g_assert_not_reached ();
1035 mono_type_to_slow_insert_op (MonoType *type)
1037 switch (type->type) {
1040 return OP_INSERTX_U1_SLOW;
1043 return OP_INSERT_I2;
1046 return OP_INSERTX_I4_SLOW;
1049 return OP_INSERTX_I8_SLOW;
1051 return OP_INSERTX_R4_SLOW;
1053 return OP_INSERTX_R8_SLOW;
1055 g_assert_not_reached ();
1059 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1062 MonoMethodSignature *sig = mono_method_signature (cmethod);
1067 size = mono_type_size (sig->params [0], &align);
1069 if (COMPILE_LLVM (cfg)) {
1070 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1071 ins->klass = cmethod->klass;
1072 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1073 ins->sreg2 = args [1]->dreg;
1074 ins->inst_c0 = intrinsic->opcode;
1075 MONO_ADD_INS (cfg->cbb, ins);
1076 } else if (size == 2 || size == 4 || size == 8) {
1077 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1078 ins->klass = cmethod->klass;
1079 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1080 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1081 ins->sreg2 = args [1]->dreg;
1082 ins->inst_c0 = intrinsic->opcode;
1083 if (sig->params [0]->type == MONO_TYPE_R4)
1084 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1085 else if (sig->params [0]->type == MONO_TYPE_R8)
1086 ins->backend.spill_var = get_double_spill_area (cfg);
1087 MONO_ADD_INS (cfg->cbb, ins);
1091 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1092 ins->klass = cmethod->klass;
1093 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1094 ins->type = STACK_I4;
1095 ins->dreg = vreg = alloc_ireg (cfg);
1096 ins->inst_c0 = intrinsic->opcode / 2;
1097 MONO_ADD_INS (cfg->cbb, ins);
1099 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1100 ins->klass = cmethod->klass;
1102 ins->sreg2 = args [1]->dreg;
1104 ins->inst_c0 = intrinsic->opcode;
1105 MONO_ADD_INS (cfg->cbb, ins);
1109 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1110 ins->klass = cmethod->klass;
1111 ins->dreg = args [0]->dreg;
1113 MONO_ADD_INS (cfg->cbb, ins);
1119 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1122 MonoMethodSignature *sig = mono_method_signature (cmethod);
1123 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1125 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1127 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1128 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1129 ins->klass = cmethod->klass;
1131 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1132 ins->type = STACK_VTYPE;
1133 ins->dreg = vreg = alloc_ireg (cfg);
1134 MONO_ADD_INS (cfg->cbb, ins);
1137 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1138 ins->klass = cmethod->klass;
1140 ins->type = STACK_I4;
1141 ins->dreg = vreg = alloc_ireg (cfg);
1142 if (cfg->compile_llvm)
1143 ins->inst_c0 = intrinsic->opcode;
1145 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1146 MONO_ADD_INS (cfg->cbb, ins);
1148 if (sig->ret->type == MONO_TYPE_R4) {
1149 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1150 ins->klass = mono_defaults.single_class;
1152 ins->type = STACK_R8;
1153 ins->dreg = alloc_freg (cfg);
1154 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1155 MONO_ADD_INS (cfg->cbb, ins);
1161 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1165 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1167 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1169 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1170 ins->klass = cmethod->klass;
1172 ins->inst_c0 = intrinsic->opcode;
1174 ins->type = STACK_R8;
1175 ins->dreg = alloc_freg (cfg);
1176 ins->backend.spill_var = get_double_spill_area (cfg);
1178 ins->type = STACK_I8;
1179 ins->dreg = alloc_lreg (cfg);
1181 MONO_ADD_INS (cfg->cbb, ins);
1187 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1189 MonoInst *ins = NULL;
1191 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1192 MonoMethodSignature *sig = mono_method_signature (cmethod);
1193 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1194 int arg_size = mono_type_size (sig->params [0], &i);
1196 if (sig->param_count == 1) {
1200 dreg = args [0]->inst_i0->dreg;
1201 NULLIFY_INS (args [0]);
1203 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1204 dreg = alloc_ireg (cfg);
1207 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1208 ins->klass = cmethod->klass;
1209 ins->sreg1 = args [1]->dreg;
1210 ins->type = STACK_VTYPE;
1213 MONO_ADD_INS (cfg->cbb, ins);
1214 if (sig->params [0]->type == MONO_TYPE_R4)
1215 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1216 else if (sig->params [0]->type == MONO_TYPE_R8)
1217 ins->backend.spill_var = get_double_spill_area (cfg);
1220 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1221 ins->dreg = args [0]->dreg;
1223 MONO_ADD_INS (cfg->cbb, ins);
1229 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1230 MONO_ADD_INS (cfg->cbb, ins);
1231 addr_reg = ins->dreg;
1233 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1234 addr_reg = args [0]->dreg;
1237 for (i = sig->param_count - 1; i >= 0; --i) {
1238 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1241 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1242 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1243 NULLIFY_INS (args [0]);
1245 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1246 ins->klass = cmethod->klass;
1247 ins->sreg1 = addr_reg;
1248 ins->type = STACK_VTYPE;
1250 MONO_ADD_INS (cfg->cbb, ins);
1256 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1261 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1263 //TODO macroize this
1264 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1265 ins->klass = cmethod->klass;
1266 ins->type = STACK_VTYPE;
1268 ins->dreg = alloc_ireg (cfg);
1269 MONO_ADD_INS (cfg->cbb, ins);
1274 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1277 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1279 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1281 if (args [1]->opcode != OP_ICONST) {
1282 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1283 ins->klass = mono_defaults.int32_class;
1284 ins->sreg1 = args [1]->dreg;
1285 ins->type = STACK_I4;
1286 ins->dreg = vreg2 = alloc_ireg (cfg);
1287 MONO_ADD_INS (cfg->cbb, ins);
1289 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1292 MONO_INST_NEW (cfg, ins, opcode);
1293 ins->klass = cmethod->klass;
1297 if (args [1]->opcode == OP_ICONST) {
1298 ins->inst_imm = args [1]->inst_c0;
1299 NULLIFY_INS (args [1]);
1302 ins->type = STACK_VTYPE;
1303 ins->dreg = alloc_ireg (cfg);
1304 MONO_ADD_INS (cfg->cbb, ins);
1308 static inline gboolean
1309 mono_op_is_packed_compare (int op)
1311 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1315 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1318 int left_vreg, right_vreg, tmp_vreg;
1320 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1321 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1324 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1325 ins->klass = cmethod->klass;
1326 ins->sreg1 = left_vreg;
1327 ins->sreg2 = right_vreg;
1328 ins->type = STACK_VTYPE;
1329 ins->klass = cmethod->klass;
1330 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1331 ins->inst_c0 = intrinsic->flags;
1332 MONO_ADD_INS (cfg->cbb, ins);
1334 /*FIXME the next ops are SSE specific*/
1335 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1336 ins->klass = cmethod->klass;
1337 ins->sreg1 = tmp_vreg;
1338 ins->type = STACK_I4;
1339 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1340 MONO_ADD_INS (cfg->cbb, ins);
1342 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1343 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1344 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1345 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1347 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1348 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1350 MONO_ADD_INS (cfg->cbb, ins);
1356 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1359 int vreg, vreg2 = -1;
1360 int param_count = mono_method_signature (cmethod)->param_count;
1362 if (args [param_count - 1]->opcode != OP_ICONST) {
1363 /*TODO Shuffle with non literals is not yet supported */
1367 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1368 if (param_count == 3)
1369 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1371 NULLIFY_INS (args [param_count - 1]);
1374 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1375 ins->klass = cmethod->klass;
1378 ins->inst_c0 = args [param_count - 1]->inst_c0;
1379 ins->type = STACK_VTYPE;
1380 ins->dreg = alloc_ireg (cfg);
1381 MONO_ADD_INS (cfg->cbb, ins);
1383 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1384 ins->opcode = OP_SHUFPS;
1389 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1393 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1394 ins->klass = cmethod->klass;
1395 ins->sreg1 = args [0]->dreg;
1396 ins->type = STACK_VTYPE;
1397 ins->dreg = alloc_ireg (cfg);
1398 MONO_ADD_INS (cfg->cbb, ins);
1403 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1408 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1410 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1411 ins->klass = cmethod->klass;
1412 ins->dreg = args [0]->dreg;
1414 ins->type = STACK_VTYPE;
1415 MONO_ADD_INS (cfg->cbb, ins);
1420 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1425 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1427 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1428 ins->klass = cmethod->klass;
1430 ins->type = STACK_I4;
1431 ins->dreg = alloc_ireg (cfg);
1432 MONO_ADD_INS (cfg->cbb, ins);
1438 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1442 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1443 ins->klass = cmethod->klass;
1444 ins->sreg1 = args [0]->dreg;
1445 ins->backend.arg_info = intrinsic->flags;
1446 MONO_ADD_INS (cfg->cbb, ins);
1451 simd_version_name (guint32 version)
1454 case SIMD_VERSION_SSE1:
1456 case SIMD_VERSION_SSE2:
1458 case SIMD_VERSION_SSE3:
1460 case SIMD_VERSION_SSSE3:
1462 case SIMD_VERSION_SSE41:
1464 case SIMD_VERSION_SSE42:
1466 case SIMD_VERSION_SSE4a:
1473 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1475 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1477 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1480 if (IS_DEBUG_ON (cfg)) {
1482 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1483 max = fsig->param_count + fsig->hasthis;
1484 for (i = 0; i < max; ++i) {
1485 printf ("param %d: ", i);
1486 mono_print_ins (args [i]);
1489 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1490 if (IS_DEBUG_ON (cfg)) {
1492 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1493 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1494 if (result->simd_version_flags & (1 << x))
1495 printf ("%s ", simd_version_name (1 << x));
1502 switch (result->simd_emit_mode) {
1503 case SIMD_EMIT_BINARY:
1504 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1505 case SIMD_EMIT_UNARY:
1506 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1507 case SIMD_EMIT_SETTER:
1508 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1509 case SIMD_EMIT_GETTER:
1510 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1511 case SIMD_EMIT_GETTER_QWORD:
1512 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1513 case SIMD_EMIT_CTOR:
1514 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1515 case SIMD_EMIT_CAST:
1516 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1517 case SIMD_EMIT_SHUFFLE:
1518 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1519 case SIMD_EMIT_SHIFT:
1520 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1521 case SIMD_EMIT_EQUALITY:
1522 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1523 case SIMD_EMIT_LOAD_ALIGNED:
1524 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1525 case SIMD_EMIT_STORE:
1526 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1527 case SIMD_EMIT_EXTRACT_MASK:
1528 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1529 case SIMD_EMIT_PREFETCH:
1530 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1532 g_assert_not_reached ();
1536 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1540 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1542 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1543 mult_reg = alloc_preg (cfg);
1544 array_reg = arr->dreg;
1545 index_reg = index->dreg;
1547 #if SIZEOF_VOID_P == 8
1548 /* The array reg is 64 bits but the index reg is only 32 */
1549 index2_reg = alloc_preg (cfg);
1550 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1552 index2_reg = index_reg;
1554 index3_reg = alloc_preg (cfg);
1557 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1558 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1559 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1562 add_reg = alloc_preg (cfg);
1564 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1565 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1566 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1567 ins->type = STACK_PTR;
1568 MONO_ADD_INS (cfg->cbb, ins);
1574 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1576 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1578 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1580 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1581 load->klass = cmethod->klass;
1583 load->type = STACK_VTYPE;
1584 load->dreg = alloc_ireg (cfg);
1585 MONO_ADD_INS (cfg->cbb, load);
1589 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1591 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1592 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1594 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1595 store->klass = cmethod->klass;
1597 store->sreg1 = vreg;
1598 MONO_ADD_INS (cfg->cbb, store);
1602 if (!strcmp ("IsAligned", cmethod->name)) {
1604 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1606 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1607 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1608 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1609 MONO_ADD_INS (cfg->cbb, ins);
1617 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1619 if (!strcmp ("get_AccelMode", cmethod->name)) {
1621 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1628 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1630 const char *class_name;
1632 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1635 class_name = cmethod->klass->name;
1636 if (!strcmp ("SimdRuntime", class_name))
1637 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1639 if (!strcmp ("ArrayExtensions", class_name))
1640 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1642 if (!strcmp ("VectorOperations", class_name)) {
1643 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1645 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1646 } else if (!cmethod->klass->simd_type)
1649 cfg->uses_simd_intrinsics = 1;
1650 if (!strcmp ("Vector2d", class_name))
1651 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1652 if (!strcmp ("Vector4f", class_name))
1653 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1654 if (!strcmp ("Vector2ul", class_name))
1655 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1656 if (!strcmp ("Vector2l", class_name))
1657 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1658 if (!strcmp ("Vector4ui", class_name))
1659 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1660 if (!strcmp ("Vector4i", class_name))
1661 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1662 if (!strcmp ("Vector8us", class_name))
1663 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1664 if (!strcmp ("Vector8s", class_name))
1665 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1666 if (!strcmp ("Vector16b", class_name))
1667 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1668 if (!strcmp ("Vector16sb", class_name))
1669 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));