2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
73 SIMD_EMIT_GETTER_QWORD,
79 SIMD_EMIT_LOAD_ALIGNED,
81 SIMD_EMIT_EXTRACT_MASK,
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name,
113 #include "simd-methods.h"
117 #define method_name(idx) (method_names [(idx)])
124 guint8 simd_emit_mode : 4;
125 guint8 simd_version : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
132 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
133 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
168 { SN_op_Equality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
169 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
170 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
171 { SN_op_Inequality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
172 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
173 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
174 { SN_set_W, 3, SIMD_EMIT_SETTER },
175 { SN_set_X, 0, SIMD_EMIT_SETTER },
176 { SN_set_Y, 1, SIMD_EMIT_SETTER },
177 { SN_set_Z, 2, SIMD_EMIT_SETTER },
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_ctor, OP_EXPAND_R8, SIMD_EMIT_CTOR },
182 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
183 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
184 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
185 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
186 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
187 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
188 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
189 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
190 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
191 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
192 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
193 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
194 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
195 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
196 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
197 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
198 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
199 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
200 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
201 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
202 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
203 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
204 { SN_Sqrt, OP_SQRTPD, SIMD_EMIT_UNARY },
205 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
206 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
207 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
208 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
209 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
210 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
211 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
212 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
213 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
214 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
215 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
216 { SN_set_X, 0, SIMD_EMIT_SETTER },
217 { SN_set_Y, 1, SIMD_EMIT_SETTER },
220 static const SimdIntrinsc vector2ul_intrinsics[] = {
221 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
222 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
223 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
224 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
225 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
226 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
227 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
228 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
229 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
230 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
231 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
232 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
233 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
234 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
235 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
236 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
237 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
238 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
239 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
240 { SN_op_RightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
241 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
242 { SN_set_X, 0, SIMD_EMIT_SETTER },
243 { SN_set_Y, 1, SIMD_EMIT_SETTER },
246 static const SimdIntrinsc vector2l_intrinsics[] = {
247 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
248 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
249 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE42 },
250 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
251 { SN_LogicalRightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
252 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
253 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
254 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
255 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
256 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
257 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
258 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
259 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
260 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
261 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
263 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
264 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
265 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
266 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
267 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
268 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
269 { SN_set_X, 0, SIMD_EMIT_SETTER },
270 { SN_set_Y, 1, SIMD_EMIT_SETTER },
273 static const SimdIntrinsc vector4ui_intrinsics[] = {
274 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
275 { SN_ArithmeticRightShift, OP_PSARD, SIMD_EMIT_SHIFT },
276 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
277 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
278 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
279 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
280 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
281 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
282 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
283 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
284 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
285 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
286 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
287 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
288 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
289 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
290 { SN_get_W, 3, SIMD_EMIT_GETTER },
291 { SN_get_X, 0, SIMD_EMIT_GETTER },
292 { SN_get_Y, 1, SIMD_EMIT_GETTER },
293 { SN_get_Z, 2, SIMD_EMIT_GETTER },
294 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
296 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
297 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
298 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
299 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
300 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
301 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
302 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
303 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
304 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
305 { SN_set_W, 3, SIMD_EMIT_SETTER },
306 { SN_set_X, 0, SIMD_EMIT_SETTER },
307 { SN_set_Y, 1, SIMD_EMIT_SETTER },
308 { SN_set_Z, 2, SIMD_EMIT_SETTER },
311 static const SimdIntrinsc vector4i_intrinsics[] = {
312 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
313 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
314 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
315 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
316 { SN_LogicalRightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
317 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
318 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
319 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
320 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
321 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
322 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
323 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
324 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
325 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
326 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
327 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
328 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
329 { SN_get_W, 3, SIMD_EMIT_GETTER },
330 { SN_get_X, 0, SIMD_EMIT_GETTER },
331 { SN_get_Y, 1, SIMD_EMIT_GETTER },
332 { SN_get_Z, 2, SIMD_EMIT_GETTER },
333 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
334 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
335 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
336 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
337 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
338 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
339 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
340 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
341 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
342 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
343 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
344 { SN_set_W, 3, SIMD_EMIT_SETTER },
345 { SN_set_X, 0, SIMD_EMIT_SETTER },
346 { SN_set_Y, 1, SIMD_EMIT_SETTER },
347 { SN_set_Z, 2, SIMD_EMIT_SETTER },
350 static const SimdIntrinsc vector8us_intrinsics[] = {
351 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
352 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
353 { SN_ArithmeticRightShift, OP_PSARW, SIMD_EMIT_SHIFT },
354 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
355 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
356 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
357 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
358 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
359 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
360 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
361 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
362 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
363 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
364 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
365 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
366 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
367 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
368 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
369 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
370 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
371 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
372 { SN_get_V0, 0, SIMD_EMIT_GETTER },
373 { SN_get_V1, 1, SIMD_EMIT_GETTER },
374 { SN_get_V2, 2, SIMD_EMIT_GETTER },
375 { SN_get_V3, 3, SIMD_EMIT_GETTER },
376 { SN_get_V4, 4, SIMD_EMIT_GETTER },
377 { SN_get_V5, 5, SIMD_EMIT_GETTER },
378 { SN_get_V6, 6, SIMD_EMIT_GETTER },
379 { SN_get_V7, 7, SIMD_EMIT_GETTER },
380 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
381 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
382 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
383 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
384 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
385 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
386 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
387 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
388 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
389 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
390 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
391 { SN_set_V0, 0, SIMD_EMIT_SETTER },
392 { SN_set_V1, 1, SIMD_EMIT_SETTER },
393 { SN_set_V2, 2, SIMD_EMIT_SETTER },
394 { SN_set_V3, 3, SIMD_EMIT_SETTER },
395 { SN_set_V4, 4, SIMD_EMIT_SETTER },
396 { SN_set_V5, 5, SIMD_EMIT_SETTER },
397 { SN_set_V6, 6, SIMD_EMIT_SETTER },
398 { SN_set_V7, 7, SIMD_EMIT_SETTER },
401 static const SimdIntrinsc vector8s_intrinsics[] = {
402 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
403 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
404 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
405 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
406 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
407 { SN_LogicalRightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
408 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
409 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
410 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
411 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
412 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
413 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
414 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
415 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
416 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
417 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
418 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
419 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
420 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
421 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
422 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
423 { SN_get_V0, 0, SIMD_EMIT_GETTER },
424 { SN_get_V1, 1, SIMD_EMIT_GETTER },
425 { SN_get_V2, 2, SIMD_EMIT_GETTER },
426 { SN_get_V3, 3, SIMD_EMIT_GETTER },
427 { SN_get_V4, 4, SIMD_EMIT_GETTER },
428 { SN_get_V5, 5, SIMD_EMIT_GETTER },
429 { SN_get_V6, 6, SIMD_EMIT_GETTER },
430 { SN_get_V7, 7, SIMD_EMIT_GETTER },
431 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
432 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
433 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
434 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
435 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
436 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
437 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
438 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
439 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
440 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
441 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
442 { SN_set_V0, 0, SIMD_EMIT_SETTER },
443 { SN_set_V1, 1, SIMD_EMIT_SETTER },
444 { SN_set_V2, 2, SIMD_EMIT_SETTER },
445 { SN_set_V3, 3, SIMD_EMIT_SETTER },
446 { SN_set_V4, 4, SIMD_EMIT_SETTER },
447 { SN_set_V5, 5, SIMD_EMIT_SETTER },
448 { SN_set_V6, 6, SIMD_EMIT_SETTER },
449 { SN_set_V7, 7, SIMD_EMIT_SETTER },
452 static const SimdIntrinsc vector16b_intrinsics[] = {
453 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
454 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
455 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
456 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
457 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
458 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
459 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
460 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
461 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
462 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
463 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
464 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
465 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
466 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
467 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
468 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
469 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
470 { SN_get_V0, 0, SIMD_EMIT_GETTER },
471 { SN_get_V1, 1, SIMD_EMIT_GETTER },
472 { SN_get_V10, 10, SIMD_EMIT_GETTER },
473 { SN_get_V11, 11, SIMD_EMIT_GETTER },
474 { SN_get_V12, 12, SIMD_EMIT_GETTER },
475 { SN_get_V13, 13, SIMD_EMIT_GETTER },
476 { SN_get_V14, 14, SIMD_EMIT_GETTER },
477 { SN_get_V15, 15, SIMD_EMIT_GETTER },
478 { SN_get_V2, 2, SIMD_EMIT_GETTER },
479 { SN_get_V3, 3, SIMD_EMIT_GETTER },
480 { SN_get_V4, 4, SIMD_EMIT_GETTER },
481 { SN_get_V5, 5, SIMD_EMIT_GETTER },
482 { SN_get_V6, 6, SIMD_EMIT_GETTER },
483 { SN_get_V7, 7, SIMD_EMIT_GETTER },
484 { SN_get_V8, 8, SIMD_EMIT_GETTER },
485 { SN_get_V9, 9, SIMD_EMIT_GETTER },
486 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
487 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
488 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
489 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
490 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
491 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
492 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
493 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
494 { SN_set_V0, 0, SIMD_EMIT_SETTER },
495 { SN_set_V1, 1, SIMD_EMIT_SETTER },
496 { SN_set_V10, 10, SIMD_EMIT_SETTER },
497 { SN_set_V11, 11, SIMD_EMIT_SETTER },
498 { SN_set_V12, 12, SIMD_EMIT_SETTER },
499 { SN_set_V13, 13, SIMD_EMIT_SETTER },
500 { SN_set_V14, 14, SIMD_EMIT_SETTER },
501 { SN_set_V15, 15, SIMD_EMIT_SETTER },
502 { SN_set_V2, 2, SIMD_EMIT_SETTER },
503 { SN_set_V3, 3, SIMD_EMIT_SETTER },
504 { SN_set_V4, 4, SIMD_EMIT_SETTER },
505 { SN_set_V5, 5, SIMD_EMIT_SETTER },
506 { SN_set_V6, 6, SIMD_EMIT_SETTER },
507 { SN_set_V7, 7, SIMD_EMIT_SETTER },
508 { SN_set_V8, 8, SIMD_EMIT_SETTER },
509 { SN_set_V9, 9, SIMD_EMIT_SETTER },
516 static const SimdIntrinsc vector16sb_intrinsics[] = {
517 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
518 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
519 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
520 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
521 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
522 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
523 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
524 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
525 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
526 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
527 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
528 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
529 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
530 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
531 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
532 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
533 { SN_get_V0, 0, SIMD_EMIT_GETTER },
534 { SN_get_V1, 1, SIMD_EMIT_GETTER },
535 { SN_get_V10, 10, SIMD_EMIT_GETTER },
536 { SN_get_V11, 11, SIMD_EMIT_GETTER },
537 { SN_get_V12, 12, SIMD_EMIT_GETTER },
538 { SN_get_V13, 13, SIMD_EMIT_GETTER },
539 { SN_get_V14, 14, SIMD_EMIT_GETTER },
540 { SN_get_V15, 15, SIMD_EMIT_GETTER },
541 { SN_get_V2, 2, SIMD_EMIT_GETTER },
542 { SN_get_V3, 3, SIMD_EMIT_GETTER },
543 { SN_get_V4, 4, SIMD_EMIT_GETTER },
544 { SN_get_V5, 5, SIMD_EMIT_GETTER },
545 { SN_get_V6, 6, SIMD_EMIT_GETTER },
546 { SN_get_V7, 7, SIMD_EMIT_GETTER },
547 { SN_get_V8, 8, SIMD_EMIT_GETTER },
548 { SN_get_V9, 9, SIMD_EMIT_GETTER },
549 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
550 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
551 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
552 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
553 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
554 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
555 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
556 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
557 { SN_set_V0, 0, SIMD_EMIT_SETTER },
558 { SN_set_V1, 1, SIMD_EMIT_SETTER },
559 { SN_set_V10, 10, SIMD_EMIT_SETTER },
560 { SN_set_V11, 11, SIMD_EMIT_SETTER },
561 { SN_set_V12, 12, SIMD_EMIT_SETTER },
562 { SN_set_V13, 13, SIMD_EMIT_SETTER },
563 { SN_set_V14, 14, SIMD_EMIT_SETTER },
564 { SN_set_V15, 15, SIMD_EMIT_SETTER },
565 { SN_set_V2, 2, SIMD_EMIT_SETTER },
566 { SN_set_V3, 3, SIMD_EMIT_SETTER },
567 { SN_set_V4, 4, SIMD_EMIT_SETTER },
568 { SN_set_V5, 5, SIMD_EMIT_SETTER },
569 { SN_set_V6, 6, SIMD_EMIT_SETTER },
570 { SN_set_V7, 7, SIMD_EMIT_SETTER },
571 { SN_set_V8, 8, SIMD_EMIT_SETTER },
572 { SN_set_V9, 9, SIMD_EMIT_SETTER },
575 static guint32 simd_supported_versions;
577 /*TODO match using number of parameters as well*/
579 simd_intrinsic_compare_by_name (const void *key, const void *value)
581 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
586 VREG_HAS_XZERO_BB0 = 0x02,
587 VREG_HAS_OTHER_OP_BB0 = 0x04,
588 VREG_SINGLE_BB_USE = 0x08,
589 VREG_MANY_BB_USE = 0x10,
593 mono_simd_intrinsics_init (void)
595 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
596 /*TODO log the supported flags*/
599 static inline gboolean
600 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
602 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
603 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
604 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
605 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
611 static inline gboolean
612 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
614 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
617 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
618 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
619 vreg_flags [reg] |= VREG_MANY_BB_USE;
620 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
622 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
623 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
624 target_bb [reg] = bb;
625 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
632 This pass recalculate which vars need MONO_INST_INDIRECT.
634 We cannot do this for non SIMD vars since code like mono_get_vtable_var
635 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
638 mono_simd_simplify_indirection (MonoCompile *cfg)
641 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
645 for (i = 0; i < cfg->num_varinfo; i++) {
646 MonoInst *var = cfg->varinfo [i];
647 if (var->klass->simd_type) {
648 var->flags &= ~MONO_INST_INDIRECT;
649 max_vreg = MAX (var->dreg, max_vreg);
653 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
654 if (!first_bb && bb->code)
656 for (ins = bb->code; ins; ins = ins->next) {
657 if (ins->opcode == OP_LDADDR) {
658 MonoInst *var = (MonoInst*)ins->inst_p0;
659 if (var->klass->simd_type) {
660 var->flags |= MONO_INST_INDIRECT;
666 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
667 vreg_flags = g_malloc0 (max_vreg + 1);
668 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
670 for (i = 0; i < cfg->num_varinfo; i++) {
671 MonoInst *var = cfg->varinfo [i];
672 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
673 vreg_flags [var->dreg] = VREG_USED;
674 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
678 /*Scan the first basic block looking xzeros not used*/
679 for (ins = first_bb->code; ins; ins = ins->next) {
681 int sregs [MONO_MAX_SRC_REGS];
683 if (ins->opcode == OP_XZERO) {
684 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
685 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
686 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
690 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
692 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
694 num_sregs = mono_inst_get_src_registers (ins, sregs);
695 for (i = 0; i < num_sregs; ++i) {
696 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
701 if (IS_DEBUG_ON (cfg)) {
702 for (i = 0; i < cfg->num_varinfo; i++) {
703 MonoInst *var = cfg->varinfo [i];
704 if (var->klass->simd_type) {
705 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
706 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
707 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
708 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
713 /*TODO stop here if no var is xzero only*/
716 Scan all other bb and check if it has only one other use
717 Ideally this would be done after an extended bb formation pass
719 FIXME This pass could use dominator information to properly
720 place the XZERO on the bb that dominates all uses of the var,
721 but this will have zero effect with the current local reg alloc
723 TODO simply the use of flags.
726 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
727 for (ins = bb->code; ins; ins = ins->next) {
729 int sregs [MONO_MAX_SRC_REGS];
731 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
733 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
735 num_sregs = mono_inst_get_src_registers (ins, sregs);
736 for (i = 0; i < num_sregs; ++i) {
737 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
738 max_vreg, vreg_flags, target_bb))
744 for (i = 0; i < cfg->num_varinfo; i++) {
745 MonoInst *var = cfg->varinfo [i];
746 if (!var->klass->simd_type)
748 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
749 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
750 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
751 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
753 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
755 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
757 int sregs [MONO_MAX_SRC_REGS];
758 gboolean found = FALSE;
760 num_sregs = mono_inst_get_src_registers (ins, sregs);
761 for (j = 0; j < num_sregs; ++j) {
762 if (sregs [i] == var->dreg)
765 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
766 if (ins->dreg == var->dreg && !found) {
770 MONO_INST_NEW (cfg, tmp, OP_XZERO);
771 tmp->dreg = var->dreg;
772 tmp->type = STACK_VTYPE;
773 tmp->klass = var->klass;
774 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
780 for (ins = first_bb->code; ins; ins = ins->next) {
781 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
790 * This function expect that src be a value.
793 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
795 if (src->opcode == OP_XMOVE) {
797 } else if (src->type == STACK_VTYPE) {
800 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
801 mono_print_ins (src);
802 g_assert_not_reached ();
806 * This function will load the value if needed.
809 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
811 if (src->opcode == OP_XMOVE) {
813 } else if (src->opcode == OP_LDADDR) {
814 int res = ((MonoInst*)src->inst_p0)->dreg;
817 } else if (src->type == STACK_VTYPE) {
819 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
822 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
823 ins->klass = cmethod->klass;
824 ins->sreg1 = src->dreg;
825 ins->type = STACK_VTYPE;
826 ins->dreg = alloc_ireg (cfg);
827 MONO_ADD_INS (cfg->cbb, ins);
830 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
831 mono_print_ins (src);
832 g_assert_not_reached ();
836 get_int_to_float_spill_area (MonoCompile *cfg)
838 if (!cfg->iconv_raw_var) {
839 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
840 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
842 return cfg->iconv_raw_var;
845 /*We share the var with fconv_to_r8_x to save some stack space.*/
847 get_double_spill_area (MonoCompile *cfg)
849 if (!cfg->fconv_to_r8_x_var) {
850 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
851 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
853 return cfg->fconv_to_r8_x_var;
856 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
858 if (!cfg->simd_ctor_var) {
859 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
860 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
862 return cfg->simd_ctor_var;
866 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
869 int left_vreg, right_vreg;
871 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
872 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
875 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
876 ins->klass = cmethod->klass;
877 ins->sreg1 = left_vreg;
878 ins->sreg2 = right_vreg;
879 ins->type = STACK_VTYPE;
880 ins->klass = cmethod->klass;
881 ins->dreg = alloc_ireg (cfg);
882 ins->inst_c0 = intrinsic->flags;
883 MONO_ADD_INS (cfg->cbb, ins);
888 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
893 vreg = get_simd_vreg (cfg, cmethod, args [0]);
895 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
896 ins->klass = cmethod->klass;
898 ins->type = STACK_VTYPE;
899 ins->dreg = alloc_ireg (cfg);
900 MONO_ADD_INS (cfg->cbb, ins);
905 mono_type_to_extract_op (MonoType *type)
907 switch (type->type) {
909 return OP_EXTRACT_I1;
911 return OP_EXTRACT_U1;
913 return OP_EXTRACT_I2;
915 return OP_EXTRACT_U2;
919 return OP_EXTRACT_I4;
921 g_assert_not_reached ();
924 /*Returns the amount to shift the element index to get the dword it belongs to*/
926 mono_type_elements_shift_bits (MonoType *type)
928 switch (type->type) {
940 g_assert_not_reached ();
944 mono_type_to_slow_insert_op (MonoType *type)
946 switch (type->type) {
949 return OP_INSERTX_U1_SLOW;
955 return OP_INSERTX_I4_SLOW;
958 return OP_INSERTX_I8_SLOW;
960 return OP_INSERTX_R4_SLOW;
962 return OP_INSERTX_R8_SLOW;
964 g_assert_not_reached ();
968 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
971 MonoMethodSignature *sig = mono_method_signature (cmethod);
973 size = mono_type_size (sig->params [0], &align);
975 if (size == 2 || size == 4 || size == 8) {
976 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
977 ins->klass = cmethod->klass;
978 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
979 ins->dreg = ins->sreg1 = load_simd_vreg (cfg, cmethod, args [0]);
980 ins->sreg2 = args [1]->dreg;
981 ins->inst_c0 = intrinsic->opcode;
982 if (sig->params [0]->type == MONO_TYPE_R4)
983 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
984 else if (sig->params [0]->type == MONO_TYPE_R8)
985 ins->backend.spill_var = get_double_spill_area (cfg);
986 MONO_ADD_INS (cfg->cbb, ins);
990 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
991 ins->klass = cmethod->klass;
992 ins->sreg1 = sreg = load_simd_vreg (cfg, cmethod, args [0]);
993 ins->type = STACK_I4;
994 ins->dreg = vreg = alloc_ireg (cfg);
995 ins->inst_c0 = intrinsic->opcode / 2;
996 MONO_ADD_INS (cfg->cbb, ins);
998 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
999 ins->klass = cmethod->klass;
1001 ins->sreg2 = args [1]->dreg;
1003 ins->inst_c0 = intrinsic->opcode;
1004 MONO_ADD_INS (cfg->cbb, ins);
1011 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1014 MonoMethodSignature *sig = mono_method_signature (cmethod);
1015 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1017 vreg = load_simd_vreg (cfg, cmethod, args [0]);
1019 if (intrinsic->opcode >> shift_bits) {
1020 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1021 ins->klass = cmethod->klass;
1023 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1024 ins->type = STACK_VTYPE;
1025 ins->dreg = vreg = alloc_ireg (cfg);
1026 MONO_ADD_INS (cfg->cbb, ins);
1029 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1030 ins->klass = cmethod->klass;
1032 ins->type = STACK_I4;
1033 ins->dreg = vreg = alloc_ireg (cfg);
1034 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1035 MONO_ADD_INS (cfg->cbb, ins);
1037 if (sig->ret->type == MONO_TYPE_R4) {
1038 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1039 ins->klass = mono_defaults.single_class;
1041 ins->type = STACK_R8;
1042 ins->dreg = alloc_freg (cfg);
1043 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1044 MONO_ADD_INS (cfg->cbb, ins);
1050 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1054 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1056 vreg = load_simd_vreg (cfg, cmethod, args [0]);
1058 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1059 ins->klass = cmethod->klass;
1061 ins->inst_c0 = intrinsic->opcode;
1063 ins->type = STACK_R8;
1064 ins->dreg = alloc_freg (cfg);
1065 ins->backend.spill_var = get_double_spill_area (cfg);
1067 ins->type = STACK_I8;
1068 ins->dreg = alloc_lreg (cfg);
1070 MONO_ADD_INS (cfg->cbb, ins);
1076 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1078 MonoInst *ins = NULL;
1080 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1081 MonoMethodSignature *sig = mono_method_signature (cmethod);
1082 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1083 int arg_size = mono_type_size (sig->params [0], &i);
1085 if (sig->param_count == 1) {
1089 dreg = args [0]->inst_i0->dreg;
1090 NULLIFY_INS (args [0]);
1092 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1093 dreg = alloc_ireg (cfg);
1096 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1097 ins->klass = cmethod->klass;
1098 ins->sreg1 = args [1]->dreg;
1099 ins->type = STACK_VTYPE;
1102 MONO_ADD_INS (cfg->cbb, ins);
1103 if (sig->params [0]->type == MONO_TYPE_R4)
1104 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1105 else if (sig->params [0]->type == MONO_TYPE_R8)
1106 ins->backend.spill_var = get_double_spill_area (cfg);
1109 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1110 ins->dreg = args [0]->dreg;
1112 MONO_ADD_INS (cfg->cbb, ins);
1118 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1119 MONO_ADD_INS (cfg->cbb, ins);
1120 addr_reg = ins->dreg;
1122 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1123 addr_reg = args [0]->dreg;
1126 for (i = sig->param_count - 1; i >= 0; --i) {
1127 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1130 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1131 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1132 NULLIFY_INS (args [0]);
1134 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1135 ins->klass = cmethod->klass;
1136 ins->sreg1 = addr_reg;
1137 ins->type = STACK_VTYPE;
1139 MONO_ADD_INS (cfg->cbb, ins);
1145 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1150 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1152 //TODO macroize this
1153 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1154 ins->klass = cmethod->klass;
1155 ins->type = STACK_VTYPE;
1157 ins->dreg = alloc_ireg (cfg);
1158 MONO_ADD_INS (cfg->cbb, ins);
1163 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1166 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1168 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1170 if (args [1]->opcode != OP_ICONST) {
1171 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1172 ins->klass = mono_defaults.int32_class;
1173 ins->sreg1 = args [1]->dreg;
1174 ins->type = STACK_I4;
1175 ins->dreg = vreg2 = alloc_ireg (cfg);
1176 MONO_ADD_INS (cfg->cbb, ins);
1178 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1181 MONO_INST_NEW (cfg, ins, opcode);
1182 ins->klass = cmethod->klass;
1186 if (args [1]->opcode == OP_ICONST) {
1187 ins->inst_imm = args [1]->inst_c0;
1188 NULLIFY_INS (args [1]);
1191 ins->type = STACK_VTYPE;
1192 ins->dreg = alloc_ireg (cfg);
1193 MONO_ADD_INS (cfg->cbb, ins);
1197 static inline gboolean
1198 mono_op_is_packed_compare (int op)
1200 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1204 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1207 int left_vreg, right_vreg, tmp_vreg;
1209 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1210 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1213 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1214 ins->klass = cmethod->klass;
1215 ins->sreg1 = left_vreg;
1216 ins->sreg2 = right_vreg;
1217 ins->type = STACK_VTYPE;
1218 ins->klass = cmethod->klass;
1219 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1220 ins->inst_c0 = intrinsic->flags;
1221 MONO_ADD_INS (cfg->cbb, ins);
1223 /*FIXME the next ops are SSE specific*/
1224 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1225 ins->klass = cmethod->klass;
1226 ins->sreg1 = tmp_vreg;
1227 ins->type = STACK_I4;
1228 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1229 MONO_ADD_INS (cfg->cbb, ins);
1231 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1232 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1233 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1234 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1236 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1237 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1239 MONO_ADD_INS (cfg->cbb, ins);
1245 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1250 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
1252 if (args [1]->opcode != OP_ICONST) {
1253 g_warning ("Shuffle with non literals is not yet supported");
1254 g_assert_not_reached ();
1256 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1257 NULLIFY_INS (args [1]);
1259 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1260 ins->klass = cmethod->klass;
1262 ins->inst_c0 = args [1]->inst_c0;
1263 ins->type = STACK_VTYPE;
1264 ins->dreg = alloc_ireg (cfg);
1265 MONO_ADD_INS (cfg->cbb, ins);
1270 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1274 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1275 ins->klass = cmethod->klass;
1276 ins->sreg1 = args [0]->dreg;
1277 ins->type = STACK_VTYPE;
1278 ins->dreg = alloc_ireg (cfg);
1279 MONO_ADD_INS (cfg->cbb, ins);
1284 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1289 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1291 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1292 ins->klass = cmethod->klass;
1293 ins->dreg = args [0]->dreg;
1295 ins->type = STACK_VTYPE;
1296 MONO_ADD_INS (cfg->cbb, ins);
1301 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1306 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1308 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1309 ins->klass = cmethod->klass;
1311 ins->type = STACK_I4;
1312 ins->dreg = alloc_ireg (cfg);
1313 MONO_ADD_INS (cfg->cbb, ins);
1319 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1323 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1324 ins->klass = cmethod->klass;
1325 ins->sreg1 = args [0]->dreg;
1326 ins->backend.arg_info = intrinsic->flags;
1327 MONO_ADD_INS (cfg->cbb, ins);
1332 simd_version_name (guint32 version)
1335 case SIMD_VERSION_SSE1:
1337 case SIMD_VERSION_SSE2:
1339 case SIMD_VERSION_SSE3:
1341 case SIMD_VERSION_SSSE3:
1343 case SIMD_VERSION_SSE41:
1345 case SIMD_VERSION_SSE42:
1347 case SIMD_VERSION_SSE4a:
1354 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1356 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1358 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1361 if (IS_DEBUG_ON (cfg)) {
1363 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1364 max = fsig->param_count + fsig->hasthis;
1365 for (i = 0; i < max; ++i) {
1366 printf ("param %d: ", i);
1367 mono_print_ins (args [i]);
1370 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
1371 if (IS_DEBUG_ON (cfg))
1372 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
1376 switch (result->simd_emit_mode) {
1377 case SIMD_EMIT_BINARY:
1378 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1379 case SIMD_EMIT_UNARY:
1380 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1381 case SIMD_EMIT_SETTER:
1382 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1383 case SIMD_EMIT_GETTER:
1384 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1385 case SIMD_EMIT_GETTER_QWORD:
1386 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1387 case SIMD_EMIT_CTOR:
1388 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1389 case SIMD_EMIT_CAST:
1390 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1391 case SIMD_EMIT_SHUFFLE:
1392 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1393 case SIMD_EMIT_SHIFT:
1394 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1395 case SIMD_EMIT_EQUALITY:
1396 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1397 case SIMD_EMIT_LOAD_ALIGNED:
1398 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1399 case SIMD_EMIT_STORE:
1400 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1401 case SIMD_EMIT_EXTRACT_MASK:
1402 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1403 case SIMD_EMIT_PREFETCH:
1404 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1406 g_assert_not_reached ();
1410 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1414 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1416 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1417 mult_reg = alloc_preg (cfg);
1418 array_reg = arr->dreg;
1419 index_reg = index->dreg;
1421 #if SIZEOF_VOID_P == 8
1422 /* The array reg is 64 bits but the index reg is only 32 */
1423 index2_reg = alloc_preg (cfg);
1424 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1426 index2_reg = index_reg;
1428 index3_reg = alloc_preg (cfg);
1431 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1432 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1433 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1436 add_reg = alloc_preg (cfg);
1438 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1439 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1440 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1441 ins->type = STACK_PTR;
1442 MONO_ADD_INS (cfg->cbb, ins);
1448 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1450 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1452 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1454 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1455 load->klass = cmethod->klass;
1457 load->type = STACK_VTYPE;
1458 load->dreg = alloc_ireg (cfg);
1459 MONO_ADD_INS (cfg->cbb, load);
1463 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1465 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1466 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1468 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1469 store->klass = cmethod->klass;
1471 store->sreg1 = vreg;
1472 MONO_ADD_INS (cfg->cbb, store);
1476 if (!strcmp ("IsAligned", cmethod->name)) {
1478 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1480 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1481 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1482 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1483 MONO_ADD_INS (cfg->cbb, ins);
1491 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1493 if (!strcmp ("get_AccelMode", cmethod->name)) {
1495 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1502 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1504 const char *class_name;
1506 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1509 class_name = cmethod->klass->name;
1510 if (!strcmp ("SimdRuntime", class_name))
1511 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1513 if (!strcmp ("ArrayExtensions", class_name))
1514 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1516 if (!strcmp ("VectorOperations", class_name)) {
1517 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1519 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1520 } else if (!cmethod->klass->simd_type)
1523 cfg->uses_simd_intrinsics = 1;
1524 if (!strcmp ("Vector2d", class_name))
1525 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1526 if (!strcmp ("Vector4f", class_name))
1527 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1528 if (!strcmp ("Vector2ul", class_name))
1529 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1530 if (!strcmp ("Vector2l", class_name))
1531 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1532 if (!strcmp ("Vector4ui", class_name))
1533 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1534 if (!strcmp ("Vector4i", class_name))
1535 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1536 if (!strcmp ("Vector8us", class_name))
1537 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1538 if (!strcmp ("Vector8s", class_name))
1539 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1540 if (!strcmp ("Vector16b", class_name))
1541 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1542 if (!strcmp ("Vector16sb", class_name))
1543 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));