2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
73 SIMD_EMIT_GETTER_QWORD,
79 SIMD_EMIT_LOAD_ALIGNED,
81 SIMD_EMIT_EXTRACT_MASK,
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name,
113 #include "simd-methods.h"
117 #define method_name(idx) (method_names [(idx)])
124 guint8 simd_emit_mode : 4;
125 guint8 simd_version : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
132 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
133 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
168 { SN_op_Equality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
169 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
170 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
171 { SN_op_Inequality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
172 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
173 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
174 { SN_set_W, 3, SIMD_EMIT_SETTER },
175 { SN_set_X, 0, SIMD_EMIT_SETTER },
176 { SN_set_Y, 1, SIMD_EMIT_SETTER },
177 { SN_set_Z, 2, SIMD_EMIT_SETTER },
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_ctor, OP_EXPAND_R8, SIMD_EMIT_CTOR },
182 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
183 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
184 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
185 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
186 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
187 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
188 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
189 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
190 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
191 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
192 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
193 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
194 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
195 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
196 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
197 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
198 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
199 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
200 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
201 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
202 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
203 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
204 { SN_Sqrt, OP_SQRTPD, SIMD_EMIT_UNARY },
205 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
206 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
207 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
208 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
209 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
210 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
211 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
212 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
213 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
214 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
215 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
216 { SN_set_X, 0, SIMD_EMIT_SETTER },
217 { SN_set_Y, 1, SIMD_EMIT_SETTER },
220 static const SimdIntrinsc vector2ul_intrinsics[] = {
221 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
222 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
223 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
224 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
225 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
226 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
227 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
228 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
229 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
230 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
231 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
232 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
233 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
234 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
235 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
236 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
237 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
238 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
239 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
240 { SN_op_RightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
241 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
242 { SN_set_X, 0, SIMD_EMIT_SETTER },
243 { SN_set_Y, 1, SIMD_EMIT_SETTER },
246 static const SimdIntrinsc vector2l_intrinsics[] = {
247 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
248 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
249 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE42 },
250 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
251 { SN_LogicalRightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
252 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
253 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
254 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
255 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
256 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
257 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
258 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
259 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
260 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
261 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
263 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
264 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
265 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
266 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
267 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
268 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
269 { SN_set_X, 0, SIMD_EMIT_SETTER },
270 { SN_set_Y, 1, SIMD_EMIT_SETTER },
273 static const SimdIntrinsc vector4ui_intrinsics[] = {
274 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
275 { SN_ArithmeticRightShift, OP_PSARD, SIMD_EMIT_SHIFT },
276 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
277 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
278 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
279 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
280 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
281 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
282 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
283 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
284 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
285 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
286 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
287 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
288 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
289 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
290 { SN_get_W, 3, SIMD_EMIT_GETTER },
291 { SN_get_X, 0, SIMD_EMIT_GETTER },
292 { SN_get_Y, 1, SIMD_EMIT_GETTER },
293 { SN_get_Z, 2, SIMD_EMIT_GETTER },
294 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
296 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
297 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
298 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
299 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
300 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
301 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
302 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
303 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
304 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
305 { SN_set_W, 3, SIMD_EMIT_SETTER },
306 { SN_set_X, 0, SIMD_EMIT_SETTER },
307 { SN_set_Y, 1, SIMD_EMIT_SETTER },
308 { SN_set_Z, 2, SIMD_EMIT_SETTER },
311 static const SimdIntrinsc vector4i_intrinsics[] = {
312 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
313 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
314 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
315 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
316 { SN_LogicalRightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
317 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
318 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
319 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
320 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
321 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
322 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
323 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
324 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
325 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
326 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
327 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
328 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
329 { SN_get_W, 3, SIMD_EMIT_GETTER },
330 { SN_get_X, 0, SIMD_EMIT_GETTER },
331 { SN_get_Y, 1, SIMD_EMIT_GETTER },
332 { SN_get_Z, 2, SIMD_EMIT_GETTER },
333 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
334 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
335 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
336 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
337 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
338 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
339 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
340 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
341 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
342 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
343 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
344 { SN_set_W, 3, SIMD_EMIT_SETTER },
345 { SN_set_X, 0, SIMD_EMIT_SETTER },
346 { SN_set_Y, 1, SIMD_EMIT_SETTER },
347 { SN_set_Z, 2, SIMD_EMIT_SETTER },
350 static const SimdIntrinsc vector8us_intrinsics[] = {
351 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
352 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
353 { SN_ArithmeticRightShift, OP_PSARW, SIMD_EMIT_SHIFT },
354 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
355 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
356 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
357 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
358 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
359 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
360 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
361 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
362 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
363 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
364 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
365 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
366 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
367 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
368 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
369 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
370 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
371 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
372 { SN_get_V0, 0, SIMD_EMIT_GETTER },
373 { SN_get_V1, 1, SIMD_EMIT_GETTER },
374 { SN_get_V2, 2, SIMD_EMIT_GETTER },
375 { SN_get_V3, 3, SIMD_EMIT_GETTER },
376 { SN_get_V4, 4, SIMD_EMIT_GETTER },
377 { SN_get_V5, 5, SIMD_EMIT_GETTER },
378 { SN_get_V6, 6, SIMD_EMIT_GETTER },
379 { SN_get_V7, 7, SIMD_EMIT_GETTER },
380 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
381 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
382 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
383 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
384 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
385 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
386 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
387 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
388 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
389 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
390 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
391 { SN_set_V0, 0, SIMD_EMIT_SETTER },
392 { SN_set_V1, 1, SIMD_EMIT_SETTER },
393 { SN_set_V2, 2, SIMD_EMIT_SETTER },
394 { SN_set_V3, 3, SIMD_EMIT_SETTER },
395 { SN_set_V4, 4, SIMD_EMIT_SETTER },
396 { SN_set_V5, 5, SIMD_EMIT_SETTER },
397 { SN_set_V6, 6, SIMD_EMIT_SETTER },
398 { SN_set_V7, 7, SIMD_EMIT_SETTER },
401 static const SimdIntrinsc vector8s_intrinsics[] = {
402 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
403 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
404 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
405 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
406 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
407 { SN_LogicalRightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
408 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
409 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
410 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
411 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
412 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
413 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
414 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
415 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
416 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
417 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
418 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
419 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
420 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
421 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
422 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
423 { SN_get_V0, 0, SIMD_EMIT_GETTER },
424 { SN_get_V1, 1, SIMD_EMIT_GETTER },
425 { SN_get_V2, 2, SIMD_EMIT_GETTER },
426 { SN_get_V3, 3, SIMD_EMIT_GETTER },
427 { SN_get_V4, 4, SIMD_EMIT_GETTER },
428 { SN_get_V5, 5, SIMD_EMIT_GETTER },
429 { SN_get_V6, 6, SIMD_EMIT_GETTER },
430 { SN_get_V7, 7, SIMD_EMIT_GETTER },
431 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
432 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
433 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
434 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
435 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
436 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
437 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
438 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
439 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
440 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
441 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
442 { SN_set_V0, 0, SIMD_EMIT_SETTER },
443 { SN_set_V1, 1, SIMD_EMIT_SETTER },
444 { SN_set_V2, 2, SIMD_EMIT_SETTER },
445 { SN_set_V3, 3, SIMD_EMIT_SETTER },
446 { SN_set_V4, 4, SIMD_EMIT_SETTER },
447 { SN_set_V5, 5, SIMD_EMIT_SETTER },
448 { SN_set_V6, 6, SIMD_EMIT_SETTER },
449 { SN_set_V7, 7, SIMD_EMIT_SETTER },
452 static const SimdIntrinsc vector16b_intrinsics[] = {
453 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
454 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
455 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
456 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
457 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
458 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
459 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
460 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
461 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
462 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
463 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
464 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
465 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
466 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
467 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
468 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
469 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
470 { SN_get_V0, 0, SIMD_EMIT_GETTER },
471 { SN_get_V1, 1, SIMD_EMIT_GETTER },
472 { SN_get_V10, 10, SIMD_EMIT_GETTER },
473 { SN_get_V11, 11, SIMD_EMIT_GETTER },
474 { SN_get_V12, 12, SIMD_EMIT_GETTER },
475 { SN_get_V13, 13, SIMD_EMIT_GETTER },
476 { SN_get_V14, 14, SIMD_EMIT_GETTER },
477 { SN_get_V15, 15, SIMD_EMIT_GETTER },
478 { SN_get_V2, 2, SIMD_EMIT_GETTER },
479 { SN_get_V3, 3, SIMD_EMIT_GETTER },
480 { SN_get_V4, 4, SIMD_EMIT_GETTER },
481 { SN_get_V5, 5, SIMD_EMIT_GETTER },
482 { SN_get_V6, 6, SIMD_EMIT_GETTER },
483 { SN_get_V7, 7, SIMD_EMIT_GETTER },
484 { SN_get_V8, 8, SIMD_EMIT_GETTER },
485 { SN_get_V9, 9, SIMD_EMIT_GETTER },
486 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
487 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
488 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
489 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
490 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
491 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
492 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
493 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
494 { SN_set_V0, 0, SIMD_EMIT_SETTER },
495 { SN_set_V1, 1, SIMD_EMIT_SETTER },
496 { SN_set_V10, 10, SIMD_EMIT_SETTER },
497 { SN_set_V11, 11, SIMD_EMIT_SETTER },
498 { SN_set_V12, 12, SIMD_EMIT_SETTER },
499 { SN_set_V13, 13, SIMD_EMIT_SETTER },
500 { SN_set_V14, 14, SIMD_EMIT_SETTER },
501 { SN_set_V15, 15, SIMD_EMIT_SETTER },
502 { SN_set_V2, 2, SIMD_EMIT_SETTER },
503 { SN_set_V3, 3, SIMD_EMIT_SETTER },
504 { SN_set_V4, 4, SIMD_EMIT_SETTER },
505 { SN_set_V5, 5, SIMD_EMIT_SETTER },
506 { SN_set_V6, 6, SIMD_EMIT_SETTER },
507 { SN_set_V7, 7, SIMD_EMIT_SETTER },
508 { SN_set_V8, 8, SIMD_EMIT_SETTER },
509 { SN_set_V9, 9, SIMD_EMIT_SETTER },
516 static const SimdIntrinsc vector16sb_intrinsics[] = {
517 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
518 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
519 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
520 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
521 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
522 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
523 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
524 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
525 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
526 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
527 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
528 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
529 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
530 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
531 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
532 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
533 { SN_get_V0, 0, SIMD_EMIT_GETTER },
534 { SN_get_V1, 1, SIMD_EMIT_GETTER },
535 { SN_get_V10, 10, SIMD_EMIT_GETTER },
536 { SN_get_V11, 11, SIMD_EMIT_GETTER },
537 { SN_get_V12, 12, SIMD_EMIT_GETTER },
538 { SN_get_V13, 13, SIMD_EMIT_GETTER },
539 { SN_get_V14, 14, SIMD_EMIT_GETTER },
540 { SN_get_V15, 15, SIMD_EMIT_GETTER },
541 { SN_get_V2, 2, SIMD_EMIT_GETTER },
542 { SN_get_V3, 3, SIMD_EMIT_GETTER },
543 { SN_get_V4, 4, SIMD_EMIT_GETTER },
544 { SN_get_V5, 5, SIMD_EMIT_GETTER },
545 { SN_get_V6, 6, SIMD_EMIT_GETTER },
546 { SN_get_V7, 7, SIMD_EMIT_GETTER },
547 { SN_get_V8, 8, SIMD_EMIT_GETTER },
548 { SN_get_V9, 9, SIMD_EMIT_GETTER },
549 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
550 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
551 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
552 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
553 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
554 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
555 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
556 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
557 { SN_set_V0, 0, SIMD_EMIT_SETTER },
558 { SN_set_V1, 1, SIMD_EMIT_SETTER },
559 { SN_set_V10, 10, SIMD_EMIT_SETTER },
560 { SN_set_V11, 11, SIMD_EMIT_SETTER },
561 { SN_set_V12, 12, SIMD_EMIT_SETTER },
562 { SN_set_V13, 13, SIMD_EMIT_SETTER },
563 { SN_set_V14, 14, SIMD_EMIT_SETTER },
564 { SN_set_V15, 15, SIMD_EMIT_SETTER },
565 { SN_set_V2, 2, SIMD_EMIT_SETTER },
566 { SN_set_V3, 3, SIMD_EMIT_SETTER },
567 { SN_set_V4, 4, SIMD_EMIT_SETTER },
568 { SN_set_V5, 5, SIMD_EMIT_SETTER },
569 { SN_set_V6, 6, SIMD_EMIT_SETTER },
570 { SN_set_V7, 7, SIMD_EMIT_SETTER },
571 { SN_set_V8, 8, SIMD_EMIT_SETTER },
572 { SN_set_V9, 9, SIMD_EMIT_SETTER },
575 static guint32 simd_supported_versions;
577 /*TODO match using number of parameters as well*/
579 simd_intrinsic_compare_by_name (const void *key, const void *value)
581 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
586 VREG_HAS_XZERO_BB0 = 0x02,
587 VREG_HAS_OTHER_OP_BB0 = 0x04,
588 VREG_SINGLE_BB_USE = 0x08,
589 VREG_MANY_BB_USE = 0x10,
593 mono_simd_intrinsics_init (void)
595 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
596 /*TODO log the supported flags*/
599 static inline gboolean
600 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
602 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
603 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
604 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
605 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
611 static inline gboolean
612 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
614 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
617 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
618 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
619 vreg_flags [reg] |= VREG_MANY_BB_USE;
620 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
622 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
623 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
624 target_bb [reg] = bb;
625 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
632 This pass recalculate which vars need MONO_INST_INDIRECT.
634 We cannot do this for non SIMD vars since code like mono_get_vtable_var
635 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
638 mono_simd_simplify_indirection (MonoCompile *cfg)
641 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
645 for (i = 0; i < cfg->num_varinfo; i++) {
646 MonoInst *var = cfg->varinfo [i];
647 if (var->klass->simd_type) {
648 var->flags &= ~MONO_INST_INDIRECT;
649 max_vreg = MAX (var->dreg, max_vreg);
653 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
654 if (!first_bb && bb->code)
656 for (ins = bb->code; ins; ins = ins->next) {
657 if (ins->opcode == OP_LDADDR) {
658 MonoInst *var = (MonoInst*)ins->inst_p0;
659 if (var->klass->simd_type) {
660 var->flags |= MONO_INST_INDIRECT;
666 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
667 vreg_flags = g_malloc0 (max_vreg + 1);
668 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
670 for (i = 0; i < cfg->num_varinfo; i++) {
671 MonoInst *var = cfg->varinfo [i];
672 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
673 vreg_flags [var->dreg] = VREG_USED;
674 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
678 /*Scan the first basic block looking xzeros not used*/
679 for (ins = first_bb->code; ins; ins = ins->next) {
681 int sregs [MONO_MAX_SRC_REGS];
683 if (ins->opcode == OP_XZERO) {
684 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
685 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
686 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
690 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
692 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
694 num_sregs = mono_inst_get_src_registers (ins, sregs);
695 for (i = 0; i < num_sregs; ++i) {
696 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
701 if (IS_DEBUG_ON (cfg)) {
702 for (i = 0; i < cfg->num_varinfo; i++) {
703 MonoInst *var = cfg->varinfo [i];
704 if (var->klass->simd_type) {
705 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
706 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
707 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
708 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
713 /*TODO stop here if no var is xzero only*/
716 Scan all other bb and check if it has only one other use
717 Ideally this would be done after an extended bb formation pass
719 FIXME This pass could use dominator information to properly
720 place the XZERO on the bb that dominates all uses of the var,
721 but this will have zero effect with the current local reg alloc
723 TODO simply the use of flags.
726 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
727 for (ins = bb->code; ins; ins = ins->next) {
729 int sregs [MONO_MAX_SRC_REGS];
731 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
733 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
735 num_sregs = mono_inst_get_src_registers (ins, sregs);
736 for (i = 0; i < num_sregs; ++i) {
737 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
738 max_vreg, vreg_flags, target_bb))
744 for (i = 0; i < cfg->num_varinfo; i++) {
745 MonoInst *var = cfg->varinfo [i];
746 if (!var->klass->simd_type)
748 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
749 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
750 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
751 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
753 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
755 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
757 int sregs [MONO_MAX_SRC_REGS];
758 gboolean found = FALSE;
760 num_sregs = mono_inst_get_src_registers (ins, sregs);
761 for (j = 0; j < num_sregs; ++j) {
762 if (sregs [i] == var->dreg)
765 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
766 if (ins->dreg == var->dreg && !found) {
770 MONO_INST_NEW (cfg, tmp, OP_XZERO);
771 tmp->dreg = var->dreg;
772 tmp->type = STACK_VTYPE;
773 tmp->klass = var->klass;
774 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
780 for (ins = first_bb->code; ins; ins = ins->next) {
781 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
790 * This function expect that src be a value.
793 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
795 if (src->opcode == OP_XMOVE) {
797 } else if (src->type == STACK_VTYPE) {
800 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
801 mono_print_ins (src);
802 g_assert_not_reached ();
806 * This function will load the value if needed.
809 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
813 if (src->opcode == OP_XMOVE) {
815 } else if (src->opcode == OP_LDADDR) {
816 int res = ((MonoInst*)src->inst_p0)->dreg;
819 } else if (src->type == STACK_VTYPE) {
821 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
826 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
827 ins->klass = cmethod->klass;
828 ins->sreg1 = src->dreg;
829 ins->type = STACK_VTYPE;
830 ins->dreg = alloc_ireg (cfg);
831 MONO_ADD_INS (cfg->cbb, ins);
834 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
835 mono_print_ins (src);
836 g_assert_not_reached ();
840 get_int_to_float_spill_area (MonoCompile *cfg)
842 if (!cfg->iconv_raw_var) {
843 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
844 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
846 return cfg->iconv_raw_var;
849 /*We share the var with fconv_to_r8_x to save some stack space.*/
851 get_double_spill_area (MonoCompile *cfg)
853 if (!cfg->fconv_to_r8_x_var) {
854 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
855 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
857 return cfg->fconv_to_r8_x_var;
860 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
862 if (!cfg->simd_ctor_var) {
863 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
864 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
866 return cfg->simd_ctor_var;
870 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
873 int left_vreg, right_vreg;
875 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
876 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
879 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
880 ins->klass = cmethod->klass;
881 ins->sreg1 = left_vreg;
882 ins->sreg2 = right_vreg;
883 ins->type = STACK_VTYPE;
884 ins->klass = cmethod->klass;
885 ins->dreg = alloc_ireg (cfg);
886 ins->inst_c0 = intrinsic->flags;
887 MONO_ADD_INS (cfg->cbb, ins);
892 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
897 vreg = get_simd_vreg (cfg, cmethod, args [0]);
899 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
900 ins->klass = cmethod->klass;
902 ins->type = STACK_VTYPE;
903 ins->dreg = alloc_ireg (cfg);
904 MONO_ADD_INS (cfg->cbb, ins);
909 mono_type_to_extract_op (MonoType *type)
911 switch (type->type) {
913 return OP_EXTRACT_I1;
915 return OP_EXTRACT_U1;
917 return OP_EXTRACT_I2;
919 return OP_EXTRACT_U2;
923 return OP_EXTRACT_I4;
925 g_assert_not_reached ();
928 /*Returns the amount to shift the element index to get the dword it belongs to*/
930 mono_type_elements_shift_bits (MonoType *type)
932 switch (type->type) {
944 g_assert_not_reached ();
948 mono_type_to_slow_insert_op (MonoType *type)
950 switch (type->type) {
953 return OP_INSERTX_U1_SLOW;
959 return OP_INSERTX_I4_SLOW;
962 return OP_INSERTX_I8_SLOW;
964 return OP_INSERTX_R4_SLOW;
966 return OP_INSERTX_R8_SLOW;
968 g_assert_not_reached ();
972 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
975 MonoMethodSignature *sig = mono_method_signature (cmethod);
980 size = mono_type_size (sig->params [0], &align);
982 if (size == 2 || size == 4 || size == 8) {
983 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
984 ins->klass = cmethod->klass;
985 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
986 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
987 ins->sreg2 = args [1]->dreg;
988 ins->inst_c0 = intrinsic->opcode;
989 if (sig->params [0]->type == MONO_TYPE_R4)
990 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
991 else if (sig->params [0]->type == MONO_TYPE_R8)
992 ins->backend.spill_var = get_double_spill_area (cfg);
993 MONO_ADD_INS (cfg->cbb, ins);
997 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
998 ins->klass = cmethod->klass;
999 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1000 ins->type = STACK_I4;
1001 ins->dreg = vreg = alloc_ireg (cfg);
1002 ins->inst_c0 = intrinsic->opcode / 2;
1003 MONO_ADD_INS (cfg->cbb, ins);
1005 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1006 ins->klass = cmethod->klass;
1008 ins->sreg2 = args [1]->dreg;
1010 ins->inst_c0 = intrinsic->opcode;
1011 MONO_ADD_INS (cfg->cbb, ins);
1015 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1016 ins->klass = cmethod->klass;
1017 ins->dreg = args [0]->dreg;
1019 MONO_ADD_INS (cfg->cbb, ins);
1025 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1028 MonoMethodSignature *sig = mono_method_signature (cmethod);
1029 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1031 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1033 if (intrinsic->opcode >> shift_bits) {
1034 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1035 ins->klass = cmethod->klass;
1037 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1038 ins->type = STACK_VTYPE;
1039 ins->dreg = vreg = alloc_ireg (cfg);
1040 MONO_ADD_INS (cfg->cbb, ins);
1043 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1044 ins->klass = cmethod->klass;
1046 ins->type = STACK_I4;
1047 ins->dreg = vreg = alloc_ireg (cfg);
1048 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1049 MONO_ADD_INS (cfg->cbb, ins);
1051 if (sig->ret->type == MONO_TYPE_R4) {
1052 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1053 ins->klass = mono_defaults.single_class;
1055 ins->type = STACK_R8;
1056 ins->dreg = alloc_freg (cfg);
1057 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1058 MONO_ADD_INS (cfg->cbb, ins);
1064 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1068 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1070 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1072 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1073 ins->klass = cmethod->klass;
1075 ins->inst_c0 = intrinsic->opcode;
1077 ins->type = STACK_R8;
1078 ins->dreg = alloc_freg (cfg);
1079 ins->backend.spill_var = get_double_spill_area (cfg);
1081 ins->type = STACK_I8;
1082 ins->dreg = alloc_lreg (cfg);
1084 MONO_ADD_INS (cfg->cbb, ins);
1090 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1092 MonoInst *ins = NULL;
1094 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1095 MonoMethodSignature *sig = mono_method_signature (cmethod);
1096 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1097 int arg_size = mono_type_size (sig->params [0], &i);
1099 if (sig->param_count == 1) {
1103 dreg = args [0]->inst_i0->dreg;
1104 NULLIFY_INS (args [0]);
1106 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1107 dreg = alloc_ireg (cfg);
1110 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1111 ins->klass = cmethod->klass;
1112 ins->sreg1 = args [1]->dreg;
1113 ins->type = STACK_VTYPE;
1116 MONO_ADD_INS (cfg->cbb, ins);
1117 if (sig->params [0]->type == MONO_TYPE_R4)
1118 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1119 else if (sig->params [0]->type == MONO_TYPE_R8)
1120 ins->backend.spill_var = get_double_spill_area (cfg);
1123 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1124 ins->dreg = args [0]->dreg;
1126 MONO_ADD_INS (cfg->cbb, ins);
1132 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1133 MONO_ADD_INS (cfg->cbb, ins);
1134 addr_reg = ins->dreg;
1136 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1137 addr_reg = args [0]->dreg;
1140 for (i = sig->param_count - 1; i >= 0; --i) {
1141 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1144 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1145 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1146 NULLIFY_INS (args [0]);
1148 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1149 ins->klass = cmethod->klass;
1150 ins->sreg1 = addr_reg;
1151 ins->type = STACK_VTYPE;
1153 MONO_ADD_INS (cfg->cbb, ins);
1159 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1164 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1166 //TODO macroize this
1167 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1168 ins->klass = cmethod->klass;
1169 ins->type = STACK_VTYPE;
1171 ins->dreg = alloc_ireg (cfg);
1172 MONO_ADD_INS (cfg->cbb, ins);
1177 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1180 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1182 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1184 if (args [1]->opcode != OP_ICONST) {
1185 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1186 ins->klass = mono_defaults.int32_class;
1187 ins->sreg1 = args [1]->dreg;
1188 ins->type = STACK_I4;
1189 ins->dreg = vreg2 = alloc_ireg (cfg);
1190 MONO_ADD_INS (cfg->cbb, ins);
1192 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1195 MONO_INST_NEW (cfg, ins, opcode);
1196 ins->klass = cmethod->klass;
1200 if (args [1]->opcode == OP_ICONST) {
1201 ins->inst_imm = args [1]->inst_c0;
1202 NULLIFY_INS (args [1]);
1205 ins->type = STACK_VTYPE;
1206 ins->dreg = alloc_ireg (cfg);
1207 MONO_ADD_INS (cfg->cbb, ins);
1211 static inline gboolean
1212 mono_op_is_packed_compare (int op)
1214 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1218 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1221 int left_vreg, right_vreg, tmp_vreg;
1223 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1224 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1227 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1228 ins->klass = cmethod->klass;
1229 ins->sreg1 = left_vreg;
1230 ins->sreg2 = right_vreg;
1231 ins->type = STACK_VTYPE;
1232 ins->klass = cmethod->klass;
1233 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1234 ins->inst_c0 = intrinsic->flags;
1235 MONO_ADD_INS (cfg->cbb, ins);
1237 /*FIXME the next ops are SSE specific*/
1238 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1239 ins->klass = cmethod->klass;
1240 ins->sreg1 = tmp_vreg;
1241 ins->type = STACK_I4;
1242 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1243 MONO_ADD_INS (cfg->cbb, ins);
1245 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1246 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1247 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1248 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1250 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1251 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1253 MONO_ADD_INS (cfg->cbb, ins);
1259 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1264 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
1266 if (args [1]->opcode != OP_ICONST) {
1267 /*TODO Shuffle with non literals is not yet supported */
1270 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1271 NULLIFY_INS (args [1]);
1273 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1274 ins->klass = cmethod->klass;
1276 ins->inst_c0 = args [1]->inst_c0;
1277 ins->type = STACK_VTYPE;
1278 ins->dreg = alloc_ireg (cfg);
1279 MONO_ADD_INS (cfg->cbb, ins);
1284 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1288 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1289 ins->klass = cmethod->klass;
1290 ins->sreg1 = args [0]->dreg;
1291 ins->type = STACK_VTYPE;
1292 ins->dreg = alloc_ireg (cfg);
1293 MONO_ADD_INS (cfg->cbb, ins);
1298 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1303 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1305 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1306 ins->klass = cmethod->klass;
1307 ins->dreg = args [0]->dreg;
1309 ins->type = STACK_VTYPE;
1310 MONO_ADD_INS (cfg->cbb, ins);
1315 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1320 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1322 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1323 ins->klass = cmethod->klass;
1325 ins->type = STACK_I4;
1326 ins->dreg = alloc_ireg (cfg);
1327 MONO_ADD_INS (cfg->cbb, ins);
1333 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1337 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1338 ins->klass = cmethod->klass;
1339 ins->sreg1 = args [0]->dreg;
1340 ins->backend.arg_info = intrinsic->flags;
1341 MONO_ADD_INS (cfg->cbb, ins);
1346 simd_version_name (guint32 version)
1349 case SIMD_VERSION_SSE1:
1351 case SIMD_VERSION_SSE2:
1353 case SIMD_VERSION_SSE3:
1355 case SIMD_VERSION_SSSE3:
1357 case SIMD_VERSION_SSE41:
1359 case SIMD_VERSION_SSE42:
1361 case SIMD_VERSION_SSE4a:
1368 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1370 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1372 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1375 if (IS_DEBUG_ON (cfg)) {
1377 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1378 max = fsig->param_count + fsig->hasthis;
1379 for (i = 0; i < max; ++i) {
1380 printf ("param %d: ", i);
1381 mono_print_ins (args [i]);
1384 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
1385 if (IS_DEBUG_ON (cfg))
1386 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
1390 switch (result->simd_emit_mode) {
1391 case SIMD_EMIT_BINARY:
1392 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1393 case SIMD_EMIT_UNARY:
1394 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1395 case SIMD_EMIT_SETTER:
1396 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1397 case SIMD_EMIT_GETTER:
1398 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1399 case SIMD_EMIT_GETTER_QWORD:
1400 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1401 case SIMD_EMIT_CTOR:
1402 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1403 case SIMD_EMIT_CAST:
1404 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1405 case SIMD_EMIT_SHUFFLE:
1406 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1407 case SIMD_EMIT_SHIFT:
1408 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1409 case SIMD_EMIT_EQUALITY:
1410 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1411 case SIMD_EMIT_LOAD_ALIGNED:
1412 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1413 case SIMD_EMIT_STORE:
1414 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1415 case SIMD_EMIT_EXTRACT_MASK:
1416 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1417 case SIMD_EMIT_PREFETCH:
1418 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1420 g_assert_not_reached ();
1424 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1428 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1430 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1431 mult_reg = alloc_preg (cfg);
1432 array_reg = arr->dreg;
1433 index_reg = index->dreg;
1435 #if SIZEOF_VOID_P == 8
1436 /* The array reg is 64 bits but the index reg is only 32 */
1437 index2_reg = alloc_preg (cfg);
1438 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1440 index2_reg = index_reg;
1442 index3_reg = alloc_preg (cfg);
1445 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1446 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1447 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1450 add_reg = alloc_preg (cfg);
1452 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1453 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1454 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1455 ins->type = STACK_PTR;
1456 MONO_ADD_INS (cfg->cbb, ins);
1462 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1464 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1466 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1468 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1469 load->klass = cmethod->klass;
1471 load->type = STACK_VTYPE;
1472 load->dreg = alloc_ireg (cfg);
1473 MONO_ADD_INS (cfg->cbb, load);
1477 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1479 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1480 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1482 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1483 store->klass = cmethod->klass;
1485 store->sreg1 = vreg;
1486 MONO_ADD_INS (cfg->cbb, store);
1490 if (!strcmp ("IsAligned", cmethod->name)) {
1492 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1494 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1495 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1496 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1497 MONO_ADD_INS (cfg->cbb, ins);
1505 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1507 if (!strcmp ("get_AccelMode", cmethod->name)) {
1509 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1516 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1518 const char *class_name;
1520 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1523 class_name = cmethod->klass->name;
1524 if (!strcmp ("SimdRuntime", class_name))
1525 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1527 if (!strcmp ("ArrayExtensions", class_name))
1528 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1530 if (!strcmp ("VectorOperations", class_name)) {
1531 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1533 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1534 } else if (!cmethod->klass->simd_type)
1537 cfg->uses_simd_intrinsics = 1;
1538 if (!strcmp ("Vector2d", class_name))
1539 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1540 if (!strcmp ("Vector4f", class_name))
1541 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1542 if (!strcmp ("Vector2ul", class_name))
1543 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1544 if (!strcmp ("Vector2l", class_name))
1545 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1546 if (!strcmp ("Vector4ui", class_name))
1547 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1548 if (!strcmp ("Vector4i", class_name))
1549 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1550 if (!strcmp ("Vector8us", class_name))
1551 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1552 if (!strcmp ("Vector8s", class_name))
1553 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1554 if (!strcmp ("Vector16b", class_name))
1555 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1556 if (!strcmp ("Vector16sb", class_name))
1557 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));