2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
73 SIMD_EMIT_GETTER_QWORD,
79 SIMD_EMIT_LOAD_ALIGNED,
81 SIMD_EMIT_EXTRACT_MASK,
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name,
113 #include "simd-methods.h"
117 #define method_name(idx) (method_names [(idx)])
124 guint8 simd_emit_mode : 4;
125 guint8 simd_version : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
132 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
133 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
168 { SN_op_Equality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
169 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
170 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
171 { SN_op_Inequality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
172 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
173 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
174 { SN_set_W, 3, SIMD_EMIT_SETTER },
175 { SN_set_X, 0, SIMD_EMIT_SETTER },
176 { SN_set_Y, 1, SIMD_EMIT_SETTER },
177 { SN_set_Z, 2, SIMD_EMIT_SETTER },
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_ctor, OP_EXPAND_R8, SIMD_EMIT_CTOR },
182 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
183 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
184 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
185 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
186 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
187 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
188 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
189 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
190 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
191 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
192 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
193 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
194 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
195 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
196 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
197 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
198 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
199 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
200 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
201 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
202 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
203 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
204 { SN_Sqrt, OP_SQRTPD, SIMD_EMIT_UNARY },
205 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
206 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
207 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
208 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
209 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
210 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
211 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
212 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
213 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
214 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
215 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
216 { SN_set_X, 0, SIMD_EMIT_SETTER },
217 { SN_set_Y, 1, SIMD_EMIT_SETTER },
220 static const SimdIntrinsc vector2ul_intrinsics[] = {
221 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
222 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
223 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
224 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
225 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
226 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
227 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
228 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
229 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
230 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
231 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
232 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
233 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
234 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
235 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
236 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
237 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
238 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
239 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
240 { SN_op_RightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
241 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
242 { SN_set_X, 0, SIMD_EMIT_SETTER },
243 { SN_set_Y, 1, SIMD_EMIT_SETTER },
246 static const SimdIntrinsc vector2l_intrinsics[] = {
247 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
248 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
249 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE42 },
250 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
251 { SN_LogicalRightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
252 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
253 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
254 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
255 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
256 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
257 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
258 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
259 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
260 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
261 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
263 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
264 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
265 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
266 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
267 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
268 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
269 { SN_set_X, 0, SIMD_EMIT_SETTER },
270 { SN_set_Y, 1, SIMD_EMIT_SETTER },
273 static const SimdIntrinsc vector4ui_intrinsics[] = {
274 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
275 { SN_ArithmeticRightShift, OP_PSARD, SIMD_EMIT_SHIFT },
276 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
277 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
278 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
279 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
280 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
281 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
282 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
283 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
284 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
285 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
286 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
287 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
288 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
289 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
290 { SN_get_W, 3, SIMD_EMIT_GETTER },
291 { SN_get_X, 0, SIMD_EMIT_GETTER },
292 { SN_get_Y, 1, SIMD_EMIT_GETTER },
293 { SN_get_Z, 2, SIMD_EMIT_GETTER },
294 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
296 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
297 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
298 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
299 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
300 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
301 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
302 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
303 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
304 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
305 { SN_set_W, 3, SIMD_EMIT_SETTER },
306 { SN_set_X, 0, SIMD_EMIT_SETTER },
307 { SN_set_Y, 1, SIMD_EMIT_SETTER },
308 { SN_set_Z, 2, SIMD_EMIT_SETTER },
311 static const SimdIntrinsc vector4i_intrinsics[] = {
312 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
313 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
314 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
315 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
316 { SN_LogicalRightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
317 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
318 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
319 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
320 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
321 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
322 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
323 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
324 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
325 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
326 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
327 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
328 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
329 { SN_get_W, 3, SIMD_EMIT_GETTER },
330 { SN_get_X, 0, SIMD_EMIT_GETTER },
331 { SN_get_Y, 1, SIMD_EMIT_GETTER },
332 { SN_get_Z, 2, SIMD_EMIT_GETTER },
333 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
334 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
335 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
336 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
337 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
338 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
339 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
340 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
341 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
342 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
343 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
344 { SN_set_W, 3, SIMD_EMIT_SETTER },
345 { SN_set_X, 0, SIMD_EMIT_SETTER },
346 { SN_set_Y, 1, SIMD_EMIT_SETTER },
347 { SN_set_Z, 2, SIMD_EMIT_SETTER },
350 static const SimdIntrinsc vector8us_intrinsics[] = {
351 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
352 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
353 { SN_ArithmeticRightShift, OP_PSARW, SIMD_EMIT_SHIFT },
354 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
355 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
356 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
357 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
358 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
359 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
360 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
361 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
362 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
363 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
364 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
365 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
366 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
367 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
368 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
369 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
370 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
371 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
372 { SN_get_V0, 0, SIMD_EMIT_GETTER },
373 { SN_get_V1, 1, SIMD_EMIT_GETTER },
374 { SN_get_V2, 2, SIMD_EMIT_GETTER },
375 { SN_get_V3, 3, SIMD_EMIT_GETTER },
376 { SN_get_V4, 4, SIMD_EMIT_GETTER },
377 { SN_get_V5, 5, SIMD_EMIT_GETTER },
378 { SN_get_V6, 6, SIMD_EMIT_GETTER },
379 { SN_get_V7, 7, SIMD_EMIT_GETTER },
380 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
381 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
382 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
383 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
384 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
385 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
386 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
387 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
388 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
389 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
390 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
391 { SN_set_V0, 0, SIMD_EMIT_SETTER },
392 { SN_set_V1, 1, SIMD_EMIT_SETTER },
393 { SN_set_V2, 2, SIMD_EMIT_SETTER },
394 { SN_set_V3, 3, SIMD_EMIT_SETTER },
395 { SN_set_V4, 4, SIMD_EMIT_SETTER },
396 { SN_set_V5, 5, SIMD_EMIT_SETTER },
397 { SN_set_V6, 6, SIMD_EMIT_SETTER },
398 { SN_set_V7, 7, SIMD_EMIT_SETTER },
401 static const SimdIntrinsc vector8s_intrinsics[] = {
402 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
403 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
404 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
405 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
406 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
407 { SN_LogicalRightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
408 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
409 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
410 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
411 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
412 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
413 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
414 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
415 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
416 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
417 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
418 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
419 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
420 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
421 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
422 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
423 { SN_get_V0, 0, SIMD_EMIT_GETTER },
424 { SN_get_V1, 1, SIMD_EMIT_GETTER },
425 { SN_get_V2, 2, SIMD_EMIT_GETTER },
426 { SN_get_V3, 3, SIMD_EMIT_GETTER },
427 { SN_get_V4, 4, SIMD_EMIT_GETTER },
428 { SN_get_V5, 5, SIMD_EMIT_GETTER },
429 { SN_get_V6, 6, SIMD_EMIT_GETTER },
430 { SN_get_V7, 7, SIMD_EMIT_GETTER },
431 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
432 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
433 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
434 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
435 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
436 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
437 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
438 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
439 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
440 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
441 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
442 { SN_set_V0, 0, SIMD_EMIT_SETTER },
443 { SN_set_V1, 1, SIMD_EMIT_SETTER },
444 { SN_set_V2, 2, SIMD_EMIT_SETTER },
445 { SN_set_V3, 3, SIMD_EMIT_SETTER },
446 { SN_set_V4, 4, SIMD_EMIT_SETTER },
447 { SN_set_V5, 5, SIMD_EMIT_SETTER },
448 { SN_set_V6, 6, SIMD_EMIT_SETTER },
449 { SN_set_V7, 7, SIMD_EMIT_SETTER },
452 static const SimdIntrinsc vector16b_intrinsics[] = {
453 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
454 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
455 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
456 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
457 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
458 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
459 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
460 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
461 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
462 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
463 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
464 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
465 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
466 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
467 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
468 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
469 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
470 { SN_get_V0, 0, SIMD_EMIT_GETTER },
471 { SN_get_V1, 1, SIMD_EMIT_GETTER },
472 { SN_get_V10, 10, SIMD_EMIT_GETTER },
473 { SN_get_V11, 11, SIMD_EMIT_GETTER },
474 { SN_get_V12, 12, SIMD_EMIT_GETTER },
475 { SN_get_V13, 13, SIMD_EMIT_GETTER },
476 { SN_get_V14, 14, SIMD_EMIT_GETTER },
477 { SN_get_V15, 15, SIMD_EMIT_GETTER },
478 { SN_get_V2, 2, SIMD_EMIT_GETTER },
479 { SN_get_V3, 3, SIMD_EMIT_GETTER },
480 { SN_get_V4, 4, SIMD_EMIT_GETTER },
481 { SN_get_V5, 5, SIMD_EMIT_GETTER },
482 { SN_get_V6, 6, SIMD_EMIT_GETTER },
483 { SN_get_V7, 7, SIMD_EMIT_GETTER },
484 { SN_get_V8, 8, SIMD_EMIT_GETTER },
485 { SN_get_V9, 9, SIMD_EMIT_GETTER },
486 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
487 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
488 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
489 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
490 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
491 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
492 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
493 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
494 { SN_set_V0, 0, SIMD_EMIT_SETTER },
495 { SN_set_V1, 1, SIMD_EMIT_SETTER },
496 { SN_set_V10, 10, SIMD_EMIT_SETTER },
497 { SN_set_V11, 11, SIMD_EMIT_SETTER },
498 { SN_set_V12, 12, SIMD_EMIT_SETTER },
499 { SN_set_V13, 13, SIMD_EMIT_SETTER },
500 { SN_set_V14, 14, SIMD_EMIT_SETTER },
501 { SN_set_V15, 15, SIMD_EMIT_SETTER },
502 { SN_set_V2, 2, SIMD_EMIT_SETTER },
503 { SN_set_V3, 3, SIMD_EMIT_SETTER },
504 { SN_set_V4, 4, SIMD_EMIT_SETTER },
505 { SN_set_V5, 5, SIMD_EMIT_SETTER },
506 { SN_set_V6, 6, SIMD_EMIT_SETTER },
507 { SN_set_V7, 7, SIMD_EMIT_SETTER },
508 { SN_set_V8, 8, SIMD_EMIT_SETTER },
509 { SN_set_V9, 9, SIMD_EMIT_SETTER },
516 static const SimdIntrinsc vector16sb_intrinsics[] = {
517 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
518 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
519 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
520 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
521 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
522 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
523 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
524 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
525 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
526 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
527 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
528 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
529 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
530 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
531 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
532 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
533 { SN_get_V0, 0, SIMD_EMIT_GETTER },
534 { SN_get_V1, 1, SIMD_EMIT_GETTER },
535 { SN_get_V10, 10, SIMD_EMIT_GETTER },
536 { SN_get_V11, 11, SIMD_EMIT_GETTER },
537 { SN_get_V12, 12, SIMD_EMIT_GETTER },
538 { SN_get_V13, 13, SIMD_EMIT_GETTER },
539 { SN_get_V14, 14, SIMD_EMIT_GETTER },
540 { SN_get_V15, 15, SIMD_EMIT_GETTER },
541 { SN_get_V2, 2, SIMD_EMIT_GETTER },
542 { SN_get_V3, 3, SIMD_EMIT_GETTER },
543 { SN_get_V4, 4, SIMD_EMIT_GETTER },
544 { SN_get_V5, 5, SIMD_EMIT_GETTER },
545 { SN_get_V6, 6, SIMD_EMIT_GETTER },
546 { SN_get_V7, 7, SIMD_EMIT_GETTER },
547 { SN_get_V8, 8, SIMD_EMIT_GETTER },
548 { SN_get_V9, 9, SIMD_EMIT_GETTER },
549 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
550 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
551 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
552 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
553 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
554 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
555 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
556 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
557 { SN_set_V0, 0, SIMD_EMIT_SETTER },
558 { SN_set_V1, 1, SIMD_EMIT_SETTER },
559 { SN_set_V10, 10, SIMD_EMIT_SETTER },
560 { SN_set_V11, 11, SIMD_EMIT_SETTER },
561 { SN_set_V12, 12, SIMD_EMIT_SETTER },
562 { SN_set_V13, 13, SIMD_EMIT_SETTER },
563 { SN_set_V14, 14, SIMD_EMIT_SETTER },
564 { SN_set_V15, 15, SIMD_EMIT_SETTER },
565 { SN_set_V2, 2, SIMD_EMIT_SETTER },
566 { SN_set_V3, 3, SIMD_EMIT_SETTER },
567 { SN_set_V4, 4, SIMD_EMIT_SETTER },
568 { SN_set_V5, 5, SIMD_EMIT_SETTER },
569 { SN_set_V6, 6, SIMD_EMIT_SETTER },
570 { SN_set_V7, 7, SIMD_EMIT_SETTER },
571 { SN_set_V8, 8, SIMD_EMIT_SETTER },
572 { SN_set_V9, 9, SIMD_EMIT_SETTER },
575 static guint32 simd_supported_versions;
577 /*TODO match using number of parameters as well*/
579 simd_intrinsic_compare_by_name (const void *key, const void *value)
581 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
586 VREG_HAS_XZERO_BB0 = 0x02,
587 VREG_HAS_OTHER_OP_BB0 = 0x04,
588 VREG_SINGLE_BB_USE = 0x08,
589 VREG_MANY_BB_USE = 0x10,
593 mono_simd_intrinsics_init (void)
595 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
596 /*TODO log the supported flags*/
599 static inline gboolean
600 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
602 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
603 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
604 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
605 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
611 static inline gboolean
612 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
614 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
617 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
618 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
619 vreg_flags [reg] |= VREG_MANY_BB_USE;
620 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
622 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
623 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
624 target_bb [reg] = bb;
625 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
632 This pass recalculate which vars need MONO_INST_INDIRECT.
634 We cannot do this for non SIMD vars since code like mono_get_vtable_var
635 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
638 mono_simd_simplify_indirection (MonoCompile *cfg)
641 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
645 for (i = 0; i < cfg->num_varinfo; i++) {
646 MonoInst *var = cfg->varinfo [i];
647 if (var->klass->simd_type) {
648 var->flags &= ~MONO_INST_INDIRECT;
649 max_vreg = MAX (var->dreg, max_vreg);
653 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
654 if (!first_bb && bb->code)
656 for (ins = bb->code; ins; ins = ins->next) {
657 if (ins->opcode == OP_LDADDR) {
658 MonoInst *var = (MonoInst*)ins->inst_p0;
659 if (var->klass->simd_type) {
660 var->flags |= MONO_INST_INDIRECT;
666 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
667 vreg_flags = g_malloc0 (max_vreg + 1);
668 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
670 for (i = 0; i < cfg->num_varinfo; i++) {
671 MonoInst *var = cfg->varinfo [i];
672 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
673 vreg_flags [var->dreg] = VREG_USED;
674 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
678 /*Scan the first basic block looking xzeros not used*/
679 for (ins = first_bb->code; ins; ins = ins->next) {
681 int sregs [MONO_MAX_SRC_REGS];
683 if (ins->opcode == OP_XZERO) {
684 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
685 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
686 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
690 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
692 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
694 num_sregs = mono_inst_get_src_registers (ins, sregs);
695 for (i = 0; i < num_sregs; ++i) {
696 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
701 if (IS_DEBUG_ON (cfg)) {
702 for (i = 0; i < cfg->num_varinfo; i++) {
703 MonoInst *var = cfg->varinfo [i];
704 if (var->klass->simd_type) {
705 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
706 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
707 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
708 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
713 /*TODO stop here if no var is xzero only*/
716 Scan all other bb and check if it has only one other use
717 Ideally this would be done after an extended bb formation pass
719 FIXME This pass could use dominator information to properly
720 place the XZERO on the bb that dominates all uses of the var,
721 but this will have zero effect with the current local reg alloc
723 TODO simply the use of flags.
726 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
727 for (ins = bb->code; ins; ins = ins->next) {
729 int sregs [MONO_MAX_SRC_REGS];
731 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
733 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
735 num_sregs = mono_inst_get_src_registers (ins, sregs);
736 for (i = 0; i < num_sregs; ++i) {
737 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
738 max_vreg, vreg_flags, target_bb))
744 for (i = 0; i < cfg->num_varinfo; i++) {
745 MonoInst *var = cfg->varinfo [i];
746 if (!var->klass->simd_type)
748 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
749 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
750 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
751 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
753 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
755 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
757 int sregs [MONO_MAX_SRC_REGS];
758 gboolean found = FALSE;
760 num_sregs = mono_inst_get_src_registers (ins, sregs);
761 for (j = 0; j < num_sregs; ++j) {
762 if (sregs [i] == var->dreg)
765 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
766 if (ins->dreg == var->dreg && !found) {
770 MONO_INST_NEW (cfg, tmp, OP_XZERO);
771 tmp->dreg = var->dreg;
772 tmp->type = STACK_VTYPE;
773 tmp->klass = var->klass;
774 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
780 for (ins = first_bb->code; ins; ins = ins->next) {
781 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
790 * This function expect that src be a value.
793 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
795 if (src->opcode == OP_XMOVE) {
797 } else if (src->type == STACK_VTYPE) {
800 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
801 mono_print_ins (src);
802 g_assert_not_reached ();
806 * This function will load the value if needed.
809 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
813 if (src->opcode == OP_XMOVE) {
815 } else if (src->opcode == OP_LDADDR) {
816 int res = ((MonoInst*)src->inst_p0)->dreg;
819 } else if (src->type == STACK_VTYPE) {
821 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
826 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
827 ins->klass = cmethod->klass;
828 ins->sreg1 = src->dreg;
829 ins->type = STACK_VTYPE;
830 ins->dreg = alloc_ireg (cfg);
831 MONO_ADD_INS (cfg->cbb, ins);
834 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
835 mono_print_ins (src);
836 g_assert_not_reached ();
840 get_int_to_float_spill_area (MonoCompile *cfg)
842 if (!cfg->iconv_raw_var) {
843 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
844 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
846 return cfg->iconv_raw_var;
849 /*We share the var with fconv_to_r8_x to save some stack space.*/
851 get_double_spill_area (MonoCompile *cfg)
853 if (!cfg->fconv_to_r8_x_var) {
854 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
855 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
857 return cfg->fconv_to_r8_x_var;
860 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
862 if (!cfg->simd_ctor_var) {
863 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
864 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
866 return cfg->simd_ctor_var;
870 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
873 int left_vreg, right_vreg;
875 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
876 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
879 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
880 ins->klass = cmethod->klass;
881 ins->sreg1 = left_vreg;
882 ins->sreg2 = right_vreg;
883 ins->type = STACK_VTYPE;
884 ins->klass = cmethod->klass;
885 ins->dreg = alloc_ireg (cfg);
886 ins->inst_c0 = intrinsic->flags;
887 MONO_ADD_INS (cfg->cbb, ins);
892 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
897 vreg = get_simd_vreg (cfg, cmethod, args [0]);
899 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
900 ins->klass = cmethod->klass;
902 ins->type = STACK_VTYPE;
903 ins->dreg = alloc_ireg (cfg);
904 MONO_ADD_INS (cfg->cbb, ins);
909 mono_type_to_extract_op (MonoType *type)
911 switch (type->type) {
913 return OP_EXTRACT_I1;
915 return OP_EXTRACT_U1;
917 return OP_EXTRACT_I2;
919 return OP_EXTRACT_U2;
923 return OP_EXTRACT_I4;
925 g_assert_not_reached ();
928 /*Returns the amount to shift the element index to get the dword it belongs to*/
930 mono_type_elements_shift_bits (MonoType *type)
932 switch (type->type) {
944 g_assert_not_reached ();
948 mono_type_to_slow_insert_op (MonoType *type)
950 switch (type->type) {
953 return OP_INSERTX_U1_SLOW;
959 return OP_INSERTX_I4_SLOW;
962 return OP_INSERTX_I8_SLOW;
964 return OP_INSERTX_R4_SLOW;
966 return OP_INSERTX_R8_SLOW;
968 g_assert_not_reached ();
972 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
975 MonoMethodSignature *sig = mono_method_signature (cmethod);
980 size = mono_type_size (sig->params [0], &align);
982 if (size == 2 || size == 4 || size == 8) {
983 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
984 ins->klass = cmethod->klass;
985 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
986 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
987 ins->sreg2 = args [1]->dreg;
988 ins->inst_c0 = intrinsic->opcode;
989 if (sig->params [0]->type == MONO_TYPE_R4)
990 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
991 else if (sig->params [0]->type == MONO_TYPE_R8)
992 ins->backend.spill_var = get_double_spill_area (cfg);
993 MONO_ADD_INS (cfg->cbb, ins);
997 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
998 ins->klass = cmethod->klass;
999 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1000 ins->type = STACK_I4;
1001 ins->dreg = vreg = alloc_ireg (cfg);
1002 ins->inst_c0 = intrinsic->opcode / 2;
1003 MONO_ADD_INS (cfg->cbb, ins);
1005 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1006 ins->klass = cmethod->klass;
1008 ins->sreg2 = args [1]->dreg;
1010 ins->inst_c0 = intrinsic->opcode;
1011 MONO_ADD_INS (cfg->cbb, ins);
1015 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1016 ins->klass = cmethod->klass;
1017 ins->dreg = args [0]->dreg;
1019 MONO_ADD_INS (cfg->cbb, ins);
1025 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1028 MonoMethodSignature *sig = mono_method_signature (cmethod);
1029 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1031 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1033 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1034 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1035 ins->klass = cmethod->klass;
1037 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1038 ins->type = STACK_VTYPE;
1039 ins->dreg = vreg = alloc_ireg (cfg);
1040 MONO_ADD_INS (cfg->cbb, ins);
1043 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1044 ins->klass = cmethod->klass;
1046 ins->type = STACK_I4;
1047 ins->dreg = vreg = alloc_ireg (cfg);
1048 if (cfg->compile_llvm)
1049 ins->inst_c0 = intrinsic->opcode;
1051 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1052 MONO_ADD_INS (cfg->cbb, ins);
1054 if (sig->ret->type == MONO_TYPE_R4) {
1055 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1056 ins->klass = mono_defaults.single_class;
1058 ins->type = STACK_R8;
1059 ins->dreg = alloc_freg (cfg);
1060 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1061 MONO_ADD_INS (cfg->cbb, ins);
1067 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1071 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1073 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1075 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1076 ins->klass = cmethod->klass;
1078 ins->inst_c0 = intrinsic->opcode;
1080 ins->type = STACK_R8;
1081 ins->dreg = alloc_freg (cfg);
1082 ins->backend.spill_var = get_double_spill_area (cfg);
1084 ins->type = STACK_I8;
1085 ins->dreg = alloc_lreg (cfg);
1087 MONO_ADD_INS (cfg->cbb, ins);
1093 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1095 MonoInst *ins = NULL;
1097 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1098 MonoMethodSignature *sig = mono_method_signature (cmethod);
1099 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1100 int arg_size = mono_type_size (sig->params [0], &i);
1102 if (sig->param_count == 1) {
1106 dreg = args [0]->inst_i0->dreg;
1107 NULLIFY_INS (args [0]);
1109 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1110 dreg = alloc_ireg (cfg);
1113 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1114 ins->klass = cmethod->klass;
1115 ins->sreg1 = args [1]->dreg;
1116 ins->type = STACK_VTYPE;
1119 MONO_ADD_INS (cfg->cbb, ins);
1120 if (sig->params [0]->type == MONO_TYPE_R4)
1121 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1122 else if (sig->params [0]->type == MONO_TYPE_R8)
1123 ins->backend.spill_var = get_double_spill_area (cfg);
1126 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1127 ins->dreg = args [0]->dreg;
1129 MONO_ADD_INS (cfg->cbb, ins);
1135 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1136 MONO_ADD_INS (cfg->cbb, ins);
1137 addr_reg = ins->dreg;
1139 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1140 addr_reg = args [0]->dreg;
1143 for (i = sig->param_count - 1; i >= 0; --i) {
1144 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1147 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1148 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1149 NULLIFY_INS (args [0]);
1151 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1152 ins->klass = cmethod->klass;
1153 ins->sreg1 = addr_reg;
1154 ins->type = STACK_VTYPE;
1156 MONO_ADD_INS (cfg->cbb, ins);
1162 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1167 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1169 //TODO macroize this
1170 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1171 ins->klass = cmethod->klass;
1172 ins->type = STACK_VTYPE;
1174 ins->dreg = alloc_ireg (cfg);
1175 MONO_ADD_INS (cfg->cbb, ins);
1180 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1183 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1185 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1187 if (args [1]->opcode != OP_ICONST) {
1188 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1189 ins->klass = mono_defaults.int32_class;
1190 ins->sreg1 = args [1]->dreg;
1191 ins->type = STACK_I4;
1192 ins->dreg = vreg2 = alloc_ireg (cfg);
1193 MONO_ADD_INS (cfg->cbb, ins);
1195 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1198 MONO_INST_NEW (cfg, ins, opcode);
1199 ins->klass = cmethod->klass;
1203 if (args [1]->opcode == OP_ICONST) {
1204 ins->inst_imm = args [1]->inst_c0;
1205 NULLIFY_INS (args [1]);
1208 ins->type = STACK_VTYPE;
1209 ins->dreg = alloc_ireg (cfg);
1210 MONO_ADD_INS (cfg->cbb, ins);
1214 static inline gboolean
1215 mono_op_is_packed_compare (int op)
1217 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1221 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1224 int left_vreg, right_vreg, tmp_vreg;
1226 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1227 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1230 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1231 ins->klass = cmethod->klass;
1232 ins->sreg1 = left_vreg;
1233 ins->sreg2 = right_vreg;
1234 ins->type = STACK_VTYPE;
1235 ins->klass = cmethod->klass;
1236 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1237 ins->inst_c0 = intrinsic->flags;
1238 MONO_ADD_INS (cfg->cbb, ins);
1240 /*FIXME the next ops are SSE specific*/
1241 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1242 ins->klass = cmethod->klass;
1243 ins->sreg1 = tmp_vreg;
1244 ins->type = STACK_I4;
1245 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1246 MONO_ADD_INS (cfg->cbb, ins);
1248 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1249 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1250 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1251 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1253 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1254 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1256 MONO_ADD_INS (cfg->cbb, ins);
1262 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1267 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
1269 if (args [1]->opcode != OP_ICONST) {
1270 /*TODO Shuffle with non literals is not yet supported */
1273 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1274 NULLIFY_INS (args [1]);
1276 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1277 ins->klass = cmethod->klass;
1279 ins->inst_c0 = args [1]->inst_c0;
1280 ins->type = STACK_VTYPE;
1281 ins->dreg = alloc_ireg (cfg);
1282 MONO_ADD_INS (cfg->cbb, ins);
1287 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1291 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1292 ins->klass = cmethod->klass;
1293 ins->sreg1 = args [0]->dreg;
1294 ins->type = STACK_VTYPE;
1295 ins->dreg = alloc_ireg (cfg);
1296 MONO_ADD_INS (cfg->cbb, ins);
1301 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1306 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1308 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1309 ins->klass = cmethod->klass;
1310 ins->dreg = args [0]->dreg;
1312 ins->type = STACK_VTYPE;
1313 MONO_ADD_INS (cfg->cbb, ins);
1318 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1323 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1325 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1326 ins->klass = cmethod->klass;
1328 ins->type = STACK_I4;
1329 ins->dreg = alloc_ireg (cfg);
1330 MONO_ADD_INS (cfg->cbb, ins);
1336 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1340 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1341 ins->klass = cmethod->klass;
1342 ins->sreg1 = args [0]->dreg;
1343 ins->backend.arg_info = intrinsic->flags;
1344 MONO_ADD_INS (cfg->cbb, ins);
1349 simd_version_name (guint32 version)
1352 case SIMD_VERSION_SSE1:
1354 case SIMD_VERSION_SSE2:
1356 case SIMD_VERSION_SSE3:
1358 case SIMD_VERSION_SSSE3:
1360 case SIMD_VERSION_SSE41:
1362 case SIMD_VERSION_SSE42:
1364 case SIMD_VERSION_SSE4a:
1371 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1373 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1375 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1378 if (IS_DEBUG_ON (cfg)) {
1380 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1381 max = fsig->param_count + fsig->hasthis;
1382 for (i = 0; i < max; ++i) {
1383 printf ("param %d: ", i);
1384 mono_print_ins (args [i]);
1387 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
1388 if (IS_DEBUG_ON (cfg))
1389 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
1393 switch (result->simd_emit_mode) {
1394 case SIMD_EMIT_BINARY:
1395 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1396 case SIMD_EMIT_UNARY:
1397 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1398 case SIMD_EMIT_SETTER:
1399 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1400 case SIMD_EMIT_GETTER:
1401 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1402 case SIMD_EMIT_GETTER_QWORD:
1403 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1404 case SIMD_EMIT_CTOR:
1405 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1406 case SIMD_EMIT_CAST:
1407 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1408 case SIMD_EMIT_SHUFFLE:
1409 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1410 case SIMD_EMIT_SHIFT:
1411 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1412 case SIMD_EMIT_EQUALITY:
1413 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1414 case SIMD_EMIT_LOAD_ALIGNED:
1415 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1416 case SIMD_EMIT_STORE:
1417 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1418 case SIMD_EMIT_EXTRACT_MASK:
1419 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1420 case SIMD_EMIT_PREFETCH:
1421 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1423 g_assert_not_reached ();
1427 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1431 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1433 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1434 mult_reg = alloc_preg (cfg);
1435 array_reg = arr->dreg;
1436 index_reg = index->dreg;
1438 #if SIZEOF_VOID_P == 8
1439 /* The array reg is 64 bits but the index reg is only 32 */
1440 index2_reg = alloc_preg (cfg);
1441 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1443 index2_reg = index_reg;
1445 index3_reg = alloc_preg (cfg);
1448 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1449 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1450 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1453 add_reg = alloc_preg (cfg);
1455 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1456 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1457 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1458 ins->type = STACK_PTR;
1459 MONO_ADD_INS (cfg->cbb, ins);
1465 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1467 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1469 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1471 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1472 load->klass = cmethod->klass;
1474 load->type = STACK_VTYPE;
1475 load->dreg = alloc_ireg (cfg);
1476 MONO_ADD_INS (cfg->cbb, load);
1480 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1482 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1483 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1485 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1486 store->klass = cmethod->klass;
1488 store->sreg1 = vreg;
1489 MONO_ADD_INS (cfg->cbb, store);
1493 if (!strcmp ("IsAligned", cmethod->name)) {
1495 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1497 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1498 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1499 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1500 MONO_ADD_INS (cfg->cbb, ins);
1508 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1510 if (!strcmp ("get_AccelMode", cmethod->name)) {
1512 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1519 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1521 const char *class_name;
1523 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1526 class_name = cmethod->klass->name;
1527 if (!strcmp ("SimdRuntime", class_name))
1528 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1530 if (!strcmp ("ArrayExtensions", class_name))
1531 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1533 if (!strcmp ("VectorOperations", class_name)) {
1534 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1536 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1537 } else if (!cmethod->klass->simd_type)
1540 cfg->uses_simd_intrinsics = 1;
1541 if (!strcmp ("Vector2d", class_name))
1542 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1543 if (!strcmp ("Vector4f", class_name))
1544 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1545 if (!strcmp ("Vector2ul", class_name))
1546 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1547 if (!strcmp ("Vector2l", class_name))
1548 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1549 if (!strcmp ("Vector4ui", class_name))
1550 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1551 if (!strcmp ("Vector4i", class_name))
1552 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1553 if (!strcmp ("Vector8us", class_name))
1554 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1555 if (!strcmp ("Vector8s", class_name))
1556 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1557 if (!strcmp ("Vector16b", class_name))
1558 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1559 if (!strcmp ("Vector16sb", class_name))
1560 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));