2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
73 SIMD_EMIT_GETTER_QWORD,
79 SIMD_EMIT_LOAD_ALIGNED,
81 SIMD_EMIT_EXTRACT_MASK,
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name,
113 #include "simd-methods.h"
117 #define method_name(idx) (method_names [(idx)])
124 guint8 simd_emit_mode : 4;
125 guint8 simd_version : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
132 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
133 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
168 { SN_op_Equality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
169 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
170 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
171 { SN_op_Inequality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
172 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
173 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
174 { SN_set_W, 3, SIMD_EMIT_SETTER },
175 { SN_set_X, 0, SIMD_EMIT_SETTER },
176 { SN_set_Y, 1, SIMD_EMIT_SETTER },
177 { SN_set_Z, 2, SIMD_EMIT_SETTER },
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_ctor, OP_EXPAND_R8, SIMD_EMIT_CTOR },
182 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
183 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
184 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
185 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
186 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
187 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
188 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
189 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
190 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
191 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
192 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
193 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
194 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
195 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
196 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
197 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
198 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
199 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
200 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
201 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
202 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
203 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
204 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
205 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
206 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
207 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
208 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
209 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
210 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
211 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
212 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
213 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
214 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
215 { SN_set_X, 0, SIMD_EMIT_SETTER },
216 { SN_set_Y, 1, SIMD_EMIT_SETTER },
219 static const SimdIntrinsc vector2ul_intrinsics[] = {
220 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
221 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
222 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
223 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
224 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
225 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
226 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
227 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
228 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
229 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
230 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
231 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
232 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
233 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
234 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
235 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
236 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
237 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
238 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
239 { SN_op_RightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
240 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
241 { SN_set_X, 0, SIMD_EMIT_SETTER },
242 { SN_set_Y, 1, SIMD_EMIT_SETTER },
245 static const SimdIntrinsc vector2l_intrinsics[] = {
246 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
247 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
248 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE42 },
249 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
250 { SN_LogicalRightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
251 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
252 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
253 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
254 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
255 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
256 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
257 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
258 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
259 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
260 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
261 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
263 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
264 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
265 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
266 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
267 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
268 { SN_set_X, 0, SIMD_EMIT_SETTER },
269 { SN_set_Y, 1, SIMD_EMIT_SETTER },
272 static const SimdIntrinsc vector4ui_intrinsics[] = {
273 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
274 { SN_ArithmeticRightShift, OP_PSARD, SIMD_EMIT_SHIFT },
275 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
276 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
277 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
278 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
279 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
280 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
281 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
282 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
283 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
284 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
285 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
286 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
287 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
288 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
289 { SN_get_W, 3, SIMD_EMIT_GETTER },
290 { SN_get_X, 0, SIMD_EMIT_GETTER },
291 { SN_get_Y, 1, SIMD_EMIT_GETTER },
292 { SN_get_Z, 2, SIMD_EMIT_GETTER },
293 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
294 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
296 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
297 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
298 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
299 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
300 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
301 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
302 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
303 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
304 { SN_set_W, 3, SIMD_EMIT_SETTER },
305 { SN_set_X, 0, SIMD_EMIT_SETTER },
306 { SN_set_Y, 1, SIMD_EMIT_SETTER },
307 { SN_set_Z, 2, SIMD_EMIT_SETTER },
310 static const SimdIntrinsc vector4i_intrinsics[] = {
311 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
312 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
313 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
314 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
315 { SN_LogicalRightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
316 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
317 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
318 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
319 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
320 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
321 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
322 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
323 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
324 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
325 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
326 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
327 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
328 { SN_get_W, 3, SIMD_EMIT_GETTER },
329 { SN_get_X, 0, SIMD_EMIT_GETTER },
330 { SN_get_Y, 1, SIMD_EMIT_GETTER },
331 { SN_get_Z, 2, SIMD_EMIT_GETTER },
332 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
333 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
334 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
335 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
336 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
337 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
338 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
339 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
340 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
341 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
342 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
343 { SN_set_W, 3, SIMD_EMIT_SETTER },
344 { SN_set_X, 0, SIMD_EMIT_SETTER },
345 { SN_set_Y, 1, SIMD_EMIT_SETTER },
346 { SN_set_Z, 2, SIMD_EMIT_SETTER },
349 static const SimdIntrinsc vector8us_intrinsics[] = {
350 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
351 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
352 { SN_ArithmeticRightShift, OP_PSARW, SIMD_EMIT_SHIFT },
353 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
354 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
355 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
356 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
357 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
358 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
359 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
360 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
361 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
362 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
363 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
364 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
365 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
366 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
367 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
368 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
369 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
370 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
371 { SN_get_V0, 0, SIMD_EMIT_GETTER },
372 { SN_get_V1, 1, SIMD_EMIT_GETTER },
373 { SN_get_V2, 2, SIMD_EMIT_GETTER },
374 { SN_get_V3, 3, SIMD_EMIT_GETTER },
375 { SN_get_V4, 4, SIMD_EMIT_GETTER },
376 { SN_get_V5, 5, SIMD_EMIT_GETTER },
377 { SN_get_V6, 6, SIMD_EMIT_GETTER },
378 { SN_get_V7, 7, SIMD_EMIT_GETTER },
379 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
380 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
381 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
382 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
383 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
384 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
385 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
386 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
387 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
388 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
389 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
390 { SN_set_V0, 0, SIMD_EMIT_SETTER },
391 { SN_set_V1, 1, SIMD_EMIT_SETTER },
392 { SN_set_V2, 2, SIMD_EMIT_SETTER },
393 { SN_set_V3, 3, SIMD_EMIT_SETTER },
394 { SN_set_V4, 4, SIMD_EMIT_SETTER },
395 { SN_set_V5, 5, SIMD_EMIT_SETTER },
396 { SN_set_V6, 6, SIMD_EMIT_SETTER },
397 { SN_set_V7, 7, SIMD_EMIT_SETTER },
400 static const SimdIntrinsc vector8s_intrinsics[] = {
401 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
402 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
403 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
404 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
405 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
406 { SN_LogicalRightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
407 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
408 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
409 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
410 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
411 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
412 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
413 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
414 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
415 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
416 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
417 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
418 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
419 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
420 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
421 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
422 { SN_get_V0, 0, SIMD_EMIT_GETTER },
423 { SN_get_V1, 1, SIMD_EMIT_GETTER },
424 { SN_get_V2, 2, SIMD_EMIT_GETTER },
425 { SN_get_V3, 3, SIMD_EMIT_GETTER },
426 { SN_get_V4, 4, SIMD_EMIT_GETTER },
427 { SN_get_V5, 5, SIMD_EMIT_GETTER },
428 { SN_get_V6, 6, SIMD_EMIT_GETTER },
429 { SN_get_V7, 7, SIMD_EMIT_GETTER },
430 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
431 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
432 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
433 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
434 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
435 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
436 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
437 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
438 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
439 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
440 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
441 { SN_set_V0, 0, SIMD_EMIT_SETTER },
442 { SN_set_V1, 1, SIMD_EMIT_SETTER },
443 { SN_set_V2, 2, SIMD_EMIT_SETTER },
444 { SN_set_V3, 3, SIMD_EMIT_SETTER },
445 { SN_set_V4, 4, SIMD_EMIT_SETTER },
446 { SN_set_V5, 5, SIMD_EMIT_SETTER },
447 { SN_set_V6, 6, SIMD_EMIT_SETTER },
448 { SN_set_V7, 7, SIMD_EMIT_SETTER },
451 static const SimdIntrinsc vector16b_intrinsics[] = {
452 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
453 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
454 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
455 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
456 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
457 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
458 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
459 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
460 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
461 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
462 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
463 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
464 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
465 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
466 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
467 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
468 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
469 { SN_get_V0, 0, SIMD_EMIT_GETTER },
470 { SN_get_V1, 1, SIMD_EMIT_GETTER },
471 { SN_get_V10, 10, SIMD_EMIT_GETTER },
472 { SN_get_V11, 11, SIMD_EMIT_GETTER },
473 { SN_get_V12, 12, SIMD_EMIT_GETTER },
474 { SN_get_V13, 13, SIMD_EMIT_GETTER },
475 { SN_get_V14, 14, SIMD_EMIT_GETTER },
476 { SN_get_V15, 15, SIMD_EMIT_GETTER },
477 { SN_get_V2, 2, SIMD_EMIT_GETTER },
478 { SN_get_V3, 3, SIMD_EMIT_GETTER },
479 { SN_get_V4, 4, SIMD_EMIT_GETTER },
480 { SN_get_V5, 5, SIMD_EMIT_GETTER },
481 { SN_get_V6, 6, SIMD_EMIT_GETTER },
482 { SN_get_V7, 7, SIMD_EMIT_GETTER },
483 { SN_get_V8, 8, SIMD_EMIT_GETTER },
484 { SN_get_V9, 9, SIMD_EMIT_GETTER },
485 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
486 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
487 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
488 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
489 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
490 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
491 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
492 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
493 { SN_set_V0, 0, SIMD_EMIT_SETTER },
494 { SN_set_V1, 1, SIMD_EMIT_SETTER },
495 { SN_set_V10, 10, SIMD_EMIT_SETTER },
496 { SN_set_V11, 11, SIMD_EMIT_SETTER },
497 { SN_set_V12, 12, SIMD_EMIT_SETTER },
498 { SN_set_V13, 13, SIMD_EMIT_SETTER },
499 { SN_set_V14, 14, SIMD_EMIT_SETTER },
500 { SN_set_V15, 15, SIMD_EMIT_SETTER },
501 { SN_set_V2, 2, SIMD_EMIT_SETTER },
502 { SN_set_V3, 3, SIMD_EMIT_SETTER },
503 { SN_set_V4, 4, SIMD_EMIT_SETTER },
504 { SN_set_V5, 5, SIMD_EMIT_SETTER },
505 { SN_set_V6, 6, SIMD_EMIT_SETTER },
506 { SN_set_V7, 7, SIMD_EMIT_SETTER },
507 { SN_set_V8, 8, SIMD_EMIT_SETTER },
508 { SN_set_V9, 9, SIMD_EMIT_SETTER },
515 static const SimdIntrinsc vector16sb_intrinsics[] = {
516 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
517 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
518 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
519 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
520 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
521 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
522 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
523 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
524 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
525 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
526 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
527 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
528 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
529 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
530 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
531 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
532 { SN_get_V0, 0, SIMD_EMIT_GETTER },
533 { SN_get_V1, 1, SIMD_EMIT_GETTER },
534 { SN_get_V10, 10, SIMD_EMIT_GETTER },
535 { SN_get_V11, 11, SIMD_EMIT_GETTER },
536 { SN_get_V12, 12, SIMD_EMIT_GETTER },
537 { SN_get_V13, 13, SIMD_EMIT_GETTER },
538 { SN_get_V14, 14, SIMD_EMIT_GETTER },
539 { SN_get_V15, 15, SIMD_EMIT_GETTER },
540 { SN_get_V2, 2, SIMD_EMIT_GETTER },
541 { SN_get_V3, 3, SIMD_EMIT_GETTER },
542 { SN_get_V4, 4, SIMD_EMIT_GETTER },
543 { SN_get_V5, 5, SIMD_EMIT_GETTER },
544 { SN_get_V6, 6, SIMD_EMIT_GETTER },
545 { SN_get_V7, 7, SIMD_EMIT_GETTER },
546 { SN_get_V8, 8, SIMD_EMIT_GETTER },
547 { SN_get_V9, 9, SIMD_EMIT_GETTER },
548 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
549 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
550 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
551 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
552 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
553 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
554 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
555 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
556 { SN_set_V0, 0, SIMD_EMIT_SETTER },
557 { SN_set_V1, 1, SIMD_EMIT_SETTER },
558 { SN_set_V10, 10, SIMD_EMIT_SETTER },
559 { SN_set_V11, 11, SIMD_EMIT_SETTER },
560 { SN_set_V12, 12, SIMD_EMIT_SETTER },
561 { SN_set_V13, 13, SIMD_EMIT_SETTER },
562 { SN_set_V14, 14, SIMD_EMIT_SETTER },
563 { SN_set_V15, 15, SIMD_EMIT_SETTER },
564 { SN_set_V2, 2, SIMD_EMIT_SETTER },
565 { SN_set_V3, 3, SIMD_EMIT_SETTER },
566 { SN_set_V4, 4, SIMD_EMIT_SETTER },
567 { SN_set_V5, 5, SIMD_EMIT_SETTER },
568 { SN_set_V6, 6, SIMD_EMIT_SETTER },
569 { SN_set_V7, 7, SIMD_EMIT_SETTER },
570 { SN_set_V8, 8, SIMD_EMIT_SETTER },
571 { SN_set_V9, 9, SIMD_EMIT_SETTER },
574 static guint32 simd_supported_versions;
576 /*TODO match using number of parameters as well*/
578 simd_intrinsic_compare_by_name (const void *key, const void *value)
580 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
585 VREG_HAS_XZERO_BB0 = 0x02,
586 VREG_HAS_OTHER_OP_BB0 = 0x04,
587 VREG_SINGLE_BB_USE = 0x08,
588 VREG_MANY_BB_USE = 0x10,
592 mono_simd_intrinsics_init (void)
594 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
595 /*TODO log the supported flags*/
598 static inline gboolean
599 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
601 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
602 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
603 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
604 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
610 static inline gboolean
611 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
613 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
616 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
617 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
618 vreg_flags [reg] |= VREG_MANY_BB_USE;
619 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
621 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
622 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
623 target_bb [reg] = bb;
624 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
631 This pass recalculate which vars need MONO_INST_INDIRECT.
633 We cannot do this for non SIMD vars since code like mono_get_vtable_var
634 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
637 mono_simd_simplify_indirection (MonoCompile *cfg)
640 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
644 for (i = 0; i < cfg->num_varinfo; i++) {
645 MonoInst *var = cfg->varinfo [i];
646 if (var->klass->simd_type) {
647 var->flags &= ~MONO_INST_INDIRECT;
648 max_vreg = MAX (var->dreg, max_vreg);
652 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
653 if (!first_bb && bb->code)
655 for (ins = bb->code; ins; ins = ins->next) {
656 if (ins->opcode == OP_LDADDR) {
657 MonoInst *var = (MonoInst*)ins->inst_p0;
658 if (var->klass->simd_type) {
659 var->flags |= MONO_INST_INDIRECT;
665 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
666 vreg_flags = g_malloc0 (max_vreg + 1);
667 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
669 for (i = 0; i < cfg->num_varinfo; i++) {
670 MonoInst *var = cfg->varinfo [i];
671 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
672 vreg_flags [var->dreg] = VREG_USED;
673 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
677 /*Scan the first basic block looking xzeros not used*/
678 for (ins = first_bb->code; ins; ins = ins->next) {
680 int sregs [MONO_MAX_SRC_REGS];
682 if (ins->opcode == OP_XZERO) {
683 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
684 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
685 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
689 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
691 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
693 num_sregs = mono_inst_get_src_registers (ins, sregs);
694 for (i = 0; i < num_sregs; ++i) {
695 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
700 if (IS_DEBUG_ON (cfg)) {
701 for (i = 0; i < cfg->num_varinfo; i++) {
702 MonoInst *var = cfg->varinfo [i];
703 if (var->klass->simd_type) {
704 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
705 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
706 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
707 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
712 /*TODO stop here if no var is xzero only*/
715 Scan all other bb and check if it has only one other use
716 Ideally this would be done after an extended bb formation pass
718 FIXME This pass could use dominator information to properly
719 place the XZERO on the bb that dominates all uses of the var,
720 but this will have zero effect with the current local reg alloc
722 TODO simply the use of flags.
725 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
726 for (ins = bb->code; ins; ins = ins->next) {
728 int sregs [MONO_MAX_SRC_REGS];
730 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
732 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
734 num_sregs = mono_inst_get_src_registers (ins, sregs);
735 for (i = 0; i < num_sregs; ++i) {
736 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
737 max_vreg, vreg_flags, target_bb))
743 for (i = 0; i < cfg->num_varinfo; i++) {
744 MonoInst *var = cfg->varinfo [i];
745 if (!var->klass->simd_type)
747 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
748 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
749 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
750 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
752 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
754 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
756 int sregs [MONO_MAX_SRC_REGS];
757 gboolean found = FALSE;
759 num_sregs = mono_inst_get_src_registers (ins, sregs);
760 for (j = 0; j < num_sregs; ++j) {
761 if (sregs [i] == var->dreg)
764 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
765 if (ins->dreg == var->dreg && !found) {
769 MONO_INST_NEW (cfg, tmp, OP_XZERO);
770 tmp->dreg = var->dreg;
771 tmp->type = STACK_VTYPE;
772 tmp->klass = var->klass;
773 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
779 for (ins = first_bb->code; ins; ins = ins->next) {
780 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
789 * This function expect that src be a value.
792 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
794 if (src->opcode == OP_XMOVE) {
796 } else if (src->type == STACK_VTYPE) {
799 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
800 mono_print_ins (src);
801 g_assert_not_reached ();
805 * This function will load the value if needed.
808 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
810 if (src->opcode == OP_XMOVE) {
812 } else if (src->opcode == OP_LDADDR) {
813 int res = ((MonoInst*)src->inst_p0)->dreg;
816 } else if (src->type == STACK_VTYPE) {
818 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
821 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
822 ins->klass = cmethod->klass;
823 ins->sreg1 = src->dreg;
824 ins->type = STACK_VTYPE;
825 ins->dreg = alloc_ireg (cfg);
826 MONO_ADD_INS (cfg->cbb, ins);
829 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
830 mono_print_ins (src);
831 g_assert_not_reached ();
835 get_int_to_float_spill_area (MonoCompile *cfg)
837 if (!cfg->iconv_raw_var) {
838 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
839 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
841 return cfg->iconv_raw_var;
844 /*We share the var with fconv_to_r8_x to save some stack space.*/
846 get_double_spill_area (MonoCompile *cfg)
848 if (!cfg->fconv_to_r8_x_var) {
849 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
850 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
852 return cfg->fconv_to_r8_x_var;
855 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
857 if (!cfg->simd_ctor_var) {
858 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
859 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
861 return cfg->simd_ctor_var;
865 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
868 int left_vreg, right_vreg;
870 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
871 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
874 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
875 ins->klass = cmethod->klass;
876 ins->sreg1 = left_vreg;
877 ins->sreg2 = right_vreg;
878 ins->type = STACK_VTYPE;
879 ins->klass = cmethod->klass;
880 ins->dreg = alloc_ireg (cfg);
881 ins->inst_c0 = intrinsic->flags;
882 MONO_ADD_INS (cfg->cbb, ins);
887 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
892 vreg = get_simd_vreg (cfg, cmethod, args [0]);
894 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
895 ins->klass = cmethod->klass;
897 ins->type = STACK_VTYPE;
898 ins->dreg = alloc_ireg (cfg);
899 MONO_ADD_INS (cfg->cbb, ins);
904 mono_type_to_extract_op (MonoType *type)
906 switch (type->type) {
908 return OP_EXTRACT_I1;
910 return OP_EXTRACT_U1;
912 return OP_EXTRACT_I2;
914 return OP_EXTRACT_U2;
918 return OP_EXTRACT_I4;
920 g_assert_not_reached ();
923 /*Returns the amount to shift the element index to get the dword it belongs to*/
925 mono_type_elements_shift_bits (MonoType *type)
927 switch (type->type) {
939 g_assert_not_reached ();
943 mono_type_to_slow_insert_op (MonoType *type)
945 switch (type->type) {
948 return OP_INSERTX_U1_SLOW;
954 return OP_INSERTX_I4_SLOW;
957 return OP_INSERTX_I8_SLOW;
959 return OP_INSERTX_R4_SLOW;
961 return OP_INSERTX_R8_SLOW;
963 g_assert_not_reached ();
967 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
970 MonoMethodSignature *sig = mono_method_signature (cmethod);
972 size = mono_type_size (sig->params [0], &align);
974 if (size == 2 || size == 4 || size == 8) {
975 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
976 ins->klass = cmethod->klass;
977 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
978 ins->dreg = ins->sreg1 = load_simd_vreg (cfg, cmethod, args [0]);
979 ins->sreg2 = args [1]->dreg;
980 ins->inst_c0 = intrinsic->opcode;
981 if (sig->params [0]->type == MONO_TYPE_R4)
982 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
983 else if (sig->params [0]->type == MONO_TYPE_R8)
984 ins->backend.spill_var = get_double_spill_area (cfg);
985 MONO_ADD_INS (cfg->cbb, ins);
989 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
990 ins->klass = cmethod->klass;
991 ins->sreg1 = sreg = load_simd_vreg (cfg, cmethod, args [0]);
992 ins->type = STACK_I4;
993 ins->dreg = vreg = alloc_ireg (cfg);
994 ins->inst_c0 = intrinsic->opcode / 2;
995 MONO_ADD_INS (cfg->cbb, ins);
997 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
998 ins->klass = cmethod->klass;
1000 ins->sreg2 = args [1]->dreg;
1002 ins->inst_c0 = intrinsic->opcode;
1003 MONO_ADD_INS (cfg->cbb, ins);
1010 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1013 MonoMethodSignature *sig = mono_method_signature (cmethod);
1014 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1016 vreg = load_simd_vreg (cfg, cmethod, args [0]);
1018 if (intrinsic->opcode >> shift_bits) {
1019 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1020 ins->klass = cmethod->klass;
1022 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1023 ins->type = STACK_VTYPE;
1024 ins->dreg = vreg = alloc_ireg (cfg);
1025 MONO_ADD_INS (cfg->cbb, ins);
1028 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1029 ins->klass = cmethod->klass;
1031 ins->type = STACK_I4;
1032 ins->dreg = vreg = alloc_ireg (cfg);
1033 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1034 MONO_ADD_INS (cfg->cbb, ins);
1036 if (sig->ret->type == MONO_TYPE_R4) {
1037 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1038 ins->klass = mono_defaults.single_class;
1040 ins->type = STACK_R8;
1041 ins->dreg = alloc_freg (cfg);
1042 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1043 MONO_ADD_INS (cfg->cbb, ins);
1049 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1053 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1055 vreg = load_simd_vreg (cfg, cmethod, args [0]);
1057 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1058 ins->klass = cmethod->klass;
1060 ins->inst_c0 = intrinsic->opcode;
1062 ins->type = STACK_R8;
1063 ins->dreg = alloc_freg (cfg);
1064 ins->backend.spill_var = get_double_spill_area (cfg);
1066 ins->type = STACK_I8;
1067 ins->dreg = alloc_lreg (cfg);
1069 MONO_ADD_INS (cfg->cbb, ins);
1075 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1077 MonoInst *ins = NULL;
1079 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1080 MonoMethodSignature *sig = mono_method_signature (cmethod);
1081 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1082 int arg_size = mono_type_size (sig->params [0], &i);
1084 if (sig->param_count == 1) {
1088 dreg = args [0]->inst_i0->dreg;
1089 NULLIFY_INS (args [0]);
1091 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1092 dreg = alloc_ireg (cfg);
1095 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1096 ins->klass = cmethod->klass;
1097 ins->sreg1 = args [1]->dreg;
1098 ins->type = STACK_VTYPE;
1101 MONO_ADD_INS (cfg->cbb, ins);
1102 if (sig->params [0]->type == MONO_TYPE_R4)
1103 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1104 else if (sig->params [0]->type == MONO_TYPE_R8)
1105 ins->backend.spill_var = get_double_spill_area (cfg);
1108 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1109 ins->dreg = args [0]->dreg;
1111 MONO_ADD_INS (cfg->cbb, ins);
1117 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1118 MONO_ADD_INS (cfg->cbb, ins);
1119 addr_reg = ins->dreg;
1121 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1122 addr_reg = args [0]->dreg;
1125 for (i = sig->param_count - 1; i >= 0; --i) {
1126 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1129 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1130 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1131 NULLIFY_INS (args [0]);
1133 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1134 ins->klass = cmethod->klass;
1135 ins->sreg1 = addr_reg;
1136 ins->type = STACK_VTYPE;
1138 MONO_ADD_INS (cfg->cbb, ins);
1144 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1149 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1151 //TODO macroize this
1152 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1153 ins->klass = cmethod->klass;
1154 ins->type = STACK_VTYPE;
1156 ins->dreg = alloc_ireg (cfg);
1157 MONO_ADD_INS (cfg->cbb, ins);
1162 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1165 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1167 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1169 if (args [1]->opcode != OP_ICONST) {
1170 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1171 ins->klass = mono_defaults.int32_class;
1172 ins->sreg1 = args [1]->dreg;
1173 ins->type = STACK_I4;
1174 ins->dreg = vreg2 = alloc_ireg (cfg);
1175 MONO_ADD_INS (cfg->cbb, ins);
1177 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1180 MONO_INST_NEW (cfg, ins, opcode);
1181 ins->klass = cmethod->klass;
1185 if (args [1]->opcode == OP_ICONST) {
1186 ins->inst_imm = args [1]->inst_c0;
1187 NULLIFY_INS (args [1]);
1190 ins->type = STACK_VTYPE;
1191 ins->dreg = alloc_ireg (cfg);
1192 MONO_ADD_INS (cfg->cbb, ins);
1196 static inline gboolean
1197 mono_op_is_packed_compare (int op)
1199 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1203 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1206 int left_vreg, right_vreg, tmp_vreg;
1208 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1209 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1212 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1213 ins->klass = cmethod->klass;
1214 ins->sreg1 = left_vreg;
1215 ins->sreg2 = right_vreg;
1216 ins->type = STACK_VTYPE;
1217 ins->klass = cmethod->klass;
1218 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1219 ins->inst_c0 = intrinsic->flags;
1220 MONO_ADD_INS (cfg->cbb, ins);
1222 /*FIXME the next ops are SSE specific*/
1223 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1224 ins->klass = cmethod->klass;
1225 ins->sreg1 = tmp_vreg;
1226 ins->type = STACK_I4;
1227 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1228 MONO_ADD_INS (cfg->cbb, ins);
1230 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1231 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1232 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1233 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1235 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1236 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1238 MONO_ADD_INS (cfg->cbb, ins);
1244 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1249 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
1251 if (args [1]->opcode != OP_ICONST) {
1252 g_warning ("Shuffle with non literals is not yet supported");
1253 g_assert_not_reached ();
1255 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1256 NULLIFY_INS (args [1]);
1258 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1259 ins->klass = cmethod->klass;
1261 ins->inst_c0 = args [1]->inst_c0;
1262 ins->type = STACK_VTYPE;
1263 ins->dreg = alloc_ireg (cfg);
1264 MONO_ADD_INS (cfg->cbb, ins);
1269 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1273 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1274 ins->klass = cmethod->klass;
1275 ins->sreg1 = args [0]->dreg;
1276 ins->type = STACK_VTYPE;
1277 ins->dreg = alloc_ireg (cfg);
1278 MONO_ADD_INS (cfg->cbb, ins);
1283 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1288 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1290 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1291 ins->klass = cmethod->klass;
1292 ins->dreg = args [0]->dreg;
1294 ins->type = STACK_VTYPE;
1295 MONO_ADD_INS (cfg->cbb, ins);
1300 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1305 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1307 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1308 ins->klass = cmethod->klass;
1310 ins->type = STACK_I4;
1311 ins->dreg = alloc_ireg (cfg);
1312 MONO_ADD_INS (cfg->cbb, ins);
1318 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1322 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1323 ins->klass = cmethod->klass;
1324 ins->sreg1 = args [0]->dreg;
1325 ins->backend.arg_info = intrinsic->flags;
1326 MONO_ADD_INS (cfg->cbb, ins);
1331 simd_version_name (guint32 version)
1334 case SIMD_VERSION_SSE1:
1336 case SIMD_VERSION_SSE2:
1338 case SIMD_VERSION_SSE3:
1340 case SIMD_VERSION_SSSE3:
1342 case SIMD_VERSION_SSE41:
1344 case SIMD_VERSION_SSE42:
1346 case SIMD_VERSION_SSE4a:
1353 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1355 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1357 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1360 if (IS_DEBUG_ON (cfg)) {
1362 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1363 max = fsig->param_count + fsig->hasthis;
1364 for (i = 0; i < max; ++i) {
1365 printf ("param %d: ", i);
1366 mono_print_ins (args [i]);
1369 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
1370 if (IS_DEBUG_ON (cfg))
1371 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
1375 switch (result->simd_emit_mode) {
1376 case SIMD_EMIT_BINARY:
1377 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1378 case SIMD_EMIT_UNARY:
1379 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1380 case SIMD_EMIT_SETTER:
1381 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1382 case SIMD_EMIT_GETTER:
1383 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1384 case SIMD_EMIT_GETTER_QWORD:
1385 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1386 case SIMD_EMIT_CTOR:
1387 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1388 case SIMD_EMIT_CAST:
1389 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1390 case SIMD_EMIT_SHUFFLE:
1391 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1392 case SIMD_EMIT_SHIFT:
1393 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1394 case SIMD_EMIT_EQUALITY:
1395 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1396 case SIMD_EMIT_LOAD_ALIGNED:
1397 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1398 case SIMD_EMIT_STORE:
1399 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1400 case SIMD_EMIT_EXTRACT_MASK:
1401 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1402 case SIMD_EMIT_PREFETCH:
1403 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1405 g_assert_not_reached ();
1409 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1413 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1415 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1416 mult_reg = alloc_preg (cfg);
1417 array_reg = arr->dreg;
1418 index_reg = index->dreg;
1420 #if SIZEOF_VOID_P == 8
1421 /* The array reg is 64 bits but the index reg is only 32 */
1422 index2_reg = alloc_preg (cfg);
1423 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1425 index2_reg = index_reg;
1427 index3_reg = alloc_preg (cfg);
1430 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1431 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1432 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1435 add_reg = alloc_preg (cfg);
1437 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1438 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1439 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1440 ins->type = STACK_PTR;
1441 MONO_ADD_INS (cfg->cbb, ins);
1447 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1449 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1451 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1453 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1454 load->klass = cmethod->klass;
1456 load->type = STACK_VTYPE;
1457 load->dreg = alloc_ireg (cfg);
1458 MONO_ADD_INS (cfg->cbb, load);
1462 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1464 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1465 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1467 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1468 store->klass = cmethod->klass;
1470 store->sreg1 = vreg;
1471 MONO_ADD_INS (cfg->cbb, store);
1475 if (!strcmp ("IsAligned", cmethod->name)) {
1477 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1479 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1480 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1481 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1482 MONO_ADD_INS (cfg->cbb, ins);
1490 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1492 if (!strcmp ("get_AccelMode", cmethod->name)) {
1494 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1501 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1503 const char *class_name;
1505 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1508 class_name = cmethod->klass->name;
1509 if (!strcmp ("SimdRuntime", class_name))
1510 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1512 if (!strcmp ("ArrayExtensions", class_name))
1513 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1515 if (!strcmp ("VectorOperations", class_name)) {
1516 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1518 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1519 } else if (!cmethod->klass->simd_type)
1522 cfg->uses_simd_intrinsics = 1;
1523 if (!strcmp ("Vector2d", class_name))
1524 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1525 if (!strcmp ("Vector4f", class_name))
1526 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1527 if (!strcmp ("Vector2ul", class_name))
1528 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1529 if (!strcmp ("Vector2l", class_name))
1530 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1531 if (!strcmp ("Vector4ui", class_name))
1532 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1533 if (!strcmp ("Vector4i", class_name))
1534 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1535 if (!strcmp ("Vector8us", class_name))
1536 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1537 if (!strcmp ("Vector8s", class_name))
1538 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1539 if (!strcmp ("Vector16b", class_name))
1540 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1541 if (!strcmp ("Vector16sb", class_name))
1542 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));