2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
73 SIMD_EMIT_GETTER_QWORD,
79 SIMD_EMIT_LOAD_ALIGNED,
81 SIMD_EMIT_EXTRACT_MASK,
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name,
113 #include "simd-methods.h"
117 #define method_name(idx) (method_names [(idx)])
124 guint8 simd_emit_mode : 4;
125 guint8 simd_version : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
132 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
133 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
168 { SN_op_Equality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
169 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
170 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
171 { SN_op_Inequality, OP_COMPPS, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
172 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
173 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
174 { SN_set_W, 3, SIMD_EMIT_SETTER },
175 { SN_set_X, 0, SIMD_EMIT_SETTER },
176 { SN_set_Y, 1, SIMD_EMIT_SETTER },
177 { SN_set_Z, 2, SIMD_EMIT_SETTER },
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_ctor, OP_EXPAND_R8, SIMD_EMIT_CTOR },
182 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
183 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
184 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
185 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
186 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
187 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
188 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
189 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
190 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
191 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
192 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
193 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
194 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
195 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
196 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
197 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
198 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
199 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
200 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
201 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
202 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
203 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
204 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
205 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
206 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
207 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
208 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
209 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
210 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
211 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
212 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
213 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
214 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
215 { SN_set_X, 0, SIMD_EMIT_SETTER },
216 { SN_set_Y, 1, SIMD_EMIT_SETTER },
219 static const SimdIntrinsc vector2ul_intrinsics[] = {
220 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
221 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
222 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
223 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
224 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
225 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
226 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
227 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
228 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
229 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
230 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
231 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
232 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
233 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
234 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
235 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
236 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
237 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
238 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
239 { SN_op_RightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
240 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
241 { SN_set_X, 0, SIMD_EMIT_SETTER },
242 { SN_set_Y, 1, SIMD_EMIT_SETTER },
245 static const SimdIntrinsc vector2l_intrinsics[] = {
246 { SN_ctor, OP_EXPAND_I8, SIMD_EMIT_CTOR },
247 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
248 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE42 },
249 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
250 { SN_LogicalRightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
251 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
252 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
253 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
254 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
255 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
256 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
257 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
258 { SN_get_X, 0, SIMD_EMIT_GETTER_QWORD },
259 { SN_get_Y, 1, SIMD_EMIT_GETTER_QWORD },
260 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
261 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
263 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
264 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
265 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
266 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
267 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
268 { SN_set_X, 0, SIMD_EMIT_SETTER },
269 { SN_set_Y, 1, SIMD_EMIT_SETTER },
272 static const SimdIntrinsc vector4ui_intrinsics[] = {
273 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
274 { SN_ArithmeticRightShift, OP_PSARD, SIMD_EMIT_SHIFT },
275 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
276 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
277 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
278 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
279 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
280 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
281 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
282 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
283 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
284 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
285 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
286 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
287 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
288 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
289 { SN_get_W, 3, SIMD_EMIT_GETTER },
290 { SN_get_X, 0, SIMD_EMIT_GETTER },
291 { SN_get_Y, 1, SIMD_EMIT_GETTER },
292 { SN_get_Z, 2, SIMD_EMIT_GETTER },
293 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
294 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
296 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
297 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
298 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
299 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
300 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
301 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
302 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
303 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
304 { SN_set_W, 3, SIMD_EMIT_SETTER },
305 { SN_set_X, 0, SIMD_EMIT_SETTER },
306 { SN_set_Y, 1, SIMD_EMIT_SETTER },
307 { SN_set_Z, 2, SIMD_EMIT_SETTER },
310 static const SimdIntrinsc vector4i_intrinsics[] = {
311 { SN_ctor, OP_EXPAND_I4, SIMD_EMIT_CTOR },
312 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
313 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
314 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
315 { SN_LogicalRightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
316 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
317 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
318 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
319 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
320 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
321 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
322 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
323 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
324 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
325 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
326 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
327 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
328 { SN_get_W, 3, SIMD_EMIT_GETTER },
329 { SN_get_X, 0, SIMD_EMIT_GETTER },
330 { SN_get_Y, 1, SIMD_EMIT_GETTER },
331 { SN_get_Z, 2, SIMD_EMIT_GETTER },
332 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
333 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
334 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
335 { SN_op_Equality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
336 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
337 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
338 { SN_op_Inequality, OP_PCMPEQD, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
339 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
340 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
341 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
342 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
343 { SN_set_W, 3, SIMD_EMIT_SETTER },
344 { SN_set_X, 0, SIMD_EMIT_SETTER },
345 { SN_set_Y, 1, SIMD_EMIT_SETTER },
346 { SN_set_Z, 2, SIMD_EMIT_SETTER },
349 static const SimdIntrinsc vector8us_intrinsics[] = {
350 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
351 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
352 { SN_ArithmeticRightShift, OP_PSARW, SIMD_EMIT_SHIFT },
353 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
354 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
355 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
356 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
357 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
358 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
359 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
360 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
361 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
362 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
363 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
364 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
365 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
366 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
367 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
368 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
369 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
370 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
371 { SN_get_V0, 0, SIMD_EMIT_GETTER },
372 { SN_get_V1, 1, SIMD_EMIT_GETTER },
373 { SN_get_V2, 2, SIMD_EMIT_GETTER },
374 { SN_get_V3, 3, SIMD_EMIT_GETTER },
375 { SN_get_V4, 4, SIMD_EMIT_GETTER },
376 { SN_get_V5, 5, SIMD_EMIT_GETTER },
377 { SN_get_V6, 6, SIMD_EMIT_GETTER },
378 { SN_get_V7, 7, SIMD_EMIT_GETTER },
379 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
380 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
381 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
382 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
383 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
384 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
385 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
386 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
387 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
388 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
389 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
390 { SN_set_V0, 0, SIMD_EMIT_SETTER },
391 { SN_set_V1, 1, SIMD_EMIT_SETTER },
392 { SN_set_V2, 2, SIMD_EMIT_SETTER },
393 { SN_set_V3, 3, SIMD_EMIT_SETTER },
394 { SN_set_V4, 4, SIMD_EMIT_SETTER },
395 { SN_set_V5, 5, SIMD_EMIT_SETTER },
396 { SN_set_V6, 6, SIMD_EMIT_SETTER },
397 { SN_set_V7, 7, SIMD_EMIT_SETTER },
400 static const SimdIntrinsc vector8s_intrinsics[] = {
401 { SN_ctor, OP_EXPAND_I2, SIMD_EMIT_CTOR },
402 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
403 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
404 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
405 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
406 { SN_LogicalRightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
407 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
408 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
409 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
410 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
411 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
412 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
413 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
414 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
415 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
416 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
417 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
418 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
419 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
420 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
421 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
422 { SN_get_V0, 0, SIMD_EMIT_GETTER },
423 { SN_get_V1, 1, SIMD_EMIT_GETTER },
424 { SN_get_V2, 2, SIMD_EMIT_GETTER },
425 { SN_get_V3, 3, SIMD_EMIT_GETTER },
426 { SN_get_V4, 4, SIMD_EMIT_GETTER },
427 { SN_get_V5, 5, SIMD_EMIT_GETTER },
428 { SN_get_V6, 6, SIMD_EMIT_GETTER },
429 { SN_get_V7, 7, SIMD_EMIT_GETTER },
430 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
431 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
432 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
433 { SN_op_Equality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
434 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
435 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
436 { SN_op_Inequality, OP_PCMPEQW, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
437 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
438 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
439 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
440 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
441 { SN_set_V0, 0, SIMD_EMIT_SETTER },
442 { SN_set_V1, 1, SIMD_EMIT_SETTER },
443 { SN_set_V2, 2, SIMD_EMIT_SETTER },
444 { SN_set_V3, 3, SIMD_EMIT_SETTER },
445 { SN_set_V4, 4, SIMD_EMIT_SETTER },
446 { SN_set_V5, 5, SIMD_EMIT_SETTER },
447 { SN_set_V6, 6, SIMD_EMIT_SETTER },
448 { SN_set_V7, 7, SIMD_EMIT_SETTER },
451 static const SimdIntrinsc vector16b_intrinsics[] = {
452 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
453 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
454 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
455 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
456 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
457 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
458 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
459 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
460 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
461 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
462 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
463 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
464 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
465 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
466 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
467 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
468 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
469 { SN_get_V0, 0, SIMD_EMIT_GETTER },
470 { SN_get_V1, 1, SIMD_EMIT_GETTER },
471 { SN_get_V10, 10, SIMD_EMIT_GETTER },
472 { SN_get_V11, 11, SIMD_EMIT_GETTER },
473 { SN_get_V12, 12, SIMD_EMIT_GETTER },
474 { SN_get_V13, 13, SIMD_EMIT_GETTER },
475 { SN_get_V14, 14, SIMD_EMIT_GETTER },
476 { SN_get_V15, 15, SIMD_EMIT_GETTER },
477 { SN_get_V2, 2, SIMD_EMIT_GETTER },
478 { SN_get_V3, 3, SIMD_EMIT_GETTER },
479 { SN_get_V4, 4, SIMD_EMIT_GETTER },
480 { SN_get_V5, 5, SIMD_EMIT_GETTER },
481 { SN_get_V6, 6, SIMD_EMIT_GETTER },
482 { SN_get_V7, 7, SIMD_EMIT_GETTER },
483 { SN_get_V8, 8, SIMD_EMIT_GETTER },
484 { SN_get_V9, 9, SIMD_EMIT_GETTER },
485 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
486 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
487 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
488 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
489 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
490 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
491 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
492 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
493 { SN_set_V0, 0, SIMD_EMIT_SETTER },
494 { SN_set_V1, 1, SIMD_EMIT_SETTER },
495 { SN_set_V10, 10, SIMD_EMIT_SETTER },
496 { SN_set_V11, 11, SIMD_EMIT_SETTER },
497 { SN_set_V12, 12, SIMD_EMIT_SETTER },
498 { SN_set_V13, 13, SIMD_EMIT_SETTER },
499 { SN_set_V14, 14, SIMD_EMIT_SETTER },
500 { SN_set_V15, 15, SIMD_EMIT_SETTER },
501 { SN_set_V2, 2, SIMD_EMIT_SETTER },
502 { SN_set_V3, 3, SIMD_EMIT_SETTER },
503 { SN_set_V4, 4, SIMD_EMIT_SETTER },
504 { SN_set_V5, 5, SIMD_EMIT_SETTER },
505 { SN_set_V6, 6, SIMD_EMIT_SETTER },
506 { SN_set_V7, 7, SIMD_EMIT_SETTER },
507 { SN_set_V8, 8, SIMD_EMIT_SETTER },
508 { SN_set_V9, 9, SIMD_EMIT_SETTER },
515 static const SimdIntrinsc vector16sb_intrinsics[] = {
516 { SN_ctor, OP_EXPAND_I1, SIMD_EMIT_CTOR },
517 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
518 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
519 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
520 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
521 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
522 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
523 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
524 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
525 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
526 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
527 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
528 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
529 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
530 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
531 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
532 { SN_get_V0, 0, SIMD_EMIT_GETTER },
533 { SN_get_V1, 1, SIMD_EMIT_GETTER },
534 { SN_get_V10, 10, SIMD_EMIT_GETTER },
535 { SN_get_V11, 11, SIMD_EMIT_GETTER },
536 { SN_get_V12, 12, SIMD_EMIT_GETTER },
537 { SN_get_V13, 13, SIMD_EMIT_GETTER },
538 { SN_get_V14, 14, SIMD_EMIT_GETTER },
539 { SN_get_V15, 15, SIMD_EMIT_GETTER },
540 { SN_get_V2, 2, SIMD_EMIT_GETTER },
541 { SN_get_V3, 3, SIMD_EMIT_GETTER },
542 { SN_get_V4, 4, SIMD_EMIT_GETTER },
543 { SN_get_V5, 5, SIMD_EMIT_GETTER },
544 { SN_get_V6, 6, SIMD_EMIT_GETTER },
545 { SN_get_V7, 7, SIMD_EMIT_GETTER },
546 { SN_get_V8, 8, SIMD_EMIT_GETTER },
547 { SN_get_V9, 9, SIMD_EMIT_GETTER },
548 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
549 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
550 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
551 { SN_op_Equality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
552 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
553 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
554 { SN_op_Inequality, OP_PCMPEQB, SIMD_EMIT_EQUALITY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
555 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
556 { SN_set_V0, 0, SIMD_EMIT_SETTER },
557 { SN_set_V1, 1, SIMD_EMIT_SETTER },
558 { SN_set_V10, 10, SIMD_EMIT_SETTER },
559 { SN_set_V11, 11, SIMD_EMIT_SETTER },
560 { SN_set_V12, 12, SIMD_EMIT_SETTER },
561 { SN_set_V13, 13, SIMD_EMIT_SETTER },
562 { SN_set_V14, 14, SIMD_EMIT_SETTER },
563 { SN_set_V15, 15, SIMD_EMIT_SETTER },
564 { SN_set_V2, 2, SIMD_EMIT_SETTER },
565 { SN_set_V3, 3, SIMD_EMIT_SETTER },
566 { SN_set_V4, 4, SIMD_EMIT_SETTER },
567 { SN_set_V5, 5, SIMD_EMIT_SETTER },
568 { SN_set_V6, 6, SIMD_EMIT_SETTER },
569 { SN_set_V7, 7, SIMD_EMIT_SETTER },
570 { SN_set_V8, 8, SIMD_EMIT_SETTER },
571 { SN_set_V9, 9, SIMD_EMIT_SETTER },
574 static guint32 simd_supported_versions;
576 /*TODO match using number of parameters as well*/
578 simd_intrinsic_compare_by_name (const void *key, const void *value)
580 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
585 VREG_HAS_XZERO_BB0 = 0x02,
586 VREG_HAS_OTHER_OP_BB0 = 0x04,
587 VREG_SINGLE_BB_USE = 0x08,
588 VREG_MANY_BB_USE = 0x10,
592 mono_simd_intrinsics_init (void)
594 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
595 /*TODO log the supported flags*/
598 static inline gboolean
599 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
601 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
602 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
603 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
604 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
610 static inline gboolean
611 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
613 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
616 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
617 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
618 vreg_flags [reg] |= VREG_MANY_BB_USE;
619 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
621 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
622 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
623 target_bb [reg] = bb;
624 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
631 This pass recalculate which vars need MONO_INST_INDIRECT.
633 We cannot do this for non SIMD vars since code like mono_get_vtable_var
634 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
637 mono_simd_simplify_indirection (MonoCompile *cfg)
640 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
644 for (i = 0; i < cfg->num_varinfo; i++) {
645 MonoInst *var = cfg->varinfo [i];
646 if (var->klass->simd_type) {
647 var->flags &= ~MONO_INST_INDIRECT;
648 max_vreg = MAX (var->dreg, max_vreg);
652 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
653 if (!first_bb && bb->code)
655 for (ins = bb->code; ins; ins = ins->next) {
656 if (ins->opcode == OP_LDADDR) {
657 MonoInst *var = (MonoInst*)ins->inst_p0;
658 if (var->klass->simd_type) {
659 var->flags |= MONO_INST_INDIRECT;
665 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
666 vreg_flags = g_malloc0 (max_vreg + 1);
667 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
669 for (i = 0; i < cfg->num_varinfo; i++) {
670 MonoInst *var = cfg->varinfo [i];
671 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
672 vreg_flags [var->dreg] = VREG_USED;
673 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
677 /*Scan the first basic block looking xzeros not used*/
678 for (ins = first_bb->code; ins; ins = ins->next) {
679 if (ins->opcode == OP_XZERO) {
680 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
681 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
682 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
686 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
689 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
691 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg1, max_vreg, vreg_flags))
693 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg2, max_vreg, vreg_flags))
697 if (IS_DEBUG_ON (cfg)) {
698 for (i = 0; i < cfg->num_varinfo; i++) {
699 MonoInst *var = cfg->varinfo [i];
700 if (var->klass->simd_type) {
701 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
702 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
703 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
704 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
709 /*TODO stop here if no var is xzero only*/
712 Scan all other bb and check if it has only one other use
713 Ideally this would be done after an extended bb formation pass
715 FIXME This pass could use dominator information to properly
716 place the XZERO on the bb that dominates all uses of the var,
717 but this will have zero effect with the current local reg alloc
719 TODO simply the use of flags.
722 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
723 for (ins = bb->code; ins; ins = ins->next) {
725 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
727 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
729 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg1, bb, max_vreg, vreg_flags, target_bb))
731 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg2, bb, max_vreg, vreg_flags, target_bb))
736 for (i = 0; i < cfg->num_varinfo; i++) {
737 MonoInst *var = cfg->varinfo [i];
738 if (!var->klass->simd_type)
740 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
741 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
742 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
743 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
745 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
747 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
748 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
749 if (ins->dreg == var->dreg && ins->sreg1 != var->dreg && ins->sreg2 != var->dreg) {
751 } else if (ins->sreg1 == var->dreg || ins->sreg2 == var->dreg) {
753 MONO_INST_NEW (cfg, tmp, OP_XZERO);
754 tmp->dreg = var->dreg;
755 tmp->type = STACK_VTYPE;
756 tmp->klass = var->klass;
757 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
763 for (ins = first_bb->code; ins; ins = ins->next) {
764 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
773 * This function expect that src be a value.
776 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
778 if (src->opcode == OP_XMOVE) {
780 } else if (src->type == STACK_VTYPE) {
783 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
784 mono_print_ins (src);
785 g_assert_not_reached ();
789 * This function will load the value if needed.
792 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
794 if (src->opcode == OP_XMOVE) {
796 } else if (src->opcode == OP_LDADDR) {
797 int res = ((MonoInst*)src->inst_p0)->dreg;
800 } else if (src->type == STACK_VTYPE) {
802 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
805 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
806 ins->klass = cmethod->klass;
807 ins->sreg1 = src->dreg;
808 ins->type = STACK_VTYPE;
809 ins->dreg = alloc_ireg (cfg);
810 MONO_ADD_INS (cfg->cbb, ins);
813 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
814 mono_print_ins (src);
815 g_assert_not_reached ();
819 get_int_to_float_spill_area (MonoCompile *cfg)
821 if (!cfg->iconv_raw_var) {
822 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
823 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
825 return cfg->iconv_raw_var;
828 /*We share the var with fconv_to_r8_x to save some stack space.*/
830 get_double_spill_area (MonoCompile *cfg)
832 if (!cfg->fconv_to_r8_x_var) {
833 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
834 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
836 return cfg->fconv_to_r8_x_var;
839 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
841 if (!cfg->simd_ctor_var) {
842 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
843 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
845 return cfg->simd_ctor_var;
849 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
852 int left_vreg, right_vreg;
854 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
855 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
858 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
859 ins->klass = cmethod->klass;
860 ins->sreg1 = left_vreg;
861 ins->sreg2 = right_vreg;
862 ins->type = STACK_VTYPE;
863 ins->klass = cmethod->klass;
864 ins->dreg = alloc_ireg (cfg);
865 ins->inst_c0 = intrinsic->flags;
866 MONO_ADD_INS (cfg->cbb, ins);
871 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
876 vreg = get_simd_vreg (cfg, cmethod, args [0]);
878 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
879 ins->klass = cmethod->klass;
881 ins->type = STACK_VTYPE;
882 ins->dreg = alloc_ireg (cfg);
883 MONO_ADD_INS (cfg->cbb, ins);
888 mono_type_to_extract_op (MonoType *type)
890 switch (type->type) {
892 return OP_EXTRACT_I1;
894 return OP_EXTRACT_U1;
896 return OP_EXTRACT_I2;
898 return OP_EXTRACT_U2;
902 return OP_EXTRACT_I4;
904 g_assert_not_reached ();
907 /*Returns the amount to shift the element index to get the dword it belongs to*/
909 mono_type_elements_shift_bits (MonoType *type)
911 switch (type->type) {
923 g_assert_not_reached ();
927 mono_type_to_slow_insert_op (MonoType *type)
929 switch (type->type) {
932 return OP_INSERTX_U1_SLOW;
938 return OP_INSERTX_I4_SLOW;
941 return OP_INSERTX_I8_SLOW;
943 return OP_INSERTX_R4_SLOW;
945 return OP_INSERTX_R8_SLOW;
947 g_assert_not_reached ();
951 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
954 MonoMethodSignature *sig = mono_method_signature (cmethod);
956 size = mono_type_size (sig->params [0], &align);
958 if (size == 2 || size == 4 || size == 8) {
959 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
960 ins->klass = cmethod->klass;
961 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
962 ins->dreg = ins->sreg1 = load_simd_vreg (cfg, cmethod, args [0]);
963 ins->sreg2 = args [1]->dreg;
964 ins->inst_c0 = intrinsic->opcode;
965 if (sig->params [0]->type == MONO_TYPE_R4)
966 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
967 else if (sig->params [0]->type == MONO_TYPE_R8)
968 ins->backend.spill_var = get_double_spill_area (cfg);
969 MONO_ADD_INS (cfg->cbb, ins);
973 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
974 ins->klass = cmethod->klass;
975 ins->sreg1 = sreg = load_simd_vreg (cfg, cmethod, args [0]);
976 ins->type = STACK_I4;
977 ins->dreg = vreg = alloc_ireg (cfg);
978 ins->inst_c0 = intrinsic->opcode / 2;
979 MONO_ADD_INS (cfg->cbb, ins);
981 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
982 ins->klass = cmethod->klass;
984 ins->sreg2 = args [1]->dreg;
986 ins->inst_c0 = intrinsic->opcode;
987 MONO_ADD_INS (cfg->cbb, ins);
994 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
997 MonoMethodSignature *sig = mono_method_signature (cmethod);
998 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1000 vreg = load_simd_vreg (cfg, cmethod, args [0]);
1002 if (intrinsic->opcode >> shift_bits) {
1003 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1004 ins->klass = cmethod->klass;
1006 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1007 ins->type = STACK_VTYPE;
1008 ins->dreg = vreg = alloc_ireg (cfg);
1009 MONO_ADD_INS (cfg->cbb, ins);
1012 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1013 ins->klass = cmethod->klass;
1015 ins->type = STACK_I4;
1016 ins->dreg = vreg = alloc_ireg (cfg);
1017 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1018 MONO_ADD_INS (cfg->cbb, ins);
1020 if (sig->ret->type == MONO_TYPE_R4) {
1021 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1022 ins->klass = mono_defaults.single_class;
1024 ins->type = STACK_R8;
1025 ins->dreg = alloc_freg (cfg);
1026 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1027 MONO_ADD_INS (cfg->cbb, ins);
1033 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1037 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1039 vreg = load_simd_vreg (cfg, cmethod, args [0]);
1041 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1042 ins->klass = cmethod->klass;
1044 ins->inst_c0 = intrinsic->opcode;
1046 ins->type = STACK_R8;
1047 ins->dreg = alloc_freg (cfg);
1048 ins->backend.spill_var = get_double_spill_area (cfg);
1050 ins->type = STACK_I8;
1051 ins->dreg = alloc_lreg (cfg);
1053 MONO_ADD_INS (cfg->cbb, ins);
1059 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1061 MonoInst *ins = NULL;
1063 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1064 MonoMethodSignature *sig = mono_method_signature (cmethod);
1065 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1066 int arg_size = mono_type_size (sig->params [0], &i);
1068 if (sig->param_count == 1) {
1072 dreg = args [0]->inst_i0->dreg;
1073 NULLIFY_INS (args [0]);
1075 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1076 dreg = alloc_ireg (cfg);
1079 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1080 ins->klass = cmethod->klass;
1081 ins->sreg1 = args [1]->dreg;
1082 ins->type = STACK_VTYPE;
1085 MONO_ADD_INS (cfg->cbb, ins);
1086 if (sig->params [0]->type == MONO_TYPE_R4)
1087 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1088 else if (sig->params [0]->type == MONO_TYPE_R8)
1089 ins->backend.spill_var = get_double_spill_area (cfg);
1092 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1093 ins->dreg = args [0]->dreg;
1095 MONO_ADD_INS (cfg->cbb, ins);
1101 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1102 MONO_ADD_INS (cfg->cbb, ins);
1103 addr_reg = ins->dreg;
1105 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1106 addr_reg = args [0]->dreg;
1109 for (i = sig->param_count - 1; i >= 0; --i) {
1110 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1113 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1114 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1115 NULLIFY_INS (args [0]);
1117 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1118 ins->klass = cmethod->klass;
1119 ins->sreg1 = addr_reg;
1120 ins->type = STACK_VTYPE;
1122 MONO_ADD_INS (cfg->cbb, ins);
1128 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1133 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1135 //TODO macroize this
1136 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1137 ins->klass = cmethod->klass;
1138 ins->type = STACK_VTYPE;
1140 ins->dreg = alloc_ireg (cfg);
1141 MONO_ADD_INS (cfg->cbb, ins);
1146 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1149 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1151 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1153 if (args [1]->opcode != OP_ICONST) {
1154 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1155 ins->klass = mono_defaults.int32_class;
1156 ins->sreg1 = args [1]->dreg;
1157 ins->type = STACK_I4;
1158 ins->dreg = vreg2 = alloc_ireg (cfg);
1159 MONO_ADD_INS (cfg->cbb, ins);
1161 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1164 MONO_INST_NEW (cfg, ins, opcode);
1165 ins->klass = cmethod->klass;
1169 if (args [1]->opcode == OP_ICONST) {
1170 ins->inst_imm = args [1]->inst_c0;
1171 NULLIFY_INS (args [1]);
1174 ins->type = STACK_VTYPE;
1175 ins->dreg = alloc_ireg (cfg);
1176 MONO_ADD_INS (cfg->cbb, ins);
1180 static inline gboolean
1181 mono_op_is_packed_compare (int op)
1183 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1187 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1190 int left_vreg, right_vreg, tmp_vreg;
1192 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1193 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1196 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1197 ins->klass = cmethod->klass;
1198 ins->sreg1 = left_vreg;
1199 ins->sreg2 = right_vreg;
1200 ins->type = STACK_VTYPE;
1201 ins->klass = cmethod->klass;
1202 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1203 ins->inst_c0 = intrinsic->flags;
1204 MONO_ADD_INS (cfg->cbb, ins);
1206 /*FIXME the next ops are SSE specific*/
1207 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1208 ins->klass = cmethod->klass;
1209 ins->sreg1 = tmp_vreg;
1210 ins->type = STACK_I4;
1211 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1212 MONO_ADD_INS (cfg->cbb, ins);
1214 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1215 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1216 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1217 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1219 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1220 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1222 MONO_ADD_INS (cfg->cbb, ins);
1228 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1233 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
1235 if (args [1]->opcode != OP_ICONST) {
1236 g_warning ("Shuffle with non literals is not yet supported");
1237 g_assert_not_reached ();
1239 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1240 NULLIFY_INS (args [1]);
1242 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1243 ins->klass = cmethod->klass;
1245 ins->inst_c0 = args [1]->inst_c0;
1246 ins->type = STACK_VTYPE;
1247 ins->dreg = alloc_ireg (cfg);
1248 MONO_ADD_INS (cfg->cbb, ins);
1253 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1257 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1258 ins->klass = cmethod->klass;
1259 ins->sreg1 = args [0]->dreg;
1260 ins->type = STACK_VTYPE;
1261 ins->dreg = alloc_ireg (cfg);
1262 MONO_ADD_INS (cfg->cbb, ins);
1267 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1272 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1274 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1275 ins->klass = cmethod->klass;
1276 ins->dreg = args [0]->dreg;
1278 ins->type = STACK_VTYPE;
1279 MONO_ADD_INS (cfg->cbb, ins);
1284 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1289 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1291 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1292 ins->klass = cmethod->klass;
1294 ins->type = STACK_I4;
1295 ins->dreg = alloc_ireg (cfg);
1296 MONO_ADD_INS (cfg->cbb, ins);
1302 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1306 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1307 ins->klass = cmethod->klass;
1308 ins->sreg1 = args [0]->dreg;
1309 ins->backend.arg_info = intrinsic->flags;
1310 MONO_ADD_INS (cfg->cbb, ins);
1315 simd_version_name (guint32 version)
1318 case SIMD_VERSION_SSE1:
1320 case SIMD_VERSION_SSE2:
1322 case SIMD_VERSION_SSE3:
1324 case SIMD_VERSION_SSSE3:
1326 case SIMD_VERSION_SSE41:
1328 case SIMD_VERSION_SSE42:
1330 case SIMD_VERSION_SSE4a:
1337 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1339 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1341 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1344 if (IS_DEBUG_ON (cfg)) {
1346 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1347 max = fsig->param_count + fsig->hasthis;
1348 for (i = 0; i < max; ++i) {
1349 printf ("param %d: ", i);
1350 mono_print_ins (args [i]);
1353 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
1354 if (IS_DEBUG_ON (cfg))
1355 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
1359 switch (result->simd_emit_mode) {
1360 case SIMD_EMIT_BINARY:
1361 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1362 case SIMD_EMIT_UNARY:
1363 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1364 case SIMD_EMIT_SETTER:
1365 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1366 case SIMD_EMIT_GETTER:
1367 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1368 case SIMD_EMIT_GETTER_QWORD:
1369 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1370 case SIMD_EMIT_CTOR:
1371 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1372 case SIMD_EMIT_CAST:
1373 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1374 case SIMD_EMIT_SHUFFLE:
1375 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1376 case SIMD_EMIT_SHIFT:
1377 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1378 case SIMD_EMIT_EQUALITY:
1379 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1380 case SIMD_EMIT_LOAD_ALIGNED:
1381 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1382 case SIMD_EMIT_STORE:
1383 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1384 case SIMD_EMIT_EXTRACT_MASK:
1385 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1386 case SIMD_EMIT_PREFETCH:
1387 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1389 g_assert_not_reached ();
1393 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1397 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1399 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1400 mult_reg = alloc_preg (cfg);
1401 array_reg = arr->dreg;
1402 index_reg = index->dreg;
1404 #if SIZEOF_VOID_P == 8
1405 /* The array reg is 64 bits but the index reg is only 32 */
1406 index2_reg = alloc_preg (cfg);
1407 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1409 index2_reg = index_reg;
1411 index3_reg = alloc_preg (cfg);
1414 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1415 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1416 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1419 add_reg = alloc_preg (cfg);
1421 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1422 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1423 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1424 ins->type = STACK_PTR;
1425 MONO_ADD_INS (cfg->cbb, ins);
1431 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1433 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1435 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1437 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1438 load->klass = cmethod->klass;
1440 load->type = STACK_VTYPE;
1441 load->dreg = alloc_ireg (cfg);
1442 MONO_ADD_INS (cfg->cbb, load);
1446 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1448 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1449 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1451 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1452 store->klass = cmethod->klass;
1454 store->sreg1 = vreg;
1455 MONO_ADD_INS (cfg->cbb, store);
1459 if (!strcmp ("IsAligned", cmethod->name)) {
1461 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1463 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1464 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1465 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1466 MONO_ADD_INS (cfg->cbb, ins);
1474 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1476 if (!strcmp ("get_AccelMode", cmethod->name)) {
1478 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1485 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1487 const char *class_name;
1489 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1492 class_name = cmethod->klass->name;
1493 if (!strcmp ("SimdRuntime", class_name))
1494 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1496 if (!strcmp ("ArrayExtensions", class_name))
1497 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1499 if (!strcmp ("VectorOperations", class_name)) {
1500 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1502 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1503 } else if (!cmethod->klass->simd_type)
1506 cfg->uses_simd_intrinsics = 1;
1507 if (!strcmp ("Vector2d", class_name))
1508 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1509 if (!strcmp ("Vector4f", class_name))
1510 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1511 if (!strcmp ("Vector2ul", class_name))
1512 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1513 if (!strcmp ("Vector2l", class_name))
1514 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1515 if (!strcmp ("Vector4ui", class_name))
1516 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1517 if (!strcmp ("Vector4i", class_name))
1518 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1519 if (!strcmp ("Vector8us", class_name))
1520 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1521 if (!strcmp ("Vector8s", class_name))
1522 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1523 if (!strcmp ("Vector16b", class_name))
1524 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1525 if (!strcmp ("Vector16sb", class_name))
1526 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));