2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
73 SIMD_EMIT_GETTER_QWORD,
79 SIMD_EMIT_LOAD_ALIGNED,
81 SIMD_EMIT_EXTRACT_MASK,
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
111 #define SIMD_METHOD(str,name) name,
113 #include "simd-methods.h"
117 #define method_name(idx) (method_names [(idx)])
124 guint8 simd_version_flags;
125 guint8 simd_emit_mode : 4;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
132 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
133 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
168 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
169 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
170 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
171 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
172 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
173 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
174 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
175 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
176 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
177 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
182 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
183 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
184 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
185 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
186 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
187 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
188 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
189 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
190 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
191 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
192 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
193 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
194 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
195 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
196 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
197 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
198 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
199 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
200 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
201 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
202 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
203 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
204 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
205 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
206 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
207 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
208 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
209 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
210 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
211 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
212 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
213 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
214 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
215 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
217 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
220 static const SimdIntrinsc vector2ul_intrinsics[] = {
221 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
222 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
223 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
224 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
225 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
226 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
227 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
228 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
229 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
232 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
233 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
234 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
235 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
236 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
237 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
238 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
239 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
240 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
241 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
242 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
243 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
246 static const SimdIntrinsc vector2l_intrinsics[] = {
247 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
248 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
249 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
250 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
251 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
252 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
253 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
254 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
255 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
256 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
257 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
258 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
259 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
260 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
261 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
263 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
264 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
265 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
266 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
267 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
268 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
269 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
270 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
273 static const SimdIntrinsc vector4ui_intrinsics[] = {
274 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
275 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
276 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
277 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
278 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
279 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
280 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
281 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
282 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
283 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
284 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
285 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
287 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
288 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
289 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
290 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
291 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
292 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
293 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
294 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
296 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
297 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
298 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
300 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
301 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
302 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
303 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
304 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
305 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
306 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
307 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
308 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
311 static const SimdIntrinsc vector4i_intrinsics[] = {
312 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
313 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
314 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
315 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
316 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
317 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
318 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
319 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
320 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
321 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
322 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
323 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
324 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
325 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
326 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
327 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
328 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
329 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
330 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
331 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
332 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
333 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
334 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
335 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
336 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
337 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
338 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
339 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
340 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
341 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
342 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
343 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
344 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
345 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
346 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
347 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
350 static const SimdIntrinsc vector8us_intrinsics[] = {
351 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
352 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
353 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
354 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
355 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
356 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
357 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
358 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
359 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
360 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
361 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
362 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
363 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
364 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
365 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
366 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
367 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
368 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
369 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
370 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
371 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
372 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
373 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
374 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
375 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
376 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
377 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
378 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
379 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
380 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
381 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
382 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
384 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
385 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
386 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
387 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
388 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
389 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
390 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
391 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
392 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
393 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
394 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
395 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
396 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
397 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
398 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
401 static const SimdIntrinsc vector8s_intrinsics[] = {
402 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
403 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
404 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
406 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
407 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
408 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
409 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
410 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
411 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
412 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
413 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
414 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
415 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
416 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
417 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
418 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
419 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
420 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
421 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
422 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
423 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
424 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
425 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
426 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
427 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
428 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
429 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
430 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
431 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
432 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
435 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
437 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
438 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
439 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
440 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
441 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
442 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
443 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
444 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
445 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
446 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
447 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
448 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
449 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
452 static const SimdIntrinsc vector16b_intrinsics[] = {
453 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
454 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
455 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
457 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
458 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
459 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
460 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
461 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
462 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
463 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
464 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
465 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
466 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
467 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
468 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
469 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
470 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
471 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
472 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
473 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
474 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
475 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
476 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
477 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
478 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
479 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
480 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
481 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
482 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
483 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
484 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
485 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
486 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
487 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
488 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
489 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
490 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
492 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
493 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
494 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
495 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
496 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
497 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
498 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
499 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
500 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
501 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
502 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
503 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
504 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
505 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
506 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
507 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
508 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
509 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
516 static const SimdIntrinsc vector16sb_intrinsics[] = {
517 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
518 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
519 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
520 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
521 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
522 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
523 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
524 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
525 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
526 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
527 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
528 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
529 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
530 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
531 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
532 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
533 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
534 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
535 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
536 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
537 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
538 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
539 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
540 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
541 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
542 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
543 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
544 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
545 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
546 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
547 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
548 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
549 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
550 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
551 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
552 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
553 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
554 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
555 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
556 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
557 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
558 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
559 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
560 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
561 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
562 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
563 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
564 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
565 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
566 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
567 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
568 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
569 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
570 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
571 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
572 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
575 static guint32 simd_supported_versions;
577 /*TODO match using number of parameters as well*/
579 simd_intrinsic_compare_by_name (const void *key, const void *value)
581 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
586 VREG_HAS_XZERO_BB0 = 0x02,
587 VREG_HAS_OTHER_OP_BB0 = 0x04,
588 VREG_SINGLE_BB_USE = 0x08,
589 VREG_MANY_BB_USE = 0x10,
593 mono_simd_intrinsics_init (void)
595 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
596 /*TODO log the supported flags*/
599 static inline gboolean
600 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
602 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
603 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
604 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
605 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
611 static inline gboolean
612 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
614 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
617 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
618 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
619 vreg_flags [reg] |= VREG_MANY_BB_USE;
620 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
622 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
623 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
624 target_bb [reg] = bb;
625 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
632 This pass recalculate which vars need MONO_INST_INDIRECT.
634 We cannot do this for non SIMD vars since code like mono_get_vtable_var
635 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
638 mono_simd_simplify_indirection (MonoCompile *cfg)
641 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
645 for (i = 0; i < cfg->num_varinfo; i++) {
646 MonoInst *var = cfg->varinfo [i];
647 if (var->klass->simd_type) {
648 var->flags &= ~MONO_INST_INDIRECT;
649 max_vreg = MAX (var->dreg, max_vreg);
653 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
654 if (!first_bb && bb->code)
656 for (ins = bb->code; ins; ins = ins->next) {
657 if (ins->opcode == OP_LDADDR) {
658 MonoInst *var = (MonoInst*)ins->inst_p0;
659 if (var->klass->simd_type) {
660 var->flags |= MONO_INST_INDIRECT;
666 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
667 vreg_flags = g_malloc0 (max_vreg + 1);
668 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
670 for (i = 0; i < cfg->num_varinfo; i++) {
671 MonoInst *var = cfg->varinfo [i];
672 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
673 vreg_flags [var->dreg] = VREG_USED;
674 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
678 /*Scan the first basic block looking xzeros not used*/
679 for (ins = first_bb->code; ins; ins = ins->next) {
681 int sregs [MONO_MAX_SRC_REGS];
683 if (ins->opcode == OP_XZERO) {
684 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
685 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
686 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
690 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
692 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
694 num_sregs = mono_inst_get_src_registers (ins, sregs);
695 for (i = 0; i < num_sregs; ++i) {
696 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
701 if (IS_DEBUG_ON (cfg)) {
702 for (i = 0; i < cfg->num_varinfo; i++) {
703 MonoInst *var = cfg->varinfo [i];
704 if (var->klass->simd_type) {
705 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
706 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
707 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
708 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
713 /*TODO stop here if no var is xzero only*/
716 Scan all other bb and check if it has only one other use
717 Ideally this would be done after an extended bb formation pass
719 FIXME This pass could use dominator information to properly
720 place the XZERO on the bb that dominates all uses of the var,
721 but this will have zero effect with the current local reg alloc
723 TODO simply the use of flags.
726 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
727 for (ins = bb->code; ins; ins = ins->next) {
729 int sregs [MONO_MAX_SRC_REGS];
731 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
733 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
735 num_sregs = mono_inst_get_src_registers (ins, sregs);
736 for (i = 0; i < num_sregs; ++i) {
737 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
738 max_vreg, vreg_flags, target_bb))
744 for (i = 0; i < cfg->num_varinfo; i++) {
745 MonoInst *var = cfg->varinfo [i];
746 if (!var->klass->simd_type)
748 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
749 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
750 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
751 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
753 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
755 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
757 int sregs [MONO_MAX_SRC_REGS];
758 gboolean found = FALSE;
760 num_sregs = mono_inst_get_src_registers (ins, sregs);
761 for (j = 0; j < num_sregs; ++j) {
762 if (sregs [i] == var->dreg)
765 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
766 if (ins->dreg == var->dreg && !found) {
770 MONO_INST_NEW (cfg, tmp, OP_XZERO);
771 tmp->dreg = var->dreg;
772 tmp->type = STACK_VTYPE;
773 tmp->klass = var->klass;
774 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
780 for (ins = first_bb->code; ins; ins = ins->next) {
781 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
790 * This function expect that src be a value.
793 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
795 if (src->opcode == OP_XMOVE) {
797 } else if (src->type == STACK_VTYPE) {
800 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
801 mono_print_ins (src);
802 g_assert_not_reached ();
806 * This function will load the value if needed.
809 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
813 if (src->opcode == OP_XMOVE) {
815 } else if (src->opcode == OP_LDADDR) {
816 int res = ((MonoInst*)src->inst_p0)->dreg;
819 } else if (src->type == STACK_VTYPE) {
821 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
826 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
827 ins->klass = cmethod->klass;
828 ins->sreg1 = src->dreg;
829 ins->type = STACK_VTYPE;
830 ins->dreg = alloc_ireg (cfg);
831 MONO_ADD_INS (cfg->cbb, ins);
834 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
835 mono_print_ins (src);
836 g_assert_not_reached ();
840 get_int_to_float_spill_area (MonoCompile *cfg)
842 if (!cfg->iconv_raw_var) {
843 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
844 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
846 return cfg->iconv_raw_var;
849 /*We share the var with fconv_to_r8_x to save some stack space.*/
851 get_double_spill_area (MonoCompile *cfg)
853 if (!cfg->fconv_to_r8_x_var) {
854 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
855 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
857 return cfg->fconv_to_r8_x_var;
860 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
862 if (!cfg->simd_ctor_var) {
863 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
864 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
866 return cfg->simd_ctor_var;
870 mono_type_to_expand_op (MonoType *type)
872 switch (type->type) {
890 g_assert_not_reached ();
894 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
897 MonoMethodSignature *sig = mono_method_signature (cmethod);
900 g_assert (sig->param_count == 2);
901 g_assert (position == 0 || position == 1);
903 if (mono_class_from_mono_type (sig->params [position])->simd_type)
904 return get_simd_vreg (cfg, cmethod, src);
906 expand_op = mono_type_to_expand_op (sig->params [position]);
907 MONO_INST_NEW (cfg, ins, expand_op);
908 ins->klass = cmethod->klass;
909 ins->sreg1 = src->dreg;
910 ins->type = STACK_VTYPE;
911 ins->dreg = alloc_ireg (cfg);
912 MONO_ADD_INS (cfg->cbb, ins);
914 if (expand_op == OP_EXPAND_R4)
915 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
916 else if (expand_op == OP_EXPAND_R8)
917 ins->backend.spill_var = get_double_spill_area (cfg);
923 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
926 int left_vreg, right_vreg;
928 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
929 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
932 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
933 ins->klass = cmethod->klass;
934 ins->sreg1 = left_vreg;
935 ins->sreg2 = right_vreg;
936 ins->type = STACK_VTYPE;
937 ins->dreg = alloc_ireg (cfg);
938 ins->inst_c0 = intrinsic->flags;
939 MONO_ADD_INS (cfg->cbb, ins);
944 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
949 vreg = get_simd_vreg (cfg, cmethod, args [0]);
951 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
952 ins->klass = cmethod->klass;
954 ins->type = STACK_VTYPE;
955 ins->dreg = alloc_ireg (cfg);
956 MONO_ADD_INS (cfg->cbb, ins);
961 mono_type_to_extract_op (MonoType *type)
963 switch (type->type) {
965 return OP_EXTRACT_I1;
967 return OP_EXTRACT_U1;
969 return OP_EXTRACT_I2;
971 return OP_EXTRACT_U2;
975 return OP_EXTRACT_I4;
977 g_assert_not_reached ();
980 /*Returns the amount to shift the element index to get the dword it belongs to*/
982 mono_type_elements_shift_bits (MonoType *type)
984 switch (type->type) {
996 g_assert_not_reached ();
1000 mono_type_to_slow_insert_op (MonoType *type)
1002 switch (type->type) {
1005 return OP_INSERTX_U1_SLOW;
1008 return OP_INSERT_I2;
1011 return OP_INSERTX_I4_SLOW;
1014 return OP_INSERTX_I8_SLOW;
1016 return OP_INSERTX_R4_SLOW;
1018 return OP_INSERTX_R8_SLOW;
1020 g_assert_not_reached ();
1024 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1027 MonoMethodSignature *sig = mono_method_signature (cmethod);
1032 size = mono_type_size (sig->params [0], &align);
1034 if (size == 2 || size == 4 || size == 8) {
1035 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1036 ins->klass = cmethod->klass;
1037 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1038 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1039 ins->sreg2 = args [1]->dreg;
1040 ins->inst_c0 = intrinsic->opcode;
1041 if (sig->params [0]->type == MONO_TYPE_R4)
1042 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1043 else if (sig->params [0]->type == MONO_TYPE_R8)
1044 ins->backend.spill_var = get_double_spill_area (cfg);
1045 MONO_ADD_INS (cfg->cbb, ins);
1049 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1050 ins->klass = cmethod->klass;
1051 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1052 ins->type = STACK_I4;
1053 ins->dreg = vreg = alloc_ireg (cfg);
1054 ins->inst_c0 = intrinsic->opcode / 2;
1055 MONO_ADD_INS (cfg->cbb, ins);
1057 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1058 ins->klass = cmethod->klass;
1060 ins->sreg2 = args [1]->dreg;
1062 ins->inst_c0 = intrinsic->opcode;
1063 MONO_ADD_INS (cfg->cbb, ins);
1067 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1068 ins->klass = cmethod->klass;
1069 ins->dreg = args [0]->dreg;
1071 MONO_ADD_INS (cfg->cbb, ins);
1077 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1080 MonoMethodSignature *sig = mono_method_signature (cmethod);
1081 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1083 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1085 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1086 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1087 ins->klass = cmethod->klass;
1089 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1090 ins->type = STACK_VTYPE;
1091 ins->dreg = vreg = alloc_ireg (cfg);
1092 MONO_ADD_INS (cfg->cbb, ins);
1095 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1096 ins->klass = cmethod->klass;
1098 ins->type = STACK_I4;
1099 ins->dreg = vreg = alloc_ireg (cfg);
1100 if (cfg->compile_llvm)
1101 ins->inst_c0 = intrinsic->opcode;
1103 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1104 MONO_ADD_INS (cfg->cbb, ins);
1106 if (sig->ret->type == MONO_TYPE_R4) {
1107 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1108 ins->klass = mono_defaults.single_class;
1110 ins->type = STACK_R8;
1111 ins->dreg = alloc_freg (cfg);
1112 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1113 MONO_ADD_INS (cfg->cbb, ins);
1119 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1123 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1125 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1127 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1128 ins->klass = cmethod->klass;
1130 ins->inst_c0 = intrinsic->opcode;
1132 ins->type = STACK_R8;
1133 ins->dreg = alloc_freg (cfg);
1134 ins->backend.spill_var = get_double_spill_area (cfg);
1136 ins->type = STACK_I8;
1137 ins->dreg = alloc_lreg (cfg);
1139 MONO_ADD_INS (cfg->cbb, ins);
1145 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1147 MonoInst *ins = NULL;
1149 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1150 MonoMethodSignature *sig = mono_method_signature (cmethod);
1151 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1152 int arg_size = mono_type_size (sig->params [0], &i);
1154 if (sig->param_count == 1) {
1158 dreg = args [0]->inst_i0->dreg;
1159 NULLIFY_INS (args [0]);
1161 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1162 dreg = alloc_ireg (cfg);
1165 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1166 ins->klass = cmethod->klass;
1167 ins->sreg1 = args [1]->dreg;
1168 ins->type = STACK_VTYPE;
1171 MONO_ADD_INS (cfg->cbb, ins);
1172 if (sig->params [0]->type == MONO_TYPE_R4)
1173 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1174 else if (sig->params [0]->type == MONO_TYPE_R8)
1175 ins->backend.spill_var = get_double_spill_area (cfg);
1178 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1179 ins->dreg = args [0]->dreg;
1181 MONO_ADD_INS (cfg->cbb, ins);
1187 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1188 MONO_ADD_INS (cfg->cbb, ins);
1189 addr_reg = ins->dreg;
1191 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1192 addr_reg = args [0]->dreg;
1195 for (i = sig->param_count - 1; i >= 0; --i) {
1196 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1199 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1200 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1201 NULLIFY_INS (args [0]);
1203 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1204 ins->klass = cmethod->klass;
1205 ins->sreg1 = addr_reg;
1206 ins->type = STACK_VTYPE;
1208 MONO_ADD_INS (cfg->cbb, ins);
1214 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1219 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1221 //TODO macroize this
1222 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1223 ins->klass = cmethod->klass;
1224 ins->type = STACK_VTYPE;
1226 ins->dreg = alloc_ireg (cfg);
1227 MONO_ADD_INS (cfg->cbb, ins);
1232 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1235 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1237 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1239 if (args [1]->opcode != OP_ICONST) {
1240 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1241 ins->klass = mono_defaults.int32_class;
1242 ins->sreg1 = args [1]->dreg;
1243 ins->type = STACK_I4;
1244 ins->dreg = vreg2 = alloc_ireg (cfg);
1245 MONO_ADD_INS (cfg->cbb, ins);
1247 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1250 MONO_INST_NEW (cfg, ins, opcode);
1251 ins->klass = cmethod->klass;
1255 if (args [1]->opcode == OP_ICONST) {
1256 ins->inst_imm = args [1]->inst_c0;
1257 NULLIFY_INS (args [1]);
1260 ins->type = STACK_VTYPE;
1261 ins->dreg = alloc_ireg (cfg);
1262 MONO_ADD_INS (cfg->cbb, ins);
1266 static inline gboolean
1267 mono_op_is_packed_compare (int op)
1269 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1273 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1276 int left_vreg, right_vreg, tmp_vreg;
1278 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1279 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1282 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1283 ins->klass = cmethod->klass;
1284 ins->sreg1 = left_vreg;
1285 ins->sreg2 = right_vreg;
1286 ins->type = STACK_VTYPE;
1287 ins->klass = cmethod->klass;
1288 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1289 ins->inst_c0 = intrinsic->flags;
1290 MONO_ADD_INS (cfg->cbb, ins);
1292 /*FIXME the next ops are SSE specific*/
1293 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1294 ins->klass = cmethod->klass;
1295 ins->sreg1 = tmp_vreg;
1296 ins->type = STACK_I4;
1297 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1298 MONO_ADD_INS (cfg->cbb, ins);
1300 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1301 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1302 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1303 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1305 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1306 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1308 MONO_ADD_INS (cfg->cbb, ins);
1314 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1319 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
1321 if (args [1]->opcode != OP_ICONST) {
1322 /*TODO Shuffle with non literals is not yet supported */
1325 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1326 NULLIFY_INS (args [1]);
1328 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1329 ins->klass = cmethod->klass;
1331 ins->inst_c0 = args [1]->inst_c0;
1332 ins->type = STACK_VTYPE;
1333 ins->dreg = alloc_ireg (cfg);
1334 MONO_ADD_INS (cfg->cbb, ins);
1339 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1343 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1344 ins->klass = cmethod->klass;
1345 ins->sreg1 = args [0]->dreg;
1346 ins->type = STACK_VTYPE;
1347 ins->dreg = alloc_ireg (cfg);
1348 MONO_ADD_INS (cfg->cbb, ins);
1353 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1358 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1360 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1361 ins->klass = cmethod->klass;
1362 ins->dreg = args [0]->dreg;
1364 ins->type = STACK_VTYPE;
1365 MONO_ADD_INS (cfg->cbb, ins);
1370 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1375 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1377 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1378 ins->klass = cmethod->klass;
1380 ins->type = STACK_I4;
1381 ins->dreg = alloc_ireg (cfg);
1382 MONO_ADD_INS (cfg->cbb, ins);
1388 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1392 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1393 ins->klass = cmethod->klass;
1394 ins->sreg1 = args [0]->dreg;
1395 ins->backend.arg_info = intrinsic->flags;
1396 MONO_ADD_INS (cfg->cbb, ins);
1401 simd_version_name (guint32 version)
1404 case SIMD_VERSION_SSE1:
1406 case SIMD_VERSION_SSE2:
1408 case SIMD_VERSION_SSE3:
1410 case SIMD_VERSION_SSSE3:
1412 case SIMD_VERSION_SSE41:
1414 case SIMD_VERSION_SSE42:
1416 case SIMD_VERSION_SSE4a:
1423 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1425 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1427 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1430 if (IS_DEBUG_ON (cfg)) {
1432 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1433 max = fsig->param_count + fsig->hasthis;
1434 for (i = 0; i < max; ++i) {
1435 printf ("param %d: ", i);
1436 mono_print_ins (args [i]);
1439 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1440 if (IS_DEBUG_ON (cfg)) {
1442 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1443 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1444 if (result->simd_version_flags & (1 << x))
1445 printf ("%s ", simd_version_name (1 << x));
1452 switch (result->simd_emit_mode) {
1453 case SIMD_EMIT_BINARY:
1454 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1455 case SIMD_EMIT_UNARY:
1456 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1457 case SIMD_EMIT_SETTER:
1458 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1459 case SIMD_EMIT_GETTER:
1460 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1461 case SIMD_EMIT_GETTER_QWORD:
1462 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1463 case SIMD_EMIT_CTOR:
1464 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1465 case SIMD_EMIT_CAST:
1466 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1467 case SIMD_EMIT_SHUFFLE:
1468 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1469 case SIMD_EMIT_SHIFT:
1470 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1471 case SIMD_EMIT_EQUALITY:
1472 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1473 case SIMD_EMIT_LOAD_ALIGNED:
1474 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1475 case SIMD_EMIT_STORE:
1476 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1477 case SIMD_EMIT_EXTRACT_MASK:
1478 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1479 case SIMD_EMIT_PREFETCH:
1480 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1482 g_assert_not_reached ();
1486 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1490 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1492 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1493 mult_reg = alloc_preg (cfg);
1494 array_reg = arr->dreg;
1495 index_reg = index->dreg;
1497 #if SIZEOF_VOID_P == 8
1498 /* The array reg is 64 bits but the index reg is only 32 */
1499 index2_reg = alloc_preg (cfg);
1500 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1502 index2_reg = index_reg;
1504 index3_reg = alloc_preg (cfg);
1507 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1508 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1509 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1512 add_reg = alloc_preg (cfg);
1514 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1515 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1516 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1517 ins->type = STACK_PTR;
1518 MONO_ADD_INS (cfg->cbb, ins);
1524 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1526 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1528 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1530 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1531 load->klass = cmethod->klass;
1533 load->type = STACK_VTYPE;
1534 load->dreg = alloc_ireg (cfg);
1535 MONO_ADD_INS (cfg->cbb, load);
1539 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1541 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1542 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1544 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1545 store->klass = cmethod->klass;
1547 store->sreg1 = vreg;
1548 MONO_ADD_INS (cfg->cbb, store);
1552 if (!strcmp ("IsAligned", cmethod->name)) {
1554 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1556 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1557 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1558 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1559 MONO_ADD_INS (cfg->cbb, ins);
1567 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1569 if (!strcmp ("get_AccelMode", cmethod->name)) {
1571 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1578 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1580 const char *class_name;
1582 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1585 class_name = cmethod->klass->name;
1586 if (!strcmp ("SimdRuntime", class_name))
1587 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1589 if (!strcmp ("ArrayExtensions", class_name))
1590 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1592 if (!strcmp ("VectorOperations", class_name)) {
1593 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1595 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1596 } else if (!cmethod->klass->simd_type)
1599 cfg->uses_simd_intrinsics = 1;
1600 if (!strcmp ("Vector2d", class_name))
1601 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1602 if (!strcmp ("Vector4f", class_name))
1603 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1604 if (!strcmp ("Vector2ul", class_name))
1605 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1606 if (!strcmp ("Vector2l", class_name))
1607 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1608 if (!strcmp ("Vector4ui", class_name))
1609 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1610 if (!strcmp ("Vector4i", class_name))
1611 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1612 if (!strcmp ("Vector8us", class_name))
1613 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1614 if (!strcmp ("Vector8s", class_name))
1615 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1616 if (!strcmp ("Vector16b", class_name))
1617 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1618 if (!strcmp ("Vector16sb", class_name))
1619 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));