2009-01-19 Rodrigo Kumpera <rkumpera@novell.com>
[mono.git] / mono / mini / mini-x86.c
1 /*
2  * mini-x86.c: x86 backend for the Mono code generator
3  *
4  * Authors:
5  *   Paolo Molaro (lupus@ximian.com)
6  *   Dietmar Maurer (dietmar@ximian.com)
7  *   Patrik Torstensson
8  *
9  * (C) 2003 Ximian, Inc.
10  */
11 #include "mini.h"
12 #include <string.h>
13 #include <math.h>
14 #ifdef HAVE_UNISTD_H
15 #include <unistd.h>
16 #endif
17
18 #include <mono/metadata/appdomain.h>
19 #include <mono/metadata/debug-helpers.h>
20 #include <mono/metadata/threads.h>
21 #include <mono/metadata/profiler-private.h>
22 #include <mono/metadata/mono-debug.h>
23 #include <mono/utils/mono-math.h>
24 #include <mono/utils/mono-counters.h>
25
26 #include "trace.h"
27 #include "mini-x86.h"
28 #include "cpu-x86.h"
29 #include "ir-emit.h"
30
31 /* On windows, these hold the key returned by TlsAlloc () */
32 static gint lmf_tls_offset = -1;
33 static gint lmf_addr_tls_offset = -1;
34 static gint appdomain_tls_offset = -1;
35 static gint thread_tls_offset = -1;
36
37 #ifdef MONO_XEN_OPT
38 static gboolean optimize_for_xen = TRUE;
39 #else
40 #define optimize_for_xen 0
41 #endif
42
43 #ifdef PLATFORM_WIN32
44 static gboolean is_win32 = TRUE;
45 #else
46 static gboolean is_win32 = FALSE;
47 #endif
48
49 /* This mutex protects architecture specific caches */
50 #define mono_mini_arch_lock() EnterCriticalSection (&mini_arch_mutex)
51 #define mono_mini_arch_unlock() LeaveCriticalSection (&mini_arch_mutex)
52 static CRITICAL_SECTION mini_arch_mutex;
53
54 #define ALIGN_TO(val,align) ((((guint64)val) + ((align) - 1)) & ~((align) - 1))
55
56 #define ARGS_OFFSET 8
57
58 #ifdef PLATFORM_WIN32
59 /* Under windows, the default pinvoke calling convention is stdcall */
60 #define CALLCONV_IS_STDCALL(sig) ((((sig)->call_convention) == MONO_CALL_STDCALL) || ((sig)->pinvoke && ((sig)->call_convention) == MONO_CALL_DEFAULT))
61 #else
62 #define CALLCONV_IS_STDCALL(sig) (((sig)->call_convention) == MONO_CALL_STDCALL)
63 #endif
64
65 MonoBreakpointInfo
66 mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
67
68 const char*
69 mono_arch_regname (int reg)
70 {
71         switch (reg) {
72         case X86_EAX: return "%eax";
73         case X86_EBX: return "%ebx";
74         case X86_ECX: return "%ecx";
75         case X86_EDX: return "%edx";
76         case X86_ESP: return "%esp";    
77         case X86_EBP: return "%ebp";
78         case X86_EDI: return "%edi";
79         case X86_ESI: return "%esi";
80         }
81         return "unknown";
82 }
83
84 const char*
85 mono_arch_fregname (int reg)
86 {
87         switch (reg) {
88         case 0:
89                 return "%fr0";
90         case 1:
91                 return "%fr1";
92         case 2:
93                 return "%fr2";
94         case 3:
95                 return "%fr3";
96         case 4:
97                 return "%fr4";
98         case 5:
99                 return "%fr5";
100         case 6:
101                 return "%fr6";
102         case 7:
103                 return "%fr7";
104         default:
105                 return "unknown";
106         }
107 }
108
109 const char *
110 mono_arch_xregname (int reg)
111 {
112         switch (reg) {
113         case 0:
114                 return "%xmm0";
115         case 1:
116                 return "%xmm1";
117         case 2:
118                 return "%xmm2";
119         case 3:
120                 return "%xmm3";
121         case 4:
122                 return "%xmm4";
123         case 5:
124                 return "%xmm5";
125         case 6:
126                 return "%xmm6";
127         case 7:
128                 return "%xmm7";
129         default:
130                 return "unknown";
131         }
132 }
133
134
135 typedef enum {
136         ArgInIReg,
137         ArgInFloatSSEReg,
138         ArgInDoubleSSEReg,
139         ArgOnStack,
140         ArgValuetypeInReg,
141         ArgOnFloatFpStack,
142         ArgOnDoubleFpStack,
143         ArgNone
144 } ArgStorage;
145
146 typedef struct {
147         gint16 offset;
148         gint8  reg;
149         ArgStorage storage;
150
151         /* Only if storage == ArgValuetypeInReg */
152         ArgStorage pair_storage [2];
153         gint8 pair_regs [2];
154 } ArgInfo;
155
156 typedef struct {
157         int nargs;
158         guint32 stack_usage;
159         guint32 reg_usage;
160         guint32 freg_usage;
161         gboolean need_stack_align;
162         guint32 stack_align_amount;
163         ArgInfo ret;
164         ArgInfo sig_cookie;
165         ArgInfo args [1];
166 } CallInfo;
167
168 #define PARAM_REGS 0
169
170 #define FLOAT_PARAM_REGS 0
171
172 static X86_Reg_No param_regs [] = { 0 };
173
174 #if defined(PLATFORM_WIN32) || defined(__APPLE__) || defined(__FreeBSD__)
175 #define SMALL_STRUCTS_IN_REGS
176 static X86_Reg_No return_regs [] = { X86_EAX, X86_EDX };
177 #endif
178
179 static void inline
180 add_general (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
181 {
182     ainfo->offset = *stack_size;
183
184     if (*gr >= PARAM_REGS) {
185                 ainfo->storage = ArgOnStack;
186                 (*stack_size) += sizeof (gpointer);
187     }
188     else {
189                 ainfo->storage = ArgInIReg;
190                 ainfo->reg = param_regs [*gr];
191                 (*gr) ++;
192     }
193 }
194
195 static void inline
196 add_general_pair (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo)
197 {
198         ainfo->offset = *stack_size;
199
200         g_assert (PARAM_REGS == 0);
201         
202         ainfo->storage = ArgOnStack;
203         (*stack_size) += sizeof (gpointer) * 2;
204 }
205
206 static void inline
207 add_float (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo, gboolean is_double)
208 {
209     ainfo->offset = *stack_size;
210
211     if (*gr >= FLOAT_PARAM_REGS) {
212                 ainfo->storage = ArgOnStack;
213                 (*stack_size) += is_double ? 8 : 4;
214     }
215     else {
216                 /* A double register */
217                 if (is_double)
218                         ainfo->storage = ArgInDoubleSSEReg;
219                 else
220                         ainfo->storage = ArgInFloatSSEReg;
221                 ainfo->reg = *gr;
222                 (*gr) += 1;
223     }
224 }
225
226
227 static void
228 add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
229                gboolean is_return,
230                guint32 *gr, guint32 *fr, guint32 *stack_size)
231 {
232         guint32 size;
233         MonoClass *klass;
234
235         klass = mono_class_from_mono_type (type);
236         size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
237
238 #ifdef SMALL_STRUCTS_IN_REGS
239         if (sig->pinvoke && is_return) {
240                 MonoMarshalType *info;
241
242                 /*
243                  * the exact rules are not very well documented, the code below seems to work with the 
244                  * code generated by gcc 3.3.3 -mno-cygwin.
245                  */
246                 info = mono_marshal_load_type_info (klass);
247                 g_assert (info);
248
249                 ainfo->pair_storage [0] = ainfo->pair_storage [1] = ArgNone;
250
251                 /* Special case structs with only a float member */
252                 if ((info->native_size == 8) && (info->num_fields == 1) && (info->fields [0].field->type->type == MONO_TYPE_R8)) {
253                         ainfo->storage = ArgValuetypeInReg;
254                         ainfo->pair_storage [0] = ArgOnDoubleFpStack;
255                         return;
256                 }
257                 if ((info->native_size == 4) && (info->num_fields == 1) && (info->fields [0].field->type->type == MONO_TYPE_R4)) {
258                         ainfo->storage = ArgValuetypeInReg;
259                         ainfo->pair_storage [0] = ArgOnFloatFpStack;
260                         return;
261                 }               
262                 if ((info->native_size == 1) || (info->native_size == 2) || (info->native_size == 4) || (info->native_size == 8)) {
263                         ainfo->storage = ArgValuetypeInReg;
264                         ainfo->pair_storage [0] = ArgInIReg;
265                         ainfo->pair_regs [0] = return_regs [0];
266                         if (info->native_size > 4) {
267                                 ainfo->pair_storage [1] = ArgInIReg;
268                                 ainfo->pair_regs [1] = return_regs [1];
269                         }
270                         return;
271                 }
272         }
273 #endif
274
275         ainfo->offset = *stack_size;
276         ainfo->storage = ArgOnStack;
277         *stack_size += ALIGN_TO (size, sizeof (gpointer));
278 }
279
280 /*
281  * get_call_info:
282  *
283  *  Obtain information about a call according to the calling convention.
284  * For x86 ELF, see the "System V Application Binary Interface Intel386 
285  * Architecture Processor Supplment, Fourth Edition" document for more
286  * information.
287  * For x86 win32, see ???.
288  */
289 static CallInfo*
290 get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
291 {
292         guint32 i, gr, fr;
293         MonoType *ret_type;
294         int n = sig->hasthis + sig->param_count;
295         guint32 stack_size = 0;
296         CallInfo *cinfo;
297
298         if (mp)
299                 cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
300         else
301                 cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
302
303         gr = 0;
304         fr = 0;
305
306         /* return value */
307         {
308                 ret_type = mini_type_get_underlying_type (gsctx, sig->ret);
309                 switch (ret_type->type) {
310                 case MONO_TYPE_BOOLEAN:
311                 case MONO_TYPE_I1:
312                 case MONO_TYPE_U1:
313                 case MONO_TYPE_I2:
314                 case MONO_TYPE_U2:
315                 case MONO_TYPE_CHAR:
316                 case MONO_TYPE_I4:
317                 case MONO_TYPE_U4:
318                 case MONO_TYPE_I:
319                 case MONO_TYPE_U:
320                 case MONO_TYPE_PTR:
321                 case MONO_TYPE_FNPTR:
322                 case MONO_TYPE_CLASS:
323                 case MONO_TYPE_OBJECT:
324                 case MONO_TYPE_SZARRAY:
325                 case MONO_TYPE_ARRAY:
326                 case MONO_TYPE_STRING:
327                         cinfo->ret.storage = ArgInIReg;
328                         cinfo->ret.reg = X86_EAX;
329                         break;
330                 case MONO_TYPE_U8:
331                 case MONO_TYPE_I8:
332                         cinfo->ret.storage = ArgInIReg;
333                         cinfo->ret.reg = X86_EAX;
334                         break;
335                 case MONO_TYPE_R4:
336                         cinfo->ret.storage = ArgOnFloatFpStack;
337                         break;
338                 case MONO_TYPE_R8:
339                         cinfo->ret.storage = ArgOnDoubleFpStack;
340                         break;
341                 case MONO_TYPE_GENERICINST:
342                         if (!mono_type_generic_inst_is_valuetype (sig->ret)) {
343                                 cinfo->ret.storage = ArgInIReg;
344                                 cinfo->ret.reg = X86_EAX;
345                                 break;
346                         }
347                         /* Fall through */
348                 case MONO_TYPE_VALUETYPE: {
349                         guint32 tmp_gr = 0, tmp_fr = 0, tmp_stacksize = 0;
350
351                         add_valuetype (gsctx, sig, &cinfo->ret, sig->ret, TRUE, &tmp_gr, &tmp_fr, &tmp_stacksize);
352                         if (cinfo->ret.storage == ArgOnStack)
353                                 /* The caller passes the address where the value is stored */
354                                 add_general (&gr, &stack_size, &cinfo->ret);
355                         break;
356                 }
357                 case MONO_TYPE_TYPEDBYREF:
358                         /* Same as a valuetype with size 24 */
359                         add_general (&gr, &stack_size, &cinfo->ret);
360                         ;
361                         break;
362                 case MONO_TYPE_VOID:
363                         cinfo->ret.storage = ArgNone;
364                         break;
365                 default:
366                         g_error ("Can't handle as return value 0x%x", sig->ret->type);
367                 }
368         }
369
370         /* this */
371         if (sig->hasthis)
372                 add_general (&gr, &stack_size, cinfo->args + 0);
373
374         if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == 0)) {
375                 gr = PARAM_REGS;
376                 fr = FLOAT_PARAM_REGS;
377                 
378                 /* Emit the signature cookie just before the implicit arguments */
379                 add_general (&gr, &stack_size, &cinfo->sig_cookie);
380         }
381
382         for (i = 0; i < sig->param_count; ++i) {
383                 ArgInfo *ainfo = &cinfo->args [sig->hasthis + i];
384                 MonoType *ptype;
385
386                 if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
387                         /* We allways pass the sig cookie on the stack for simplicity */
388                         /* 
389                          * Prevent implicit arguments + the sig cookie from being passed 
390                          * in registers.
391                          */
392                         gr = PARAM_REGS;
393                         fr = FLOAT_PARAM_REGS;
394
395                         /* Emit the signature cookie just before the implicit arguments */
396                         add_general (&gr, &stack_size, &cinfo->sig_cookie);
397                 }
398
399                 if (sig->params [i]->byref) {
400                         add_general (&gr, &stack_size, ainfo);
401                         continue;
402                 }
403                 ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
404                 switch (ptype->type) {
405                 case MONO_TYPE_BOOLEAN:
406                 case MONO_TYPE_I1:
407                 case MONO_TYPE_U1:
408                         add_general (&gr, &stack_size, ainfo);
409                         break;
410                 case MONO_TYPE_I2:
411                 case MONO_TYPE_U2:
412                 case MONO_TYPE_CHAR:
413                         add_general (&gr, &stack_size, ainfo);
414                         break;
415                 case MONO_TYPE_I4:
416                 case MONO_TYPE_U4:
417                         add_general (&gr, &stack_size, ainfo);
418                         break;
419                 case MONO_TYPE_I:
420                 case MONO_TYPE_U:
421                 case MONO_TYPE_PTR:
422                 case MONO_TYPE_FNPTR:
423                 case MONO_TYPE_CLASS:
424                 case MONO_TYPE_OBJECT:
425                 case MONO_TYPE_STRING:
426                 case MONO_TYPE_SZARRAY:
427                 case MONO_TYPE_ARRAY:
428                         add_general (&gr, &stack_size, ainfo);
429                         break;
430                 case MONO_TYPE_GENERICINST:
431                         if (!mono_type_generic_inst_is_valuetype (sig->params [i])) {
432                                 add_general (&gr, &stack_size, ainfo);
433                                 break;
434                         }
435                         /* Fall through */
436                 case MONO_TYPE_VALUETYPE:
437                         add_valuetype (gsctx, sig, ainfo, sig->params [i], FALSE, &gr, &fr, &stack_size);
438                         break;
439                 case MONO_TYPE_TYPEDBYREF:
440                         stack_size += sizeof (MonoTypedRef);
441                         ainfo->storage = ArgOnStack;
442                         break;
443                 case MONO_TYPE_U8:
444                 case MONO_TYPE_I8:
445                         add_general_pair (&gr, &stack_size, ainfo);
446                         break;
447                 case MONO_TYPE_R4:
448                         add_float (&fr, &stack_size, ainfo, FALSE);
449                         break;
450                 case MONO_TYPE_R8:
451                         add_float (&fr, &stack_size, ainfo, TRUE);
452                         break;
453                 default:
454                         g_error ("unexpected type 0x%x", ptype->type);
455                         g_assert_not_reached ();
456                 }
457         }
458
459         if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n > 0) && (sig->sentinelpos == sig->param_count)) {
460                 gr = PARAM_REGS;
461                 fr = FLOAT_PARAM_REGS;
462                 
463                 /* Emit the signature cookie just before the implicit arguments */
464                 add_general (&gr, &stack_size, &cinfo->sig_cookie);
465         }
466
467         if (mono_do_x86_stack_align && (stack_size % MONO_ARCH_FRAME_ALIGNMENT) != 0) {
468                 cinfo->need_stack_align = TRUE;
469                 cinfo->stack_align_amount = MONO_ARCH_FRAME_ALIGNMENT - (stack_size % MONO_ARCH_FRAME_ALIGNMENT);
470                 stack_size += cinfo->stack_align_amount;
471         }
472
473         cinfo->stack_usage = stack_size;
474         cinfo->reg_usage = gr;
475         cinfo->freg_usage = fr;
476         return cinfo;
477 }
478
479 /*
480  * mono_arch_get_argument_info:
481  * @csig:  a method signature
482  * @param_count: the number of parameters to consider
483  * @arg_info: an array to store the result infos
484  *
485  * Gathers information on parameters such as size, alignment and
486  * padding. arg_info should be large enought to hold param_count + 1 entries. 
487  *
488  * Returns the size of the argument area on the stack.
489  */
490 int
491 mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info)
492 {
493         int k, args_size = 0;
494         int size, pad;
495         guint32 align;
496         int offset = 8;
497         CallInfo *cinfo;
498
499         cinfo = get_call_info (NULL, NULL, csig, FALSE);
500
501         if (MONO_TYPE_ISSTRUCT (csig->ret) && (cinfo->ret.storage == ArgOnStack)) {
502                 args_size += sizeof (gpointer);
503                 offset += 4;
504         }
505
506         arg_info [0].offset = offset;
507
508         if (csig->hasthis) {
509                 args_size += sizeof (gpointer);
510                 offset += 4;
511         }
512
513         arg_info [0].size = args_size;
514
515         for (k = 0; k < param_count; k++) {
516                 size = mini_type_stack_size_full (NULL, csig->params [k], &align, csig->pinvoke);
517
518                 /* ignore alignment for now */
519                 align = 1;
520
521                 args_size += pad = (align - (args_size & (align - 1))) & (align - 1);   
522                 arg_info [k].pad = pad;
523                 args_size += size;
524                 arg_info [k + 1].pad = 0;
525                 arg_info [k + 1].size = size;
526                 offset += pad;
527                 arg_info [k + 1].offset = offset;
528                 offset += size;
529         }
530
531         if (mono_do_x86_stack_align && !CALLCONV_IS_STDCALL (csig))
532                 align = MONO_ARCH_FRAME_ALIGNMENT;
533         else
534                 align = 4;
535         args_size += pad = (align - (args_size & (align - 1))) & (align - 1);
536         arg_info [k].pad = pad;
537
538         g_free (cinfo);
539
540         return args_size;
541 }
542
543 static const guchar cpuid_impl [] = {
544         0x55,                           /* push   %ebp */
545         0x89, 0xe5,                     /* mov    %esp,%ebp */
546         0x53,                           /* push   %ebx */
547         0x8b, 0x45, 0x08,               /* mov    0x8(%ebp),%eax */
548         0x0f, 0xa2,                     /* cpuid   */
549         0x50,                           /* push   %eax */
550         0x8b, 0x45, 0x10,               /* mov    0x10(%ebp),%eax */
551         0x89, 0x18,                     /* mov    %ebx,(%eax) */
552         0x8b, 0x45, 0x14,               /* mov    0x14(%ebp),%eax */
553         0x89, 0x08,                     /* mov    %ecx,(%eax) */
554         0x8b, 0x45, 0x18,               /* mov    0x18(%ebp),%eax */
555         0x89, 0x10,                     /* mov    %edx,(%eax) */
556         0x58,                           /* pop    %eax */
557         0x8b, 0x55, 0x0c,               /* mov    0xc(%ebp),%edx */
558         0x89, 0x02,                     /* mov    %eax,(%edx) */
559         0x5b,                           /* pop    %ebx */
560         0xc9,                           /* leave   */
561         0xc3,                           /* ret     */
562 };
563
564 typedef void (*CpuidFunc) (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx);
565
566 static int 
567 cpuid (int id, int* p_eax, int* p_ebx, int* p_ecx, int* p_edx)
568 {
569         int have_cpuid = 0;
570 #ifndef _MSC_VER
571         __asm__  __volatile__ (
572                 "pushfl\n"
573                 "popl %%eax\n"
574                 "movl %%eax, %%edx\n"
575                 "xorl $0x200000, %%eax\n"
576                 "pushl %%eax\n"
577                 "popfl\n"
578                 "pushfl\n"
579                 "popl %%eax\n"
580                 "xorl %%edx, %%eax\n"
581                 "andl $0x200000, %%eax\n"
582                 "movl %%eax, %0"
583                 : "=r" (have_cpuid)
584                 :
585                 : "%eax", "%edx"
586         );
587 #else
588         __asm {
589                 pushfd
590                 pop eax
591                 mov edx, eax
592                 xor eax, 0x200000
593                 push eax
594                 popfd
595                 pushfd
596                 pop eax
597                 xor eax, edx
598                 and eax, 0x200000
599                 mov have_cpuid, eax
600         }
601 #endif
602         if (have_cpuid) {
603                 /* Have to use the code manager to get around WinXP DEP */
604                 static CpuidFunc func = NULL;
605                 void *ptr;
606                 if (!func) {
607                         ptr = mono_global_codeman_reserve (sizeof (cpuid_impl));
608                         memcpy (ptr, cpuid_impl, sizeof (cpuid_impl));
609                         func = (CpuidFunc)ptr;
610                 }
611                 func (id, p_eax, p_ebx, p_ecx, p_edx);
612
613                 /*
614                  * We use this approach because of issues with gcc and pic code, see:
615                  * http://gcc.gnu.org/cgi-bin/gnatsweb.pl?cmd=view%20audit-trail&database=gcc&pr=7329
616                 __asm__ __volatile__ ("cpuid"
617                         : "=a" (*p_eax), "=b" (*p_ebx), "=c" (*p_ecx), "=d" (*p_edx)
618                         : "a" (id));
619                 */
620                 return 1;
621         }
622         return 0;
623 }
624
625 /*
626  * Initialize the cpu to execute managed code.
627  */
628 void
629 mono_arch_cpu_init (void)
630 {
631         /* spec compliance requires running with double precision */
632 #ifndef _MSC_VER
633         guint16 fpcw;
634
635         __asm__  __volatile__ ("fnstcw %0\n": "=m" (fpcw));
636         fpcw &= ~X86_FPCW_PRECC_MASK;
637         fpcw |= X86_FPCW_PREC_DOUBLE;
638         __asm__  __volatile__ ("fldcw %0\n": : "m" (fpcw));
639         __asm__  __volatile__ ("fnstcw %0\n": "=m" (fpcw));
640 #else
641         _control87 (_PC_53, MCW_PC);
642 #endif
643 }
644
645 /*
646  * Initialize architecture specific code.
647  */
648 void
649 mono_arch_init (void)
650 {
651         InitializeCriticalSection (&mini_arch_mutex);
652 }
653
654 /*
655  * Cleanup architecture specific code.
656  */
657 void
658 mono_arch_cleanup (void)
659 {
660         DeleteCriticalSection (&mini_arch_mutex);
661 }
662
663 /*
664  * This function returns the optimizations supported on this cpu.
665  */
666 guint32
667 mono_arch_cpu_optimizazions (guint32 *exclude_mask)
668 {
669         int eax, ebx, ecx, edx;
670         guint32 opts = 0;
671         
672         *exclude_mask = 0;
673         /* Feature Flags function, flags returned in EDX. */
674         if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
675                 if (edx & (1 << 15)) {
676                         opts |= MONO_OPT_CMOV;
677                         if (edx & 1)
678                                 opts |= MONO_OPT_FCMOV;
679                         else
680                                 *exclude_mask |= MONO_OPT_FCMOV;
681                 } else
682                         *exclude_mask |= MONO_OPT_CMOV;
683                 if (edx & (1 << 26))
684                         opts |= MONO_OPT_SSE2;
685                 else
686                         *exclude_mask |= MONO_OPT_SSE2;
687
688 #ifdef MONO_ARCH_SIMD_INTRINSICS
689                 /*SIMD intrinsics require at least SSE2.*/
690                 if (!(opts & MONO_OPT_SSE2))
691                         *exclude_mask |= MONO_OPT_SIMD;
692 #endif
693         }
694         return opts;
695 }
696
697 /*
698  * This function test for all SSE functions supported.
699  *
700  * Returns a bitmask corresponding to all supported versions.
701  * 
702  * TODO detect other versions like SSE4a.
703  */
704 guint32
705 mono_arch_cpu_enumerate_simd_versions (void)
706 {
707         int eax, ebx, ecx, edx;
708         guint32 sse_opts = 0;
709
710         if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
711                 if (edx & (1 << 25))
712                         sse_opts |= 1 << SIMD_VERSION_SSE1;
713                 if (edx & (1 << 26))
714                         sse_opts |= 1 << SIMD_VERSION_SSE2;
715                 if (ecx & (1 << 0))
716                         sse_opts |= 1 << SIMD_VERSION_SSE3;
717                 if (ecx & (1 << 9))
718                         sse_opts |= 1 << SIMD_VERSION_SSSE3;
719                 if (ecx & (1 << 19))
720                         sse_opts |= 1 << SIMD_VERSION_SSE41;
721                 if (ecx & (1 << 20))
722                         sse_opts |= 1 << SIMD_VERSION_SSE42;
723         }
724         return sse_opts;        
725 }
726
727 /*
728  * Determine whenever the trap whose info is in SIGINFO is caused by
729  * integer overflow.
730  */
731 gboolean
732 mono_arch_is_int_overflow (void *sigctx, void *info)
733 {
734         MonoContext ctx;
735         guint8* ip;
736
737         mono_arch_sigctx_to_monoctx (sigctx, &ctx);
738
739         ip = (guint8*)ctx.eip;
740
741         if ((ip [0] == 0xf7) && (x86_modrm_mod (ip [1]) == 0x3) && (x86_modrm_reg (ip [1]) == 0x7)) {
742                 gint32 reg;
743
744                 /* idiv REG */
745                 switch (x86_modrm_rm (ip [1])) {
746                 case X86_EAX:
747                         reg = ctx.eax;
748                         break;
749                 case X86_ECX:
750                         reg = ctx.ecx;
751                         break;
752                 case X86_EDX:
753                         reg = ctx.edx;
754                         break;
755                 case X86_EBX:
756                         reg = ctx.ebx;
757                         break;
758                 case X86_ESI:
759                         reg = ctx.esi;
760                         break;
761                 case X86_EDI:
762                         reg = ctx.edi;
763                         break;
764                 default:
765                         g_assert_not_reached ();
766                         reg = -1;
767                 }
768
769                 if (reg == -1)
770                         return TRUE;
771         }
772                         
773         return FALSE;
774 }
775
776 GList *
777 mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
778 {
779         GList *vars = NULL;
780         int i;
781
782         for (i = 0; i < cfg->num_varinfo; i++) {
783                 MonoInst *ins = cfg->varinfo [i];
784                 MonoMethodVar *vmv = MONO_VARINFO (cfg, i);
785
786                 /* unused vars */
787                 if (vmv->range.first_use.abs_pos >= vmv->range.last_use.abs_pos)
788                         continue;
789
790                 if ((ins->flags & (MONO_INST_IS_DEAD|MONO_INST_VOLATILE|MONO_INST_INDIRECT)) || 
791                     (ins->opcode != OP_LOCAL && ins->opcode != OP_ARG))
792                         continue;
793
794                 /* we dont allocate I1 to registers because there is no simply way to sign extend 
795                  * 8bit quantities in caller saved registers on x86 */
796                 if (mono_is_regsize_var (ins->inst_vtype) && (ins->inst_vtype->type != MONO_TYPE_I1)) {
797                         g_assert (MONO_VARINFO (cfg, i)->reg == -1);
798                         g_assert (i == vmv->idx);
799                         vars = g_list_prepend (vars, vmv);
800                 }
801         }
802
803         vars = mono_varlist_sort (cfg, vars, 0);
804
805         return vars;
806 }
807
808 GList *
809 mono_arch_get_global_int_regs (MonoCompile *cfg)
810 {
811         GList *regs = NULL;
812
813         /* we can use 3 registers for global allocation */
814         regs = g_list_prepend (regs, (gpointer)X86_EBX);
815         regs = g_list_prepend (regs, (gpointer)X86_ESI);
816         regs = g_list_prepend (regs, (gpointer)X86_EDI);
817
818         return regs;
819 }
820
821 /*
822  * mono_arch_regalloc_cost:
823  *
824  *  Return the cost, in number of memory references, of the action of 
825  * allocating the variable VMV into a register during global register
826  * allocation.
827  */
828 guint32
829 mono_arch_regalloc_cost (MonoCompile *cfg, MonoMethodVar *vmv)
830 {
831         MonoInst *ins = cfg->varinfo [vmv->idx];
832
833         if (cfg->method->save_lmf)
834                 /* The register is already saved */
835                 return (ins->opcode == OP_ARG) ? 1 : 0;
836         else
837                 /* push+pop+possible load if it is an argument */
838                 return (ins->opcode == OP_ARG) ? 3 : 2;
839 }
840  
841 /*
842  * Set var information according to the calling convention. X86 version.
843  * The locals var stuff should most likely be split in another method.
844  */
845 void
846 mono_arch_allocate_vars (MonoCompile *cfg)
847 {
848         MonoMethodSignature *sig;
849         MonoMethodHeader *header;
850         MonoInst *inst;
851         guint32 locals_stack_size, locals_stack_align;
852         int i, offset;
853         gint32 *offsets;
854         CallInfo *cinfo;
855
856         header = mono_method_get_header (cfg->method);
857         sig = mono_method_signature (cfg->method);
858
859         cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
860
861         cfg->frame_reg = X86_EBP;
862         offset = 0;
863
864         /* Reserve space to save LMF and caller saved registers */
865
866         if (cfg->method->save_lmf) {
867                 offset += sizeof (MonoLMF);
868         } else {
869                 if (cfg->used_int_regs & (1 << X86_EBX)) {
870                         offset += 4;
871                 }
872
873                 if (cfg->used_int_regs & (1 << X86_EDI)) {
874                         offset += 4;
875                 }
876
877                 if (cfg->used_int_regs & (1 << X86_ESI)) {
878                         offset += 4;
879                 }
880         }
881
882         switch (cinfo->ret.storage) {
883         case ArgValuetypeInReg:
884                 /* Allocate a local to hold the result, the epilog will copy it to the correct place */
885                 offset += 8;
886                 cfg->ret->opcode = OP_REGOFFSET;
887                 cfg->ret->inst_basereg = X86_EBP;
888                 cfg->ret->inst_offset = - offset;
889                 break;
890         default:
891                 break;
892         }
893
894         /* Allocate locals */
895         offsets = mono_allocate_stack_slots (cfg, &locals_stack_size, &locals_stack_align);
896         if (locals_stack_align) {
897                 offset += (locals_stack_align - 1);
898                 offset &= ~(locals_stack_align - 1);
899         }
900         /*
901          * EBP is at alignment 8 % MONO_ARCH_FRAME_ALIGNMENT, so if we
902          * have locals larger than 8 bytes we need to make sure that
903          * they have the appropriate offset.
904          */
905         if (MONO_ARCH_FRAME_ALIGNMENT > 8 && locals_stack_align > 8)
906                 offset += MONO_ARCH_FRAME_ALIGNMENT - sizeof (gpointer) * 2;
907         for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
908                 if (offsets [i] != -1) {
909                         MonoInst *inst = cfg->varinfo [i];
910                         inst->opcode = OP_REGOFFSET;
911                         inst->inst_basereg = X86_EBP;
912                         inst->inst_offset = - (offset + offsets [i]);
913                         //printf ("allocated local %d to ", i); mono_print_tree_nl (inst);
914                 }
915         }
916         offset += locals_stack_size;
917
918
919         /*
920          * Allocate arguments+return value
921          */
922
923         switch (cinfo->ret.storage) {
924         case ArgOnStack:
925                 if (MONO_TYPE_ISSTRUCT (sig->ret)) {
926                         /* 
927                          * In the new IR, the cfg->vret_addr variable represents the
928                          * vtype return value.
929                          */
930                         cfg->vret_addr->opcode = OP_REGOFFSET;
931                         cfg->vret_addr->inst_basereg = cfg->frame_reg;
932                         cfg->vret_addr->inst_offset = cinfo->ret.offset + ARGS_OFFSET;
933                         if (G_UNLIKELY (cfg->verbose_level > 1)) {
934                                 printf ("vret_addr =");
935                                 mono_print_ins (cfg->vret_addr);
936                         }
937                 } else {
938                         cfg->ret->opcode = OP_REGOFFSET;
939                         cfg->ret->inst_basereg = X86_EBP;
940                         cfg->ret->inst_offset = cinfo->ret.offset + ARGS_OFFSET;
941                 }
942                 break;
943         case ArgValuetypeInReg:
944                 break;
945         case ArgInIReg:
946                 cfg->ret->opcode = OP_REGVAR;
947                 cfg->ret->inst_c0 = cinfo->ret.reg;
948                 cfg->ret->dreg = cinfo->ret.reg;
949                 break;
950         case ArgNone:
951         case ArgOnFloatFpStack:
952         case ArgOnDoubleFpStack:
953                 break;
954         default:
955                 g_assert_not_reached ();
956         }
957
958         if (sig->call_convention == MONO_CALL_VARARG) {
959                 g_assert (cinfo->sig_cookie.storage == ArgOnStack);
960                 cfg->sig_cookie = cinfo->sig_cookie.offset + ARGS_OFFSET;
961         }
962
963         for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
964                 ArgInfo *ainfo = &cinfo->args [i];
965                 inst = cfg->args [i];
966                 if (inst->opcode != OP_REGVAR) {
967                         inst->opcode = OP_REGOFFSET;
968                         inst->inst_basereg = X86_EBP;
969                 }
970                 inst->inst_offset = ainfo->offset + ARGS_OFFSET;
971         }
972
973         offset += (MONO_ARCH_FRAME_ALIGNMENT - 1);
974         offset &= ~(MONO_ARCH_FRAME_ALIGNMENT - 1);
975
976         cfg->stack_offset = offset;
977 }
978
979 void
980 mono_arch_create_vars (MonoCompile *cfg)
981 {
982         MonoMethodSignature *sig;
983         CallInfo *cinfo;
984
985         sig = mono_method_signature (cfg->method);
986
987         cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
988
989         if (cinfo->ret.storage == ArgValuetypeInReg)
990                 cfg->ret_var_is_local = TRUE;
991         if ((cinfo->ret.storage != ArgValuetypeInReg) && MONO_TYPE_ISSTRUCT (sig->ret)) {
992                 cfg->vret_addr = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_ARG);
993         }
994 }
995
996 /*
997  * It is expensive to adjust esp for each individual fp argument pushed on the stack
998  * so we try to do it just once when we have multiple fp arguments in a row.
999  * We don't use this mechanism generally because for int arguments the generated code
1000  * is slightly bigger and new generation cpus optimize away the dependency chains
1001  * created by push instructions on the esp value.
1002  * fp_arg_setup is the first argument in the execution sequence where the esp register
1003  * is modified.
1004  */
1005 static G_GNUC_UNUSED int
1006 collect_fp_stack_space (MonoMethodSignature *sig, int start_arg, int *fp_arg_setup)
1007 {
1008         int fp_space = 0;
1009         MonoType *t;
1010
1011         for (; start_arg < sig->param_count; ++start_arg) {
1012                 t = mini_type_get_underlying_type (NULL, sig->params [start_arg]);
1013                 if (!t->byref && t->type == MONO_TYPE_R8) {
1014                         fp_space += sizeof (double);
1015                         *fp_arg_setup = start_arg;
1016                 } else {
1017                         break;
1018                 }
1019         }
1020         return fp_space;
1021 }
1022
1023 static void
1024 emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
1025 {
1026         MonoMethodSignature *tmp_sig;
1027
1028         /* FIXME: Add support for signature tokens to AOT */
1029         cfg->disable_aot = TRUE;
1030
1031         /*
1032          * mono_ArgIterator_Setup assumes the signature cookie is 
1033          * passed first and all the arguments which were before it are
1034          * passed on the stack after the signature. So compensate by 
1035          * passing a different signature.
1036          */
1037         tmp_sig = mono_metadata_signature_dup (call->signature);
1038         tmp_sig->param_count -= call->signature->sentinelpos;
1039         tmp_sig->sentinelpos = 0;
1040         memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
1041
1042         MONO_EMIT_NEW_BIALU_IMM (cfg, OP_X86_PUSH_IMM, -1, -1, tmp_sig);
1043 }
1044
1045 void
1046 mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
1047 {
1048         MonoInst *arg, *in;
1049         MonoMethodSignature *sig;
1050         int i, n;
1051         CallInfo *cinfo;
1052         int sentinelpos = 0;
1053
1054         sig = call->signature;
1055         n = sig->param_count + sig->hasthis;
1056
1057         cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
1058
1059         if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG))
1060                 sentinelpos = sig->sentinelpos + (sig->hasthis ? 1 : 0);
1061
1062         if (cinfo->need_stack_align) {
1063                 MONO_INST_NEW (cfg, arg, OP_SUB_IMM);
1064                 arg->dreg = X86_ESP;
1065                 arg->sreg1 = X86_ESP;
1066                 arg->inst_imm = cinfo->stack_align_amount;
1067                 MONO_ADD_INS (cfg->cbb, arg);
1068         }
1069
1070         if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
1071                 MonoInst *vtarg;
1072
1073                 if (cinfo->ret.storage == ArgValuetypeInReg) {
1074                         if (cinfo->ret.pair_storage [0] == ArgInIReg && cinfo->ret.pair_storage [1] == ArgNone) {
1075                                 /*
1076                                  * Tell the JIT to use a more efficient calling convention: call using
1077                                  * OP_CALL, compute the result location after the call, and save the 
1078                                  * result there.
1079                                  */
1080                                 call->vret_in_reg = TRUE;
1081                         } else {
1082                                 /*
1083                                  * The valuetype is in EAX:EDX after the call, needs to be copied to
1084                                  * the stack. Save the address here, so the call instruction can
1085                                  * access it.
1086                                  */
1087                                 MONO_INST_NEW (cfg, vtarg, OP_X86_PUSH);
1088                                 vtarg->sreg1 = call->vret_var->dreg;
1089                                 MONO_ADD_INS (cfg->cbb, vtarg);
1090                         }
1091                 }
1092         }
1093
1094         /* Handle the case where there are no implicit arguments */
1095         if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sentinelpos)) {
1096                 emit_sig_cookie (cfg, call, cinfo);
1097         }
1098
1099         /* Arguments are pushed in the reverse order */
1100         for (i = n - 1; i >= 0; i --) {
1101                 ArgInfo *ainfo = cinfo->args + i;
1102                 MonoType *t;
1103
1104                 if (i >= sig->hasthis)
1105                         t = sig->params [i - sig->hasthis];
1106                 else
1107                         t = &mono_defaults.int_class->byval_arg;
1108                 t = mini_type_get_underlying_type (cfg->generic_sharing_context, t);
1109
1110                 MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
1111
1112                 in = call->args [i];
1113                 arg->cil_code = in->cil_code;
1114                 arg->sreg1 = in->dreg;
1115                 arg->type = in->type;
1116
1117                 g_assert (in->dreg != -1);
1118
1119                 if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(t))) {
1120                         guint32 align;
1121                         guint32 size;
1122
1123                         g_assert (in->klass);
1124
1125                         if (t->type == MONO_TYPE_TYPEDBYREF) {
1126                                 size = sizeof (MonoTypedRef);
1127                                 align = sizeof (gpointer);
1128                         }
1129                         else {
1130                                 size = mini_type_stack_size_full (cfg->generic_sharing_context, &in->klass->byval_arg, &align, sig->pinvoke);
1131                         }
1132
1133                         if (size > 0) {
1134                                 arg->opcode = OP_OUTARG_VT;
1135                                 arg->sreg1 = in->dreg;
1136                                 arg->klass = in->klass;
1137                                 arg->backend.size = size;
1138
1139                                 MONO_ADD_INS (cfg->cbb, arg);
1140                         }
1141                 }
1142                 else {
1143                         switch (ainfo->storage) {
1144                         case ArgOnStack:
1145                                 arg->opcode = OP_X86_PUSH;
1146                                 if (!t->byref) {
1147                                         if (t->type == MONO_TYPE_R4) {
1148                                                 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 4);
1149                                                 arg->opcode = OP_STORER4_MEMBASE_REG;
1150                                                 arg->inst_destbasereg = X86_ESP;
1151                                                 arg->inst_offset = 0;
1152                                         } else if (t->type == MONO_TYPE_R8) {
1153                                                 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
1154                                                 arg->opcode = OP_STORER8_MEMBASE_REG;
1155                                                 arg->inst_destbasereg = X86_ESP;
1156                                                 arg->inst_offset = 0;
1157                                         } else if (t->type == MONO_TYPE_I8 || t->type == MONO_TYPE_U8) {
1158                                                 arg->sreg1 ++;
1159                                                 MONO_EMIT_NEW_UNALU (cfg, OP_X86_PUSH, -1, in->dreg + 2);
1160                                         }
1161                                 }
1162                                 break;
1163                         default:
1164                                 g_assert_not_reached ();
1165                         }
1166                         
1167                         MONO_ADD_INS (cfg->cbb, arg);
1168                 }
1169
1170                 if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sentinelpos)) {
1171                         /* Emit the signature cookie just before the implicit arguments */
1172                         emit_sig_cookie (cfg, call, cinfo);
1173                 }
1174         }
1175
1176         if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
1177                 MonoInst *vtarg;
1178
1179                 if (cinfo->ret.storage == ArgValuetypeInReg) {
1180                         /* Already done */
1181                 }
1182                 else if (cinfo->ret.storage == ArgInIReg) {
1183                         NOT_IMPLEMENTED;
1184                         /* The return address is passed in a register */
1185                         MONO_INST_NEW (cfg, vtarg, OP_MOVE);
1186                         vtarg->sreg1 = call->inst.dreg;
1187                         vtarg->dreg = mono_alloc_ireg (cfg);
1188                         MONO_ADD_INS (cfg->cbb, vtarg);
1189                                 
1190                         mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
1191                 } else {
1192                         MonoInst *vtarg;
1193                         MONO_INST_NEW (cfg, vtarg, OP_X86_PUSH);
1194                         vtarg->type = STACK_MP;
1195                         vtarg->sreg1 = call->vret_var->dreg;
1196                         MONO_ADD_INS (cfg->cbb, vtarg);
1197                 }
1198
1199                 /* if the function returns a struct, the called method already does a ret $0x4 */
1200                 cinfo->stack_usage -= 4;
1201         }
1202
1203         call->stack_usage = cinfo->stack_usage;
1204 }
1205
1206 void
1207 mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
1208 {
1209         MonoInst *arg;
1210         int size = ins->backend.size;
1211
1212         if (size <= 4) {
1213                 MONO_INST_NEW (cfg, arg, OP_X86_PUSH_MEMBASE);
1214                 arg->sreg1 = src->dreg;
1215
1216                 MONO_ADD_INS (cfg->cbb, arg);
1217         } else if (size <= 20) {        
1218                 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, ALIGN_TO (size, 4));
1219                 mini_emit_memcpy (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
1220         } else {
1221                 MONO_INST_NEW (cfg, arg, OP_X86_PUSH_OBJ);
1222                 arg->inst_basereg = src->dreg;
1223                 arg->inst_offset = 0;
1224                 arg->inst_imm = size;
1225                                         
1226                 MONO_ADD_INS (cfg->cbb, arg);
1227         }
1228 }
1229
1230 void
1231 mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
1232 {
1233         MonoType *ret = mini_type_get_underlying_type (cfg->generic_sharing_context, mono_method_signature (method)->ret);
1234
1235         if (!ret->byref) {
1236                 if (ret->type == MONO_TYPE_R4) {
1237                         /* Nothing to do */
1238                         return;
1239                 } else if (ret->type == MONO_TYPE_R8) {
1240                         /* Nothing to do */
1241                         return;
1242                 } else if (ret->type == MONO_TYPE_I8 || ret->type == MONO_TYPE_U8) {
1243                         MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, X86_EAX, val->dreg + 1);
1244                         MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, X86_EDX, val->dreg + 2);
1245                         return;
1246                 }
1247         }
1248                         
1249         MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, cfg->ret->dreg, val->dreg);
1250 }
1251
1252 /*
1253  * Allow tracing to work with this interface (with an optional argument)
1254  */
1255 void*
1256 mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
1257 {
1258         guchar *code = p;
1259
1260         g_assert (MONO_ARCH_FRAME_ALIGNMENT >= 8);
1261         x86_alu_reg_imm (code, X86_SUB, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 8);
1262
1263         /* if some args are passed in registers, we need to save them here */
1264         x86_push_reg (code, X86_EBP);
1265
1266         if (cfg->compile_aot) {
1267                 x86_push_imm (code, cfg->method);
1268                 x86_mov_reg_imm (code, X86_EAX, func);
1269                 x86_call_reg (code, X86_EAX);
1270         } else {
1271                 mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_METHODCONST, cfg->method);
1272                 x86_push_imm (code, cfg->method);
1273                 mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_ABS, func);
1274                 x86_call_code (code, 0);
1275         }
1276         x86_alu_reg_imm (code, X86_ADD, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT);
1277
1278         return code;
1279 }
1280
1281 enum {
1282         SAVE_NONE,
1283         SAVE_STRUCT,
1284         SAVE_EAX,
1285         SAVE_EAX_EDX,
1286         SAVE_FP
1287 };
1288
1289 void*
1290 mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
1291 {
1292         guchar *code = p;
1293         int arg_size = 0, save_mode = SAVE_NONE;
1294         MonoMethod *method = cfg->method;
1295         
1296         switch (mini_type_get_underlying_type (cfg->generic_sharing_context, mono_method_signature (method)->ret)->type) {
1297         case MONO_TYPE_VOID:
1298                 /* special case string .ctor icall */
1299                 if (strcmp (".ctor", method->name) && method->klass == mono_defaults.string_class)
1300                         save_mode = SAVE_EAX;
1301                 else
1302                         save_mode = SAVE_NONE;
1303                 break;
1304         case MONO_TYPE_I8:
1305         case MONO_TYPE_U8:
1306                 save_mode = SAVE_EAX_EDX;
1307                 break;
1308         case MONO_TYPE_R4:
1309         case MONO_TYPE_R8:
1310                 save_mode = SAVE_FP;
1311                 break;
1312         case MONO_TYPE_GENERICINST:
1313                 if (!mono_type_generic_inst_is_valuetype (mono_method_signature (method)->ret)) {
1314                         save_mode = SAVE_EAX;
1315                         break;
1316                 }
1317                 /* Fall through */
1318         case MONO_TYPE_VALUETYPE:
1319                 save_mode = SAVE_STRUCT;
1320                 break;
1321         default:
1322                 save_mode = SAVE_EAX;
1323                 break;
1324         }
1325
1326         switch (save_mode) {
1327         case SAVE_EAX_EDX:
1328                 x86_push_reg (code, X86_EDX);
1329                 x86_push_reg (code, X86_EAX);
1330                 if (enable_arguments) {
1331                         x86_push_reg (code, X86_EDX);
1332                         x86_push_reg (code, X86_EAX);
1333                         arg_size = 8;
1334                 }
1335                 break;
1336         case SAVE_EAX:
1337                 x86_push_reg (code, X86_EAX);
1338                 if (enable_arguments) {
1339                         x86_push_reg (code, X86_EAX);
1340                         arg_size = 4;
1341                 }
1342                 break;
1343         case SAVE_FP:
1344                 x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
1345                 x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE);
1346                 if (enable_arguments) {
1347                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
1348                         x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE);
1349                         arg_size = 8;
1350                 }
1351                 break;
1352         case SAVE_STRUCT:
1353                 if (enable_arguments) {
1354                         x86_push_membase (code, X86_EBP, 8);
1355                         arg_size = 4;
1356                 }
1357                 break;
1358         case SAVE_NONE:
1359         default:
1360                 break;
1361         }
1362
1363         if (cfg->compile_aot) {
1364                 x86_push_imm (code, method);
1365                 x86_mov_reg_imm (code, X86_EAX, func);
1366                 x86_call_reg (code, X86_EAX);
1367         } else {
1368                 mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_METHODCONST, method);
1369                 x86_push_imm (code, method);
1370                 mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_ABS, func);
1371                 x86_call_code (code, 0);
1372         }
1373         x86_alu_reg_imm (code, X86_ADD, X86_ESP, arg_size + 4);
1374
1375         switch (save_mode) {
1376         case SAVE_EAX_EDX:
1377                 x86_pop_reg (code, X86_EAX);
1378                 x86_pop_reg (code, X86_EDX);
1379                 break;
1380         case SAVE_EAX:
1381                 x86_pop_reg (code, X86_EAX);
1382                 break;
1383         case SAVE_FP:
1384                 x86_fld_membase (code, X86_ESP, 0, TRUE);
1385                 x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
1386                 break;
1387         case SAVE_NONE:
1388         default:
1389                 break;
1390         }
1391
1392         return code;
1393 }
1394
1395 #define EMIT_COND_BRANCH(ins,cond,sign) \
1396 if (ins->flags & MONO_INST_BRLABEL) { \
1397         if (ins->inst_i0->inst_c0) { \
1398                 x86_branch (code, cond, cfg->native_code + ins->inst_i0->inst_c0, sign); \
1399         } else { \
1400                 mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_LABEL, ins->inst_i0); \
1401                 if ((cfg->opt & MONO_OPT_BRANCH) && \
1402                     x86_is_imm8 (ins->inst_i0->inst_c1 - cpos)) \
1403                         x86_branch8 (code, cond, 0, sign); \
1404                 else \
1405                         x86_branch32 (code, cond, 0, sign); \
1406         } \
1407 } else { \
1408         if (ins->inst_true_bb->native_offset) { \
1409                 x86_branch (code, cond, cfg->native_code + ins->inst_true_bb->native_offset, sign); \
1410         } else { \
1411                 mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_true_bb); \
1412                 if ((cfg->opt & MONO_OPT_BRANCH) && \
1413                     x86_is_imm8 (ins->inst_true_bb->max_offset - cpos)) \
1414                         x86_branch8 (code, cond, 0, sign); \
1415                 else \
1416                         x86_branch32 (code, cond, 0, sign); \
1417         } \
1418 }
1419
1420 /*  
1421  *      Emit an exception if condition is fail and
1422  *  if possible do a directly branch to target 
1423  */
1424 #define EMIT_COND_SYSTEM_EXCEPTION(cond,signed,exc_name)            \
1425         do {                                                        \
1426                 MonoInst *tins = mono_branch_optimize_exception_target (cfg, bb, exc_name); \
1427                 if (tins == NULL) {                                                                             \
1428                         mono_add_patch_info (cfg, code - cfg->native_code,   \
1429                                         MONO_PATCH_INFO_EXC, exc_name);  \
1430                         x86_branch32 (code, cond, 0, signed);               \
1431                 } else {        \
1432                         EMIT_COND_BRANCH (tins, cond, signed);  \
1433                 }                       \
1434         } while (0); 
1435
1436 #define EMIT_FPCOMPARE(code) do { \
1437         x86_fcompp (code); \
1438         x86_fnstsw (code); \
1439 } while (0); 
1440
1441
1442 static guint8*
1443 emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer data)
1444 {
1445         mono_add_patch_info (cfg, code - cfg->native_code, patch_type, data);
1446         x86_call_code (code, 0);
1447
1448         return code;
1449 }
1450
1451 #define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_IADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_ISBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB_IMM)))
1452
1453 /*
1454  * mono_peephole_pass_1:
1455  *
1456  *   Perform peephole opts which should/can be performed before local regalloc
1457  */
1458 void
1459 mono_arch_peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
1460 {
1461         MonoInst *ins, *n;
1462
1463         MONO_BB_FOR_EACH_INS_SAFE (bb, n, ins) {
1464                 MonoInst *last_ins = ins->prev;
1465
1466                 switch (ins->opcode) {
1467                 case OP_IADD_IMM:
1468                 case OP_ADD_IMM:
1469                         if ((ins->sreg1 < MONO_MAX_IREGS) && (ins->dreg >= MONO_MAX_IREGS)) {
1470                                 /* 
1471                                  * X86_LEA is like ADD, but doesn't have the
1472                                  * sreg1==dreg restriction.
1473                                  */
1474                                 ins->opcode = OP_X86_LEA_MEMBASE;
1475                                 ins->inst_basereg = ins->sreg1;
1476                         } else if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
1477                                 ins->opcode = OP_X86_INC_REG;
1478                         break;
1479                 case OP_SUB_IMM:
1480                 case OP_ISUB_IMM:
1481                         if ((ins->sreg1 < MONO_MAX_IREGS) && (ins->dreg >= MONO_MAX_IREGS)) {
1482                                 ins->opcode = OP_X86_LEA_MEMBASE;
1483                                 ins->inst_basereg = ins->sreg1;
1484                                 ins->inst_imm = -ins->inst_imm;
1485                         } else if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
1486                                 ins->opcode = OP_X86_DEC_REG;
1487                         break;
1488                 case OP_COMPARE_IMM:
1489                 case OP_ICOMPARE_IMM:
1490                         /* OP_COMPARE_IMM (reg, 0) 
1491                          * --> 
1492                          * OP_X86_TEST_NULL (reg) 
1493                          */
1494                         if (!ins->inst_imm)
1495                                 ins->opcode = OP_X86_TEST_NULL;
1496                         break;
1497                 case OP_X86_COMPARE_MEMBASE_IMM:
1498                         /* 
1499                          * OP_STORE_MEMBASE_REG reg, offset(basereg)
1500                          * OP_X86_COMPARE_MEMBASE_IMM offset(basereg), imm
1501                          * -->
1502                          * OP_STORE_MEMBASE_REG reg, offset(basereg)
1503                          * OP_COMPARE_IMM reg, imm
1504                          *
1505                          * Note: if imm = 0 then OP_COMPARE_IMM replaced with OP_X86_TEST_NULL
1506                          */
1507                         if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG) &&
1508                             ins->inst_basereg == last_ins->inst_destbasereg &&
1509                             ins->inst_offset == last_ins->inst_offset) {
1510                                         ins->opcode = OP_COMPARE_IMM;
1511                                         ins->sreg1 = last_ins->sreg1;
1512
1513                                         /* check if we can remove cmp reg,0 with test null */
1514                                         if (!ins->inst_imm)
1515                                                 ins->opcode = OP_X86_TEST_NULL;
1516                                 }
1517
1518                         break;                  
1519                 case OP_X86_PUSH_MEMBASE:
1520                         if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG ||
1521                                          last_ins->opcode == OP_STORE_MEMBASE_REG) &&
1522                             ins->inst_basereg == last_ins->inst_destbasereg &&
1523                             ins->inst_offset == last_ins->inst_offset) {
1524                                     ins->opcode = OP_X86_PUSH;
1525                                     ins->sreg1 = last_ins->sreg1;
1526                         }
1527                         break;
1528                 }
1529
1530                 mono_peephole_ins (bb, ins);
1531         }
1532 }
1533
1534 void
1535 mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb)
1536 {
1537         MonoInst *ins, *n;
1538
1539         MONO_BB_FOR_EACH_INS_SAFE (bb, n, ins) {
1540                 switch (ins->opcode) {
1541                 case OP_ICONST:
1542                         /* reg = 0 -> XOR (reg, reg) */
1543                         /* XOR sets cflags on x86, so we cant do it always */
1544                         if (ins->inst_c0 == 0 && (!ins->next || (ins->next && INST_IGNORES_CFLAGS (ins->next->opcode)))) {
1545                                 MonoInst *ins2;
1546
1547                                 ins->opcode = OP_IXOR;
1548                                 ins->sreg1 = ins->dreg;
1549                                 ins->sreg2 = ins->dreg;
1550
1551                                 /* 
1552                                  * Convert succeeding STORE_MEMBASE_IMM 0 ins to STORE_MEMBASE_REG 
1553                                  * since it takes 3 bytes instead of 7.
1554                                  */
1555                                 for (ins2 = ins->next; ins2; ins2 = ins2->next) {
1556                                         if ((ins2->opcode == OP_STORE_MEMBASE_IMM) && (ins2->inst_imm == 0)) {
1557                                                 ins2->opcode = OP_STORE_MEMBASE_REG;
1558                                                 ins2->sreg1 = ins->dreg;
1559                                         }
1560                                         else if ((ins2->opcode == OP_STOREI4_MEMBASE_IMM) && (ins2->inst_imm == 0)) {
1561                                                 ins2->opcode = OP_STOREI4_MEMBASE_REG;
1562                                                 ins2->sreg1 = ins->dreg;
1563                                         }
1564                                         else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM)) {
1565                                                 /* Continue iteration */
1566                                         }
1567                                         else
1568                                                 break;
1569                                 }
1570                         }
1571                         break;
1572                 case OP_IADD_IMM:
1573                 case OP_ADD_IMM:
1574                         if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
1575                                 ins->opcode = OP_X86_INC_REG;
1576                         break;
1577                 case OP_ISUB_IMM:
1578                 case OP_SUB_IMM:
1579                         if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
1580                                 ins->opcode = OP_X86_DEC_REG;
1581                         break;
1582                 }
1583
1584                 mono_peephole_ins (bb, ins);
1585         }
1586 }
1587
1588 /*
1589  * mono_arch_lowering_pass:
1590  *
1591  *  Converts complex opcodes into simpler ones so that each IR instruction
1592  * corresponds to one machine instruction.
1593  */
1594 void
1595 mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
1596 {
1597         MonoInst *ins, *next;
1598
1599         /*
1600          * FIXME: Need to add more instructions, but the current machine 
1601          * description can't model some parts of the composite instructions like
1602          * cdq.
1603          */
1604         MONO_BB_FOR_EACH_INS_SAFE (bb, next, ins) {
1605                 switch (ins->opcode) {
1606                 case OP_IREM_IMM:
1607                 case OP_IDIV_IMM:
1608                 case OP_IDIV_UN_IMM:
1609                 case OP_IREM_UN_IMM:
1610                         /* 
1611                          * Keep the cases where we could generated optimized code, otherwise convert
1612                          * to the non-imm variant.
1613                          */
1614                         if ((ins->opcode == OP_IREM_IMM) && mono_is_power_of_two (ins->inst_imm) >= 0)
1615                                 break;
1616                         mono_decompose_op_imm (cfg, bb, ins);
1617                         break;
1618                 default:
1619                         break;
1620                 }
1621         }
1622
1623         bb->max_vreg = cfg->next_vreg;
1624 }
1625
1626 static const int 
1627 branch_cc_table [] = {
1628         X86_CC_EQ, X86_CC_GE, X86_CC_GT, X86_CC_LE, X86_CC_LT,
1629         X86_CC_NE, X86_CC_GE, X86_CC_GT, X86_CC_LE, X86_CC_LT,
1630         X86_CC_O, X86_CC_NO, X86_CC_C, X86_CC_NC
1631 };
1632
1633 /* Maps CMP_... constants to X86_CC_... constants */
1634 static const int
1635 cc_table [] = {
1636         X86_CC_EQ, X86_CC_NE, X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT,
1637         X86_CC_LE, X86_CC_GE, X86_CC_LT, X86_CC_GT
1638 };
1639
1640 static const int
1641 cc_signed_table [] = {
1642         TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
1643         FALSE, FALSE, FALSE, FALSE
1644 };
1645
1646 static unsigned char*
1647 emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int size, gboolean is_signed)
1648 {
1649 #define XMM_TEMP_REG 0
1650         /*This SSE2 optimization must not be done which OPT_SIMD in place as it clobbers xmm0.*/
1651         /*The xmm pass decomposes OP_FCONV_ ops anyway anyway.*/
1652         if (cfg->opt & MONO_OPT_SSE2 && size < 8 && !(cfg->opt & MONO_OPT_SIMD)) {
1653                 /* optimize by assigning a local var for this use so we avoid
1654                  * the stack manipulations */
1655                 x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
1656                 x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE);
1657                 x86_movsd_reg_membase (code, XMM_TEMP_REG, X86_ESP, 0);
1658                 x86_cvttsd2si (code, dreg, XMM_TEMP_REG);
1659                 x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
1660                 if (size == 1)
1661                         x86_widen_reg (code, dreg, dreg, is_signed, FALSE);
1662                 else if (size == 2)
1663                         x86_widen_reg (code, dreg, dreg, is_signed, TRUE);
1664                 return code;
1665         }
1666         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4);
1667         x86_fnstcw_membase(code, X86_ESP, 0);
1668         x86_mov_reg_membase (code, dreg, X86_ESP, 0, 2);
1669         x86_alu_reg_imm (code, X86_OR, dreg, 0xc00);
1670         x86_mov_membase_reg (code, X86_ESP, 2, dreg, 2);
1671         x86_fldcw_membase (code, X86_ESP, 2);
1672         if (size == 8) {
1673                 x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
1674                 x86_fist_pop_membase (code, X86_ESP, 0, TRUE);
1675                 x86_pop_reg (code, dreg);
1676                 /* FIXME: need the high register 
1677                  * x86_pop_reg (code, dreg_high);
1678                  */
1679         } else {
1680                 x86_push_reg (code, X86_EAX); // SP = SP - 4
1681                 x86_fist_pop_membase (code, X86_ESP, 0, FALSE);
1682                 x86_pop_reg (code, dreg);
1683         }
1684         x86_fldcw_membase (code, X86_ESP, 0);
1685         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
1686
1687         if (size == 1)
1688                 x86_widen_reg (code, dreg, dreg, is_signed, FALSE);
1689         else if (size == 2)
1690                 x86_widen_reg (code, dreg, dreg, is_signed, TRUE);
1691         return code;
1692 }
1693
1694 static unsigned char*
1695 mono_emit_stack_alloc (guchar *code, MonoInst* tree)
1696 {
1697         int sreg = tree->sreg1;
1698         int need_touch = FALSE;
1699
1700 #if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
1701         need_touch = TRUE;
1702 #endif
1703
1704         if (need_touch) {
1705                 guint8* br[5];
1706
1707                 /*
1708                  * Under Windows:
1709                  * If requested stack size is larger than one page,
1710                  * perform stack-touch operation
1711                  */
1712                 /*
1713                  * Generate stack probe code.
1714                  * Under Windows, it is necessary to allocate one page at a time,
1715                  * "touching" stack after each successful sub-allocation. This is
1716                  * because of the way stack growth is implemented - there is a
1717                  * guard page before the lowest stack page that is currently commited.
1718                  * Stack normally grows sequentially so OS traps access to the
1719                  * guard page and commits more pages when needed.
1720                  */
1721                 x86_test_reg_imm (code, sreg, ~0xFFF);
1722                 br[0] = code; x86_branch8 (code, X86_CC_Z, 0, FALSE);
1723
1724                 br[2] = code; /* loop */
1725                 x86_alu_reg_imm (code, X86_SUB, X86_ESP, 0x1000);
1726                 x86_test_membase_reg (code, X86_ESP, 0, X86_ESP);
1727
1728                 /* 
1729                  * By the end of the loop, sreg2 is smaller than 0x1000, so the init routine
1730                  * that follows only initializes the last part of the area.
1731                  */
1732                 /* Same as the init code below with size==0x1000 */
1733                 if (tree->flags & MONO_INST_INIT) {
1734                         x86_push_reg (code, X86_EAX);
1735                         x86_push_reg (code, X86_ECX);
1736                         x86_push_reg (code, X86_EDI);
1737                         x86_mov_reg_imm (code, X86_ECX, (0x1000 >> 2));
1738                         x86_alu_reg_reg (code, X86_XOR, X86_EAX, X86_EAX);                              
1739                         x86_lea_membase (code, X86_EDI, X86_ESP, 12);
1740                         x86_cld (code);
1741                         x86_prefix (code, X86_REP_PREFIX);
1742                         x86_stosl (code);
1743                         x86_pop_reg (code, X86_EDI);
1744                         x86_pop_reg (code, X86_ECX);
1745                         x86_pop_reg (code, X86_EAX);
1746                 }
1747
1748                 x86_alu_reg_imm (code, X86_SUB, sreg, 0x1000);
1749                 x86_alu_reg_imm (code, X86_CMP, sreg, 0x1000);
1750                 br[3] = code; x86_branch8 (code, X86_CC_AE, 0, FALSE);
1751                 x86_patch (br[3], br[2]);
1752                 x86_test_reg_reg (code, sreg, sreg);
1753                 br[4] = code; x86_branch8 (code, X86_CC_Z, 0, FALSE);
1754                 x86_alu_reg_reg (code, X86_SUB, X86_ESP, sreg);
1755
1756                 br[1] = code; x86_jump8 (code, 0);
1757
1758                 x86_patch (br[0], code);
1759                 x86_alu_reg_reg (code, X86_SUB, X86_ESP, sreg);
1760                 x86_patch (br[1], code);
1761                 x86_patch (br[4], code);
1762         }
1763         else
1764                 x86_alu_reg_reg (code, X86_SUB, X86_ESP, tree->sreg1);
1765
1766         if (tree->flags & MONO_INST_INIT) {
1767                 int offset = 0;
1768                 if (tree->dreg != X86_EAX && sreg != X86_EAX) {
1769                         x86_push_reg (code, X86_EAX);
1770                         offset += 4;
1771                 }
1772                 if (tree->dreg != X86_ECX && sreg != X86_ECX) {
1773                         x86_push_reg (code, X86_ECX);
1774                         offset += 4;
1775                 }
1776                 if (tree->dreg != X86_EDI && sreg != X86_EDI) {
1777                         x86_push_reg (code, X86_EDI);
1778                         offset += 4;
1779                 }
1780                 
1781                 x86_shift_reg_imm (code, X86_SHR, sreg, 2);
1782                 if (sreg != X86_ECX)
1783                         x86_mov_reg_reg (code, X86_ECX, sreg, 4);
1784                 x86_alu_reg_reg (code, X86_XOR, X86_EAX, X86_EAX);
1785                                 
1786                 x86_lea_membase (code, X86_EDI, X86_ESP, offset);
1787                 x86_cld (code);
1788                 x86_prefix (code, X86_REP_PREFIX);
1789                 x86_stosl (code);
1790                 
1791                 if (tree->dreg != X86_EDI && sreg != X86_EDI)
1792                         x86_pop_reg (code, X86_EDI);
1793                 if (tree->dreg != X86_ECX && sreg != X86_ECX)
1794                         x86_pop_reg (code, X86_ECX);
1795                 if (tree->dreg != X86_EAX && sreg != X86_EAX)
1796                         x86_pop_reg (code, X86_EAX);
1797         }
1798         return code;
1799 }
1800
1801
1802 static guint8*
1803 emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
1804 {
1805         CallInfo *cinfo;
1806         int quad;
1807
1808         /* Move return value to the target register */
1809         switch (ins->opcode) {
1810         case OP_CALL:
1811         case OP_CALL_REG:
1812         case OP_CALL_MEMBASE:
1813                 if (ins->dreg != X86_EAX)
1814                         x86_mov_reg_reg (code, ins->dreg, X86_EAX, 4);
1815                 break;
1816         case OP_VCALL:
1817         case OP_VCALL_REG:
1818         case OP_VCALL_MEMBASE:
1819         case OP_VCALL2:
1820         case OP_VCALL2_REG:
1821         case OP_VCALL2_MEMBASE:
1822                 cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
1823                 if (cinfo->ret.storage == ArgValuetypeInReg) {
1824                         /* Pop the destination address from the stack */
1825                         x86_pop_reg (code, X86_ECX);
1826                         
1827                         for (quad = 0; quad < 2; quad ++) {
1828                                 switch (cinfo->ret.pair_storage [quad]) {
1829                                 case ArgInIReg:
1830                                         g_assert (cinfo->ret.pair_regs [quad] != X86_ECX);
1831                                         x86_mov_membase_reg (code, X86_ECX, (quad * sizeof (gpointer)), cinfo->ret.pair_regs [quad], sizeof (gpointer));
1832                                         break;
1833                                 case ArgNone:
1834                                         break;
1835                                 default:
1836                                         g_assert_not_reached ();
1837                                 }
1838                         }
1839                 }
1840                 break;
1841         case OP_FCALL: {
1842                 MonoCallInst *call = (MonoCallInst*)ins;
1843                 if (call->method && !mono_method_signature (call->method)->ret->byref && mono_method_signature (call->method)->ret->type == MONO_TYPE_R4) {
1844                         /* Avoid some precision issues by saving/reloading the return value */
1845                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
1846                         x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE);
1847                         x86_fld_membase (code, X86_ESP, 0, FALSE);
1848                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
1849                 }
1850                 break;
1851         }
1852         default:
1853                 break;
1854         }
1855
1856         return code;
1857 }
1858
1859 /*
1860  * mono_x86_emit_tls_get:
1861  * @code: buffer to store code to
1862  * @dreg: hard register where to place the result
1863  * @tls_offset: offset info
1864  *
1865  * mono_x86_emit_tls_get emits in @code the native code that puts in
1866  * the dreg register the item in the thread local storage identified
1867  * by tls_offset.
1868  *
1869  * Returns: a pointer to the end of the stored code
1870  */
1871 guint8*
1872 mono_x86_emit_tls_get (guint8* code, int dreg, int tls_offset)
1873 {
1874 #ifdef PLATFORM_WIN32
1875         /* 
1876          * See the Under the Hood article in the May 1996 issue of Microsoft Systems 
1877          * Journal and/or a disassembly of the TlsGet () function.
1878          */
1879         g_assert (tls_offset < 64);
1880         x86_prefix (code, X86_FS_PREFIX);
1881         x86_mov_reg_mem (code, dreg, 0x18, 4);
1882         /* Dunno what this does but TlsGetValue () contains it */
1883         x86_alu_membase_imm (code, X86_AND, dreg, 0x34, 0);
1884         x86_mov_reg_membase (code, dreg, dreg, 3600 + (tls_offset * 4), 4);
1885 #else
1886         if (optimize_for_xen) {
1887                 x86_prefix (code, X86_GS_PREFIX);
1888                 x86_mov_reg_mem (code, dreg, 0, 4);
1889                 x86_mov_reg_membase (code, dreg, dreg, tls_offset, 4);
1890         } else {
1891                 x86_prefix (code, X86_GS_PREFIX);
1892                 x86_mov_reg_mem (code, dreg, tls_offset, 4);
1893         }
1894 #endif
1895         return code;
1896 }
1897
1898 /*
1899  * emit_load_volatile_arguments:
1900  *
1901  *  Load volatile arguments from the stack to the original input registers.
1902  * Required before a tail call.
1903  */
1904 static guint8*
1905 emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
1906 {
1907         MonoMethod *method = cfg->method;
1908         MonoMethodSignature *sig;
1909         MonoInst *inst;
1910         CallInfo *cinfo;
1911         guint32 i;
1912
1913         /* FIXME: Generate intermediate code instead */
1914
1915         sig = mono_method_signature (method);
1916
1917         cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
1918         
1919         /* This is the opposite of the code in emit_prolog */
1920
1921         for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
1922                 ArgInfo *ainfo = cinfo->args + i;
1923                 MonoType *arg_type;
1924                 inst = cfg->args [i];
1925
1926                 if (sig->hasthis && (i == 0))
1927                         arg_type = &mono_defaults.object_class->byval_arg;
1928                 else
1929                         arg_type = sig->params [i - sig->hasthis];
1930
1931                 /*
1932                  * On x86, the arguments are either in their original stack locations, or in
1933                  * global regs.
1934                  */
1935                 if (inst->opcode == OP_REGVAR) {
1936                         g_assert (ainfo->storage == ArgOnStack);
1937                         
1938                         x86_mov_membase_reg (code, X86_EBP, inst->inst_offset, inst->dreg, 4);
1939                 }
1940         }
1941
1942         return code;
1943 }
1944
1945 #define REAL_PRINT_REG(text,reg) \
1946 mono_assert (reg >= 0); \
1947 x86_push_reg (code, X86_EAX); \
1948 x86_push_reg (code, X86_EDX); \
1949 x86_push_reg (code, X86_ECX); \
1950 x86_push_reg (code, reg); \
1951 x86_push_imm (code, reg); \
1952 x86_push_imm (code, text " %d %p\n"); \
1953 x86_mov_reg_imm (code, X86_EAX, printf); \
1954 x86_call_reg (code, X86_EAX); \
1955 x86_alu_reg_imm (code, X86_ADD, X86_ESP, 3*4); \
1956 x86_pop_reg (code, X86_ECX); \
1957 x86_pop_reg (code, X86_EDX); \
1958 x86_pop_reg (code, X86_EAX);
1959
1960 /* benchmark and set based on cpu */
1961 #define LOOP_ALIGNMENT 8
1962 #define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
1963
1964 void
1965 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
1966 {
1967         MonoInst *ins;
1968         MonoCallInst *call;
1969         guint offset;
1970         guint8 *code = cfg->native_code + cfg->code_len;
1971         int max_len, cpos;
1972
1973         if (cfg->opt & MONO_OPT_LOOP) {
1974                 int pad, align = LOOP_ALIGNMENT;
1975                 /* set alignment depending on cpu */
1976                 if (bb_is_loop_start (bb) && (pad = (cfg->code_len & (align - 1)))) {
1977                         pad = align - pad;
1978                         /*g_print ("adding %d pad at %x to loop in %s\n", pad, cfg->code_len, cfg->method->name);*/
1979                         x86_padding (code, pad);
1980                         cfg->code_len += pad;
1981                         bb->native_offset = cfg->code_len;
1982                 }
1983         }
1984
1985         if (cfg->verbose_level > 2)
1986                 g_print ("Basic block %d starting at offset 0x%x\n", bb->block_num, bb->native_offset);
1987
1988         cpos = bb->max_offset;
1989
1990         if (cfg->prof_options & MONO_PROFILE_COVERAGE) {
1991                 MonoProfileCoverageInfo *cov = cfg->coverage_info;
1992                 g_assert (!cfg->compile_aot);
1993                 cpos += 6;
1994
1995                 cov->data [bb->dfn].cil_code = bb->cil_code;
1996                 /* this is not thread save, but good enough */
1997                 x86_inc_mem (code, &cov->data [bb->dfn].count); 
1998         }
1999
2000         offset = code - cfg->native_code;
2001
2002         mono_debug_open_block (cfg, bb, offset);
2003
2004         MONO_BB_FOR_EACH_INS (bb, ins) {
2005                 offset = code - cfg->native_code;
2006
2007                 max_len = ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
2008
2009                 if (G_UNLIKELY (offset > (cfg->code_size - max_len - 16))) {
2010                         cfg->code_size *= 2;
2011                         cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
2012                         code = cfg->native_code + offset;
2013                         mono_jit_stats.code_reallocs++;
2014                 }
2015
2016                 if (cfg->debug_info)
2017                         mono_debug_record_line_number (cfg, ins, offset);
2018
2019                 switch (ins->opcode) {
2020                 case OP_BIGMUL:
2021                         x86_mul_reg (code, ins->sreg2, TRUE);
2022                         break;
2023                 case OP_BIGMUL_UN:
2024                         x86_mul_reg (code, ins->sreg2, FALSE);
2025                         break;
2026                 case OP_X86_SETEQ_MEMBASE:
2027                 case OP_X86_SETNE_MEMBASE:
2028                         x86_set_membase (code, ins->opcode == OP_X86_SETEQ_MEMBASE ? X86_CC_EQ : X86_CC_NE,
2029                                          ins->inst_basereg, ins->inst_offset, TRUE);
2030                         break;
2031                 case OP_STOREI1_MEMBASE_IMM:
2032                         x86_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, ins->inst_imm, 1);
2033                         break;
2034                 case OP_STOREI2_MEMBASE_IMM:
2035                         x86_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, ins->inst_imm, 2);
2036                         break;
2037                 case OP_STORE_MEMBASE_IMM:
2038                 case OP_STOREI4_MEMBASE_IMM:
2039                         x86_mov_membase_imm (code, ins->inst_destbasereg, ins->inst_offset, ins->inst_imm, 4);
2040                         break;
2041                 case OP_STOREI1_MEMBASE_REG:
2042                         x86_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 1);
2043                         break;
2044                 case OP_STOREI2_MEMBASE_REG:
2045                         x86_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 2);
2046                         break;
2047                 case OP_STORE_MEMBASE_REG:
2048                 case OP_STOREI4_MEMBASE_REG:
2049                         x86_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 4);
2050                         break;
2051                 case OP_STORE_MEM_IMM:
2052                         x86_mov_mem_imm (code, ins->inst_p0, ins->inst_c0, 4);
2053                         break;
2054                 case OP_LOADU4_MEM:
2055                         x86_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
2056                         break;
2057                 case OP_LOAD_MEM:
2058                 case OP_LOADI4_MEM:
2059                         /* These are created by the cprop pass so they use inst_imm as the source */
2060                         x86_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
2061                         break;
2062                 case OP_LOADU1_MEM:
2063                         x86_widen_mem (code, ins->dreg, ins->inst_imm, FALSE, FALSE);
2064                         break;
2065                 case OP_LOADU2_MEM:
2066                         x86_widen_mem (code, ins->dreg, ins->inst_imm, FALSE, TRUE);
2067                         break;
2068                 case OP_LOAD_MEMBASE:
2069                 case OP_LOADI4_MEMBASE:
2070                 case OP_LOADU4_MEMBASE:
2071                         x86_mov_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, 4);
2072                         break;
2073                 case OP_LOADU1_MEMBASE:
2074                         x86_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, FALSE);
2075                         break;
2076                 case OP_LOADI1_MEMBASE:
2077                         x86_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, TRUE, FALSE);
2078                         break;
2079                 case OP_LOADU2_MEMBASE:
2080                         x86_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, FALSE, TRUE);
2081                         break;
2082                 case OP_LOADI2_MEMBASE:
2083                         x86_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, TRUE, TRUE);
2084                         break;
2085                 case OP_ICONV_TO_I1:
2086                 case OP_SEXT_I1:
2087                         x86_widen_reg (code, ins->dreg, ins->sreg1, TRUE, FALSE);
2088                         break;
2089                 case OP_ICONV_TO_I2:
2090                 case OP_SEXT_I2:
2091                         x86_widen_reg (code, ins->dreg, ins->sreg1, TRUE, TRUE);
2092                         break;
2093                 case OP_ICONV_TO_U1:
2094                         x86_widen_reg (code, ins->dreg, ins->sreg1, FALSE, FALSE);
2095                         break;
2096                 case OP_ICONV_TO_U2:
2097                         x86_widen_reg (code, ins->dreg, ins->sreg1, FALSE, TRUE);
2098                         break;
2099                 case OP_COMPARE:
2100                 case OP_ICOMPARE:
2101                         x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
2102                         break;
2103                 case OP_COMPARE_IMM:
2104                 case OP_ICOMPARE_IMM:
2105                         x86_alu_reg_imm (code, X86_CMP, ins->sreg1, ins->inst_imm);
2106                         break;
2107                 case OP_X86_COMPARE_MEMBASE_REG:
2108                         x86_alu_membase_reg (code, X86_CMP, ins->inst_basereg, ins->inst_offset, ins->sreg2);
2109                         break;
2110                 case OP_X86_COMPARE_MEMBASE_IMM:
2111                         x86_alu_membase_imm (code, X86_CMP, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
2112                         break;
2113                 case OP_X86_COMPARE_MEMBASE8_IMM:
2114                         x86_alu_membase8_imm (code, X86_CMP, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
2115                         break;
2116                 case OP_X86_COMPARE_REG_MEMBASE:
2117                         x86_alu_reg_membase (code, X86_CMP, ins->sreg1, ins->sreg2, ins->inst_offset);
2118                         break;
2119                 case OP_X86_COMPARE_MEM_IMM:
2120                         x86_alu_mem_imm (code, X86_CMP, ins->inst_offset, ins->inst_imm);
2121                         break;
2122                 case OP_X86_TEST_NULL:
2123                         x86_test_reg_reg (code, ins->sreg1, ins->sreg1);
2124                         break;
2125                 case OP_X86_ADD_MEMBASE_IMM:
2126                         x86_alu_membase_imm (code, X86_ADD, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
2127                         break;
2128                 case OP_X86_ADD_REG_MEMBASE:
2129                         x86_alu_reg_membase (code, X86_ADD, ins->sreg1, ins->sreg2, ins->inst_offset);
2130                         break;
2131                 case OP_X86_SUB_MEMBASE_IMM:
2132                         x86_alu_membase_imm (code, X86_SUB, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
2133                         break;
2134                 case OP_X86_SUB_REG_MEMBASE:
2135                         x86_alu_reg_membase (code, X86_SUB, ins->sreg1, ins->sreg2, ins->inst_offset);
2136                         break;
2137                 case OP_X86_AND_MEMBASE_IMM:
2138                         x86_alu_membase_imm (code, X86_AND, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
2139                         break;
2140                 case OP_X86_OR_MEMBASE_IMM:
2141                         x86_alu_membase_imm (code, X86_OR, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
2142                         break;
2143                 case OP_X86_XOR_MEMBASE_IMM:
2144                         x86_alu_membase_imm (code, X86_XOR, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
2145                         break;
2146                 case OP_X86_ADD_MEMBASE_REG:
2147                         x86_alu_membase_reg (code, X86_ADD, ins->inst_basereg, ins->inst_offset, ins->sreg2);
2148                         break;
2149                 case OP_X86_SUB_MEMBASE_REG:
2150                         x86_alu_membase_reg (code, X86_SUB, ins->inst_basereg, ins->inst_offset, ins->sreg2);
2151                         break;
2152                 case OP_X86_AND_MEMBASE_REG:
2153                         x86_alu_membase_reg (code, X86_AND, ins->inst_basereg, ins->inst_offset, ins->sreg2);
2154                         break;
2155                 case OP_X86_OR_MEMBASE_REG:
2156                         x86_alu_membase_reg (code, X86_OR, ins->inst_basereg, ins->inst_offset, ins->sreg2);
2157                         break;
2158                 case OP_X86_XOR_MEMBASE_REG:
2159                         x86_alu_membase_reg (code, X86_XOR, ins->inst_basereg, ins->inst_offset, ins->sreg2);
2160                         break;
2161                 case OP_X86_INC_MEMBASE:
2162                         x86_inc_membase (code, ins->inst_basereg, ins->inst_offset);
2163                         break;
2164                 case OP_X86_INC_REG:
2165                         x86_inc_reg (code, ins->dreg);
2166                         break;
2167                 case OP_X86_DEC_MEMBASE:
2168                         x86_dec_membase (code, ins->inst_basereg, ins->inst_offset);
2169                         break;
2170                 case OP_X86_DEC_REG:
2171                         x86_dec_reg (code, ins->dreg);
2172                         break;
2173                 case OP_X86_MUL_REG_MEMBASE:
2174                         x86_imul_reg_membase (code, ins->sreg1, ins->sreg2, ins->inst_offset);
2175                         break;
2176                 case OP_X86_AND_REG_MEMBASE:
2177                         x86_alu_reg_membase (code, X86_AND, ins->sreg1, ins->sreg2, ins->inst_offset);
2178                         break;
2179                 case OP_X86_OR_REG_MEMBASE:
2180                         x86_alu_reg_membase (code, X86_OR, ins->sreg1, ins->sreg2, ins->inst_offset);
2181                         break;
2182                 case OP_X86_XOR_REG_MEMBASE:
2183                         x86_alu_reg_membase (code, X86_XOR, ins->sreg1, ins->sreg2, ins->inst_offset);
2184                         break;
2185                 case OP_BREAK:
2186                         x86_breakpoint (code);
2187                         break;
2188                 case OP_RELAXED_NOP:
2189                         x86_prefix (code, X86_REP_PREFIX);
2190                         x86_nop (code);
2191                         break;
2192                 case OP_HARD_NOP:
2193                         x86_nop (code);
2194                         break;
2195                 case OP_NOP:
2196                 case OP_DUMMY_USE:
2197                 case OP_DUMMY_STORE:
2198                 case OP_NOT_REACHED:
2199                 case OP_NOT_NULL:
2200                         break;
2201                 case OP_ADDCC:
2202                 case OP_IADDCC:
2203                 case OP_IADD:
2204                         x86_alu_reg_reg (code, X86_ADD, ins->sreg1, ins->sreg2);
2205                         break;
2206                 case OP_ADC:
2207                 case OP_IADC:
2208                         x86_alu_reg_reg (code, X86_ADC, ins->sreg1, ins->sreg2);
2209                         break;
2210                 case OP_ADDCC_IMM:
2211                 case OP_ADD_IMM:
2212                 case OP_IADD_IMM:
2213                         x86_alu_reg_imm (code, X86_ADD, ins->dreg, ins->inst_imm);
2214                         break;
2215                 case OP_ADC_IMM:
2216                 case OP_IADC_IMM:
2217                         x86_alu_reg_imm (code, X86_ADC, ins->dreg, ins->inst_imm);
2218                         break;
2219                 case OP_SUBCC:
2220                 case OP_ISUBCC:
2221                 case OP_ISUB:
2222                         x86_alu_reg_reg (code, X86_SUB, ins->sreg1, ins->sreg2);
2223                         break;
2224                 case OP_SBB:
2225                 case OP_ISBB:
2226                         x86_alu_reg_reg (code, X86_SBB, ins->sreg1, ins->sreg2);
2227                         break;
2228                 case OP_SUBCC_IMM:
2229                 case OP_SUB_IMM:
2230                 case OP_ISUB_IMM:
2231                         x86_alu_reg_imm (code, X86_SUB, ins->dreg, ins->inst_imm);
2232                         break;
2233                 case OP_SBB_IMM:
2234                 case OP_ISBB_IMM:
2235                         x86_alu_reg_imm (code, X86_SBB, ins->dreg, ins->inst_imm);
2236                         break;
2237                 case OP_IAND:
2238                         x86_alu_reg_reg (code, X86_AND, ins->sreg1, ins->sreg2);
2239                         break;
2240                 case OP_AND_IMM:
2241                 case OP_IAND_IMM:
2242                         x86_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_imm);
2243                         break;
2244                 case OP_IDIV:
2245                 case OP_IREM:
2246                         /* 
2247                          * The code is the same for div/rem, the allocator will allocate dreg
2248                          * to RAX/RDX as appropriate.
2249                          */
2250                         if (ins->sreg2 == X86_EDX) {
2251                                 /* cdq clobbers this */
2252                                 x86_push_reg (code, ins->sreg2);
2253                                 x86_cdq (code);
2254                                 x86_div_membase (code, X86_ESP, 0, TRUE);
2255                                 x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);                            
2256                         } else {
2257                                 x86_cdq (code);
2258                                 x86_div_reg (code, ins->sreg2, TRUE);
2259                         }
2260                         break;
2261                 case OP_IDIV_UN:
2262                 case OP_IREM_UN:
2263                         if (ins->sreg2 == X86_EDX) {
2264                                 x86_push_reg (code, ins->sreg2);
2265                                 x86_alu_reg_reg (code, X86_XOR, X86_EDX, X86_EDX);
2266                                 x86_div_membase (code, X86_ESP, 0, FALSE);
2267                                 x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);                            
2268                         } else {
2269                                 x86_alu_reg_reg (code, X86_XOR, X86_EDX, X86_EDX);
2270                                 x86_div_reg (code, ins->sreg2, FALSE);
2271                         }
2272                         break;
2273                 case OP_DIV_IMM:
2274                         x86_mov_reg_imm (code, ins->sreg2, ins->inst_imm);
2275                         x86_cdq (code);
2276                         x86_div_reg (code, ins->sreg2, TRUE);
2277                         break;
2278                 case OP_IREM_IMM: {
2279                         int power = mono_is_power_of_two (ins->inst_imm);
2280
2281                         g_assert (ins->sreg1 == X86_EAX);
2282                         g_assert (ins->dreg == X86_EAX);
2283                         g_assert (power >= 0);
2284
2285                         if (power == 1) {
2286                                 /* Based on http://compilers.iecc.com/comparch/article/93-04-079 */
2287                                 x86_cdq (code);
2288                                 x86_alu_reg_imm (code, X86_AND, X86_EAX, 1);
2289                                 /* 
2290                                  * If the divident is >= 0, this does not nothing. If it is positive, it
2291                                  * it transforms %eax=0 into %eax=0, and %eax=1 into %eax=-1.
2292                                  */
2293                                 x86_alu_reg_reg (code, X86_XOR, X86_EAX, X86_EDX);
2294                                 x86_alu_reg_reg (code, X86_SUB, X86_EAX, X86_EDX);
2295                         } else {
2296                                 /* Based on gcc code */
2297
2298                                 /* Add compensation for negative dividents */
2299                                 x86_cdq (code);
2300                                 x86_shift_reg_imm (code, X86_SHR, X86_EDX, 32 - power);
2301                                 x86_alu_reg_reg (code, X86_ADD, X86_EAX, X86_EDX);
2302                                 /* Compute remainder */
2303                                 x86_alu_reg_imm (code, X86_AND, X86_EAX, (1 << power) - 1);
2304                                 /* Remove compensation */
2305                                 x86_alu_reg_reg (code, X86_SUB, X86_EAX, X86_EDX);
2306                         }
2307                         break;
2308                 }
2309                 case OP_IOR:
2310                         x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
2311                         break;
2312                 case OP_OR_IMM:
2313                 case OP_IOR_IMM:
2314                         x86_alu_reg_imm (code, X86_OR, ins->sreg1, ins->inst_imm);
2315                         break;
2316                 case OP_IXOR:
2317                         x86_alu_reg_reg (code, X86_XOR, ins->sreg1, ins->sreg2);
2318                         break;
2319                 case OP_XOR_IMM:
2320                 case OP_IXOR_IMM:
2321                         x86_alu_reg_imm (code, X86_XOR, ins->sreg1, ins->inst_imm);
2322                         break;
2323                 case OP_ISHL:
2324                         g_assert (ins->sreg2 == X86_ECX);
2325                         x86_shift_reg (code, X86_SHL, ins->dreg);
2326                         break;
2327                 case OP_ISHR:
2328                         g_assert (ins->sreg2 == X86_ECX);
2329                         x86_shift_reg (code, X86_SAR, ins->dreg);
2330                         break;
2331                 case OP_SHR_IMM:
2332                 case OP_ISHR_IMM:
2333                         x86_shift_reg_imm (code, X86_SAR, ins->dreg, ins->inst_imm);
2334                         break;
2335                 case OP_SHR_UN_IMM:
2336                 case OP_ISHR_UN_IMM:
2337                         x86_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_imm);
2338                         break;
2339                 case OP_ISHR_UN:
2340                         g_assert (ins->sreg2 == X86_ECX);
2341                         x86_shift_reg (code, X86_SHR, ins->dreg);
2342                         break;
2343                 case OP_SHL_IMM:
2344                 case OP_ISHL_IMM:
2345                         x86_shift_reg_imm (code, X86_SHL, ins->dreg, ins->inst_imm);
2346                         break;
2347                 case OP_LSHL: {
2348                         guint8 *jump_to_end;
2349
2350                         /* handle shifts below 32 bits */
2351                         x86_shld_reg (code, ins->backend.reg3, ins->sreg1);
2352                         x86_shift_reg (code, X86_SHL, ins->sreg1);
2353
2354                         x86_test_reg_imm (code, X86_ECX, 32);
2355                         jump_to_end = code; x86_branch8 (code, X86_CC_EQ, 0, TRUE);
2356
2357                         /* handle shift over 32 bit */
2358                         x86_mov_reg_reg (code, ins->backend.reg3, ins->sreg1, 4);
2359                         x86_clear_reg (code, ins->sreg1);
2360                         
2361                         x86_patch (jump_to_end, code);
2362                         }
2363                         break;
2364                 case OP_LSHR: {
2365                         guint8 *jump_to_end;
2366
2367                         /* handle shifts below 32 bits */
2368                         x86_shrd_reg (code, ins->sreg1, ins->backend.reg3);
2369                         x86_shift_reg (code, X86_SAR, ins->backend.reg3);
2370
2371                         x86_test_reg_imm (code, X86_ECX, 32);
2372                         jump_to_end = code; x86_branch8 (code, X86_CC_EQ, 0, FALSE);
2373
2374                         /* handle shifts over 31 bits */
2375                         x86_mov_reg_reg (code, ins->sreg1, ins->backend.reg3, 4);
2376                         x86_shift_reg_imm (code, X86_SAR, ins->backend.reg3, 31);
2377                         
2378                         x86_patch (jump_to_end, code);
2379                         }
2380                         break;
2381                 case OP_LSHR_UN: {
2382                         guint8 *jump_to_end;
2383
2384                         /* handle shifts below 32 bits */
2385                         x86_shrd_reg (code, ins->sreg1, ins->backend.reg3);
2386                         x86_shift_reg (code, X86_SHR, ins->backend.reg3);
2387
2388                         x86_test_reg_imm (code, X86_ECX, 32);
2389                         jump_to_end = code; x86_branch8 (code, X86_CC_EQ, 0, FALSE);
2390
2391                         /* handle shifts over 31 bits */
2392                         x86_mov_reg_reg (code, ins->sreg1, ins->backend.reg3, 4);
2393                         x86_clear_reg (code, ins->backend.reg3);
2394                         
2395                         x86_patch (jump_to_end, code);
2396                         }
2397                         break;
2398                 case OP_LSHL_IMM:
2399                         if (ins->inst_imm >= 32) {
2400                                 x86_mov_reg_reg (code, ins->backend.reg3, ins->sreg1, 4);
2401                                 x86_clear_reg (code, ins->sreg1);
2402                                 x86_shift_reg_imm (code, X86_SHL, ins->backend.reg3, ins->inst_imm - 32);
2403                         } else {
2404                                 x86_shld_reg_imm (code, ins->backend.reg3, ins->sreg1, ins->inst_imm);
2405                                 x86_shift_reg_imm (code, X86_SHL, ins->sreg1, ins->inst_imm);
2406                         }
2407                         break;
2408                 case OP_LSHR_IMM:
2409                         if (ins->inst_imm >= 32) {
2410                                 x86_mov_reg_reg (code, ins->sreg1, ins->backend.reg3,  4);
2411                                 x86_shift_reg_imm (code, X86_SAR, ins->backend.reg3, 0x1f);
2412                                 x86_shift_reg_imm (code, X86_SAR, ins->sreg1, ins->inst_imm - 32);
2413                         } else {
2414                                 x86_shrd_reg_imm (code, ins->sreg1, ins->backend.reg3, ins->inst_imm);
2415                                 x86_shift_reg_imm (code, X86_SAR, ins->backend.reg3, ins->inst_imm);
2416                         }
2417                         break;
2418                 case OP_LSHR_UN_IMM:
2419                         if (ins->inst_imm >= 32) {
2420                                 x86_mov_reg_reg (code, ins->sreg1, ins->backend.reg3, 4);
2421                                 x86_clear_reg (code, ins->backend.reg3);
2422                                 x86_shift_reg_imm (code, X86_SHR, ins->sreg1, ins->inst_imm - 32);
2423                         } else {
2424                                 x86_shrd_reg_imm (code, ins->sreg1, ins->backend.reg3, ins->inst_imm);
2425                                 x86_shift_reg_imm (code, X86_SHR, ins->backend.reg3, ins->inst_imm);
2426                         }
2427                         break;
2428                 case OP_INOT:
2429                         x86_not_reg (code, ins->sreg1);
2430                         break;
2431                 case OP_INEG:
2432                         x86_neg_reg (code, ins->sreg1);
2433                         break;
2434
2435                 case OP_IMUL:
2436                         x86_imul_reg_reg (code, ins->sreg1, ins->sreg2);
2437                         break;
2438                 case OP_MUL_IMM:
2439                 case OP_IMUL_IMM:
2440                         switch (ins->inst_imm) {
2441                         case 2:
2442                                 /* MOV r1, r2 */
2443                                 /* ADD r1, r1 */
2444                                 if (ins->dreg != ins->sreg1)
2445                                         x86_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
2446                                 x86_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
2447                                 break;
2448                         case 3:
2449                                 /* LEA r1, [r2 + r2*2] */
2450                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
2451                                 break;
2452                         case 5:
2453                                 /* LEA r1, [r2 + r2*4] */
2454                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
2455                                 break;
2456                         case 6:
2457                                 /* LEA r1, [r2 + r2*2] */
2458                                 /* ADD r1, r1          */
2459                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
2460                                 x86_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
2461                                 break;
2462                         case 9:
2463                                 /* LEA r1, [r2 + r2*8] */
2464                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 3);
2465                                 break;
2466                         case 10:
2467                                 /* LEA r1, [r2 + r2*4] */
2468                                 /* ADD r1, r1          */
2469                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
2470                                 x86_alu_reg_reg (code, X86_ADD, ins->dreg, ins->dreg);
2471                                 break;
2472                         case 12:
2473                                 /* LEA r1, [r2 + r2*2] */
2474                                 /* SHL r1, 2           */
2475                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 1);
2476                                 x86_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
2477                                 break;
2478                         case 25:
2479                                 /* LEA r1, [r2 + r2*4] */
2480                                 /* LEA r1, [r1 + r1*4] */
2481                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
2482                                 x86_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
2483                                 break;
2484                         case 100:
2485                                 /* LEA r1, [r2 + r2*4] */
2486                                 /* SHL r1, 2           */
2487                                 /* LEA r1, [r1 + r1*4] */
2488                                 x86_lea_memindex (code, ins->dreg, ins->sreg1, 0, ins->sreg1, 2);
2489                                 x86_shift_reg_imm (code, X86_SHL, ins->dreg, 2);
2490                                 x86_lea_memindex (code, ins->dreg, ins->dreg, 0, ins->dreg, 2);
2491                                 break;
2492                         default:
2493                                 x86_imul_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_imm);
2494                                 break;
2495                         }
2496                         break;
2497                 case OP_IMUL_OVF:
2498                         x86_imul_reg_reg (code, ins->sreg1, ins->sreg2);
2499                         EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
2500                         break;
2501                 case OP_IMUL_OVF_UN: {
2502                         /* the mul operation and the exception check should most likely be split */
2503                         int non_eax_reg, saved_eax = FALSE, saved_edx = FALSE;
2504                         /*g_assert (ins->sreg2 == X86_EAX);
2505                         g_assert (ins->dreg == X86_EAX);*/
2506                         if (ins->sreg2 == X86_EAX) {
2507                                 non_eax_reg = ins->sreg1;
2508                         } else if (ins->sreg1 == X86_EAX) {
2509                                 non_eax_reg = ins->sreg2;
2510                         } else {
2511                                 /* no need to save since we're going to store to it anyway */
2512                                 if (ins->dreg != X86_EAX) {
2513                                         saved_eax = TRUE;
2514                                         x86_push_reg (code, X86_EAX);
2515                                 }
2516                                 x86_mov_reg_reg (code, X86_EAX, ins->sreg1, 4);
2517                                 non_eax_reg = ins->sreg2;
2518                         }
2519                         if (ins->dreg == X86_EDX) {
2520                                 if (!saved_eax) {
2521                                         saved_eax = TRUE;
2522                                         x86_push_reg (code, X86_EAX);
2523                                 }
2524                         } else if (ins->dreg != X86_EAX) {
2525                                 saved_edx = TRUE;
2526                                 x86_push_reg (code, X86_EDX);
2527                         }
2528                         x86_mul_reg (code, non_eax_reg, FALSE);
2529                         /* save before the check since pop and mov don't change the flags */
2530                         if (ins->dreg != X86_EAX)
2531                                 x86_mov_reg_reg (code, ins->dreg, X86_EAX, 4);
2532                         if (saved_edx)
2533                                 x86_pop_reg (code, X86_EDX);
2534                         if (saved_eax)
2535                                 x86_pop_reg (code, X86_EAX);
2536                         EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
2537                         break;
2538                 }
2539                 case OP_ICONST:
2540                         x86_mov_reg_imm (code, ins->dreg, ins->inst_c0);
2541                         break;
2542                 case OP_AOTCONST:
2543                         g_assert_not_reached ();
2544                         mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
2545                         x86_mov_reg_imm (code, ins->dreg, 0);
2546                         break;
2547                 case OP_JUMP_TABLE:
2548                         mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
2549                         x86_mov_reg_imm (code, ins->dreg, 0);
2550                         break;
2551                 case OP_LOAD_GOTADDR:
2552                         x86_call_imm (code, 0);
2553                         /* 
2554                          * The patch needs to point to the pop, since the GOT offset needs 
2555                          * to be added to that address.
2556                          */
2557                         mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_GOT_OFFSET, NULL);
2558                         x86_pop_reg (code, ins->dreg);
2559                         x86_alu_reg_imm (code, X86_ADD, ins->dreg, 0xf0f0f0f0);
2560                         break;
2561                 case OP_GOT_ENTRY:
2562                         mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_right->inst_i1, ins->inst_right->inst_p0);
2563                         x86_mov_reg_membase (code, ins->dreg, ins->inst_basereg, 0xf0f0f0f0, 4);
2564                         break;
2565                 case OP_X86_PUSH_GOT_ENTRY:
2566                         mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_right->inst_i1, ins->inst_right->inst_p0);
2567                         x86_push_membase (code, ins->inst_basereg, 0xf0f0f0f0);
2568                         break;
2569                 case OP_MOVE:
2570                         x86_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
2571                         break;
2572                 case OP_JMP: {
2573                         /*
2574                          * Note: this 'frame destruction' logic is useful for tail calls, too.
2575                          * Keep in sync with the code in emit_epilog.
2576                          */
2577                         int pos = 0;
2578
2579                         /* FIXME: no tracing support... */
2580                         if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
2581                                 code = mono_arch_instrument_epilog (cfg, mono_profiler_method_leave, code, FALSE);
2582                         /* reset offset to make max_len work */
2583                         offset = code - cfg->native_code;
2584
2585                         g_assert (!cfg->method->save_lmf);
2586
2587                         code = emit_load_volatile_arguments (cfg, code);
2588
2589                         if (cfg->used_int_regs & (1 << X86_EBX))
2590                                 pos -= 4;
2591                         if (cfg->used_int_regs & (1 << X86_EDI))
2592                                 pos -= 4;
2593                         if (cfg->used_int_regs & (1 << X86_ESI))
2594                                 pos -= 4;
2595                         if (pos)
2596                                 x86_lea_membase (code, X86_ESP, X86_EBP, pos);
2597         
2598                         if (cfg->used_int_regs & (1 << X86_ESI))
2599                                 x86_pop_reg (code, X86_ESI);
2600                         if (cfg->used_int_regs & (1 << X86_EDI))
2601                                 x86_pop_reg (code, X86_EDI);
2602                         if (cfg->used_int_regs & (1 << X86_EBX))
2603                                 x86_pop_reg (code, X86_EBX);
2604         
2605                         /* restore ESP/EBP */
2606                         x86_leave (code);
2607                         offset = code - cfg->native_code;
2608                         mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
2609                         x86_jump32 (code, 0);
2610
2611                         cfg->disable_aot = TRUE;
2612                         break;
2613                 }
2614                 case OP_CHECK_THIS:
2615                         /* ensure ins->sreg1 is not NULL
2616                          * note that cmp DWORD PTR [eax], eax is one byte shorter than
2617                          * cmp DWORD PTR [eax], 0
2618                          */
2619                         x86_alu_membase_reg (code, X86_CMP, ins->sreg1, 0, ins->sreg1);
2620                         break;
2621                 case OP_ARGLIST: {
2622                         int hreg = ins->sreg1 == X86_EAX? X86_ECX: X86_EAX;
2623                         x86_push_reg (code, hreg);
2624                         x86_lea_membase (code, hreg, X86_EBP, cfg->sig_cookie);
2625                         x86_mov_membase_reg (code, ins->sreg1, 0, hreg, 4);
2626                         x86_pop_reg (code, hreg);
2627                         break;
2628                 }
2629                 case OP_FCALL:
2630                 case OP_LCALL:
2631                 case OP_VCALL:
2632                 case OP_VCALL2:
2633                 case OP_VOIDCALL:
2634                 case OP_CALL:
2635                         call = (MonoCallInst*)ins;
2636                         if (ins->flags & MONO_INST_HAS_METHOD)
2637                                 code = emit_call (cfg, code, MONO_PATCH_INFO_METHOD, call->method);
2638                         else
2639                                 code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, call->fptr);
2640                         if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature)) {
2641                                 /* a pop is one byte, while an add reg, imm is 3. So if there are 4 or 8
2642                                  * bytes to pop, we want to use pops. GCC does this (note it won't happen
2643                                  * for P4 or i686 because gcc will avoid using pop push at all. But we aren't
2644                                  * smart enough to do that optimization yet
2645                                  *
2646                                  * It turns out that on my P4, doing two pops for 8 bytes on the stack makes
2647                                  * mcs botstrap slow down. However, doing 1 pop for 4 bytes creates a small,
2648                                  * (most likely from locality benefits). People with other processors should
2649                                  * check on theirs to see what happens.
2650                                  */
2651                                 if (call->stack_usage == 4) {
2652                                         /* we want to use registers that won't get used soon, so use
2653                                          * ecx, as eax will get allocated first. edx is used by long calls,
2654                                          * so we can't use that.
2655                                          */
2656                                         
2657                                         x86_pop_reg (code, X86_ECX);
2658                                 } else {
2659                                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, call->stack_usage);
2660                                 }
2661                         }
2662                         code = emit_move_return_value (cfg, ins, code);
2663                         break;
2664                 case OP_FCALL_REG:
2665                 case OP_LCALL_REG:
2666                 case OP_VCALL_REG:
2667                 case OP_VCALL2_REG:
2668                 case OP_VOIDCALL_REG:
2669                 case OP_CALL_REG:
2670                         call = (MonoCallInst*)ins;
2671                         x86_call_reg (code, ins->sreg1);
2672                         if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature)) {
2673                                 if (call->stack_usage == 4)
2674                                         x86_pop_reg (code, X86_ECX);
2675                                 else
2676                                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, call->stack_usage);
2677                         }
2678                         code = emit_move_return_value (cfg, ins, code);
2679                         break;
2680                 case OP_FCALL_MEMBASE:
2681                 case OP_LCALL_MEMBASE:
2682                 case OP_VCALL_MEMBASE:
2683                 case OP_VCALL2_MEMBASE:
2684                 case OP_VOIDCALL_MEMBASE:
2685                 case OP_CALL_MEMBASE:
2686                         call = (MonoCallInst*)ins;
2687                         x86_call_membase (code, ins->sreg1, ins->inst_offset);
2688                         if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature)) {
2689                                 if (call->stack_usage == 4)
2690                                         x86_pop_reg (code, X86_ECX);
2691                                 else
2692                                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, call->stack_usage);
2693                         }
2694                         code = emit_move_return_value (cfg, ins, code);
2695                         break;
2696                 case OP_X86_PUSH:
2697                         x86_push_reg (code, ins->sreg1);
2698                         break;
2699                 case OP_X86_PUSH_IMM:
2700                         x86_push_imm (code, ins->inst_imm);
2701                         break;
2702                 case OP_X86_PUSH_MEMBASE:
2703                         x86_push_membase (code, ins->inst_basereg, ins->inst_offset);
2704                         break;
2705                 case OP_X86_PUSH_OBJ: 
2706                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, ins->inst_imm);
2707                         x86_push_reg (code, X86_EDI);
2708                         x86_push_reg (code, X86_ESI);
2709                         x86_push_reg (code, X86_ECX);
2710                         if (ins->inst_offset)
2711                                 x86_lea_membase (code, X86_ESI, ins->inst_basereg, ins->inst_offset);
2712                         else
2713                                 x86_mov_reg_reg (code, X86_ESI, ins->inst_basereg, 4);
2714                         x86_lea_membase (code, X86_EDI, X86_ESP, 12);
2715                         x86_mov_reg_imm (code, X86_ECX, (ins->inst_imm >> 2));
2716                         x86_cld (code);
2717                         x86_prefix (code, X86_REP_PREFIX);
2718                         x86_movsd (code);
2719                         x86_pop_reg (code, X86_ECX);
2720                         x86_pop_reg (code, X86_ESI);
2721                         x86_pop_reg (code, X86_EDI);
2722                         break;
2723                 case OP_X86_LEA:
2724                         x86_lea_memindex (code, ins->dreg, ins->sreg1, ins->inst_imm, ins->sreg2, ins->backend.shift_amount);
2725                         break;
2726                 case OP_X86_LEA_MEMBASE:
2727                         x86_lea_membase (code, ins->dreg, ins->sreg1, ins->inst_imm);
2728                         break;
2729                 case OP_X86_XCHG:
2730                         x86_xchg_reg_reg (code, ins->sreg1, ins->sreg2, 4);
2731                         break;
2732                 case OP_LOCALLOC:
2733                         /* keep alignment */
2734                         x86_alu_reg_imm (code, X86_ADD, ins->sreg1, MONO_ARCH_LOCALLOC_ALIGNMENT - 1);
2735                         x86_alu_reg_imm (code, X86_AND, ins->sreg1, ~(MONO_ARCH_LOCALLOC_ALIGNMENT - 1));
2736                         code = mono_emit_stack_alloc (code, ins);
2737                         x86_mov_reg_reg (code, ins->dreg, X86_ESP, 4);
2738                         break;
2739                 case OP_LOCALLOC_IMM: {
2740                         guint32 size = ins->inst_imm;
2741                         size = (size + (MONO_ARCH_FRAME_ALIGNMENT - 1)) & ~ (MONO_ARCH_FRAME_ALIGNMENT - 1);
2742
2743                         if (ins->flags & MONO_INST_INIT) {
2744                                 /* FIXME: Optimize this */
2745                                 x86_mov_reg_imm (code, ins->dreg, size);
2746                                 ins->sreg1 = ins->dreg;
2747
2748                                 code = mono_emit_stack_alloc (code, ins);
2749                                 x86_mov_reg_reg (code, ins->dreg, X86_ESP, 4);
2750                         } else {
2751                                 x86_alu_reg_imm (code, X86_SUB, X86_ESP, size);
2752                                 x86_mov_reg_reg (code, ins->dreg, X86_ESP, 4);
2753                         }
2754                         break;
2755                 }
2756                 case OP_THROW: {
2757                         x86_push_reg (code, ins->sreg1);
2758                         code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
2759                                                           (gpointer)"mono_arch_throw_exception");
2760                         break;
2761                 }
2762                 case OP_RETHROW: {
2763                         x86_push_reg (code, ins->sreg1);
2764                         code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
2765                                                           (gpointer)"mono_arch_rethrow_exception");
2766                         break;
2767                 }
2768                 case OP_CALL_HANDLER:
2769                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 4);
2770                         mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
2771                         x86_call_imm (code, 0);
2772                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 4);
2773                         break;
2774                 case OP_START_HANDLER: {
2775                         MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
2776                         x86_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, X86_ESP, 4);
2777                         break;
2778                 }
2779                 case OP_ENDFINALLY: {
2780                         MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
2781                         x86_mov_reg_membase (code, X86_ESP, spvar->inst_basereg, spvar->inst_offset, 4);
2782                         x86_ret (code);
2783                         break;
2784                 }
2785                 case OP_ENDFILTER: {
2786                         MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
2787                         x86_mov_reg_membase (code, X86_ESP, spvar->inst_basereg, spvar->inst_offset, 4);
2788                         /* The local allocator will put the result into EAX */
2789                         x86_ret (code);
2790                         break;
2791                 }
2792
2793                 case OP_LABEL:
2794                         ins->inst_c0 = code - cfg->native_code;
2795                         break;
2796                 case OP_BR:
2797                         if (ins->flags & MONO_INST_BRLABEL) {
2798                                 if (ins->inst_i0->inst_c0) {
2799                                         x86_jump_code (code, cfg->native_code + ins->inst_i0->inst_c0);
2800                                 } else {
2801                                         mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_LABEL, ins->inst_i0);
2802                                         if ((cfg->opt & MONO_OPT_BRANCH) &&
2803                                             x86_is_imm8 (ins->inst_i0->inst_c1 - cpos))
2804                                                 x86_jump8 (code, 0);
2805                                         else 
2806                                                 x86_jump32 (code, 0);
2807                                 }
2808                         } else {
2809                                 if (ins->inst_target_bb->native_offset) {
2810                                         x86_jump_code (code, cfg->native_code + ins->inst_target_bb->native_offset); 
2811                                 } else {
2812                                         mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb);
2813                                         if ((cfg->opt & MONO_OPT_BRANCH) &&
2814                                             x86_is_imm8 (ins->inst_target_bb->max_offset - cpos))
2815                                                 x86_jump8 (code, 0);
2816                                         else 
2817                                                 x86_jump32 (code, 0);
2818                                 } 
2819                         }
2820                         break;
2821                 case OP_BR_REG:
2822                         x86_jump_reg (code, ins->sreg1);
2823                         break;
2824                 case OP_CEQ:
2825                 case OP_CLT:
2826                 case OP_CLT_UN:
2827                 case OP_CGT:
2828                 case OP_CGT_UN:
2829                 case OP_CNE:
2830                 case OP_ICEQ:
2831                 case OP_ICLT:
2832                 case OP_ICLT_UN:
2833                 case OP_ICGT:
2834                 case OP_ICGT_UN:
2835                         x86_set_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
2836                         x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
2837                         break;
2838                 case OP_COND_EXC_EQ:
2839                 case OP_COND_EXC_NE_UN:
2840                 case OP_COND_EXC_LT:
2841                 case OP_COND_EXC_LT_UN:
2842                 case OP_COND_EXC_GT:
2843                 case OP_COND_EXC_GT_UN:
2844                 case OP_COND_EXC_GE:
2845                 case OP_COND_EXC_GE_UN:
2846                 case OP_COND_EXC_LE:
2847                 case OP_COND_EXC_LE_UN:
2848                 case OP_COND_EXC_IEQ:
2849                 case OP_COND_EXC_INE_UN:
2850                 case OP_COND_EXC_ILT:
2851                 case OP_COND_EXC_ILT_UN:
2852                 case OP_COND_EXC_IGT:
2853                 case OP_COND_EXC_IGT_UN:
2854                 case OP_COND_EXC_IGE:
2855                 case OP_COND_EXC_IGE_UN:
2856                 case OP_COND_EXC_ILE:
2857                 case OP_COND_EXC_ILE_UN:
2858                         EMIT_COND_SYSTEM_EXCEPTION (cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->inst_p1);
2859                         break;
2860                 case OP_COND_EXC_OV:
2861                 case OP_COND_EXC_NO:
2862                 case OP_COND_EXC_C:
2863                 case OP_COND_EXC_NC:
2864                         EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_EQ], (ins->opcode < OP_COND_EXC_NE_UN), ins->inst_p1);
2865                         break;
2866                 case OP_COND_EXC_IOV:
2867                 case OP_COND_EXC_INO:
2868                 case OP_COND_EXC_IC:
2869                 case OP_COND_EXC_INC:
2870                         EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_IEQ], (ins->opcode < OP_COND_EXC_INE_UN), ins->inst_p1);
2871                         break;
2872                 case OP_IBEQ:
2873                 case OP_IBNE_UN:
2874                 case OP_IBLT:
2875                 case OP_IBLT_UN:
2876                 case OP_IBGT:
2877                 case OP_IBGT_UN:
2878                 case OP_IBGE:
2879                 case OP_IBGE_UN:
2880                 case OP_IBLE:
2881                 case OP_IBLE_UN:
2882                         EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
2883                         break;
2884
2885                 case OP_CMOV_IEQ:
2886                 case OP_CMOV_IGE:
2887                 case OP_CMOV_IGT:
2888                 case OP_CMOV_ILE:
2889                 case OP_CMOV_ILT:
2890                 case OP_CMOV_INE_UN:
2891                 case OP_CMOV_IGE_UN:
2892                 case OP_CMOV_IGT_UN:
2893                 case OP_CMOV_ILE_UN:
2894                 case OP_CMOV_ILT_UN:
2895                         g_assert (ins->dreg == ins->sreg1);
2896                         x86_cmov_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, ins->sreg2);
2897                         break;
2898
2899                 /* floating point opcodes */
2900                 case OP_R8CONST: {
2901                         double d = *(double *)ins->inst_p0;
2902
2903                         if ((d == 0.0) && (mono_signbit (d) == 0)) {
2904                                 x86_fldz (code);
2905                         } else if (d == 1.0) {
2906                                 x86_fld1 (code);
2907                         } else {
2908                                 if (cfg->compile_aot) {
2909                                         guint32 *val = (guint32*)&d;
2910                                         x86_push_imm (code, val [1]);
2911                                         x86_push_imm (code, val [0]);
2912                                         x86_fld_membase (code, X86_ESP, 0, TRUE);
2913                                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
2914                                 }
2915                                 else {
2916                                         mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_R8, ins->inst_p0);
2917                                         x86_fld (code, NULL, TRUE);
2918                                 }
2919                         }
2920                         break;
2921                 }
2922                 case OP_R4CONST: {
2923                         float f = *(float *)ins->inst_p0;
2924
2925                         if ((f == 0.0) && (mono_signbit (f) == 0)) {
2926                                 x86_fldz (code);
2927                         } else if (f == 1.0) {
2928                                 x86_fld1 (code);
2929                         } else {
2930                                 if (cfg->compile_aot) {
2931                                         guint32 val = *(guint32*)&f;
2932                                         x86_push_imm (code, val);
2933                                         x86_fld_membase (code, X86_ESP, 0, FALSE);
2934                                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
2935                                 }
2936                                 else {
2937                                         mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_R4, ins->inst_p0);
2938                                         x86_fld (code, NULL, FALSE);
2939                                 }
2940                         }
2941                         break;
2942                 }
2943                 case OP_STORER8_MEMBASE_REG:
2944                         x86_fst_membase (code, ins->inst_destbasereg, ins->inst_offset, TRUE, TRUE);
2945                         break;
2946                 case OP_LOADR8_SPILL_MEMBASE:
2947                         x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
2948                         x86_fxch (code, 1);
2949                         break;
2950                 case OP_LOADR8_MEMBASE:
2951                         x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
2952                         break;
2953                 case OP_STORER4_MEMBASE_REG:
2954                         x86_fst_membase (code, ins->inst_destbasereg, ins->inst_offset, FALSE, TRUE);
2955                         break;
2956                 case OP_LOADR4_MEMBASE:
2957                         x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, FALSE);
2958                         break;
2959                 case OP_ICONV_TO_R4: /* FIXME: change precision */
2960                 case OP_ICONV_TO_R8:
2961                         x86_push_reg (code, ins->sreg1);
2962                         x86_fild_membase (code, X86_ESP, 0, FALSE);
2963                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
2964                         break;
2965                 case OP_ICONV_TO_R_UN:
2966                         x86_push_imm (code, 0);
2967                         x86_push_reg (code, ins->sreg1);
2968                         x86_fild_membase (code, X86_ESP, 0, TRUE);
2969                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
2970                         break;
2971                 case OP_X86_FP_LOAD_I8:
2972                         x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
2973                         break;
2974                 case OP_X86_FP_LOAD_I4:
2975                         x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, FALSE);
2976                         break;
2977                 case OP_FCONV_TO_R4:
2978                         /* FIXME: nothing to do ?? */
2979                         break;
2980                 case OP_FCONV_TO_I1:
2981                         code = emit_float_to_int (cfg, code, ins->dreg, 1, TRUE);
2982                         break;
2983                 case OP_FCONV_TO_U1:
2984                         code = emit_float_to_int (cfg, code, ins->dreg, 1, FALSE);
2985                         break;
2986                 case OP_FCONV_TO_I2:
2987                         code = emit_float_to_int (cfg, code, ins->dreg, 2, TRUE);
2988                         break;
2989                 case OP_FCONV_TO_U2:
2990                         code = emit_float_to_int (cfg, code, ins->dreg, 2, FALSE);
2991                         break;
2992                 case OP_FCONV_TO_I4:
2993                 case OP_FCONV_TO_I:
2994                         code = emit_float_to_int (cfg, code, ins->dreg, 4, TRUE);
2995                         break;
2996                 case OP_FCONV_TO_I8:
2997                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4);
2998                         x86_fnstcw_membase(code, X86_ESP, 0);
2999                         x86_mov_reg_membase (code, ins->dreg, X86_ESP, 0, 2);
3000                         x86_alu_reg_imm (code, X86_OR, ins->dreg, 0xc00);
3001                         x86_mov_membase_reg (code, X86_ESP, 2, ins->dreg, 2);
3002                         x86_fldcw_membase (code, X86_ESP, 2);
3003                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
3004                         x86_fist_pop_membase (code, X86_ESP, 0, TRUE);
3005                         x86_pop_reg (code, ins->dreg);
3006                         x86_pop_reg (code, ins->backend.reg3);
3007                         x86_fldcw_membase (code, X86_ESP, 0);
3008                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
3009                         break;
3010                 case OP_LCONV_TO_R8_2:
3011                         x86_push_reg (code, ins->sreg2);
3012                         x86_push_reg (code, ins->sreg1);
3013                         x86_fild_membase (code, X86_ESP, 0, TRUE);
3014                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
3015                         break;
3016                 case OP_LCONV_TO_R4_2:
3017                         x86_push_reg (code, ins->sreg2);
3018                         x86_push_reg (code, ins->sreg1);
3019                         x86_fild_membase (code, X86_ESP, 0, TRUE);
3020                         /* Change precision */
3021                         x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE);
3022                         x86_fld_membase (code, X86_ESP, 0, FALSE);
3023                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
3024                         break;
3025                 case OP_LCONV_TO_R_UN:
3026                 case OP_LCONV_TO_R_UN_2: { 
3027                         static guint8 mn[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x40 };
3028                         guint8 *br;
3029
3030                         /* load 64bit integer to FP stack */
3031                         x86_push_imm (code, 0);
3032                         x86_push_reg (code, ins->sreg2);
3033                         x86_push_reg (code, ins->sreg1);
3034                         x86_fild_membase (code, X86_ESP, 0, TRUE);
3035                         /* store as 80bit FP value */
3036                         x86_fst80_membase (code, X86_ESP, 0);
3037                         
3038                         /* test if lreg is negative */
3039                         x86_test_reg_reg (code, ins->sreg2, ins->sreg2);
3040                         br = code; x86_branch8 (code, X86_CC_GEZ, 0, TRUE);
3041         
3042                         /* add correction constant mn */
3043                         x86_fld80_mem (code, mn);
3044                         x86_fld80_membase (code, X86_ESP, 0);
3045                         x86_fp_op_reg (code, X86_FADD, 1, TRUE);
3046                         x86_fst80_membase (code, X86_ESP, 0);
3047
3048                         x86_patch (br, code);
3049
3050                         x86_fld80_membase (code, X86_ESP, 0);
3051                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 12);
3052
3053                         break;
3054                 }
3055                 case OP_LCONV_TO_OVF_I:
3056                 case OP_LCONV_TO_OVF_I4_2: {
3057                         guint8 *br [3], *label [1];
3058                         MonoInst *tins;
3059
3060                         /* 
3061                          * Valid ints: 0xffffffff:8000000 to 00000000:0x7f000000
3062                          */
3063                         x86_test_reg_reg (code, ins->sreg1, ins->sreg1);
3064
3065                         /* If the low word top bit is set, see if we are negative */
3066                         br [0] = code; x86_branch8 (code, X86_CC_LT, 0, TRUE);
3067                         /* We are not negative (no top bit set, check for our top word to be zero */
3068                         x86_test_reg_reg (code, ins->sreg2, ins->sreg2);
3069                         br [1] = code; x86_branch8 (code, X86_CC_EQ, 0, TRUE);
3070                         label [0] = code;
3071
3072                         /* throw exception */
3073                         tins = mono_branch_optimize_exception_target (cfg, bb, "OverflowException");
3074                         if (tins) {
3075                                 mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, tins->inst_true_bb);
3076                                 if ((cfg->opt & MONO_OPT_BRANCH) && x86_is_imm8 (tins->inst_true_bb->max_offset - cpos))
3077                                         x86_jump8 (code, 0);
3078                                 else
3079                                         x86_jump32 (code, 0);
3080                         } else {
3081                                 mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_EXC, "OverflowException");
3082                                 x86_jump32 (code, 0);
3083                         }
3084         
3085         
3086                         x86_patch (br [0], code);
3087                         /* our top bit is set, check that top word is 0xfffffff */
3088                         x86_alu_reg_imm (code, X86_CMP, ins->sreg2, 0xffffffff);
3089                 
3090                         x86_patch (br [1], code);
3091                         /* nope, emit exception */
3092                         br [2] = code; x86_branch8 (code, X86_CC_NE, 0, TRUE);
3093                         x86_patch (br [2], label [0]);
3094
3095                         if (ins->dreg != ins->sreg1)
3096                                 x86_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
3097                         break;
3098                 }
3099                 case OP_FMOVE:
3100                         /* Not needed on the fp stack */
3101                         break;
3102                 case OP_FADD:
3103                         x86_fp_op_reg (code, X86_FADD, 1, TRUE);
3104                         break;
3105                 case OP_FSUB:
3106                         x86_fp_op_reg (code, X86_FSUB, 1, TRUE);
3107                         break;          
3108                 case OP_FMUL:
3109                         x86_fp_op_reg (code, X86_FMUL, 1, TRUE);
3110                         break;          
3111                 case OP_FDIV:
3112                         x86_fp_op_reg (code, X86_FDIV, 1, TRUE);
3113                         break;          
3114                 case OP_FNEG:
3115                         x86_fchs (code);
3116                         break;          
3117                 case OP_SIN:
3118                         x86_fsin (code);
3119                         x86_fldz (code);
3120                         x86_fp_op_reg (code, X86_FADD, 1, TRUE);
3121                         break;          
3122                 case OP_COS:
3123                         x86_fcos (code);
3124                         x86_fldz (code);
3125                         x86_fp_op_reg (code, X86_FADD, 1, TRUE);
3126                         break;          
3127                 case OP_ABS:
3128                         x86_fabs (code);
3129                         break;          
3130                 case OP_TAN: {
3131                         /* 
3132                          * it really doesn't make sense to inline all this code,
3133                          * it's here just to show that things may not be as simple 
3134                          * as they appear.
3135                          */
3136                         guchar *check_pos, *end_tan, *pop_jump;
3137                         x86_push_reg (code, X86_EAX);
3138                         x86_fptan (code);
3139                         x86_fnstsw (code);
3140                         x86_test_reg_imm (code, X86_EAX, X86_FP_C2);
3141                         check_pos = code;
3142                         x86_branch8 (code, X86_CC_NE, 0, FALSE);
3143                         x86_fstp (code, 0); /* pop the 1.0 */
3144                         end_tan = code;
3145                         x86_jump8 (code, 0);
3146                         x86_fldpi (code);
3147                         x86_fp_op (code, X86_FADD, 0);
3148                         x86_fxch (code, 1);
3149                         x86_fprem1 (code);
3150                         x86_fstsw (code);
3151                         x86_test_reg_imm (code, X86_EAX, X86_FP_C2);
3152                         pop_jump = code;
3153                         x86_branch8 (code, X86_CC_NE, 0, FALSE);
3154                         x86_fstp (code, 1);
3155                         x86_fptan (code);
3156                         x86_patch (pop_jump, code);
3157                         x86_fstp (code, 0); /* pop the 1.0 */
3158                         x86_patch (check_pos, code);
3159                         x86_patch (end_tan, code);
3160                         x86_fldz (code);
3161                         x86_fp_op_reg (code, X86_FADD, 1, TRUE);
3162                         x86_pop_reg (code, X86_EAX);
3163                         break;
3164                 }
3165                 case OP_ATAN:
3166                         x86_fld1 (code);
3167                         x86_fpatan (code);
3168                         x86_fldz (code);
3169                         x86_fp_op_reg (code, X86_FADD, 1, TRUE);
3170                         break;          
3171                 case OP_SQRT:
3172                         x86_fsqrt (code);
3173                         break;
3174                 case OP_ROUND:
3175                         x86_frndint (code);
3176                         break;
3177                 case OP_IMIN:
3178                         g_assert (cfg->opt & MONO_OPT_CMOV);
3179                         g_assert (ins->dreg == ins->sreg1);
3180                         x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
3181                         x86_cmov_reg (code, X86_CC_GT, TRUE, ins->dreg, ins->sreg2);
3182                         break;
3183                 case OP_IMIN_UN:
3184                         g_assert (cfg->opt & MONO_OPT_CMOV);
3185                         g_assert (ins->dreg == ins->sreg1);
3186                         x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
3187                         x86_cmov_reg (code, X86_CC_GT, FALSE, ins->dreg, ins->sreg2);
3188                         break;
3189                 case OP_IMAX:
3190                         g_assert (cfg->opt & MONO_OPT_CMOV);
3191                         g_assert (ins->dreg == ins->sreg1);
3192                         x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
3193                         x86_cmov_reg (code, X86_CC_LT, TRUE, ins->dreg, ins->sreg2);
3194                         break;
3195                 case OP_IMAX_UN:
3196                         g_assert (cfg->opt & MONO_OPT_CMOV);
3197                         g_assert (ins->dreg == ins->sreg1);
3198                         x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
3199                         x86_cmov_reg (code, X86_CC_LT, FALSE, ins->dreg, ins->sreg2);
3200                         break;
3201                 case OP_X86_FPOP:
3202                         x86_fstp (code, 0);
3203                         break;
3204                 case OP_X86_FXCH:
3205                         x86_fxch (code, ins->inst_imm);
3206                         break;
3207                 case OP_FREM: {
3208                         guint8 *l1, *l2;
3209
3210                         x86_push_reg (code, X86_EAX);
3211                         /* we need to exchange ST(0) with ST(1) */
3212                         x86_fxch (code, 1);
3213
3214                         /* this requires a loop, because fprem somtimes 
3215                          * returns a partial remainder */
3216                         l1 = code;
3217                         /* looks like MS is using fprem instead of the IEEE compatible fprem1 */
3218                         /* x86_fprem1 (code); */
3219                         x86_fprem (code);
3220                         x86_fnstsw (code);
3221                         x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_C2);
3222                         l2 = code + 2;
3223                         x86_branch8 (code, X86_CC_NE, l1 - l2, FALSE);
3224
3225                         /* pop result */
3226                         x86_fstp (code, 1);
3227
3228                         x86_pop_reg (code, X86_EAX);
3229                         break;
3230                 }
3231                 case OP_FCOMPARE:
3232                         if (cfg->opt & MONO_OPT_FCMOV) {
3233                                 x86_fcomip (code, 1);
3234                                 x86_fstp (code, 0);
3235                                 break;
3236                         }
3237                         /* this overwrites EAX */
3238                         EMIT_FPCOMPARE(code);
3239                         x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK);
3240                         break;
3241                 case OP_FCEQ:
3242                         if (cfg->opt & MONO_OPT_FCMOV) {
3243                                 /* zeroing the register at the start results in 
3244                                  * shorter and faster code (we can also remove the widening op)
3245                                  */
3246                                 guchar *unordered_check;
3247                                 x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
3248                                 x86_fcomip (code, 1);
3249                                 x86_fstp (code, 0);
3250                                 unordered_check = code;
3251                                 x86_branch8 (code, X86_CC_P, 0, FALSE);
3252                                 x86_set_reg (code, X86_CC_EQ, ins->dreg, FALSE);
3253                                 x86_patch (unordered_check, code);
3254                                 break;
3255                         }
3256                         if (ins->dreg != X86_EAX) 
3257                                 x86_push_reg (code, X86_EAX);
3258
3259                         EMIT_FPCOMPARE(code);
3260                         x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK);
3261                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0x4000);
3262                         x86_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
3263                         x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
3264
3265                         if (ins->dreg != X86_EAX) 
3266                                 x86_pop_reg (code, X86_EAX);
3267                         break;
3268                 case OP_FCLT:
3269                 case OP_FCLT_UN:
3270                         if (cfg->opt & MONO_OPT_FCMOV) {
3271                                 /* zeroing the register at the start results in 
3272                                  * shorter and faster code (we can also remove the widening op)
3273                                  */
3274                                 x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
3275                                 x86_fcomip (code, 1);
3276                                 x86_fstp (code, 0);
3277                                 if (ins->opcode == OP_FCLT_UN) {
3278                                         guchar *unordered_check = code;
3279                                         guchar *jump_to_end;
3280                                         x86_branch8 (code, X86_CC_P, 0, FALSE);
3281                                         x86_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
3282                                         jump_to_end = code;
3283                                         x86_jump8 (code, 0);
3284                                         x86_patch (unordered_check, code);
3285                                         x86_inc_reg (code, ins->dreg);
3286                                         x86_patch (jump_to_end, code);
3287                                 } else {
3288                                         x86_set_reg (code, X86_CC_GT, ins->dreg, FALSE);
3289                                 }
3290                                 break;
3291                         }
3292                         if (ins->dreg != X86_EAX) 
3293                                 x86_push_reg (code, X86_EAX);
3294
3295                         EMIT_FPCOMPARE(code);
3296                         x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK);
3297                         if (ins->opcode == OP_FCLT_UN) {
3298                                 guchar *is_not_zero_check, *end_jump;
3299                                 is_not_zero_check = code;
3300                                 x86_branch8 (code, X86_CC_NZ, 0, TRUE);
3301                                 end_jump = code;
3302                                 x86_jump8 (code, 0);
3303                                 x86_patch (is_not_zero_check, code);
3304                                 x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK);
3305
3306                                 x86_patch (end_jump, code);
3307                         }
3308                         x86_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
3309                         x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
3310
3311                         if (ins->dreg != X86_EAX) 
3312                                 x86_pop_reg (code, X86_EAX);
3313                         break;
3314                 case OP_FCGT:
3315                 case OP_FCGT_UN:
3316                         if (cfg->opt & MONO_OPT_FCMOV) {
3317                                 /* zeroing the register at the start results in 
3318                                  * shorter and faster code (we can also remove the widening op)
3319                                  */
3320                                 guchar *unordered_check;
3321                                 x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
3322                                 x86_fcomip (code, 1);
3323                                 x86_fstp (code, 0);
3324                                 if (ins->opcode == OP_FCGT) {
3325                                         unordered_check = code;
3326                                         x86_branch8 (code, X86_CC_P, 0, FALSE);
3327                                         x86_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
3328                                         x86_patch (unordered_check, code);
3329                                 } else {
3330                                         x86_set_reg (code, X86_CC_LT, ins->dreg, FALSE);
3331                                 }
3332                                 break;
3333                         }
3334                         if (ins->dreg != X86_EAX) 
3335                                 x86_push_reg (code, X86_EAX);
3336
3337                         EMIT_FPCOMPARE(code);
3338                         x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK);
3339                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0);
3340                         if (ins->opcode == OP_FCGT_UN) {
3341                                 guchar *is_not_zero_check, *end_jump;
3342                                 is_not_zero_check = code;
3343                                 x86_branch8 (code, X86_CC_NZ, 0, TRUE);
3344                                 end_jump = code;
3345                                 x86_jump8 (code, 0);
3346                                 x86_patch (is_not_zero_check, code);
3347                                 x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK);
3348         
3349                                 x86_patch (end_jump, code);
3350                         }
3351                         x86_set_reg (code, X86_CC_EQ, ins->dreg, TRUE);
3352                         x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
3353
3354                         if (ins->dreg != X86_EAX) 
3355                                 x86_pop_reg (code, X86_EAX);
3356                         break;
3357                 case OP_FBEQ:
3358                         if (cfg->opt & MONO_OPT_FCMOV) {
3359                                 guchar *jump = code;
3360                                 x86_branch8 (code, X86_CC_P, 0, TRUE);
3361                                 EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
3362                                 x86_patch (jump, code);
3363                                 break;
3364                         }
3365                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0x4000);
3366                         EMIT_COND_BRANCH (ins, X86_CC_EQ, TRUE);
3367                         break;
3368                 case OP_FBNE_UN:
3369                         /* Branch if C013 != 100 */
3370                         if (cfg->opt & MONO_OPT_FCMOV) {
3371                                 /* branch if !ZF or (PF|CF) */
3372                                 EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
3373                                 EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
3374                                 EMIT_COND_BRANCH (ins, X86_CC_B, FALSE);
3375                                 break;
3376                         }
3377                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C3);
3378                         EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
3379                         break;
3380                 case OP_FBLT:
3381                         if (cfg->opt & MONO_OPT_FCMOV) {
3382                                 EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE);
3383                                 break;
3384                         }
3385                         EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
3386                         break;
3387                 case OP_FBLT_UN:
3388                         if (cfg->opt & MONO_OPT_FCMOV) {
3389                                 EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
3390                                 EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE);
3391                                 break;
3392                         }
3393                         if (ins->opcode == OP_FBLT_UN) {
3394                                 guchar *is_not_zero_check, *end_jump;
3395                                 is_not_zero_check = code;
3396                                 x86_branch8 (code, X86_CC_NZ, 0, TRUE);
3397                                 end_jump = code;
3398                                 x86_jump8 (code, 0);
3399                                 x86_patch (is_not_zero_check, code);
3400                                 x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK);
3401
3402                                 x86_patch (end_jump, code);
3403                         }
3404                         EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
3405                         break;
3406                 case OP_FBGT:
3407                 case OP_FBGT_UN:
3408                         if (cfg->opt & MONO_OPT_FCMOV) {
3409                                 if (ins->opcode == OP_FBGT) {
3410                                         guchar *br1;
3411
3412                                         /* skip branch if C1=1 */
3413                                         br1 = code;
3414                                         x86_branch8 (code, X86_CC_P, 0, FALSE);
3415                                         /* branch if (C0 | C3) = 1 */
3416                                         EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
3417                                         x86_patch (br1, code);
3418                                 } else {
3419                                         EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE);
3420                                 }
3421                                 break;
3422                         }
3423                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0);
3424                         if (ins->opcode == OP_FBGT_UN) {
3425                                 guchar *is_not_zero_check, *end_jump;
3426                                 is_not_zero_check = code;
3427                                 x86_branch8 (code, X86_CC_NZ, 0, TRUE);
3428                                 end_jump = code;
3429                                 x86_jump8 (code, 0);
3430                                 x86_patch (is_not_zero_check, code);
3431                                 x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK);
3432
3433                                 x86_patch (end_jump, code);
3434                         }
3435                         EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
3436                         break;
3437                 case OP_FBGE:
3438                         /* Branch if C013 == 100 or 001 */
3439                         if (cfg->opt & MONO_OPT_FCMOV) {
3440                                 guchar *br1;
3441
3442                                 /* skip branch if C1=1 */
3443                                 br1 = code;
3444                                 x86_branch8 (code, X86_CC_P, 0, FALSE);
3445                                 /* branch if (C0 | C3) = 1 */
3446                                 EMIT_COND_BRANCH (ins, X86_CC_BE, FALSE);
3447                                 x86_patch (br1, code);
3448                                 break;
3449                         }
3450                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0);
3451                         EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
3452                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C3);
3453                         EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
3454                         break;
3455                 case OP_FBGE_UN:
3456                         /* Branch if C013 == 000 */
3457                         if (cfg->opt & MONO_OPT_FCMOV) {
3458                                 EMIT_COND_BRANCH (ins, X86_CC_LE, FALSE);
3459                                 break;
3460                         }
3461                         EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
3462                         break;
3463                 case OP_FBLE:
3464                         /* Branch if C013=000 or 100 */
3465                         if (cfg->opt & MONO_OPT_FCMOV) {
3466                                 guchar *br1;
3467
3468                                 /* skip branch if C1=1 */
3469                                 br1 = code;
3470                                 x86_branch8 (code, X86_CC_P, 0, FALSE);
3471                                 /* branch if C0=0 */
3472                                 EMIT_COND_BRANCH (ins, X86_CC_NB, FALSE);
3473                                 x86_patch (br1, code);
3474                                 break;
3475                         }
3476                         x86_alu_reg_imm (code, X86_AND, X86_EAX, (X86_FP_C0|X86_FP_C1));
3477                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0);
3478                         EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE);
3479                         break;
3480                 case OP_FBLE_UN:
3481                         /* Branch if C013 != 001 */
3482                         if (cfg->opt & MONO_OPT_FCMOV) {
3483                                 EMIT_COND_BRANCH (ins, X86_CC_P, FALSE);
3484                                 EMIT_COND_BRANCH (ins, X86_CC_GE, FALSE);
3485                                 break;
3486                         }
3487                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0);
3488                         EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE);
3489                         break;
3490                 case OP_CKFINITE: {
3491                         guchar *br1;
3492                         x86_push_reg (code, X86_EAX);
3493                         x86_fxam (code);
3494                         x86_fnstsw (code);
3495                         x86_alu_reg_imm (code, X86_AND, X86_EAX, 0x4100);
3496                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0);
3497                         x86_pop_reg (code, X86_EAX);
3498
3499                         /* Have to clean up the fp stack before throwing the exception */
3500                         br1 = code;
3501                         x86_branch8 (code, X86_CC_NE, 0, FALSE);
3502
3503                         x86_fstp (code, 0);                     
3504                         EMIT_COND_SYSTEM_EXCEPTION (X86_CC_EQ, FALSE, "ArithmeticException");
3505
3506                         x86_patch (br1, code);
3507                         break;
3508                 }
3509                 case OP_TLS_GET: {
3510                         code = mono_x86_emit_tls_get (code, ins->dreg, ins->inst_offset);
3511                         break;
3512                 }
3513                 case OP_MEMORY_BARRIER: {
3514                         /* Not needed on x86 */
3515                         break;
3516                 }
3517                 case OP_ATOMIC_ADD_I4: {
3518                         int dreg = ins->dreg;
3519
3520                         if (dreg == ins->inst_basereg) {
3521                                 x86_push_reg (code, ins->sreg2);
3522                                 dreg = ins->sreg2;
3523                         } 
3524                         
3525                         if (dreg != ins->sreg2)
3526                                 x86_mov_reg_reg (code, ins->dreg, ins->sreg2, 4);
3527
3528                         x86_prefix (code, X86_LOCK_PREFIX);
3529                         x86_xadd_membase_reg (code, ins->inst_basereg, ins->inst_offset, dreg, 4);
3530
3531                         if (dreg != ins->dreg) {
3532                                 x86_mov_reg_reg (code, ins->dreg, dreg, 4);
3533                                 x86_pop_reg (code, dreg);
3534                         }
3535
3536                         break;
3537                 }
3538                 case OP_ATOMIC_ADD_NEW_I4: {
3539                         int dreg = ins->dreg;
3540
3541                         /* hack: limit in regalloc, dreg != sreg1 && dreg != sreg2 */
3542                         if (ins->sreg2 == dreg) {
3543                                 if (dreg == X86_EBX) {
3544                                         dreg = X86_EDI;
3545                                         if (ins->inst_basereg == X86_EDI)
3546                                                 dreg = X86_ESI;
3547                                 } else {
3548                                         dreg = X86_EBX;
3549                                         if (ins->inst_basereg == X86_EBX)
3550                                                 dreg = X86_EDI;
3551                                 }
3552                         } else if (ins->inst_basereg == dreg) {
3553                                 if (dreg == X86_EBX) {
3554                                         dreg = X86_EDI;
3555                                         if (ins->sreg2 == X86_EDI)
3556                                                 dreg = X86_ESI;
3557                                 } else {
3558                                         dreg = X86_EBX;
3559                                         if (ins->sreg2 == X86_EBX)
3560                                                 dreg = X86_EDI;
3561                                 }
3562                         }
3563
3564                         if (dreg != ins->dreg) {
3565                                 x86_push_reg (code, dreg);
3566                         }
3567
3568                         x86_mov_reg_reg (code, dreg, ins->sreg2, 4);
3569                         x86_prefix (code, X86_LOCK_PREFIX);
3570                         x86_xadd_membase_reg (code, ins->inst_basereg, ins->inst_offset, dreg, 4);
3571                         /* dreg contains the old value, add with sreg2 value */
3572                         x86_alu_reg_reg (code, X86_ADD, dreg, ins->sreg2);
3573                         
3574                         if (ins->dreg != dreg) {
3575                                 x86_mov_reg_reg (code, ins->dreg, dreg, 4);
3576                                 x86_pop_reg (code, dreg);
3577                         }
3578
3579                         break;
3580                 }
3581                 case OP_ATOMIC_EXCHANGE_I4:
3582                 case OP_ATOMIC_CAS_IMM_I4: {
3583                         guchar *br[2];
3584                         int sreg2 = ins->sreg2;
3585                         int breg = ins->inst_basereg;
3586
3587                         /* cmpxchg uses eax as comperand, need to make sure we can use it
3588                          * hack to overcome limits in x86 reg allocator 
3589                          * (req: dreg == eax and sreg2 != eax and breg != eax) 
3590                          */
3591                         g_assert (ins->dreg == X86_EAX);
3592                         
3593                         /* We need the EAX reg for the cmpxchg */
3594                         if (ins->sreg2 == X86_EAX) {
3595                                 x86_push_reg (code, X86_EDX);
3596                                 x86_mov_reg_reg (code, X86_EDX, X86_EAX, 4);
3597                                 sreg2 = X86_EDX;
3598                         }
3599
3600                         if (breg == X86_EAX) {
3601                                 x86_push_reg (code, X86_ESI);
3602                                 x86_mov_reg_reg (code, X86_ESI, X86_EAX, 4);
3603                                 breg = X86_ESI;
3604                         }
3605
3606                         if (ins->opcode == OP_ATOMIC_CAS_IMM_I4) {
3607                                 x86_mov_reg_imm (code, X86_EAX, ins->backend.data);
3608
3609                                 x86_prefix (code, X86_LOCK_PREFIX);
3610                                 x86_cmpxchg_membase_reg (code, breg, ins->inst_offset, sreg2);
3611                         } else {
3612                                 x86_mov_reg_membase (code, X86_EAX, breg, ins->inst_offset, 4);
3613
3614                                 br [0] = code; x86_prefix (code, X86_LOCK_PREFIX);
3615                                 x86_cmpxchg_membase_reg (code, breg, ins->inst_offset, sreg2);
3616                                 br [1] = code; x86_branch8 (code, X86_CC_NE, -1, FALSE);
3617                                 x86_patch (br [1], br [0]);
3618                         }
3619
3620                         if (breg != ins->inst_basereg)
3621                                 x86_pop_reg (code, X86_ESI);
3622
3623                         if (ins->sreg2 != sreg2)
3624                                 x86_pop_reg (code, X86_EDX);
3625
3626                         break;
3627                 }
3628 #ifdef MONO_ARCH_SIMD_INTRINSICS
3629                 case OP_ADDPS:
3630                         x86_sse_alu_ps_reg_reg (code, X86_SSE_ADD, ins->sreg1, ins->sreg2);
3631                         break;
3632                 case OP_DIVPS:
3633                         x86_sse_alu_ps_reg_reg (code, X86_SSE_DIV, ins->sreg1, ins->sreg2);
3634                         break;
3635                 case OP_MULPS:
3636                         x86_sse_alu_ps_reg_reg (code, X86_SSE_MUL, ins->sreg1, ins->sreg2);
3637                         break;
3638                 case OP_SUBPS:
3639                         x86_sse_alu_ps_reg_reg (code, X86_SSE_SUB, ins->sreg1, ins->sreg2);
3640                         break;
3641                 case OP_MAXPS:
3642                         x86_sse_alu_ps_reg_reg (code, X86_SSE_MAX, ins->sreg1, ins->sreg2);
3643                         break;
3644                 case OP_MINPS:
3645                         x86_sse_alu_ps_reg_reg (code, X86_SSE_MIN, ins->sreg1, ins->sreg2);
3646                         break;
3647                 case OP_COMPPS:
3648                         g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
3649                         x86_sse_alu_ps_reg_reg_imm (code, X86_SSE_COMP, ins->sreg1, ins->sreg2, ins->inst_c0);
3650                         break;
3651                 case OP_ANDPS:
3652                         x86_sse_alu_ps_reg_reg (code, X86_SSE_AND, ins->sreg1, ins->sreg2);
3653                         break;
3654                 case OP_ANDNPS:
3655                         x86_sse_alu_ps_reg_reg (code, X86_SSE_ANDN, ins->sreg1, ins->sreg2);
3656                         break;
3657                 case OP_ORPS:
3658                         x86_sse_alu_ps_reg_reg (code, X86_SSE_OR, ins->sreg1, ins->sreg2);
3659                         break;
3660                 case OP_XORPS:
3661                         x86_sse_alu_ps_reg_reg (code, X86_SSE_XOR, ins->sreg1, ins->sreg2);
3662                         break;
3663                 case OP_SQRTPS:
3664                         x86_sse_alu_ps_reg_reg (code, X86_SSE_SQRT, ins->dreg, ins->sreg1);
3665                         break;
3666                 case OP_RSQRTPS:
3667                         x86_sse_alu_ps_reg_reg (code, X86_SSE_RSQRT, ins->dreg, ins->sreg1);
3668                         break;
3669                 case OP_RCPPS:
3670                         x86_sse_alu_ps_reg_reg (code, X86_SSE_RCP, ins->dreg, ins->sreg1);
3671                         break;
3672                 case OP_ADDSUBPS:
3673                         x86_sse_alu_sd_reg_reg (code, X86_SSE_ADDSUB, ins->sreg1, ins->sreg2);
3674                         break;
3675                 case OP_HADDPS:
3676                         x86_sse_alu_sd_reg_reg (code, X86_SSE_HADD, ins->sreg1, ins->sreg2);
3677                         break;
3678                 case OP_HSUBPS:
3679                         x86_sse_alu_sd_reg_reg (code, X86_SSE_HSUB, ins->sreg1, ins->sreg2);
3680                         break;
3681                 case OP_DUPPS_HIGH:
3682                         x86_sse_alu_ss_reg_reg (code, X86_SSE_MOVSHDUP, ins->dreg, ins->sreg1);
3683                         break;
3684                 case OP_DUPPS_LOW:
3685                         x86_sse_alu_ss_reg_reg (code, X86_SSE_MOVSLDUP, ins->dreg, ins->sreg1);
3686                         break;
3687
3688                 case OP_PSHUFLEW_HIGH:
3689                         g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
3690                         x86_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 1);
3691                         break;
3692                 case OP_PSHUFLEW_LOW:
3693                         g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
3694                         x86_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 0);
3695                         break;
3696                 case OP_PSHUFLED:
3697                         g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
3698                         x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->sreg1, ins->inst_c0);
3699                         break;
3700
3701                 case OP_ADDPD:
3702                         x86_sse_alu_pd_reg_reg (code, X86_SSE_ADD, ins->sreg1, ins->sreg2);
3703                         break;
3704                 case OP_DIVPD:
3705                         x86_sse_alu_pd_reg_reg (code, X86_SSE_DIV, ins->sreg1, ins->sreg2);
3706                         break;
3707                 case OP_MULPD:
3708                         x86_sse_alu_pd_reg_reg (code, X86_SSE_MUL, ins->sreg1, ins->sreg2);
3709                         break;
3710                 case OP_SUBPD:
3711                         x86_sse_alu_pd_reg_reg (code, X86_SSE_SUB, ins->sreg1, ins->sreg2);
3712                         break;
3713                 case OP_MAXPD:
3714                         x86_sse_alu_pd_reg_reg (code, X86_SSE_MAX, ins->sreg1, ins->sreg2);
3715                         break;
3716                 case OP_MINPD:
3717                         x86_sse_alu_pd_reg_reg (code, X86_SSE_MIN, ins->sreg1, ins->sreg2);
3718                         break;
3719                 case OP_COMPPD:
3720                         g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
3721                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_COMP, ins->sreg1, ins->sreg2, ins->inst_c0);
3722                         break;
3723                 case OP_ANDPD:
3724                         x86_sse_alu_pd_reg_reg (code, X86_SSE_AND, ins->sreg1, ins->sreg2);
3725                         break;
3726                 case OP_ANDNPD:
3727                         x86_sse_alu_pd_reg_reg (code, X86_SSE_ANDN, ins->sreg1, ins->sreg2);
3728                         break;
3729                 case OP_ORPD:
3730                         x86_sse_alu_pd_reg_reg (code, X86_SSE_OR, ins->sreg1, ins->sreg2);
3731                         break;
3732                 case OP_XORPD:
3733                         x86_sse_alu_pd_reg_reg (code, X86_SSE_XOR, ins->sreg1, ins->sreg2);
3734                         break;
3735                 case OP_ADDSUBPD:
3736                         x86_sse_alu_pd_reg_reg (code, X86_SSE_ADDSUB, ins->sreg1, ins->sreg2);
3737                         break;
3738                 case OP_HADDPD:
3739                         x86_sse_alu_pd_reg_reg (code, X86_SSE_HADD, ins->sreg1, ins->sreg2);
3740                         break;
3741                 case OP_HSUBPD:
3742                         x86_sse_alu_pd_reg_reg (code, X86_SSE_HSUB, ins->sreg1, ins->sreg2);
3743                         break;
3744                 case OP_DUPPD:
3745                         x86_sse_alu_sd_reg_reg (code, X86_SSE_MOVDDUP, ins->dreg, ins->sreg1);
3746                         break;
3747                         
3748                 case OP_EXTRACT_MASK:
3749                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMOVMSKB, ins->dreg, ins->sreg1);
3750                         break;
3751         
3752                 case OP_PAND:
3753                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PAND, ins->sreg1, ins->sreg2);
3754                         break;
3755                 case OP_POR:
3756                         x86_sse_alu_pd_reg_reg (code, X86_SSE_POR, ins->sreg1, ins->sreg2);
3757                         break;
3758                 case OP_PXOR:
3759                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PXOR, ins->sreg1, ins->sreg2);
3760                         break;
3761
3762                 case OP_PADDB:
3763                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDB, ins->sreg1, ins->sreg2);
3764                         break;
3765                 case OP_PADDW:
3766                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDW, ins->sreg1, ins->sreg2);
3767                         break;
3768                 case OP_PADDD:
3769                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDD, ins->sreg1, ins->sreg2);
3770                         break;
3771                 case OP_PADDQ:
3772                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDQ, ins->sreg1, ins->sreg2);
3773                         break;
3774
3775                 case OP_PSUBB:
3776                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBB, ins->sreg1, ins->sreg2);
3777                         break;
3778                 case OP_PSUBW:
3779                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBW, ins->sreg1, ins->sreg2);
3780                         break;
3781                 case OP_PSUBD:
3782                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBD, ins->sreg1, ins->sreg2);
3783                         break;
3784                 case OP_PSUBQ:
3785                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBQ, ins->sreg1, ins->sreg2);
3786                         break;
3787
3788                 case OP_PMAXB_UN:
3789                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMAXUB, ins->sreg1, ins->sreg2);
3790                         break;
3791                 case OP_PMAXW_UN:
3792                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXUW, ins->sreg1, ins->sreg2);
3793                         break;
3794                 case OP_PMAXD_UN:
3795                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXUD, ins->sreg1, ins->sreg2);
3796                         break;
3797                 
3798                 case OP_PMAXB:
3799                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXSB, ins->sreg1, ins->sreg2);
3800                         break;
3801                 case OP_PMAXW:
3802                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMAXSW, ins->sreg1, ins->sreg2);
3803                         break;
3804                 case OP_PMAXD:
3805                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXSD, ins->sreg1, ins->sreg2);
3806                         break;
3807
3808                 case OP_PAVGB_UN:
3809                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PAVGB, ins->sreg1, ins->sreg2);
3810                         break;
3811                 case OP_PAVGW_UN:
3812                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PAVGW, ins->sreg1, ins->sreg2);
3813                         break;
3814
3815                 case OP_PMINB_UN:
3816                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMINUB, ins->sreg1, ins->sreg2);
3817                         break;
3818                 case OP_PMINW_UN:
3819                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINUW, ins->sreg1, ins->sreg2);
3820                         break;
3821                 case OP_PMIND_UN:
3822                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINUD, ins->sreg1, ins->sreg2);
3823                         break;
3824
3825                 case OP_PMINB:
3826                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINSB, ins->sreg1, ins->sreg2);
3827                         break;
3828                 case OP_PMINW:
3829                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMINSW, ins->sreg1, ins->sreg2);
3830                         break;
3831                 case OP_PMIND:
3832                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINSD, ins->sreg1, ins->sreg2);
3833                         break;
3834
3835                 case OP_PCMPEQB:
3836                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQB, ins->sreg1, ins->sreg2);
3837                         break;
3838                 case OP_PCMPEQW:
3839                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQW, ins->sreg1, ins->sreg2);
3840                         break;
3841                 case OP_PCMPEQD:
3842                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQD, ins->sreg1, ins->sreg2);
3843                         break;
3844                 case OP_PCMPEQQ:
3845                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PCMPEQQ, ins->sreg1, ins->sreg2);
3846                         break;
3847
3848                 case OP_PCMPGTB:
3849                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTB, ins->sreg1, ins->sreg2);
3850                         break;
3851                 case OP_PCMPGTW:
3852                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTW, ins->sreg1, ins->sreg2);
3853                         break;
3854                 case OP_PCMPGTD:
3855                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTD, ins->sreg1, ins->sreg2);
3856                         break;
3857                 case OP_PCMPGTQ:
3858                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PCMPGTQ, ins->sreg1, ins->sreg2);
3859                         break;
3860
3861                 case OP_PSUM_ABS_DIFF:
3862                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSADBW, ins->sreg1, ins->sreg2);
3863                         break;
3864
3865                 case OP_UNPACK_LOWB:
3866                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLBW, ins->sreg1, ins->sreg2);
3867                         break;
3868                 case OP_UNPACK_LOWW:
3869                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLWD, ins->sreg1, ins->sreg2);
3870                         break;
3871                 case OP_UNPACK_LOWD:
3872                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLDQ, ins->sreg1, ins->sreg2);
3873                         break;
3874                 case OP_UNPACK_LOWQ:
3875                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLQDQ, ins->sreg1, ins->sreg2);
3876                         break;
3877                 case OP_UNPACK_LOWPS:
3878                         x86_sse_alu_ps_reg_reg (code, X86_SSE_UNPCKL, ins->sreg1, ins->sreg2);
3879                         break;
3880                 case OP_UNPACK_LOWPD:
3881                         x86_sse_alu_pd_reg_reg (code, X86_SSE_UNPCKL, ins->sreg1, ins->sreg2);
3882                         break;
3883
3884                 case OP_UNPACK_HIGHB:
3885                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHBW, ins->sreg1, ins->sreg2);
3886                         break;
3887                 case OP_UNPACK_HIGHW:
3888                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHWD, ins->sreg1, ins->sreg2);
3889                         break;
3890                 case OP_UNPACK_HIGHD:
3891                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHDQ, ins->sreg1, ins->sreg2);
3892                         break;
3893                 case OP_UNPACK_HIGHQ:
3894                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHQDQ, ins->sreg1, ins->sreg2);
3895                         break;
3896                 case OP_UNPACK_HIGHPS:
3897                         x86_sse_alu_ps_reg_reg (code, X86_SSE_UNPCKH, ins->sreg1, ins->sreg2);
3898                         break;
3899                 case OP_UNPACK_HIGHPD:
3900                         x86_sse_alu_pd_reg_reg (code, X86_SSE_UNPCKH, ins->sreg1, ins->sreg2);
3901                         break;
3902
3903                 case OP_PACKW:
3904                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PACKSSWB, ins->sreg1, ins->sreg2);
3905                         break;
3906                 case OP_PACKD:
3907                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PACKSSDW, ins->sreg1, ins->sreg2);
3908                         break;
3909                 case OP_PACKW_UN:
3910                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PACKUSWB, ins->sreg1, ins->sreg2);
3911                         break;
3912                 case OP_PACKD_UN:
3913                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PACKUSDW, ins->sreg1, ins->sreg2);
3914                         break;
3915
3916                 case OP_PADDB_SAT_UN:
3917                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDUSB, ins->sreg1, ins->sreg2);
3918                         break;
3919                 case OP_PSUBB_SAT_UN:
3920                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBUSB, ins->sreg1, ins->sreg2);
3921                         break;
3922                 case OP_PADDW_SAT_UN:
3923                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDUSW, ins->sreg1, ins->sreg2);
3924                         break;
3925                 case OP_PSUBW_SAT_UN:
3926                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBUSW, ins->sreg1, ins->sreg2);
3927                         break;
3928
3929                 case OP_PADDB_SAT:
3930                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDSB, ins->sreg1, ins->sreg2);
3931                         break;
3932                 case OP_PSUBB_SAT:
3933                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBSB, ins->sreg1, ins->sreg2);
3934                         break;
3935                 case OP_PADDW_SAT:
3936                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDSW, ins->sreg1, ins->sreg2);
3937                         break;
3938                 case OP_PSUBW_SAT:
3939                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBSW, ins->sreg1, ins->sreg2);
3940                         break;
3941                         
3942                 case OP_PMULW:
3943                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULLW, ins->sreg1, ins->sreg2);
3944                         break;
3945                 case OP_PMULD:
3946                         x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMULLD, ins->sreg1, ins->sreg2);
3947                         break;
3948                 case OP_PMULQ:
3949                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULUDQ, ins->sreg1, ins->sreg2);
3950                         break;
3951                 case OP_PMULW_HIGH_UN:
3952                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULHUW, ins->sreg1, ins->sreg2);
3953                         break;
3954                 case OP_PMULW_HIGH:
3955                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULHW, ins->sreg1, ins->sreg2);
3956                         break;
3957
3958                 case OP_PSHRW:
3959                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SHR, ins->dreg, ins->inst_imm);
3960                         break;
3961                 case OP_PSHRW_REG:
3962                         x86_sse_shift_reg_reg (code, X86_SSE_PSRLW_REG, ins->dreg, ins->sreg2);
3963                         break;
3964
3965                 case OP_PSARW:
3966                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SAR, ins->dreg, ins->inst_imm);
3967                         break;
3968                 case OP_PSARW_REG:
3969                         x86_sse_shift_reg_reg (code, X86_SSE_PSRAW_REG, ins->dreg, ins->sreg2);
3970                         break;
3971
3972                 case OP_PSHLW:
3973                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SHL, ins->dreg, ins->inst_imm);
3974                         break;
3975                 case OP_PSHLW_REG:
3976                         x86_sse_shift_reg_reg (code, X86_SSE_PSLLW_REG, ins->dreg, ins->sreg2);
3977                         break;
3978
3979                 case OP_PSHRD:
3980                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTD, X86_SSE_SHR, ins->dreg, ins->inst_imm);
3981                         break;
3982                 case OP_PSHRD_REG:
3983                         x86_sse_shift_reg_reg (code, X86_SSE_PSRLD_REG, ins->dreg, ins->sreg2);
3984                         break;
3985
3986                 case OP_PSARD:
3987                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTD, X86_SSE_SAR, ins->dreg, ins->inst_imm);
3988                         break;
3989                 case OP_PSARD_REG:
3990                         x86_sse_shift_reg_reg (code, X86_SSE_PSRAD_REG, ins->dreg, ins->sreg2);
3991                         break;
3992
3993                 case OP_PSHLD:
3994                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTD, X86_SSE_SHL, ins->dreg, ins->inst_imm);
3995                         break;
3996                 case OP_PSHLD_REG:
3997                         x86_sse_shift_reg_reg (code, X86_SSE_PSLLD_REG, ins->dreg, ins->sreg2);
3998                         break;
3999
4000                 case OP_PSHRQ:
4001                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTQ, X86_SSE_SHR, ins->dreg, ins->inst_imm);
4002                         break;
4003                 case OP_PSHRQ_REG:
4004                         x86_sse_shift_reg_reg (code, X86_SSE_PSRLQ_REG, ins->dreg, ins->sreg2);
4005                         break;
4006
4007                 case OP_PSHLQ:
4008                         x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTQ, X86_SSE_SHL, ins->dreg, ins->inst_imm);
4009                         break;
4010                 case OP_PSHLQ_REG:
4011                         x86_sse_shift_reg_reg (code, X86_SSE_PSLLQ_REG, ins->dreg, ins->sreg2);
4012                         break;          
4013                         
4014                 case OP_ICONV_TO_X:
4015                         x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
4016                         break;
4017                 case OP_EXTRACT_I4:
4018                         x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
4019                         break;
4020                 case OP_EXTRACT_I1:
4021                 case OP_EXTRACT_U1:
4022                         x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
4023                         if (ins->inst_c0)
4024                                 x86_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
4025                         x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
4026                         break;
4027                 case OP_EXTRACT_I2:
4028                 case OP_EXTRACT_U2:
4029                         x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
4030                         if (ins->inst_c0)
4031                                 x86_shift_reg_imm (code, X86_SHR, ins->dreg, 16);
4032                         x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE);
4033                         break;
4034                 case OP_EXTRACT_R8:
4035                         if (ins->inst_c0)
4036                                 x86_sse_alu_pd_membase_reg (code, X86_SSE_MOVHPD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
4037                         else
4038                                 x86_sse_alu_sd_membase_reg (code, X86_SSE_MOVSD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
4039                         x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE);
4040                         break;
4041
4042                 case OP_INSERT_I2:
4043                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->sreg1, ins->sreg2, ins->inst_c0);
4044                         break;
4045                 case OP_EXTRACTX_U2:
4046                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PEXTRW, ins->dreg, ins->sreg1, ins->inst_c0);
4047                         break;
4048                 case OP_INSERTX_U1_SLOW:
4049                         /*sreg1 is the extracted ireg (scratch)
4050                         /sreg2 is the to be inserted ireg (scratch)
4051                         /dreg is the xreg to receive the value*/
4052
4053                         /*clear the bits from the extracted word*/
4054                         x86_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
4055                         /*shift the value to insert if needed*/
4056                         if (ins->inst_c0 & 1)
4057                                 x86_shift_reg_imm (code, X86_SHL, ins->sreg2, 8);
4058                         /*join them together*/
4059                         x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
4060                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
4061                         break;
4062                 case OP_INSERTX_I4_SLOW:
4063                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
4064                         x86_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
4065                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
4066                         break;
4067
4068                 case OP_INSERTX_R4_SLOW:
4069                         x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
4070                         /*TODO if inst_c0 == 0 use movss*/
4071                         x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 0, ins->inst_c0 * 2);
4072                         x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 2, ins->inst_c0 * 2 + 1);
4073                         break;
4074                 case OP_INSERTX_R8_SLOW:
4075                         x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
4076                         if (ins->inst_c0)
4077                                 x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVHPD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
4078                         else
4079                                 x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVSD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
4080                         break;
4081
4082                 case OP_STOREX_MEMBASE_REG:
4083                 case OP_STOREX_MEMBASE:
4084                         x86_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
4085                         break;
4086                 case OP_LOADX_MEMBASE:
4087                         x86_movups_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
4088                         break;
4089                 case OP_LOADX_ALIGNED_MEMBASE:
4090                         x86_movaps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
4091                         break;
4092                 case OP_STOREX_ALIGNED_MEMBASE_REG:
4093                         x86_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
4094                         break;
4095                 case OP_STOREX_NTA_MEMBASE_REG:
4096                         x86_sse_alu_reg_membase (code, X86_SSE_MOVNTPS, ins->dreg, ins->sreg1, ins->inst_offset);
4097                         break;
4098                 case OP_PREFETCH_MEMBASE:
4099                         x86_sse_alu_reg_membase (code, X86_SSE_PREFETCH, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
4100
4101                         break;
4102                 case OP_XMOVE:
4103                         /*FIXME the peephole pass should have killed this*/
4104                         if (ins->dreg != ins->sreg1)
4105                                 x86_movaps_reg_reg (code, ins->dreg, ins->sreg1);
4106                         break;          
4107                 case OP_XZERO:
4108                         x86_sse_alu_pd_reg_reg (code, X86_SSE_PXOR, ins->dreg, ins->dreg);
4109                         break;
4110                 case OP_ICONV_TO_R8_RAW:
4111                         x86_mov_membase_reg (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1, 4);
4112                         x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE);
4113                         break;
4114
4115                 case OP_FCONV_TO_R8_X:
4116                         x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
4117                         x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
4118                         break;
4119
4120                 case OP_XCONV_R8_TO_I4:
4121                         x86_cvttsd2si (code, ins->dreg, ins->sreg1);
4122                         switch (ins->backend.source_opcode) {
4123                         case OP_FCONV_TO_I1:
4124                                 x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
4125                                 break;
4126                         case OP_FCONV_TO_U1:
4127                                 x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
4128                                 break;
4129                         case OP_FCONV_TO_I2:
4130                                 x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
4131                                 break;
4132                         case OP_FCONV_TO_U2:
4133                                 x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
4134                                 break;
4135                         }                       
4136                         break;
4137
4138                 case OP_EXPAND_I1:
4139                         /*FIXME this causes a partial register stall, maybe it would not be that bad to use shift + mask + or*/
4140                         /*The +4 is to get a mov ?h, ?l over the same reg.*/
4141                         x86_mov_reg_reg (code, ins->dreg + 4, ins->dreg, 1);
4142                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
4143                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
4144                         x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
4145                         break;
4146                 case OP_EXPAND_I2:
4147                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
4148                         x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
4149                         x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
4150                         break;
4151                 case OP_EXPAND_I4:
4152                         x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
4153                         x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
4154                         break;
4155                 case OP_EXPAND_R4:
4156                         x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
4157                         x86_movd_xreg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
4158                         x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
4159                         break;
4160                 case OP_EXPAND_R8:
4161                         x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
4162                         x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
4163                         x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0x44);
4164                         break;
4165 #endif
4166                 default:
4167                         g_warning ("unknown opcode %s\n", mono_inst_name (ins->opcode));
4168                         g_assert_not_reached ();
4169                 }
4170
4171                 if (G_UNLIKELY ((code - cfg->native_code - offset) > max_len)) {
4172                         g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %d)",
4173                                    mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
4174                         g_assert_not_reached ();
4175                 }
4176                
4177                 cpos += max_len;
4178         }
4179
4180         cfg->code_len = code - cfg->native_code;
4181 }
4182
4183 void
4184 mono_arch_register_lowlevel_calls (void)
4185 {
4186 }
4187
4188 void
4189 mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, MonoJumpInfo *ji, gboolean run_cctors)
4190 {
4191         MonoJumpInfo *patch_info;
4192         gboolean compile_aot = !run_cctors;
4193
4194         for (patch_info = ji; patch_info; patch_info = patch_info->next) {
4195                 unsigned char *ip = patch_info->ip.i + code;
4196                 const unsigned char *target;
4197
4198                 target = mono_resolve_patch_target (method, domain, code, patch_info, run_cctors);
4199
4200                 if (compile_aot) {
4201                         switch (patch_info->type) {
4202                         case MONO_PATCH_INFO_BB:
4203                         case MONO_PATCH_INFO_LABEL:
4204                                 break;
4205                         default:
4206                                 /* No need to patch these */
4207                                 continue;
4208                         }
4209                 }
4210
4211                 switch (patch_info->type) {
4212                 case MONO_PATCH_INFO_IP:
4213                         *((gconstpointer *)(ip)) = target;
4214                         break;
4215                 case MONO_PATCH_INFO_CLASS_INIT: {
4216                         guint8 *code = ip;
4217                         /* Might already been changed to a nop */
4218                         x86_call_code (code, 0);
4219                         x86_patch (ip, target);
4220                         break;
4221                 }
4222                 case MONO_PATCH_INFO_ABS:
4223                 case MONO_PATCH_INFO_METHOD:
4224                 case MONO_PATCH_INFO_METHOD_JUMP:
4225                 case MONO_PATCH_INFO_INTERNAL_METHOD:
4226                 case MONO_PATCH_INFO_BB:
4227                 case MONO_PATCH_INFO_LABEL:
4228                 case MONO_PATCH_INFO_RGCTX_FETCH:
4229                 case MONO_PATCH_INFO_GENERIC_CLASS_INIT:
4230                 case MONO_PATCH_INFO_MONITOR_ENTER:
4231                 case MONO_PATCH_INFO_MONITOR_EXIT:
4232                         x86_patch (ip, target);
4233                         break;
4234                 case MONO_PATCH_INFO_NONE:
4235                         break;
4236                 default: {
4237                         guint32 offset = mono_arch_get_patch_offset (ip);
4238                         *((gconstpointer *)(ip + offset)) = target;
4239                         break;
4240                 }
4241                 }
4242         }
4243 }
4244
4245 guint8 *
4246 mono_arch_emit_prolog (MonoCompile *cfg)
4247 {
4248         MonoMethod *method = cfg->method;
4249         MonoBasicBlock *bb;
4250         MonoMethodSignature *sig;
4251         MonoInst *inst;
4252         int alloc_size, pos, max_offset, i;
4253         guint8 *code;
4254
4255         cfg->code_size =  MAX (mono_method_get_header (method)->code_size * 4, 10240);
4256
4257         if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
4258                 cfg->code_size += 512;
4259
4260         code = cfg->native_code = g_malloc (cfg->code_size);
4261
4262         x86_push_reg (code, X86_EBP);
4263         x86_mov_reg_reg (code, X86_EBP, X86_ESP, 4);
4264
4265         alloc_size = cfg->stack_offset;
4266         pos = 0;
4267
4268         if (method->wrapper_type == MONO_WRAPPER_NATIVE_TO_MANAGED) {
4269                 /* Might need to attach the thread to the JIT  or change the domain for the callback */
4270                 if (appdomain_tls_offset != -1 && lmf_tls_offset != -1) {
4271                         guint8 *buf, *no_domain_branch;
4272
4273                         code = mono_x86_emit_tls_get (code, X86_EAX, appdomain_tls_offset);
4274                         x86_alu_reg_imm (code, X86_CMP, X86_EAX, GPOINTER_TO_UINT (cfg->domain));
4275                         no_domain_branch = code;
4276                         x86_branch8 (code, X86_CC_NE, 0, 0);
4277                         code = mono_x86_emit_tls_get ( code, X86_EAX, lmf_tls_offset);
4278                         x86_test_reg_reg (code, X86_EAX, X86_EAX);
4279                         buf = code;
4280                         x86_branch8 (code, X86_CC_NE, 0, 0);
4281                         x86_patch (no_domain_branch, code);
4282                         x86_push_imm (code, cfg->domain);
4283                         code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, (gpointer)"mono_jit_thread_attach");
4284                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
4285                         x86_patch (buf, code);
4286 #ifdef PLATFORM_WIN32
4287                         /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
4288                         /* FIXME: Add a separate key for LMF to avoid this */
4289                         x86_alu_reg_imm (code, X86_ADD, X86_EAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
4290 #endif
4291                 }
4292                 else {
4293                         g_assert (!cfg->compile_aot);
4294                         x86_push_imm (code, cfg->domain);
4295                         code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, (gpointer)"mono_jit_thread_attach");
4296                         x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
4297                 }
4298         }
4299
4300         if (method->save_lmf) {
4301                 pos += sizeof (MonoLMF);
4302
4303                 /* save the current IP */
4304                 mono_add_patch_info (cfg, code + 1 - cfg->native_code, MONO_PATCH_INFO_IP, NULL);
4305                 x86_push_imm_template (code);
4306
4307                 /* save all caller saved regs */
4308                 x86_push_reg (code, X86_EBP);
4309                 x86_push_reg (code, X86_ESI);
4310                 x86_push_reg (code, X86_EDI);
4311                 x86_push_reg (code, X86_EBX);
4312
4313                 if ((lmf_tls_offset != -1) && !is_win32 && !optimize_for_xen) {
4314                         /*
4315                          * Optimized version which uses the mono_lmf TLS variable instead of indirection
4316                          * through the mono_lmf_addr TLS variable.
4317                          */
4318                         /* %eax = previous_lmf */
4319                         x86_prefix (code, X86_GS_PREFIX);
4320                         x86_mov_reg_mem (code, X86_EAX, lmf_tls_offset, 4);
4321                         /* skip esp + method_info + lmf */
4322                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 12);
4323                         /* push previous_lmf */
4324                         x86_push_reg (code, X86_EAX);
4325                         /* new lmf = ESP */
4326                         x86_prefix (code, X86_GS_PREFIX);
4327                         x86_mov_mem_reg (code, lmf_tls_offset, X86_ESP, 4);
4328                 } else {
4329                         /* get the address of lmf for the current thread */
4330                         /* 
4331                          * This is performance critical so we try to use some tricks to make
4332                          * it fast.
4333                          */                                                                        
4334
4335                         if (lmf_addr_tls_offset != -1) {
4336                                 /* Load lmf quicky using the GS register */
4337                                 code = mono_x86_emit_tls_get (code, X86_EAX, lmf_addr_tls_offset);
4338 #ifdef PLATFORM_WIN32
4339                                 /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
4340                                 /* FIXME: Add a separate key for LMF to avoid this */
4341                                 x86_alu_reg_imm (code, X86_ADD, X86_EAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
4342 #endif
4343                         } else {
4344                                 code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, (gpointer)"mono_get_lmf_addr");
4345                         }
4346
4347                         /* Skip esp + method info */
4348                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
4349
4350                         /* push lmf */
4351                         x86_push_reg (code, X86_EAX); 
4352                         /* push *lfm (previous_lmf) */
4353                         x86_push_membase (code, X86_EAX, 0);
4354                         /* *(lmf) = ESP */
4355                         x86_mov_membase_reg (code, X86_EAX, 0, X86_ESP, 4);
4356                 }
4357         } else {
4358
4359                 if (cfg->used_int_regs & (1 << X86_EBX)) {
4360                         x86_push_reg (code, X86_EBX);
4361                         pos += 4;
4362                 }
4363
4364                 if (cfg->used_int_regs & (1 << X86_EDI)) {
4365                         x86_push_reg (code, X86_EDI);
4366                         pos += 4;
4367                 }
4368
4369                 if (cfg->used_int_regs & (1 << X86_ESI)) {
4370                         x86_push_reg (code, X86_ESI);
4371                         pos += 4;
4372                 }
4373         }
4374
4375         alloc_size -= pos;
4376
4377         /* the original alloc_size is already aligned: there is %ebp and retip pushed, so realign */
4378         if (mono_do_x86_stack_align) {
4379                 int tot = alloc_size + pos + 4 + 4; /* ret ip + ebp */
4380                 tot &= MONO_ARCH_FRAME_ALIGNMENT - 1;
4381                 alloc_size += MONO_ARCH_FRAME_ALIGNMENT - tot;
4382         }
4383
4384         if (alloc_size) {
4385                 /* See mono_emit_stack_alloc */
4386 #if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
4387                 guint32 remaining_size = alloc_size;
4388                 while (remaining_size >= 0x1000) {
4389                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, 0x1000);
4390                         x86_test_membase_reg (code, X86_ESP, 0, X86_ESP);
4391                         remaining_size -= 0x1000;
4392                 }
4393                 if (remaining_size)
4394                         x86_alu_reg_imm (code, X86_SUB, X86_ESP, remaining_size);
4395 #else
4396                 x86_alu_reg_imm (code, X86_SUB, X86_ESP, alloc_size);
4397 #endif
4398         }
4399
4400         if (cfg->method->wrapper_type == MONO_WRAPPER_NATIVE_TO_MANAGED ||
4401                         cfg->method->wrapper_type == MONO_WRAPPER_RUNTIME_INVOKE) {
4402                 x86_alu_reg_imm (code, X86_AND, X86_ESP, -MONO_ARCH_FRAME_ALIGNMENT);
4403         }
4404
4405 #if DEBUG_STACK_ALIGNMENT
4406         /* check the stack is aligned */
4407         if (method->wrapper_type == MONO_WRAPPER_NONE) {
4408                 x86_mov_reg_reg (code, X86_ECX, X86_ESP, 4);
4409                 x86_alu_reg_imm (code, X86_AND, X86_ECX, MONO_ARCH_FRAME_ALIGNMENT - 1);
4410                 x86_alu_reg_imm (code, X86_CMP, X86_ECX, 0);
4411                 x86_branch_disp (code, X86_CC_EQ, 3, FALSE);
4412                 x86_breakpoint (code);
4413         }
4414 #endif
4415
4416         /* compute max_offset in order to use short forward jumps */
4417         max_offset = 0;
4418         if (cfg->opt & MONO_OPT_BRANCH) {
4419                 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
4420                         MonoInst *ins;
4421                         bb->max_offset = max_offset;
4422
4423                         if (cfg->prof_options & MONO_PROFILE_COVERAGE)
4424                                 max_offset += 6;
4425                         /* max alignment for loops */
4426                         if ((cfg->opt & MONO_OPT_LOOP) && bb_is_loop_start (bb))
4427                                 max_offset += LOOP_ALIGNMENT;
4428
4429                         MONO_BB_FOR_EACH_INS (bb, ins) {
4430                                 if (ins->opcode == OP_LABEL)
4431                                         ins->inst_c1 = max_offset;
4432                                 
4433                                 max_offset += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
4434                         }
4435                 }
4436         }
4437
4438         /* store runtime generic context */
4439         if (cfg->rgctx_var) {
4440                 g_assert (cfg->rgctx_var->opcode == OP_REGOFFSET && cfg->rgctx_var->inst_basereg == X86_EBP);
4441
4442                 x86_mov_membase_reg (code, X86_EBP, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, 4);
4443         }
4444
4445         if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
4446                 code = mono_arch_instrument_prolog (cfg, mono_trace_enter_method, code, TRUE);
4447
4448         /* load arguments allocated to register from the stack */
4449         sig = mono_method_signature (method);
4450         pos = 0;
4451
4452         for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
4453                 inst = cfg->args [pos];
4454                 if (inst->opcode == OP_REGVAR) {
4455                         x86_mov_reg_membase (code, inst->dreg, X86_EBP, inst->inst_offset, 4);
4456                         if (cfg->verbose_level > 2)
4457                                 g_print ("Argument %d assigned to register %s\n", pos, mono_arch_regname (inst->dreg));
4458                 }
4459                 pos++;
4460         }
4461
4462         cfg->code_len = code - cfg->native_code;
4463
4464         g_assert (cfg->code_len < cfg->code_size);
4465
4466         return code;
4467 }
4468
4469 void
4470 mono_arch_emit_epilog (MonoCompile *cfg)
4471 {
4472         MonoMethod *method = cfg->method;
4473         MonoMethodSignature *sig = mono_method_signature (method);
4474         int quad, pos;
4475         guint32 stack_to_pop;
4476         guint8 *code;
4477         int max_epilog_size = 16;
4478         CallInfo *cinfo;
4479         
4480         if (cfg->method->save_lmf)
4481                 max_epilog_size += 128;
4482
4483         while (cfg->code_len + max_epilog_size > (cfg->code_size - 16)) {
4484                 cfg->code_size *= 2;
4485                 cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
4486                 mono_jit_stats.code_reallocs++;
4487         }
4488
4489         code = cfg->native_code + cfg->code_len;
4490
4491         if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
4492                 code = mono_arch_instrument_epilog (cfg, mono_trace_leave_method, code, TRUE);
4493
4494         /* the code restoring the registers must be kept in sync with OP_JMP */
4495         pos = 0;
4496         
4497         if (method->save_lmf) {
4498                 gint32 prev_lmf_reg;
4499                 gint32 lmf_offset = -sizeof (MonoLMF);
4500
4501                 /* check if we need to restore protection of the stack after a stack overflow */
4502                 if (mono_get_jit_tls_offset () != -1) {
4503                         guint8 *patch;
4504                         code = mono_x86_emit_tls_get (code, X86_ECX, mono_get_jit_tls_offset ());
4505                         /* we load the value in a separate instruction: this mechanism may be
4506                          * used later as a safer way to do thread interruption
4507                          */
4508                         x86_mov_reg_membase (code, X86_ECX, X86_ECX, G_STRUCT_OFFSET (MonoJitTlsData, restore_stack_prot), 4);
4509                         x86_alu_reg_imm (code, X86_CMP, X86_ECX, 0);
4510                         patch = code;
4511                         x86_branch8 (code, X86_CC_Z, 0, FALSE);
4512                         /* note that the call trampoline will preserve eax/edx */
4513                         x86_call_reg (code, X86_ECX);
4514                         x86_patch (patch, code);
4515                 } else {
4516                         /* FIXME: maybe save the jit tls in the prolog */
4517                 }
4518                 if ((lmf_tls_offset != -1) && !is_win32 && !optimize_for_xen) {
4519                         /*
4520                          * Optimized version which uses the mono_lmf TLS variable instead of indirection
4521                          * through the mono_lmf_addr TLS variable.
4522                          */
4523                         /* reg = previous_lmf */
4524                         x86_mov_reg_membase (code, X86_ECX, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 4);
4525
4526                         /* lmf = previous_lmf */
4527                         x86_prefix (code, X86_GS_PREFIX);
4528                         x86_mov_mem_reg (code, lmf_tls_offset, X86_ECX, 4);
4529                 } else {
4530                         /* Find a spare register */
4531                         switch (mini_type_get_underlying_type (cfg->generic_sharing_context, sig->ret)->type) {
4532                         case MONO_TYPE_I8:
4533                         case MONO_TYPE_U8:
4534                                 prev_lmf_reg = X86_EDI;
4535                                 cfg->used_int_regs |= (1 << X86_EDI);
4536                                 break;
4537                         default:
4538                                 prev_lmf_reg = X86_EDX;
4539                                 break;
4540                         }
4541
4542                         /* reg = previous_lmf */
4543                         x86_mov_reg_membase (code, prev_lmf_reg, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), 4);
4544
4545                         /* ecx = lmf */
4546                         x86_mov_reg_membase (code, X86_ECX, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), 4);
4547
4548                         /* *(lmf) = previous_lmf */
4549                         x86_mov_membase_reg (code, X86_ECX, 0, prev_lmf_reg, 4);
4550                 }
4551
4552                 /* restore caller saved regs */
4553                 if (cfg->used_int_regs & (1 << X86_EBX)) {
4554                         x86_mov_reg_membase (code, X86_EBX, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebx), 4);
4555                 }
4556
4557                 if (cfg->used_int_regs & (1 << X86_EDI)) {
4558                         x86_mov_reg_membase (code, X86_EDI, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, edi), 4);
4559                 }
4560                 if (cfg->used_int_regs & (1 << X86_ESI)) {
4561                         x86_mov_reg_membase (code, X86_ESI, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, esi), 4);
4562                 }
4563
4564                 /* EBP is restored by LEAVE */
4565         } else {
4566                 if (cfg->used_int_regs & (1 << X86_EBX)) {
4567                         pos -= 4;
4568                 }
4569                 if (cfg->used_int_regs & (1 << X86_EDI)) {
4570                         pos -= 4;
4571                 }
4572                 if (cfg->used_int_regs & (1 << X86_ESI)) {
4573                         pos -= 4;
4574                 }
4575
4576                 if (pos)
4577                         x86_lea_membase (code, X86_ESP, X86_EBP, pos);
4578
4579                 if (cfg->used_int_regs & (1 << X86_ESI)) {
4580                         x86_pop_reg (code, X86_ESI);
4581                 }
4582                 if (cfg->used_int_regs & (1 << X86_EDI)) {
4583                         x86_pop_reg (code, X86_EDI);
4584                 }
4585                 if (cfg->used_int_regs & (1 << X86_EBX)) {
4586                         x86_pop_reg (code, X86_EBX);
4587                 }
4588         }
4589
4590         /* Load returned vtypes into registers if needed */
4591         cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
4592         if (cinfo->ret.storage == ArgValuetypeInReg) {
4593                 for (quad = 0; quad < 2; quad ++) {
4594                         switch (cinfo->ret.pair_storage [quad]) {
4595                         case ArgInIReg:
4596                                 x86_mov_reg_membase (code, cinfo->ret.pair_regs [quad], cfg->ret->inst_basereg, cfg->ret->inst_offset + (quad * sizeof (gpointer)), 4);
4597                                 break;
4598                         case ArgOnFloatFpStack:
4599                                 x86_fld_membase (code, cfg->ret->inst_basereg, cfg->ret->inst_offset + (quad * sizeof (gpointer)), FALSE);
4600                                 break;
4601                         case ArgOnDoubleFpStack:
4602                                 x86_fld_membase (code, cfg->ret->inst_basereg, cfg->ret->inst_offset + (quad * sizeof (gpointer)), TRUE);
4603                                 break;
4604                         case ArgNone:
4605                                 break;
4606                         default:
4607                                 g_assert_not_reached ();
4608                         }
4609                 }
4610         }
4611
4612         x86_leave (code);
4613
4614         if (CALLCONV_IS_STDCALL (sig)) {
4615                 MonoJitArgumentInfo *arg_info = alloca (sizeof (MonoJitArgumentInfo) * (sig->param_count + 1));
4616
4617                 stack_to_pop = mono_arch_get_argument_info (sig, sig->param_count, arg_info);
4618         } else if (MONO_TYPE_ISSTRUCT (mono_method_signature (cfg->method)->ret) && (cinfo->ret.storage == ArgOnStack))
4619                 stack_to_pop = 4;
4620         else
4621                 stack_to_pop = 0;
4622
4623         if (stack_to_pop)
4624                 x86_ret_imm (code, stack_to_pop);
4625         else
4626                 x86_ret (code);
4627
4628         cfg->code_len = code - cfg->native_code;
4629
4630         g_assert (cfg->code_len < cfg->code_size);
4631 }
4632
4633 void
4634 mono_arch_emit_exceptions (MonoCompile *cfg)
4635 {
4636         MonoJumpInfo *patch_info;
4637         int nthrows, i;
4638         guint8 *code;
4639         MonoClass *exc_classes [16];
4640         guint8 *exc_throw_start [16], *exc_throw_end [16];
4641         guint32 code_size;
4642         int exc_count = 0;
4643
4644         /* Compute needed space */
4645         for (patch_info = cfg->patch_info; patch_info; patch_info = patch_info->next) {
4646                 if (patch_info->type == MONO_PATCH_INFO_EXC)
4647                         exc_count++;
4648         }
4649
4650         /* 
4651          * make sure we have enough space for exceptions
4652          * 16 is the size of two push_imm instructions and a call
4653          */
4654         if (cfg->compile_aot)
4655                 code_size = exc_count * 32;
4656         else
4657                 code_size = exc_count * 16;
4658
4659         while (cfg->code_len + code_size > (cfg->code_size - 16)) {
4660                 cfg->code_size *= 2;
4661                 cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
4662                 mono_jit_stats.code_reallocs++;
4663         }
4664
4665         code = cfg->native_code + cfg->code_len;
4666
4667         nthrows = 0;
4668         for (patch_info = cfg->patch_info; patch_info; patch_info = patch_info->next) {
4669                 switch (patch_info->type) {
4670                 case MONO_PATCH_INFO_EXC: {
4671                         MonoClass *exc_class;
4672                         guint8 *buf, *buf2;
4673                         guint32 throw_ip;
4674
4675                         x86_patch (patch_info->ip.i + cfg->native_code, code);
4676
4677                         exc_class = mono_class_from_name (mono_defaults.corlib, "System", patch_info->data.name);
4678                         g_assert (exc_class);
4679                         throw_ip = patch_info->ip.i;
4680
4681                         /* Find a throw sequence for the same exception class */
4682                         for (i = 0; i < nthrows; ++i)
4683                                 if (exc_classes [i] == exc_class)
4684                                         break;
4685                         if (i < nthrows) {
4686                                 x86_push_imm (code, (exc_throw_end [i] - cfg->native_code) - throw_ip);
4687                                 x86_jump_code (code, exc_throw_start [i]);
4688                                 patch_info->type = MONO_PATCH_INFO_NONE;
4689                         }
4690                         else {
4691                                 guint32 size;
4692
4693                                 /* Compute size of code following the push <OFFSET> */
4694                                 size = 5 + 5;
4695
4696                                 if ((code - cfg->native_code) - throw_ip < 126 - size) {
4697                                         /* Use the shorter form */
4698                                         buf = buf2 = code;
4699                                         x86_push_imm (code, 0);
4700                                 }
4701                                 else {
4702                                         buf = code;
4703                                         x86_push_imm (code, 0xf0f0f0f0);
4704                                         buf2 = code;
4705                                 }
4706
4707                                 if (nthrows < 16) {
4708                                         exc_classes [nthrows] = exc_class;
4709                                         exc_throw_start [nthrows] = code;
4710                                 }
4711
4712                                 x86_push_imm (code, exc_class->type_token - MONO_TOKEN_TYPE_DEF);
4713                                 patch_info->data.name = "mono_arch_throw_corlib_exception";
4714                                 patch_info->type = MONO_PATCH_INFO_INTERNAL_METHOD;
4715                                 patch_info->ip.i = code - cfg->native_code;
4716                                 x86_call_code (code, 0);
4717                                 x86_push_imm (buf, (code - cfg->native_code) - throw_ip);
4718                                 while (buf < buf2)
4719                                         x86_nop (buf);
4720
4721                                 if (nthrows < 16) {
4722                                         exc_throw_end [nthrows] = code;
4723                                         nthrows ++;
4724                                 }
4725                         }
4726                         break;
4727                 }
4728                 default:
4729                         /* do nothing */
4730                         break;
4731                 }
4732         }
4733
4734         cfg->code_len = code - cfg->native_code;
4735
4736         g_assert (cfg->code_len < cfg->code_size);
4737 }
4738
4739 void
4740 mono_arch_flush_icache (guint8 *code, gint size)
4741 {
4742         /* not needed */
4743 }
4744
4745 void
4746 mono_arch_flush_register_windows (void)
4747 {
4748 }
4749
4750 gboolean 
4751 mono_arch_is_inst_imm (gint64 imm)
4752 {
4753         return TRUE;
4754 }
4755
4756 /*
4757  * Support for fast access to the thread-local lmf structure using the GS
4758  * segment register on NPTL + kernel 2.6.x.
4759  */
4760
4761 static gboolean tls_offset_inited = FALSE;
4762
4763 void
4764 mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
4765 {
4766         if (!tls_offset_inited) {
4767                 if (!getenv ("MONO_NO_TLS")) {
4768 #ifdef PLATFORM_WIN32
4769                         /* 
4770                          * We need to init this multiple times, since when we are first called, the key might not
4771                          * be initialized yet.
4772                          */
4773                         appdomain_tls_offset = mono_domain_get_tls_key ();
4774                         lmf_tls_offset = mono_get_jit_tls_key ();
4775                         thread_tls_offset = mono_thread_get_tls_key ();
4776
4777                         /* Only 64 tls entries can be accessed using inline code */
4778                         if (appdomain_tls_offset >= 64)
4779                                 appdomain_tls_offset = -1;
4780                         if (lmf_tls_offset >= 64)
4781                                 lmf_tls_offset = -1;
4782                         if (thread_tls_offset >= 64)
4783                                 thread_tls_offset = -1;
4784 #else
4785 #if MONO_XEN_OPT
4786                         optimize_for_xen = access ("/proc/xen", F_OK) == 0;
4787 #endif
4788                         tls_offset_inited = TRUE;
4789                         appdomain_tls_offset = mono_domain_get_tls_offset ();
4790                         lmf_tls_offset = mono_get_lmf_tls_offset ();
4791                         lmf_addr_tls_offset = mono_get_lmf_addr_tls_offset ();
4792                         thread_tls_offset = mono_thread_get_tls_offset ();
4793 #endif
4794                 }
4795         }               
4796 }
4797
4798 void
4799 mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
4800 {
4801 }
4802
4803 #ifdef MONO_ARCH_HAVE_IMT
4804
4805 // Linear handler, the bsearch head compare is shorter
4806 //[2 + 4] x86_alu_reg_imm (code, X86_CMP, ins->sreg1, ins->inst_imm);
4807 //[1 + 1] x86_branch8(inst,cond,imm,is_signed)
4808 //        x86_patch(ins,target)
4809 //[1 + 5] x86_jump_mem(inst,mem)
4810
4811 #define CMP_SIZE 6
4812 #define BR_SMALL_SIZE 2
4813 #define BR_LARGE_SIZE 5
4814 #define JUMP_IMM_SIZE 6
4815 #define ENABLE_WRONG_METHOD_CHECK 0
4816
4817 static int
4818 imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
4819 {
4820         int i, distance = 0;
4821         for (i = start; i < target; ++i)
4822                 distance += imt_entries [i]->chunk_size;
4823         return distance;
4824 }
4825
4826 /*
4827  * LOCKING: called with the domain lock held
4828  */
4829 gpointer
4830 mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count,
4831         gpointer fail_tramp)
4832 {
4833         int i;
4834         int size = 0;
4835         guint8 *code, *start;
4836
4837         for (i = 0; i < count; ++i) {
4838                 MonoIMTCheckItem *item = imt_entries [i];
4839                 if (item->is_equals) {
4840                         if (item->check_target_idx) {
4841                                 if (!item->compare_done)
4842                                         item->chunk_size += CMP_SIZE;
4843                                 item->chunk_size += BR_SMALL_SIZE + JUMP_IMM_SIZE;
4844                         } else {
4845                                 if (fail_tramp) {
4846                                         item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + JUMP_IMM_SIZE * 2;
4847                                 } else {
4848                                         item->chunk_size += JUMP_IMM_SIZE;
4849 #if ENABLE_WRONG_METHOD_CHECK
4850                                         item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
4851 #endif
4852                                 }
4853                         }
4854                 } else {
4855                         item->chunk_size += CMP_SIZE + BR_LARGE_SIZE;
4856                         imt_entries [item->check_target_idx]->compare_done = TRUE;
4857                 }
4858                 size += item->chunk_size;
4859         }
4860         if (fail_tramp)
4861                 code = mono_method_alloc_generic_virtual_thunk (domain, size);
4862         else
4863                 code = mono_code_manager_reserve (domain->code_mp, size);
4864         start = code;
4865         for (i = 0; i < count; ++i) {
4866                 MonoIMTCheckItem *item = imt_entries [i];
4867                 item->code_target = code;
4868                 if (item->is_equals) {
4869                         if (item->check_target_idx) {
4870                                 if (!item->compare_done)
4871                                         x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
4872                                 item->jmp_code = code;
4873                                 x86_branch8 (code, X86_CC_NE, 0, FALSE);
4874                                 if (fail_tramp)
4875                                         x86_jump_code (code, item->value.target_code);
4876                                 else
4877                                         x86_jump_mem (code, & (vtable->vtable [item->value.vtable_slot]));
4878                         } else {
4879                                 if (fail_tramp) {
4880                                         x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
4881                                         item->jmp_code = code;
4882                                         x86_branch8 (code, X86_CC_NE, 0, FALSE);
4883                                         x86_jump_code (code, item->value.target_code);
4884                                         x86_patch (item->jmp_code, code);
4885                                         x86_jump_code (code, fail_tramp);
4886                                         item->jmp_code = NULL;
4887                                 } else {
4888                                         /* enable the commented code to assert on wrong method */
4889 #if ENABLE_WRONG_METHOD_CHECK
4890                                         x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
4891                                         item->jmp_code = code;
4892                                         x86_branch8 (code, X86_CC_NE, 0, FALSE);
4893 #endif
4894                                         x86_jump_mem (code, & (vtable->vtable [item->value.vtable_slot]));
4895 #if ENABLE_WRONG_METHOD_CHECK
4896                                         x86_patch (item->jmp_code, code);
4897                                         x86_breakpoint (code);
4898                                         item->jmp_code = NULL;
4899 #endif
4900                                 }
4901                         }
4902                 } else {
4903                         x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
4904                         item->jmp_code = code;
4905                         if (x86_is_imm8 (imt_branch_distance (imt_entries, i, item->check_target_idx)))
4906                                 x86_branch8 (code, X86_CC_GE, 0, FALSE);
4907                         else
4908                                 x86_branch32 (code, X86_CC_GE, 0, FALSE);
4909                 }
4910         }
4911         /* patch the branches to get to the target items */
4912         for (i = 0; i < count; ++i) {
4913                 MonoIMTCheckItem *item = imt_entries [i];
4914                 if (item->jmp_code) {
4915                         if (item->check_target_idx) {
4916                                 x86_patch (item->jmp_code, imt_entries [item->check_target_idx]->code_target);
4917                         }
4918                 }
4919         }
4920
4921         if (!fail_tramp)
4922                 mono_stats.imt_thunks_size += code - start;
4923         g_assert (code - start <= size);
4924         return start;
4925 }
4926
4927 MonoMethod*
4928 mono_arch_find_imt_method (gpointer *regs, guint8 *code)
4929 {
4930         return (MonoMethod*) regs [MONO_ARCH_IMT_REG];
4931 }
4932
4933 MonoObject*
4934 mono_arch_find_this_argument (gpointer *regs, MonoMethod *method, MonoGenericSharingContext *gsctx)
4935 {
4936         MonoMethodSignature *sig = mono_method_signature (method);
4937         CallInfo *cinfo = get_call_info (gsctx, NULL, sig, FALSE);
4938         int this_argument_offset;
4939         MonoObject *this_argument;
4940
4941         /* 
4942          * this is the offset of the this arg from esp as saved at the start of 
4943          * mono_arch_create_trampoline_code () in tramp-x86.c.
4944          */
4945         this_argument_offset = 5;
4946         if (MONO_TYPE_ISSTRUCT (sig->ret) && (cinfo->ret.storage == ArgOnStack))
4947                 this_argument_offset++;
4948
4949         this_argument = * (MonoObject**) (((guint8*) regs [X86_ESP]) + this_argument_offset * sizeof (gpointer));
4950
4951         g_free (cinfo);
4952         return this_argument;
4953 }
4954 #endif
4955
4956 MonoVTable*
4957 mono_arch_find_static_call_vtable (gpointer *regs, guint8 *code)
4958 {
4959         return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
4960 }
4961
4962 MonoInst*
4963 mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
4964 {
4965         MonoInst *ins = NULL;
4966         int opcode = 0;
4967
4968         if (cmethod->klass == mono_defaults.math_class) {
4969                 if (strcmp (cmethod->name, "Sin") == 0) {
4970                         opcode = OP_SIN;
4971                 } else if (strcmp (cmethod->name, "Cos") == 0) {
4972                         opcode = OP_COS;
4973                 } else if (strcmp (cmethod->name, "Tan") == 0) {
4974                         opcode = OP_TAN;
4975                 } else if (strcmp (cmethod->name, "Atan") == 0) {
4976                         opcode = OP_ATAN;
4977                 } else if (strcmp (cmethod->name, "Sqrt") == 0) {
4978                         opcode = OP_SQRT;
4979                 } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
4980                         opcode = OP_ABS;
4981                 } else if (strcmp (cmethod->name, "Round") == 0 && fsig->param_count == 1 && fsig->params [0]->type == MONO_TYPE_R8) {
4982                         opcode = OP_ROUND;
4983                 }
4984                 
4985                 if (opcode) {
4986                         MONO_INST_NEW (cfg, ins, opcode);
4987                         ins->type = STACK_R8;
4988                         ins->dreg = mono_alloc_freg (cfg);
4989                         ins->sreg1 = args [0]->dreg;
4990                         MONO_ADD_INS (cfg->cbb, ins);
4991                 }
4992
4993                 if (cfg->opt & MONO_OPT_CMOV) {
4994                         int opcode = 0;
4995
4996                         if (strcmp (cmethod->name, "Min") == 0) {
4997                                 if (fsig->params [0]->type == MONO_TYPE_I4)
4998                                         opcode = OP_IMIN;
4999                         } else if (strcmp (cmethod->name, "Max") == 0) {
5000                                 if (fsig->params [0]->type == MONO_TYPE_I4)
5001                                         opcode = OP_IMAX;
5002                         }               
5003
5004                         if (opcode) {
5005                                 MONO_INST_NEW (cfg, ins, opcode);
5006                                 ins->type = STACK_I4;
5007                                 ins->dreg = mono_alloc_ireg (cfg);
5008                                 ins->sreg1 = args [0]->dreg;
5009                                 ins->sreg2 = args [1]->dreg;
5010                                 MONO_ADD_INS (cfg->cbb, ins);
5011                         }
5012                 }
5013
5014 #if 0
5015                 /* OP_FREM is not IEEE compatible */
5016                 else if (strcmp (cmethod->name, "IEEERemainder") == 0) {
5017                         MONO_INST_NEW (cfg, ins, OP_FREM);
5018                         ins->inst_i0 = args [0];
5019                         ins->inst_i1 = args [1];
5020                 }
5021 #endif
5022         }
5023
5024         return ins;
5025 }
5026
5027 gboolean
5028 mono_arch_print_tree (MonoInst *tree, int arity)
5029 {
5030         return 0;
5031 }
5032
5033 MonoInst* mono_arch_get_domain_intrinsic (MonoCompile* cfg)
5034 {
5035         MonoInst* ins;
5036
5037         return NULL;
5038
5039         if (appdomain_tls_offset == -1)
5040                 return NULL;
5041
5042         MONO_INST_NEW (cfg, ins, OP_TLS_GET);
5043         ins->inst_offset = appdomain_tls_offset;
5044         return ins;
5045 }
5046
5047 MonoInst* mono_arch_get_thread_intrinsic (MonoCompile* cfg)
5048 {
5049         MonoInst* ins;
5050
5051         if (thread_tls_offset == -1)
5052                 return NULL;
5053
5054         MONO_INST_NEW (cfg, ins, OP_TLS_GET);
5055         ins->inst_offset = thread_tls_offset;
5056         return ins;
5057 }
5058
5059 guint32
5060 mono_arch_get_patch_offset (guint8 *code)
5061 {
5062         if ((code [0] == 0x8b) && (x86_modrm_mod (code [1]) == 0x2))
5063                 return 2;
5064         else if ((code [0] == 0xba))
5065                 return 1;
5066         else if ((code [0] == 0x68))
5067                 /* push IMM */
5068                 return 1;
5069         else if ((code [0] == 0xff) && (x86_modrm_reg (code [1]) == 0x6))
5070                 /* push <OFFSET>(<REG>) */
5071                 return 2;
5072         else if ((code [0] == 0xff) && (x86_modrm_reg (code [1]) == 0x2))
5073                 /* call *<OFFSET>(<REG>) */
5074                 return 2;
5075         else if ((code [0] == 0xdd) || (code [0] == 0xd9))
5076                 /* fldl <ADDR> */
5077                 return 2;
5078         else if ((code [0] == 0x58) && (code [1] == 0x05))
5079                 /* pop %eax; add <OFFSET>, %eax */
5080                 return 2;
5081         else if ((code [0] >= 0x58) && (code [0] <= 0x58 + X86_NREG) && (code [1] == 0x81))
5082                 /* pop <REG>; add <OFFSET>, <REG> */
5083                 return 3;
5084         else if ((code [0] >= 0xb8) && (code [0] < 0xb8 + 8))
5085                 /* mov <REG>, imm */
5086                 return 1;
5087         else {
5088                 g_assert_not_reached ();
5089                 return -1;
5090         }
5091 }
5092
5093 /**
5094  * mono_breakpoint_clean_code:
5095  *
5096  * Copy @size bytes from @code - @offset to the buffer @buf. If the debugger inserted software
5097  * breakpoints in the original code, they are removed in the copy.
5098  *
5099  * Returns TRUE if no sw breakpoint was present.
5100  */
5101 gboolean
5102 mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guint8 *buf, int size)
5103 {
5104         int i;
5105         gboolean can_write = TRUE;
5106         /*
5107          * If method_start is non-NULL we need to perform bound checks, since we access memory
5108          * at code - offset we could go before the start of the method and end up in a different
5109          * page of memory that is not mapped or read incorrect data anyway. We zero-fill the bytes
5110          * instead.
5111          */
5112         if (!method_start || code - offset >= method_start) {
5113                 memcpy (buf, code - offset, size);
5114         } else {
5115                 int diff = code - method_start;
5116                 memset (buf, 0, size);
5117                 memcpy (buf + offset - diff, method_start, diff + size - offset);
5118         }
5119         code -= offset;
5120         for (i = 0; i < MONO_BREAKPOINT_ARRAY_SIZE; ++i) {
5121                 int idx = mono_breakpoint_info_index [i];
5122                 guint8 *ptr;
5123                 if (idx < 1)
5124                         continue;
5125                 ptr = mono_breakpoint_info [idx].address;
5126                 if (ptr >= code && ptr < code + size) {
5127                         guint8 saved_byte = mono_breakpoint_info [idx].saved_byte;
5128                         can_write = FALSE;
5129                         /*g_print ("patching %p with 0x%02x (was: 0x%02x)\n", ptr, saved_byte, buf [ptr - code]);*/
5130                         buf [ptr - code] = saved_byte;
5131                 }
5132         }
5133         return can_write;
5134 }
5135
5136 gpointer
5137 mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
5138 {
5139         guint8 buf [8];
5140         guint8 reg = 0;
5141         gint32 disp = 0;
5142
5143         mono_breakpoint_clean_code (NULL, code, 8, buf, sizeof (buf));
5144         code = buf + 8;
5145
5146         *displacement = 0;
5147
5148         /* go to the start of the call instruction
5149          *
5150          * address_byte = (m << 6) | (o << 3) | reg
5151          * call opcode: 0xff address_byte displacement
5152          * 0xff m=1,o=2 imm8
5153          * 0xff m=2,o=2 imm32
5154          */
5155         code -= 6;
5156
5157         /* 
5158          * A given byte sequence can match more than case here, so we have to be
5159          * really careful about the ordering of the cases. Longer sequences
5160          * come first.
5161          */
5162         if ((code [-2] == 0x8b) && (x86_modrm_mod (code [-1]) == 0x2) && (code [4] == 0xff) && (x86_modrm_reg (code [5]) == 0x2) && (x86_modrm_mod (code [5]) == 0x0)) {
5163                 /*
5164                  * This is an interface call
5165                  * 8b 80 0c e8 ff ff       mov    0xffffe80c(%eax),%eax
5166                  * ff 10                   call   *(%eax)
5167                  */
5168                 reg = x86_modrm_rm (code [5]);
5169                 disp = 0;
5170 #ifdef MONO_ARCH_HAVE_IMT
5171         } else if ((code [-2] == 0xba) && (code [3] == 0xff) && (x86_modrm_mod (code [4]) == 1) && (x86_modrm_reg (code [4]) == 2) && ((signed char)code [5] < 0)) {
5172                 /* IMT-based interface calls: with MONO_ARCH_IMT_REG == edx
5173                  * ba 14 f8 28 08          mov    $0x828f814,%edx
5174                  * ff 50 fc                call   *0xfffffffc(%eax)
5175                  */
5176                 reg = code [4] & 0x07;
5177                 disp = (signed char)code [5];
5178 #endif
5179         } else if ((code [1] != 0xe8) && (code [3] == 0xff) && ((code [4] & 0x18) == 0x10) && ((code [4] >> 6) == 1)) {
5180                 reg = code [4] & 0x07;
5181                 disp = (signed char)code [5];
5182         } else {
5183                 if ((code [0] == 0xff) && ((code [1] & 0x18) == 0x10) && ((code [1] >> 6) == 2)) {
5184                         reg = code [1] & 0x07;
5185                         disp = *((gint32*)(code + 2));
5186                 } else if ((code [1] == 0xe8)) {
5187                         return NULL;
5188                 } else if ((code [4] == 0xff) && (((code [5] >> 6) & 0x3) == 0) && (((code [5] >> 3) & 0x7) == 2)) {
5189                         /*
5190                          * This is a interface call
5191                          * 8b 40 30   mov    0x30(%eax),%eax
5192                          * ff 10      call   *(%eax)
5193                          */
5194                         disp = 0;
5195                         reg = code [5] & 0x07;
5196                 }
5197                 else
5198                         return NULL;
5199         }
5200
5201         *displacement = disp;
5202         return regs [reg];
5203 }
5204
5205 gpointer*
5206 mono_arch_get_vcall_slot_addr (guint8 *code, gpointer *regs)
5207 {
5208         gpointer vt;
5209         int displacement;
5210         vt = mono_arch_get_vcall_slot (code, regs, &displacement);
5211         if (!vt)
5212                 return NULL;
5213         return (gpointer*)((char*)vt + displacement);
5214 }
5215
5216 gpointer
5217 mono_arch_get_this_arg_from_call (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig,
5218                 gssize *regs, guint8 *code)
5219 {
5220         guint32 esp = regs [X86_ESP];
5221         CallInfo *cinfo;
5222         gpointer res;
5223
5224         if (!gsctx && code)
5225                 gsctx = mono_get_generic_context_from_code (code);
5226         cinfo = get_call_info (gsctx, NULL, sig, FALSE);
5227
5228         /*
5229          * The stack looks like:
5230          * <other args>
5231          * <this=delegate>
5232          * <possible vtype return address>
5233          * <return addr>
5234          * <4 pointers pushed by mono_arch_create_trampoline_code ()>
5235          */
5236         res = (((MonoObject**)esp) [5 + (cinfo->args [0].offset / 4)]);
5237         g_free (cinfo);
5238         return res;
5239 }
5240
5241 #define MAX_ARCH_DELEGATE_PARAMS 10
5242
5243 gpointer
5244 mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_target)
5245 {
5246         guint8 *code, *start;
5247
5248         if (sig->param_count > MAX_ARCH_DELEGATE_PARAMS)
5249                 return NULL;
5250
5251         /* FIXME: Support more cases */
5252         if (MONO_TYPE_ISSTRUCT (sig->ret))
5253                 return NULL;
5254
5255         /*
5256          * The stack contains:
5257          * <delegate>
5258          * <return addr>
5259          */
5260
5261         if (has_target) {
5262                 static guint8* cached = NULL;
5263                 if (cached)
5264                         return cached;
5265                 
5266                 start = code = mono_global_codeman_reserve (64);
5267
5268                 /* Replace the this argument with the target */
5269                 x86_mov_reg_membase (code, X86_EAX, X86_ESP, 4, 4);
5270                 x86_mov_reg_membase (code, X86_ECX, X86_EAX, G_STRUCT_OFFSET (MonoDelegate, target), 4);
5271                 x86_mov_membase_reg (code, X86_ESP, 4, X86_ECX, 4);
5272                 x86_jump_membase (code, X86_EAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
5273
5274                 g_assert ((code - start) < 64);
5275
5276                 mono_debug_add_delegate_trampoline (start, code - start);
5277
5278                 mono_memory_barrier ();
5279
5280                 cached = start;
5281         } else {
5282                 static guint8* cache [MAX_ARCH_DELEGATE_PARAMS + 1] = {NULL};
5283                 int i = 0;
5284                 /* 8 for mov_reg and jump, plus 8 for each parameter */
5285                 int code_reserve = 8 + (sig->param_count * 8);
5286
5287                 for (i = 0; i < sig->param_count; ++i)
5288                         if (!mono_is_regsize_var (sig->params [i]))
5289                                 return NULL;
5290
5291                 code = cache [sig->param_count];
5292                 if (code)
5293                         return code;
5294
5295                 /*
5296                  * The stack contains:
5297                  * <args in reverse order>
5298                  * <delegate>
5299                  * <return addr>
5300                  *
5301                  * and we need:
5302                  * <args in reverse order>
5303                  * <return addr>
5304                  * 
5305                  * without unbalancing the stack.
5306                  * So move each arg up a spot in the stack (overwriting un-needed 'this' arg)
5307                  * and leaving original spot of first arg as placeholder in stack so
5308                  * when callee pops stack everything works.
5309                  */
5310
5311                 start = code = mono_global_codeman_reserve (code_reserve);
5312
5313                 /* store delegate for access to method_ptr */
5314                 x86_mov_reg_membase (code, X86_ECX, X86_ESP, 4, 4);
5315
5316                 /* move args up */
5317                 for (i = 0; i < sig->param_count; ++i) {
5318                         x86_mov_reg_membase (code, X86_EAX, X86_ESP, (i+2)*4, 4);
5319                         x86_mov_membase_reg (code, X86_ESP, (i+1)*4, X86_EAX, 4);
5320                 }
5321
5322                 x86_jump_membase (code, X86_ECX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
5323
5324                 g_assert ((code - start) < code_reserve);
5325
5326                 mono_debug_add_delegate_trampoline (start, code - start);
5327
5328                 mono_memory_barrier ();
5329
5330                 cache [sig->param_count] = start;
5331         }
5332
5333         return start;
5334 }
5335
5336 gpointer
5337 mono_arch_context_get_int_reg (MonoContext *ctx, int reg)
5338 {
5339         switch (reg) {
5340         case X86_ECX: return (gpointer)ctx->ecx;
5341         case X86_EDX: return (gpointer)ctx->edx;
5342         case X86_EBP: return (gpointer)ctx->ebp;
5343         case X86_ESP: return (gpointer)ctx->esp;
5344         default: return ((gpointer)(&ctx->eax)[reg]);
5345         }
5346 }
5347
5348 #ifdef MONO_ARCH_SIMD_INTRINSICS
5349
5350 static MonoInst*
5351 get_float_to_x_spill_area (MonoCompile *cfg)
5352 {
5353         if (!cfg->fconv_to_r8_x_var) {
5354                 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
5355                 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
5356         }       
5357         return cfg->fconv_to_r8_x_var;
5358 }
5359
5360 /*
5361  * Convert all fconv opts that MONO_OPT_SSE2 would get wrong. 
5362  */
5363 void
5364 mono_arch_decompose_opts (MonoCompile *cfg, MonoInst *ins)
5365 {
5366         MonoInst *fconv;
5367         int dreg, src_opcode;
5368
5369         if (!(cfg->opt & MONO_OPT_SSE2) || !(cfg->opt & MONO_OPT_SIMD))
5370                 return;
5371
5372         switch (src_opcode = ins->opcode) {
5373         case OP_FCONV_TO_I1:
5374         case OP_FCONV_TO_U1:
5375         case OP_FCONV_TO_I2:
5376         case OP_FCONV_TO_U2:
5377         case OP_FCONV_TO_I4:
5378         case OP_FCONV_TO_I:
5379                 break;
5380         default:
5381                 return;
5382         }
5383
5384         /* dreg is the IREG and sreg1 is the FREG */
5385         MONO_INST_NEW (cfg, fconv, OP_FCONV_TO_R8_X);
5386         fconv->klass = NULL; /*FIXME, what can I use here as the Mono.Simd lib might not be loaded yet*/
5387         fconv->sreg1 = ins->sreg1;
5388         fconv->dreg = mono_alloc_ireg (cfg);
5389         fconv->type = STACK_VTYPE;
5390         fconv->backend.spill_var = get_float_to_x_spill_area (cfg);
5391
5392         mono_bblock_insert_before_ins (cfg->cbb, ins, fconv);
5393
5394         dreg = ins->dreg;
5395         NULLIFY_INS (ins);
5396         ins->opcode = OP_XCONV_R8_TO_I4;
5397
5398         ins->klass = mono_defaults.int32_class;
5399         ins->sreg1 = fconv->dreg;
5400         ins->dreg = dreg;
5401         ins->type = STACK_I4;
5402         ins->backend.source_opcode = src_opcode;
5403 }
5404
5405 void
5406 mono_arch_decompose_long_opts (MonoCompile *cfg, MonoInst *long_ins)
5407 {
5408         MonoInst *ins;
5409         int vreg;
5410         if (!(cfg->opt & MONO_OPT_SIMD))
5411                 return;
5412         
5413         /*TODO move this to simd-intrinsic.c once we support sse 4.1 dword extractors since we need the runtime caps info */ 
5414         switch (long_ins->opcode) {
5415         case OP_EXTRACT_I8:
5416                 vreg = long_ins->sreg1;
5417         
5418                 if (long_ins->inst_c0) {
5419                         MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
5420                         ins->klass = long_ins->klass;
5421                         ins->sreg1 = long_ins->sreg1;
5422                         ins->inst_c0 = 2;
5423                         ins->type = STACK_VTYPE;
5424                         ins->dreg = vreg = alloc_ireg (cfg);
5425                         MONO_ADD_INS (cfg->cbb, ins);
5426                 }
5427         
5428                 MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
5429                 ins->klass = mono_defaults.int32_class;
5430                 ins->sreg1 = vreg;
5431                 ins->type = STACK_I4;
5432                 ins->dreg = long_ins->dreg + 1;
5433                 MONO_ADD_INS (cfg->cbb, ins);
5434         
5435                 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
5436                 ins->klass = long_ins->klass;
5437                 ins->sreg1 = long_ins->sreg1;
5438                 ins->inst_c0 = long_ins->inst_c0 ? 3 : 1;
5439                 ins->type = STACK_VTYPE;
5440                 ins->dreg = vreg = alloc_ireg (cfg);
5441                 MONO_ADD_INS (cfg->cbb, ins);
5442         
5443                 MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
5444                 ins->klass = mono_defaults.int32_class;
5445                 ins->sreg1 = vreg;
5446                 ins->type = STACK_I4;
5447                 ins->dreg = long_ins->dreg + 2;
5448                 MONO_ADD_INS (cfg->cbb, ins);
5449         
5450                 long_ins->opcode = OP_NOP;
5451                 break;
5452         case OP_INSERTX_I8_SLOW:
5453                 MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
5454                 ins->dreg = long_ins->dreg;
5455                 ins->sreg1 = long_ins->dreg;
5456                 ins->sreg2 = long_ins->sreg2 + 1;
5457                 ins->inst_c0 = long_ins->inst_c0 * 2;
5458                 MONO_ADD_INS (cfg->cbb, ins);
5459
5460                 MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
5461                 ins->dreg = long_ins->dreg;
5462                 ins->sreg1 = long_ins->dreg;
5463                 ins->sreg2 = long_ins->sreg2 + 2;
5464                 ins->inst_c0 = long_ins->inst_c0 * 2 + 1;
5465                 MONO_ADD_INS (cfg->cbb, ins);
5466
5467                 long_ins->opcode = OP_NOP;
5468                 break;
5469         case OP_EXPAND_I8:
5470                 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
5471                 ins->dreg = long_ins->dreg;
5472                 ins->sreg1 = long_ins->sreg1 + 1;
5473                 ins->klass = long_ins->klass;
5474                 ins->type = STACK_VTYPE;
5475                 MONO_ADD_INS (cfg->cbb, ins);
5476
5477                 MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
5478                 ins->dreg = long_ins->dreg;
5479                 ins->sreg1 = long_ins->dreg;
5480                 ins->sreg2 = long_ins->sreg1 + 2;
5481                 ins->inst_c0 = 1;
5482                 ins->klass = long_ins->klass;
5483                 ins->type = STACK_VTYPE;
5484                 MONO_ADD_INS (cfg->cbb, ins);
5485
5486                 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
5487                 ins->dreg = long_ins->dreg;
5488                 ins->sreg1 = long_ins->dreg;;
5489                 ins->inst_c0 = 0x44; /*Magic number for swizzling (X,Y,X,Y)*/
5490                 ins->klass = long_ins->klass;
5491                 ins->type = STACK_VTYPE;
5492                 MONO_ADD_INS (cfg->cbb, ins);
5493
5494                 long_ins->opcode = OP_NOP;
5495                 break;
5496         }
5497 }
5498 #endif
5499