From b0e9fb6f6f4791e9c7684c2e9af17f9929ff6564 Mon Sep 17 00:00:00 2001 From: Paolo Molaro Date: Wed, 14 Nov 2012 15:03:41 +0100 Subject: [PATCH] Introduce support for unlimited trampolines. With this new code there is no longer an AOT compiler limit on the number of trampolines, as they are allocated at runtime as necessary on iOS systems. We use iOS's ability to remap a page to a different address in memory and we replicate the trampolines we generate in the binary file to a location in memory near a writable page where the trampolines can find the specific runtime data needed (basically building writable constant pools). On a big app like bcltests, this saves more than 900KB of binary size. There is still the option to use the old code, by passing the "nopagetrampolines" option tp the AOT compiler (the new code is the default). --- mono/mini/aot-compiler.c | 162 ++++++++++++++++++++++++++++- mono/mini/aot-runtime.c | 215 +++++++++++++++++++++++++++++++++++++-- mono/mini/tramp-arm.c | 11 +- 3 files changed, 375 insertions(+), 13 deletions(-) diff --git a/mono/mini/aot-compiler.c b/mono/mini/aot-compiler.c index d897f13e11e..412a7cbf65f 100644 --- a/mono/mini/aot-compiler.c +++ b/mono/mini/aot-compiler.c @@ -126,6 +126,7 @@ typedef struct MonoAotOptions { gboolean direct_pinvoke; gboolean direct_icalls; gboolean no_direct_calls; + gboolean use_trampolines_page; int nthreads; int ntrampolines; int nrgctx_trampolines; @@ -907,6 +908,152 @@ arch_emit_llvm_plt_entry (MonoAotCompile *acfg, int index) #endif } +/* + * arch_emit_specific_trampoline_pages: + * + * Emits a page full of trampolines: each trampoline uses its own address to + * lookup both the generic trampoline code and the data argument. + * This page can be remapped in process multiple times so we can get an + * unlimited number of trampolines. + * Specifically this implementation uses the following trick: two memory pages + * are allocated, with the first containing the data and the second containing the trampolines. + * To reduce trampoline size, each trampoline jumps at the start of the page where a common + * implementation does all the lifting. + * Note that the ARM single trampoline size is 8 bytes, exactly like the data that needs to be stored + * on the arm 32 bit system. + */ +static void +arch_emit_specific_trampoline_pages (MonoAotCompile *acfg) +{ +#if defined(TARGET_ARM) + guint8 buf [128]; + guint8 *code; + guint8 *loop_start, *loop_branch_back, *loop_end_check, *imt_found_check; + int i; +#define COMMON_TRAMP_SIZE 16 + int count = (mono_pagesize () - COMMON_TRAMP_SIZE) / 8; + int imm8, rot_amount; + + if (!acfg->aot_opts.use_trampolines_page) + return; + + emit_alignment (acfg, mono_pagesize ()); + emit_global (acfg, "specific_trampolines_page", TRUE); + emit_label (acfg, "specific_trampolines_page"); + + /* emit the generic code first, the trampoline address + 8 is in the lr register */ + code = buf; + imm8 = mono_arm_is_rotated_imm8 (mono_pagesize (), &rot_amount); + ARM_SUB_REG_IMM (code, ARMREG_LR, ARMREG_LR, imm8, rot_amount); + ARM_LDR_IMM (code, ARMREG_R1, ARMREG_LR, -8); + ARM_LDR_IMM (code, ARMREG_PC, ARMREG_LR, -4); + ARM_NOP (code); + g_assert (code - buf == COMMON_TRAMP_SIZE); + + /* Emit it */ + emit_bytes (acfg, buf, code - buf); + + for (i = 0; i < count; ++i) { + code = buf; + ARM_PUSH (code, 0x5fff); + ARM_BL (code, 0); + arm_patch (code - 4, code - COMMON_TRAMP_SIZE - 8 * (i + 1)); + g_assert (code - buf == 8); + emit_bytes (acfg, buf, code - buf); + } + + /* now the rgctx trampolines: each specific trampolines puts in the ip register + * the instruction pointer address, so the generic trampoline at the start of the page + * subtracts 4096 to get to the data page and loads the values + * We again fit the generic trampiline in 16 bytes. + */ + emit_global (acfg, "rgctx_trampolines_page", TRUE); + emit_label (acfg, "rgctx_trampolines_page"); + code = buf; + imm8 = mono_arm_is_rotated_imm8 (mono_pagesize (), &rot_amount); + ARM_SUB_REG_IMM (code, ARMREG_IP, ARMREG_IP, imm8, rot_amount); + ARM_LDR_IMM (code, MONO_ARCH_RGCTX_REG, ARMREG_IP, -8); + ARM_LDR_IMM (code, ARMREG_PC, ARMREG_IP, -4); + ARM_NOP (code); + g_assert (code - buf == COMMON_TRAMP_SIZE); + + /* Emit it */ + emit_bytes (acfg, buf, code - buf); + + for (i = 0; i < count; ++i) { + code = buf; + ARM_MOV_REG_REG (code, ARMREG_IP, ARMREG_PC); + ARM_B (code, 0); + arm_patch (code - 4, code - COMMON_TRAMP_SIZE - 8 * (i + 1)); + g_assert (code - buf == 8); + emit_bytes (acfg, buf, code - buf); + } + /* now the imt trampolines: each specific trampolines puts in the ip register + * the instruction pointer address, so the generic trampoline at the start of the page + * subtracts 4096 to get to the data page and loads the values + * We again fit the generic trampiline in 16 bytes. + */ +#define IMT_TRAMP_SIZE 72 + emit_global (acfg, "imt_trampolines_page", TRUE); + emit_label (acfg, "imt_trampolines_page"); + code = buf; + /* Need at least two free registers, plus a slot for storing the pc */ + ARM_PUSH (code, (1 << ARMREG_R0)|(1 << ARMREG_R1)|(1 << ARMREG_R2)); + + imm8 = mono_arm_is_rotated_imm8 (mono_pagesize (), &rot_amount); + ARM_SUB_REG_IMM (code, ARMREG_IP, ARMREG_IP, imm8, rot_amount); + ARM_LDR_IMM (code, ARMREG_R0, ARMREG_IP, -8); + + /* The IMT method is in v5, r0 has the imt array address */ + + loop_start = code; + ARM_LDR_IMM (code, ARMREG_R1, ARMREG_R0, 0); + ARM_CMP_REG_REG (code, ARMREG_R1, ARMREG_V5); + imt_found_check = code; + ARM_B_COND (code, ARMCOND_EQ, 0); + + /* End-of-loop check */ + ARM_CMP_REG_IMM (code, ARMREG_R1, 0, 0); + loop_end_check = code; + ARM_B_COND (code, ARMCOND_EQ, 0); + + /* Loop footer */ + ARM_ADD_REG_IMM8 (code, ARMREG_R0, ARMREG_R0, sizeof (gpointer) * 2); + loop_branch_back = code; + ARM_B (code, 0); + arm_patch (loop_branch_back, loop_start); + + /* Match */ + arm_patch (imt_found_check, code); + ARM_LDR_IMM (code, ARMREG_R0, ARMREG_R0, 4); + ARM_LDR_IMM (code, ARMREG_R0, ARMREG_R0, 0); + /* Save it to the third stack slot */ + ARM_STR_IMM (code, ARMREG_R0, ARMREG_SP, 8); + /* Restore the registers and branch */ + ARM_POP (code, (1 << ARMREG_R0)|(1 << ARMREG_R1)|(1 << ARMREG_PC)); + + /* No match */ + arm_patch (loop_end_check, code); + ARM_LDR_IMM (code, ARMREG_R0, ARMREG_R0, 4); + ARM_STR_IMM (code, ARMREG_R0, ARMREG_SP, 8); + ARM_POP (code, (1 << ARMREG_R0)|(1 << ARMREG_R1)|(1 << ARMREG_PC)); + ARM_NOP (code); + + /* Emit it */ + g_assert (code - buf == IMT_TRAMP_SIZE); + emit_bytes (acfg, buf, code - buf); + + for (i = 0; i < count; ++i) { + code = buf; + ARM_MOV_REG_REG (code, ARMREG_IP, ARMREG_PC); + ARM_B (code, 0); + arm_patch (code - 4, code - IMT_TRAMP_SIZE - 8 * (i + 1)); + g_assert (code - buf == 8); + emit_bytes (acfg, buf, code - buf); + } +#endif +} + /* * arch_emit_specific_trampoline: * @@ -4959,7 +5106,8 @@ emit_trampolines (MonoAotCompile *acfg) * method. */ for (tramp_type = 0; tramp_type < MONO_TRAMPOLINE_NUM; ++tramp_type) { - mono_arch_create_generic_trampoline (tramp_type, &info, TRUE); + /* we overload the boolean here to indicate the slightly different trampoline needed, see mono_arch_create_generic_trampoline() */ + mono_arch_create_generic_trampoline (tramp_type, &info, acfg->aot_opts.use_trampolines_page? 2: TRUE); emit_trampoline (acfg, acfg->got_offset, info); } @@ -5119,6 +5267,8 @@ emit_trampolines (MonoAotCompile *acfg) emit_label (acfg, end_symbol); } + arch_emit_specific_trampoline_pages (acfg); + /* Reserve some entries at the end of the GOT for our use */ acfg->num_trampoline_got_entries = tramp_got_offset - acfg->got_offset; } @@ -5246,6 +5396,8 @@ mono_aot_parse_options (const char *aot_options, MonoAotOptions *opts) opts->asm_writer = TRUE; } else if (str_begins_with (arg, "nodebug")) { opts->nodebug = TRUE; + } else if (str_begins_with (arg, "nopagetrampolines")) { + opts->use_trampolines_page = FALSE; } else if (str_begins_with (arg, "ntrampolines=")) { opts->ntrampolines = atoi (arg + strlen ("ntrampolines=")); } else if (str_begins_with (arg, "nrgctx-trampolines=")) { @@ -5318,6 +5470,11 @@ mono_aot_parse_options (const char *aot_options, MonoAotOptions *opts) } } + if (opts->use_trampolines_page) { + opts->ntrampolines = 0; + opts->nrgctx_trampolines = 0; + opts->nimt_trampolines = 0; + } g_strfreev (args); } @@ -7563,6 +7720,9 @@ mono_compile_assembly (MonoAssembly *ass, guint32 opts, const char *aot_options) acfg->aot_opts.nrgctx_trampolines = 1024; acfg->aot_opts.nimt_trampolines = 128; acfg->aot_opts.llvm_path = g_strdup (""); +#if MONOTOUCH + acfg->aot_opts.use_trampolines_page = TRUE; +#endif mono_aot_parse_options (aot_options, &acfg->aot_opts); diff --git a/mono/mini/aot-runtime.c b/mono/mini/aot-runtime.c index c6866c4a9ad..524aa980262 100644 --- a/mono/mini/aot-runtime.c +++ b/mono/mini/aot-runtime.c @@ -126,12 +126,20 @@ typedef struct MonoAotModule { /* The first unused trampoline of each kind */ guint32 trampoline_index [MONO_AOT_TRAMP_NUM]; + gboolean use_page_trampolines; + MonoAotFileInfo info; gpointer *globals; MonoDl *sofile; } MonoAotModule; +typedef struct { + void *next; + unsigned char *trampolines; + unsigned char *trampolines_end; +} TrampolinePage; + static GHashTable *aot_modules; #define mono_aot_lock() EnterCriticalSection (&aot_mutex) #define mono_aot_unlock() LeaveCriticalSection (&aot_mutex) @@ -174,6 +182,16 @@ static gsize aot_code_high_addr = 0; static GHashTable *aot_jit_icall_hash; +#ifdef MONOTOUCH +#define USE_PAGE_TRAMPOLINES ((MonoAotModule*)mono_defaults.corlib->aot_module)->use_page_trampolines +#else +#define USE_PAGE_TRAMPOLINES 0 +#endif + +#define mono_aot_page_lock() EnterCriticalSection (&aot_page_mutex) +#define mono_aot_page_unlock() LeaveCriticalSection (&aot_page_mutex) +static CRITICAL_SECTION aot_page_mutex; + static void init_plt (MonoAotModule *info); @@ -1645,6 +1663,10 @@ load_aot_module (MonoAssembly *assembly, gpointer user_data) assembly->image->aot_module = amodule; if (mono_aot_only) { + char *code; + find_symbol (amodule->sofile, amodule->globals, "specific_trampolines_page", (gpointer *)&code); + amodule->use_page_trampolines = code != NULL; + /*g_warning ("using page trampolines: %d", amodule->use_page_trampolines);*/ if (mono_defaults.corlib) { /* The second got slot contains the mscorlib got addr */ MonoAotModule *mscorlib_amodule = mono_defaults.corlib->aot_module; @@ -1747,6 +1769,7 @@ void mono_aot_init (void) { InitializeCriticalSection (&aot_mutex); + InitializeCriticalSection (&aot_page_mutex); aot_modules = g_hash_table_new (NULL, NULL); mono_install_assembly_load_hook (load_aot_module, NULL); @@ -3882,6 +3905,167 @@ mono_aot_get_trampoline (const char *name) return mono_create_ftnptr_malloc (load_function (amodule, name)); } +#ifdef MONOTOUCH +#include + +static TrampolinePage* trampoline_pages [MONO_AOT_TRAMP_NUM]; +/* these sizes are for ARM code, parametrize if porting to other architectures (see arch_emit_specific_trampoline_pages) + * trampoline size is assumed to be 8 bytes below as well (8 is the minimum for 32 bit archs, since we need to store + * two pointers for trampoline in the data page). + * the minimum for the common code must be at least sizeof(TrampolinePage), since we store the page info at the + * beginning of the data page. + */ +static const int trampolines_pages_code_offsets [MONO_AOT_TRAMP_NUM] = {16, 16, 72}; + +static unsigned char* +get_new_trampoline_from_page (int tramp_type) +{ + MonoAotModule *amodule; + MonoImage *image; + TrampolinePage *page; + int count; + void *tpage; + vm_address_t addr, taddr; + kern_return_t ret; + vm_prot_t prot, max_prot; + int psize; + unsigned char *code; + + mono_aot_page_lock (); + page = trampoline_pages [tramp_type]; + if (page && page->trampolines < page->trampolines_end) { + code = page->trampolines; + page->trampolines += 8; + mono_aot_page_unlock (); + return code; + } + mono_aot_page_unlock (); + psize = mono_pagesize (); + /* the trampoline template page is in the mscorlib module */ + image = mono_defaults.corlib; + g_assert (image); + + amodule = image->aot_module; + g_assert (amodule); + + if (tramp_type == MONO_AOT_TRAMP_SPECIFIC) + tpage = load_function (amodule, "specific_trampolines_page"); + else if (tramp_type == MONO_AOT_TRAMP_STATIC_RGCTX) + tpage = load_function (amodule, "rgctx_trampolines_page"); + else if (tramp_type == MONO_AOT_TRAMP_IMT_THUNK) + tpage = load_function (amodule, "imt_trampolines_page"); + else + g_error ("Incorrect tramp type for trampolines page"); + g_assert (tpage); + /*g_warning ("loaded trampolines page at %x", tpage);*/ + + /* avoid the unlikely case of looping forever */ + count = 40; + page = NULL; + while (page == NULL && count-- > 0) { + addr = 0; + /* allocate two contiguous pages of memory: the first page will contain the data (like a local constant pool) + * while the second will contain the trampolines. + */ + ret = vm_allocate (mach_task_self (), &addr, psize * 2, VM_FLAGS_ANYWHERE); + if (ret != KERN_SUCCESS) { + g_error ("Cannot allocate memory for trampolines: %d", ret); + break; + } + /*g_warning ("allocated trampoline double page at %x", addr);*/ + /* replace the second page with a remapped trampoline page */ + taddr = addr + psize; + vm_deallocate (mach_task_self (), taddr, psize); + ret = vm_remap (mach_task_self (), &taddr, psize, 0, FALSE, mach_task_self(), (vm_address_t)tpage, FALSE, &prot, &max_prot, VM_INHERIT_SHARE); + if (ret != KERN_SUCCESS) { + /* someone else got the page, try again */ + vm_deallocate (mach_task_self (), addr, psize); + continue; + } + /*g_warning ("remapped trampoline page at %x", taddr);*/ + + mono_aot_page_lock (); + page = trampoline_pages [tramp_type]; + /* some other thread already allocated, so use that to avoid wasting memory */ + if (page && page->trampolines < page->trampolines_end) { + code = page->trampolines; + page->trampolines += 8; + mono_aot_page_unlock (); + vm_deallocate (mach_task_self (), addr, psize); + vm_deallocate (mach_task_self (), taddr, psize); + return code; + } + page = (TrampolinePage*)addr; + page->next = trampoline_pages [tramp_type]; + trampoline_pages [tramp_type] = page; + page->trampolines = (void*)(taddr + trampolines_pages_code_offsets [tramp_type]); + page->trampolines_end = (void*)(taddr + psize); + code = page->trampolines; + page->trampolines += 8; + mono_aot_page_unlock (); + return code; + } + g_error ("Cannot allocate more trampoline pages: %d", ret); + return NULL; +} + +#else +static unsigned char* +get_new_trampoline_from_page (int tramp_type) +{ + g_error ("Page trampolines not supported."); + return NULL; +} +#endif + + +static gpointer +get_new_specific_trampoline_from_page (gpointer tramp, gpointer arg) +{ + void *code; + gpointer *data; + + code = get_new_trampoline_from_page (MONO_AOT_TRAMP_SPECIFIC); + + data = (gpointer*)((char*)code - mono_pagesize ()); + data [0] = arg; + data [1] = tramp; + /*g_warning ("new trampoline at %p for data %p, tramp %p (stored at %p)", code, arg, tramp, data);*/ + return code; + +} + +static gpointer +get_new_rgctx_trampoline_from_page (gpointer tramp, gpointer arg) +{ + void *code; + gpointer *data; + + code = get_new_trampoline_from_page (MONO_AOT_TRAMP_STATIC_RGCTX); + + data = (gpointer*)((char*)code - mono_pagesize ()); + data [0] = arg; + data [1] = tramp; + /*g_warning ("new rgctx trampoline at %p for data %p, tramp %p (stored at %p)", code, arg, tramp, data);*/ + return code; + +} + +static gpointer +get_new_imt_trampoline_from_page (gpointer arg) +{ + void *code; + gpointer *data; + + code = get_new_trampoline_from_page (MONO_AOT_TRAMP_IMT_THUNK); + + data = (gpointer*)((char*)code - mono_pagesize ()); + data [0] = arg; + /*g_warning ("new imt trampoline at %p for data %p, (stored at %p)", code, arg, data);*/ + return code; + +} + /* Return a given kind of trampoline */ static gpointer get_numerous_trampoline (MonoAotTrampoline tramp_type, int n_got_slots, MonoAotModule **out_amodule, guint32 *got_offset, guint32 *out_tramp_size) @@ -3962,10 +4146,15 @@ mono_aot_create_specific_trampoline (MonoImage *image, gpointer arg1, MonoTrampo tramp = generic_trampolines [tramp_type]; g_assert (tramp); - code = get_numerous_trampoline (MONO_AOT_TRAMP_SPECIFIC, 2, &amodule, &got_offset, &tramp_size); + if (USE_PAGE_TRAMPOLINES) { + code = get_new_specific_trampoline_from_page (tramp, arg1); + tramp_size = 8; + } else { + code = get_numerous_trampoline (MONO_AOT_TRAMP_SPECIFIC, 2, &amodule, &got_offset, &tramp_size); - amodule->got [got_offset] = tramp; - amodule->got [got_offset + 1] = arg1; + amodule->got [got_offset] = tramp; + amodule->got [got_offset + 1] = arg1; + } if (code_len) *code_len = tramp_size; @@ -3980,10 +4169,14 @@ mono_aot_get_static_rgctx_trampoline (gpointer ctx, gpointer addr) guint8 *code; guint32 got_offset; - code = get_numerous_trampoline (MONO_AOT_TRAMP_STATIC_RGCTX, 2, &amodule, &got_offset, NULL); + if (USE_PAGE_TRAMPOLINES) { + code = get_new_rgctx_trampoline_from_page (addr, ctx); + } else { + code = get_numerous_trampoline (MONO_AOT_TRAMP_STATIC_RGCTX, 2, &amodule, &got_offset, NULL); - amodule->got [got_offset] = ctx; - amodule->got [got_offset + 1] = addr; + amodule->got [got_offset] = ctx; + amodule->got [got_offset + 1] = addr; + } /* The caller expects an ftnptr */ return mono_create_ftnptr (mono_domain_get (), code); @@ -4053,8 +4246,6 @@ mono_aot_get_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem int i, index, real_count; MonoAotModule *amodule; - code = get_numerous_trampoline (MONO_AOT_TRAMP_IMT_THUNK, 1, &amodule, &got_offset, NULL); - real_count = 0; for (i = 0; i < count; ++i) { MonoIMTCheckItem *item = imt_entries [i]; @@ -4087,7 +4278,13 @@ mono_aot_get_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem buf [(index * 2)] = NULL; buf [(index * 2) + 1] = fail_tramp; - amodule->got [got_offset] = buf; + if (USE_PAGE_TRAMPOLINES) { + code = get_new_imt_trampoline_from_page (buf); + } else { + code = get_numerous_trampoline (MONO_AOT_TRAMP_IMT_THUNK, 1, &amodule, &got_offset, NULL); + + amodule->got [got_offset] = buf; + } return code; } diff --git a/mono/mini/tramp-arm.c b/mono/mini/tramp-arm.c index 1aeeeffa6bd..aa5bd1f1124 100644 --- a/mono/mini/tramp-arm.c +++ b/mono/mini/tramp-arm.c @@ -180,13 +180,18 @@ mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInf if (aot && tramp_type != MONO_TRAMPOLINE_GENERIC_CLASS_INIT) { /* + * For page trampolines the data is in r1, so just move it, otherwise use the got slot as below. * The trampoline contains a pc-relative offset to the got slot * preceeding the got slot where the value is stored. The offset can be * found at [lr + 0]. */ - ARM_LDR_IMM (code, ARMREG_V2, ARMREG_LR, 0); - ARM_ADD_REG_IMM (code, ARMREG_V2, ARMREG_V2, 4, 0); - ARM_LDR_REG_REG (code, ARMREG_V2, ARMREG_V2, ARMREG_LR); + if (aot == 2) { + ARM_MOV_REG_REG (code, ARMREG_V2, ARMREG_R1); + } else { + ARM_LDR_IMM (code, ARMREG_V2, ARMREG_LR, 0); + ARM_ADD_REG_IMM (code, ARMREG_V2, ARMREG_V2, 4, 0); + ARM_LDR_REG_REG (code, ARMREG_V2, ARMREG_V2, ARMREG_LR); + } } else { if (tramp_type != MONO_TRAMPOLINE_GENERIC_CLASS_INIT) ARM_LDR_IMM (code, ARMREG_V2, ARMREG_LR, 0); -- 2.25.1