2 * lock-free-alloc.c: Lock free allocator.
4 * (C) Copyright 2011 Novell, Inc
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 * This is a simplified version of the lock-free allocator described in
29 * Scalable Lock-Free Dynamic Memory Allocation
30 * Maged M. Michael, PLDI 2004
32 * I could not get Michael's allocator working bug free under heavy
33 * stress tests. The paper doesn't provide correctness proof and after
34 * failing to formalize the ownership of descriptors I devised this
37 * Allocation within superblocks proceeds exactly like in Michael's
38 * allocator. The simplification is that a thread has to "acquire" a
39 * descriptor before it can allocate from its superblock. While it owns
40 * the descriptor no other thread can acquire and hence allocate from
41 * it. A consequence of this is that the ABA problem cannot occur, so
42 * we don't need the tag field and don't have to use 64 bit CAS.
44 * Descriptors are stored in two locations: The partial queue and the
45 * active field. They can only be in at most one of those at one time.
46 * If a thread wants to allocate, it needs to get a descriptor. It
47 * tries the active descriptor first, CASing it to NULL. If that
48 * doesn't work, it gets a descriptor out of the partial queue. Once it
49 * has the descriptor it owns it because it is not referenced anymore.
50 * It allocates a slot and then gives the descriptor back (unless it is
53 * Note that it is still possible that a slot is freed while an
54 * allocation is in progress from the same superblock. Ownership in
55 * this case is not complicated, though. If the block was FULL and the
56 * free set it to PARTIAL, the free now owns the block (because FULL
57 * blocks are not referenced from partial and active) and has to give it
58 * back. If the block was PARTIAL then the free doesn't own the block
59 * (because it's either still referenced, or an alloc owns it). A
60 * special case of this is that it has changed from PARTIAL to EMPTY and
61 * now needs to be retired. Technically, the free wouldn't have to do
62 * anything in this case because the first thing an alloc does when it
63 * gets ownership of a descriptor is to check whether it is EMPTY and
64 * retire it if that is the case. As an optimization, our free does try
65 * to acquire the descriptor (by CASing the active field, which, if it
66 * is lucky, points to that descriptor) and if it can do so, retire it.
67 * If it can't, it tries to retire other descriptors from the partial
68 * queue, so that we can be sure that even if no more allocations
69 * happen, descriptors are still retired. This is analogous to what
70 * Michael's allocator does.
72 * Another difference to Michael's allocator is not related to
73 * concurrency, however: We don't point from slots to descriptors.
74 * Instead we allocate superblocks aligned and point from the start of
75 * the superblock to the descriptor, so we only need one word of
76 * metadata per superblock.
78 * FIXME: Having more than one allocator per size class is probably
79 * buggy because it was never tested.
85 #include <mono/utils/atomic.h>
86 #include <mono/utils/mono-mmap.h>
87 #include <mono/utils/mono-membar.h>
88 #include <mono/utils/hazard-pointer.h>
89 #include <mono/utils/lock-free-queue.h>
91 #include <mono/utils/lock-free-alloc.h>
93 //#define DESC_AVAIL_DUMMY
110 typedef struct _MonoLockFreeAllocDescriptor Descriptor;
111 struct _MonoLockFreeAllocDescriptor {
112 MonoLockFreeQueueNode node;
113 MonoLockFreeAllocator *heap;
114 volatile Anchor anchor;
115 unsigned int slot_size;
116 unsigned int max_count;
118 #ifndef DESC_AVAIL_DUMMY
119 Descriptor * volatile next;
121 gboolean in_use; /* used for debugging only */
124 #define NUM_DESC_BATCH 64
126 #define SB_SIZE 16384
127 #define SB_HEADER_SIZE 16
128 #define SB_USABLE_SIZE (SB_SIZE - SB_HEADER_SIZE)
130 #define SB_HEADER_FOR_ADDR(a) ((gpointer)((gulong)(a) & ~(gulong)(SB_SIZE-1)))
131 #define DESCRIPTOR_FOR_ADDR(a) (*(Descriptor**)SB_HEADER_FOR_ADDR (a))
133 /* Taken from SGen */
136 prot_flags_for_activate (int activate)
138 unsigned long prot_flags = activate? MONO_MMAP_READ|MONO_MMAP_WRITE: MONO_MMAP_NONE;
139 return prot_flags | MONO_MMAP_PRIVATE | MONO_MMAP_ANON;
143 mono_sgen_alloc_os_memory (size_t size, int activate)
145 return mono_valloc (0, size, prot_flags_for_activate (activate));
149 mono_sgen_free_os_memory (void *addr, size_t size)
151 mono_vfree (addr, size);
154 /* size must be a power of 2 */
156 mono_sgen_alloc_os_memory_aligned (size_t size, size_t alignment, gboolean activate)
158 return mono_valloc_aligned (size, alignment, prot_flags_for_activate (activate));
162 alloc_sb (Descriptor *desc)
164 gpointer sb_header = mono_sgen_alloc_os_memory_aligned (SB_SIZE, SB_SIZE, TRUE);
165 g_assert (sb_header == SB_HEADER_FOR_ADDR (sb_header));
166 DESCRIPTOR_FOR_ADDR (sb_header) = desc;
167 //g_print ("sb %p for %p\n", sb_header, desc);
168 return (char*)sb_header + SB_HEADER_SIZE;
172 free_sb (gpointer sb)
174 gpointer sb_header = SB_HEADER_FOR_ADDR (sb);
175 g_assert ((char*)sb_header + SB_HEADER_SIZE == sb);
176 mono_sgen_free_os_memory (sb_header, SB_SIZE);
177 //g_print ("free sb %p\n", sb_header);
180 #ifndef DESC_AVAIL_DUMMY
181 static Descriptor * volatile desc_avail;
186 MonoThreadHazardPointers *hp = mono_hazard_pointer_get ();
192 desc = get_hazardous_pointer ((gpointer * volatile)&desc_avail, hp, 1);
194 Descriptor *next = desc->next;
195 success = (InterlockedCompareExchangePointer ((gpointer * volatile)&desc_avail, next, desc) == desc);
197 size_t desc_size = sizeof (Descriptor);
201 desc = mono_sgen_alloc_os_memory (desc_size * NUM_DESC_BATCH, TRUE);
203 /* Organize into linked list. */
205 for (i = 0; i < NUM_DESC_BATCH; ++i) {
206 Descriptor *next = (i == (NUM_DESC_BATCH - 1)) ? NULL : (Descriptor*)((char*)desc + ((i + 1) * desc_size));
208 mono_lock_free_queue_node_init (&d->node, TRUE);
212 mono_memory_write_barrier ();
214 success = (InterlockedCompareExchangePointer ((gpointer * volatile)&desc_avail, desc->next, NULL) == NULL);
217 mono_sgen_free_os_memory (desc, desc_size * NUM_DESC_BATCH);
220 mono_hazard_pointer_clear (hp, 1);
226 g_assert (!desc->in_use);
233 desc_enqueue_avail (gpointer _desc)
235 Descriptor *desc = _desc;
236 Descriptor *old_head;
238 g_assert (desc->anchor.data.state == STATE_EMPTY);
239 g_assert (!desc->in_use);
242 old_head = desc_avail;
243 desc->next = old_head;
244 mono_memory_write_barrier ();
245 } while (InterlockedCompareExchangePointer ((gpointer * volatile)&desc_avail, desc, old_head) != old_head);
249 desc_retire (Descriptor *desc)
251 g_assert (desc->anchor.data.state == STATE_EMPTY);
252 g_assert (desc->in_use);
253 desc->in_use = FALSE;
255 mono_thread_hazardous_free_or_queue (desc, desc_enqueue_avail, FALSE, TRUE);
258 MonoLockFreeQueue available_descs;
263 Descriptor *desc = (Descriptor*)mono_lock_free_queue_dequeue (&available_descs);
268 return calloc (1, sizeof (Descriptor));
272 desc_retire (Descriptor *desc)
275 mono_lock_free_queue_enqueue (&available_descs, &desc->node);
280 list_get_partial (MonoLockFreeAllocSizeClass *sc)
283 Descriptor *desc = (Descriptor*) mono_lock_free_queue_dequeue (&sc->partial);
286 if (desc->anchor.data.state != STATE_EMPTY)
293 desc_put_partial (gpointer _desc)
295 Descriptor *desc = _desc;
297 g_assert (desc->anchor.data.state != STATE_FULL);
299 mono_lock_free_queue_node_free (&desc->node);
300 mono_lock_free_queue_enqueue (&desc->heap->sc->partial, &desc->node);
304 list_put_partial (Descriptor *desc)
306 g_assert (desc->anchor.data.state != STATE_FULL);
307 mono_thread_hazardous_free_or_queue (desc, desc_put_partial, FALSE, TRUE);
311 list_remove_empty_desc (MonoLockFreeAllocSizeClass *sc)
313 int num_non_empty = 0;
315 Descriptor *desc = (Descriptor*) mono_lock_free_queue_dequeue (&sc->partial);
319 * We don't need to read atomically because we're the
320 * only thread that references this descriptor.
322 if (desc->anchor.data.state == STATE_EMPTY) {
325 g_assert (desc->heap->sc == sc);
326 mono_thread_hazardous_free_or_queue (desc, desc_put_partial, FALSE, TRUE);
327 if (++num_non_empty >= 2)
334 heap_get_partial (MonoLockFreeAllocator *heap)
336 return list_get_partial (heap->sc);
340 heap_put_partial (Descriptor *desc)
342 list_put_partial (desc);
346 set_anchor (Descriptor *desc, Anchor old_anchor, Anchor new_anchor)
348 if (old_anchor.data.state == STATE_EMPTY)
349 g_assert (new_anchor.data.state == STATE_EMPTY);
351 return InterlockedCompareExchange (&desc->anchor.value, new_anchor.value, old_anchor.value) == old_anchor.value;
355 alloc_from_active_or_partial (MonoLockFreeAllocator *heap)
358 Anchor old_anchor, new_anchor;
364 if (InterlockedCompareExchangePointer ((gpointer * volatile)&heap->active, NULL, desc) != desc)
367 desc = heap_get_partial (heap);
372 /* Now we own the desc. */
377 new_anchor = old_anchor = *(volatile Anchor*)&desc->anchor.value;
378 if (old_anchor.data.state == STATE_EMPTY) {
379 /* We must free it because we own it. */
383 g_assert (old_anchor.data.state == STATE_PARTIAL);
384 g_assert (old_anchor.data.count > 0);
386 addr = (char*)desc->sb + old_anchor.data.avail * desc->slot_size;
388 mono_memory_read_barrier ();
390 next = *(unsigned int*)addr;
391 g_assert (next < SB_USABLE_SIZE / desc->slot_size);
393 new_anchor.data.avail = next;
394 --new_anchor.data.count;
396 if (new_anchor.data.count == 0)
397 new_anchor.data.state = STATE_FULL;
398 } while (!set_anchor (desc, old_anchor, new_anchor));
400 /* If the desc is partial we have to give it back. */
401 if (new_anchor.data.state == STATE_PARTIAL) {
402 if (InterlockedCompareExchangePointer ((gpointer * volatile)&heap->active, desc, NULL) != NULL)
403 heap_put_partial (desc);
410 alloc_from_new_sb (MonoLockFreeAllocator *heap)
412 unsigned int slot_size, count, i;
413 Descriptor *desc = desc_alloc ();
415 desc->sb = alloc_sb (desc);
417 slot_size = desc->slot_size = heap->sc->slot_size;
418 count = SB_USABLE_SIZE / slot_size;
420 /* Organize blocks into linked list. */
421 for (i = 1; i < count - 1; ++i)
422 *(unsigned int*)((char*)desc->sb + i * slot_size) = i + 1;
426 * Setting avail to 1 because 0 is the block we're allocating
429 desc->anchor.data.avail = 1;
430 desc->slot_size = heap->sc->slot_size;
431 desc->max_count = count;
433 desc->anchor.data.count = desc->max_count - 1;
434 desc->anchor.data.state = STATE_PARTIAL;
436 mono_memory_write_barrier ();
438 /* Make it active or free it again. */
439 if (InterlockedCompareExchangePointer ((gpointer * volatile)&heap->active, desc, NULL) == NULL) {
442 desc->anchor.data.state = STATE_EMPTY;
449 mono_lock_free_alloc (MonoLockFreeAllocator *heap)
455 addr = alloc_from_active_or_partial (heap);
459 addr = alloc_from_new_sb (heap);
468 mono_lock_free_free (gpointer ptr)
470 Anchor old_anchor, new_anchor;
473 MonoLockFreeAllocator *heap = NULL;
475 desc = DESCRIPTOR_FOR_ADDR (ptr);
477 g_assert (SB_HEADER_FOR_ADDR (ptr) == SB_HEADER_FOR_ADDR (sb));
480 new_anchor = old_anchor = *(volatile Anchor*)&desc->anchor.value;
481 *(unsigned int*)ptr = old_anchor.data.avail;
482 new_anchor.data.avail = ((char*)ptr - (char*)sb) / desc->slot_size;
483 g_assert (new_anchor.data.avail < SB_USABLE_SIZE / desc->slot_size);
485 if (old_anchor.data.state == STATE_FULL)
486 new_anchor.data.state = STATE_PARTIAL;
488 if (++new_anchor.data.count == desc->max_count) {
490 new_anchor.data.state = STATE_EMPTY;
492 } while (!set_anchor (desc, old_anchor, new_anchor));
494 if (new_anchor.data.state == STATE_EMPTY) {
495 g_assert (old_anchor.data.state != STATE_EMPTY);
497 if (InterlockedCompareExchangePointer ((gpointer * volatile)&heap->active, NULL, desc) == desc) {
498 /* We own it, so we free it. */
502 * Somebody else must free it, so we do some
503 * freeing for others.
505 list_remove_empty_desc (heap->sc);
507 } else if (old_anchor.data.state == STATE_FULL) {
509 * Nobody owned it, now we do, so we need to give it
513 g_assert (new_anchor.data.state == STATE_PARTIAL);
515 if (InterlockedCompareExchangePointer ((gpointer * volatile)&desc->heap->active, desc, NULL) != NULL)
516 heap_put_partial (desc);
520 #define g_assert_OR_PRINT(c, format, ...) do { \
523 g_print ((format), ## __VA_ARGS__); \
530 descriptor_check_consistency (Descriptor *desc, gboolean print)
532 int count = desc->anchor.data.count;
533 int max_count = SB_USABLE_SIZE / desc->slot_size;
535 gboolean* linked = alloca(max_count*sizeof(gboolean));
537 gboolean linked [max_count];
542 #ifndef DESC_AVAIL_DUMMY
545 for (avail = desc_avail; avail; avail = avail->next)
546 g_assert_OR_PRINT (desc != avail, "descriptor is in the available list\n");
549 g_assert_OR_PRINT (desc->slot_size == desc->heap->sc->slot_size, "slot size doesn't match size class\n");
552 g_print ("descriptor %p is ", desc);
554 switch (desc->anchor.data.state) {
558 g_assert_OR_PRINT (count == 0, "count is not zero: %d\n", count);
562 g_print ("partial\n");
563 g_assert_OR_PRINT (count < max_count, "count too high: is %d but must be below %d\n", count, max_count);
568 g_assert_OR_PRINT (count == max_count, "count is wrong: is %d but should be %d\n", count, max_count);
571 g_assert_OR_PRINT (FALSE, "invalid state\n");
574 for (i = 0; i < max_count; ++i)
577 index = desc->anchor.data.avail;
579 for (i = 0; i < count; ++i) {
580 gpointer addr = (char*)desc->sb + index * desc->slot_size;
581 g_assert_OR_PRINT (index >= 0 && index < max_count,
582 "index %d for %dth available slot, linked from %d, not in range [0 .. %d)\n",
583 index, i, last, max_count);
584 g_assert_OR_PRINT (!linked [index], "%dth available slot %d linked twice\n", i, index);
587 linked [index] = TRUE;
589 index = *(unsigned int*)addr;
594 mono_lock_free_allocator_check_consistency (MonoLockFreeAllocator *heap)
596 Descriptor *active = heap->active;
599 g_assert (active->anchor.data.state == STATE_PARTIAL);
600 descriptor_check_consistency (active, FALSE);
602 while ((desc = (Descriptor*)mono_lock_free_queue_dequeue (&heap->sc->partial))) {
603 g_assert (desc->anchor.data.state == STATE_PARTIAL || desc->anchor.data.state == STATE_EMPTY);
604 descriptor_check_consistency (desc, FALSE);
610 mono_lock_free_allocator_init_size_class (MonoLockFreeAllocSizeClass *sc, unsigned int slot_size)
612 g_assert (slot_size <= SB_USABLE_SIZE / 2);
614 mono_lock_free_queue_init (&sc->partial);
615 sc->slot_size = slot_size;
619 mono_lock_free_allocator_init_allocator (MonoLockFreeAllocator *heap, MonoLockFreeAllocSizeClass *sc)