[proflog] Add integration of counters_init and counters_sample
[mono.git] / mono / profiler / proflog.c
index 58c0b98da2479de9d324a3e03c141416a5bf32c3..61a729b47e49162dc0471baac68105f4998822f4 100644 (file)
@@ -13,6 +13,9 @@
 #include <mono/metadata/threads.h>
 #include <mono/metadata/mono-gc.h>
 #include <mono/metadata/debug-helpers.h>
+#include <mono/utils/atomic.h>
+#include <mono/utils/mono-membar.h>
+#include <mono/utils/mono-counters.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 #endif
 
 /* the architecture needs a memory fence */
-#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__))
+#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__) || defined(__arm__))
+#include <unistd.h>
+#include <sys/syscall.h>
 #include "perf_event.h"
 #define USE_PERF_EVENTS 1
-static int read_perf_mmap (MonoProfiler* prof);
+static int read_perf_mmap (MonoProfiler* prof, int cpu);
 #endif
 
 #define BUFFER_SIZE (4096 * 16)
@@ -85,6 +90,7 @@ static int sample_freq = 0;
 static int do_mono_sample = 0;
 static int in_shutdown = 0;
 static int do_debug = 0;
+static int do_counters = 0;
 
 /* For linux compile with:
  * gcc -fPIC -shared -o libmono-profiler-log.so proflog.c utils.c -Wall -g -lz `pkg-config --cflags --libs mono-2`
@@ -283,7 +289,7 @@ typedef struct _LogBuffer LogBuffer;
  *
  * type sample format
  * type: TYPE_SAMPLE
- * exinfo: one of TYPE_SAMPLE_HIT, TYPE_SAMPLE_USYM, TYPE_SAMPLE_UBIN
+ * exinfo: one of TYPE_SAMPLE_HIT, TYPE_SAMPLE_USYM, TYPE_SAMPLE_UBIN, TYPE_SAMPLE_COUNTERS_DESC, TYPE_SAMPLE_COUNTERS
  * if exinfo == TYPE_SAMPLE_HIT
  *     [sample_type: uleb128] type of sample (SAMPLE_*)
  *     [timestamp: uleb128] nanoseconds since startup (note: different from other timestamps!)
@@ -299,6 +305,23 @@ typedef struct _LogBuffer LogBuffer;
  *     [offset: uleb128] file offset of mapping (the same file can be mapped multiple times)
  *     [size: uleb128] memory size
  *     [name: string] binary name
+ * if exinfo == TYPE_SAMPLE_COUNTERS_DESC
+ *     [len: uleb128] number of counters
+ *     for i = 0 to len
+ *             [section: uleb128] section name of counter
+ *             [name: string] name of counter
+ *             [type: uleb128] type name of counter
+ *             [unit: uleb128] unit name of counter
+ *             [variance: uleb128] variance name of counter
+ *             [index: uleb128] unique index of counter
+ * if exinfo == TYPE_SAMPLE_COUNTERS
+ *     [timestamp: uleb128] sampling timestamp
+ *     while true:
+ *             [index: uleb128] unique index of counter
+ *             if index == 0:
+ *                     break
+ *             [size: uleb128] size of counter value
+ *             [value: string/uleb128/sleb128/double] counter value, can be sleb128, uleb128, string or double (determined by using counter type)
  *
  */
 struct _LogBuffer {
@@ -374,6 +397,8 @@ static __thread LogBuffer* tlsbuffer = NULL;
 static pthread_key_t tlsbuffer;
 #endif
 
+static void safe_dump (MonoProfiler *profiler, LogBuffer *logbuffer);
+
 static char*
 pstrdup (const char *s)
 {
@@ -508,6 +533,35 @@ emit_obj (LogBuffer *logbuffer, void *ptr)
        assert (logbuffer->data <= logbuffer->data_end);
 }
 
+static void
+emit_string (LogBuffer *logbuffer, const char *str, size_t size)
+{
+       size_t i = 0;
+       if (str) {
+               for (; i < size; i++) {
+                       emit_byte (logbuffer, str [i]);
+                       if (str[i] == '\0')
+                               break;
+               }
+       }
+       if (!str || i == size)
+               emit_byte (logbuffer, '\0');
+}
+
+static void
+emit_double (LogBuffer *logbuffer, double value)
+{
+       int i;
+       unsigned char buffer[8];
+       memcpy (buffer, &value, 8);
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+       for (i = 7; i >= 0; i--)
+#else
+       for (i = 0; i < 8; i++)
+#endif
+               emit_byte (logbuffer, buffer[i]);
+}
+
 static char*
 write_int16 (char *buf, int32_t value)
 {
@@ -605,10 +659,17 @@ process_requests (MonoProfiler *profiler)
                mono_gc_collect (mono_gc_max_generation ());
 }
 
+static void counters_init (MonoProfiler *profiler);
+
 static void
 runtime_initialized (MonoProfiler *profiler)
 {
        runtime_inited = 1;
+#ifndef DISABLE_HELPER_THREAD
+       counters_init (profiler);
+#endif
+       /* ensure the main thread data and startup are available soon */
+       safe_dump (profiler, ensure_logbuf (0));
 }
 
 /*
@@ -1136,29 +1197,6 @@ thread_name (MonoProfiler *prof, uintptr_t tid, const char *name)
        EXIT_LOG (logbuffer);
 }
 
-#ifndef HOST_WIN32
-#include "mono/io-layer/atomic.h"
-#endif
-#define cmp_exchange InterlockedCompareExchangePointer
-/*#else
-static void*
-cmp_exchange (volatile void **dest, void *exch, void *comp)
-{
-       void *old;
-       __asm__ __volatile__ ("lock; "
-#ifdef __x86_64__
-               "cmpxchgq"
-#else
-               "cmpxchgl"
-#endif
-               " %2, %0"
-               : "=m" (*dest), "=a" (old)
-               : "r" (exch), "m" (*dest), "a" (comp));
-       return old;
-}
-#endif
-*/
-
 static void
 mono_sample_hit (MonoProfiler *profiler, unsigned char *ip, void *context)
 {
@@ -1196,7 +1234,7 @@ mono_sample_hit (MonoProfiler *profiler, unsigned char *ip, void *context)
        do {
                old_data = sbuf->data;
                new_data = old_data + 4;
-               data = cmp_exchange ((volatile void**)&sbuf->data, new_data, old_data);
+               data = InterlockedCompareExchangePointer ((volatile void**)&sbuf->data, new_data, old_data);
        } while (data != old_data);
        if (old_data >= sbuf->data_end)
                return; /* lost event */
@@ -1542,7 +1580,7 @@ dump_sample_hits (MonoProfiler *prof, StatBuffer *sbuf, int recurse)
                logbuffer = ensure_logbuf (20 + count * 8);
                emit_byte (logbuffer, TYPE_SAMPLE | TYPE_SAMPLE_HIT);
                emit_value (logbuffer, type);
-               emit_uvalue (logbuffer, (prof->startup_time + sample [2]) * 10000);
+               emit_uvalue (logbuffer, prof->startup_time + (uint64_t)sample [2] * (uint64_t)10000);
                emit_value (logbuffer, count);
                for (i = 0; i < count; ++i) {
                        emit_ptr (logbuffer, (void*)sample [i + 3]);
@@ -1555,15 +1593,74 @@ dump_sample_hits (MonoProfiler *prof, StatBuffer *sbuf, int recurse)
 
 #if USE_PERF_EVENTS
 #ifndef __NR_perf_event_open
+#ifdef __arm__
+#define __NR_perf_event_open 364
+#else
 #define __NR_perf_event_open 241
 #endif
+#endif
 
-static int perf_fd = -1;
-static void *mmap_base;
-static struct perf_event_mmap_page *page_desc = NULL;
-static int num_pages = 64;
+static int
+mono_cpu_count (void)
+{
+       int count = 0;
+#ifdef PLATFORM_ANDROID
+       /* Android tries really hard to save power by powering off CPUs on SMP phones which
+        * means the normal way to query cpu count returns a wrong value with userspace API.
+        * Instead we use /sys entries to query the actual hardware CPU count.
+        */
+       char buffer[8] = {'\0'};
+       int present = open ("/sys/devices/system/cpu/present", O_RDONLY);
+       /* Format of the /sys entry is a cpulist of indexes which in the case
+        * of present is always of the form "0-(n-1)" when there is more than
+        * 1 core, n being the number of CPU cores in the system. Otherwise
+        * the value is simply 0
+        */
+       if (present != -1 && read (present, (char*)buffer, sizeof (buffer)) > 3)
+               count = strtol (((char*)buffer) + 2, NULL, 10);
+       if (present != -1)
+               close (present);
+       if (count > 0)
+               return count + 1;
+#endif
+#ifdef _SC_NPROCESSORS_ONLN
+       count = sysconf (_SC_NPROCESSORS_ONLN);
+       if (count > 0)
+               return count;
+#endif
+#ifdef USE_SYSCTL
+       {
+               int mib [2];
+               size_t len = sizeof (int);
+               mib [0] = CTL_HW;
+               mib [1] = HW_NCPU;
+               if (sysctl (mib, 2, &count, &len, NULL, 0) == 0)
+                       return count;
+       }
+#endif
+#ifdef HOST_WIN32
+       {
+               SYSTEM_INFO info;
+               GetSystemInfo (&info);
+               return info.dwNumberOfProcessors;
+       }
+#endif
+       /* FIXME: warn */
+       return 1;
+}
+
+typedef struct {
+       int perf_fd;
+       unsigned int prev_pos;
+       void *mmap_base;
+       struct perf_event_mmap_page *page_desc;
+} PerfData ;
+
+static PerfData *perf_data = NULL;
+static int num_perf;
+#define PERF_PAGES_SHIFT 4
+static int num_pages = 1 << PERF_PAGES_SHIFT;
 static unsigned int mmap_mask;
-static unsigned int prev_pos = 0;
 
 typedef struct {
        struct perf_event_header h;
@@ -1584,24 +1681,25 @@ perf_event_syscall (struct perf_event_attr *attr, pid_t pid, int cpu, int group_
        return syscall(/*__NR_perf_event_open*/ 298, attr, pid, cpu, group_fd, flags);
 #elif defined(__i386__)
        return syscall(/*__NR_perf_event_open*/ 336, attr, pid, cpu, group_fd, flags);
+#elif defined(__arm__)
+       return syscall(/*__NR_perf_event_open*/ 364, attr, pid, cpu, group_fd, flags);
 #else
        return -1;
 #endif
 }
 
 static int
-setup_perf_map (void)
+setup_perf_map (PerfData *perf)
 {
-       mmap_mask = num_pages * getpagesize () - 1;
-       mmap_base = mmap (NULL, (num_pages + 1) * getpagesize (), PROT_READ|PROT_WRITE, MAP_SHARED, perf_fd, 0);
-       if (mmap_base == MAP_FAILED) {
+       perf->mmap_base = mmap (NULL, (num_pages + 1) * getpagesize (), PROT_READ|PROT_WRITE, MAP_SHARED, perf->perf_fd, 0);
+       if (perf->mmap_base == MAP_FAILED) {
                if (do_debug)
                        printf ("failed mmap\n");
                return 0;
        }
-       page_desc = mmap_base;
+       perf->page_desc = perf->mmap_base;
        if (do_debug)
-               printf ("mmap version: %d\n", page_desc->version);
+               printf ("mmap version: %d\n", perf->page_desc->version);
        return 1;
 }
 
@@ -1643,21 +1741,18 @@ dump_perf_hits (MonoProfiler *prof, void *buf, int size)
 
 /* read events from the ring buffer */
 static int
-read_perf_mmap (MonoProfiler* prof)
+read_perf_mmap (MonoProfiler* prof, int cpu)
 {
+       PerfData *perf = perf_data + cpu;
        unsigned char *buf;
-       unsigned char *data = (unsigned char*)mmap_base + getpagesize ();
-       unsigned int head = page_desc->data_head;
+       unsigned char *data = (unsigned char*)perf->mmap_base + getpagesize ();
+       unsigned int head = perf->page_desc->data_head;
        int diff, size;
        unsigned int old;
 
-#if defined(__i386__)
-       asm volatile("lock; addl $0,0(%%esp)":::"memory");
-#elif defined (__x86_64__)
-       asm volatile("lfence":::"memory");
-#endif
+       mono_memory_read_barrier ();
 
-       old = prev_pos;
+       old = perf->prev_pos;
        diff = head - old;
        if (diff < 0) {
                if (do_debug)
@@ -1681,13 +1776,13 @@ read_perf_mmap (MonoProfiler* prof)
                printf ("found bytes of events: %d\n", size);
        dump_perf_hits (prof, buf, size);
        old += size;
-       prev_pos = old;
-       page_desc->data_tail = old;
+       perf->prev_pos = old;
+       perf->page_desc->data_tail = old;
        return 0;
 }
 
 static int
-setup_perf_event (void)
+setup_perf_event_for_cpu (PerfData *perf, int cpu)
 {
        struct perf_event_attr attr;
        memset (&attr, 0, sizeof (attr));
@@ -1708,11 +1803,11 @@ setup_perf_event (void)
        attr.freq = 1;
        attr.sample_freq = sample_freq;
 
-       perf_fd = perf_event_syscall (&attr, getpid (), -1, -1, 0);
+       perf->perf_fd = perf_event_syscall (&attr, getpid (), cpu, -1, 0);
        if (do_debug)
-               printf ("perf fd: %d, freq: %d, event: %llu\n", perf_fd, sample_freq, attr.config);
-       if (perf_fd < 0) {
-               if (perf_fd == -EPERM) {
+               printf ("perf fd: %d, freq: %d, event: %llu\n", perf->perf_fd, sample_freq, attr.config);
+       if (perf->perf_fd < 0) {
+               if (perf->perf_fd == -EPERM) {
                        fprintf (stderr, "Perf syscall denied, do \"echo 1 > /proc/sys/kernel/perf_event_paranoid\" as root to enable.\n");
                } else {
                        if (do_debug)
@@ -1720,16 +1815,218 @@ setup_perf_event (void)
                }
                return 0;
        }
-       if (!setup_perf_map ()) {
-               close (perf_fd);
-               perf_fd = -1;
+       if (!setup_perf_map (perf)) {
+               close (perf->perf_fd);
+               perf->perf_fd = -1;
                return 0;
        }
        return 1;
 }
 
+static int
+setup_perf_event (void)
+{
+       int i, count = 0;
+       mmap_mask = num_pages * getpagesize () - 1;
+       num_perf = mono_cpu_count ();
+       perf_data = calloc (num_perf, sizeof (PerfData));
+       for (i = 0; i < num_perf; ++i) {
+               count += setup_perf_event_for_cpu (perf_data + i, i);
+       }
+       if (count)
+               return 1;
+       free (perf_data);
+       perf_data = NULL;
+       return 0;
+}
+
 #endif /* USE_PERF_EVENTS */
 
+#ifndef DISABLE_HELPER_THREAD
+
+typedef struct MonoCounterAgent {
+       MonoCounter *counter;
+       // MonoCounterAgent specific data :
+       void *value;
+       size_t value_size;
+       short index;
+       struct MonoCounterAgent *next;
+} MonoCounterAgent;
+
+static MonoCounterAgent* counters;
+static gboolean counters_initialized = FALSE;
+static int counters_index = 1;
+
+static mono_bool
+counters_init_add_counter (MonoCounter *counter, gpointer data)
+{
+       MonoCounterAgent *agent, *item;
+
+       for (agent = counters; agent; agent = agent->next) {
+               if (agent->counter == counter)
+                       return TRUE;
+       }
+
+       agent = malloc (sizeof (MonoCounterAgent));
+       agent->counter = counter;
+       agent->value = NULL;
+       agent->value_size = 0;
+       agent->index = counters_index++;
+       agent->next = NULL;
+
+       if (!counters) {
+               counters = agent;
+       } else {
+               item = counters;
+               while (item->next)
+                       item = item->next;
+               item->next = agent;
+       }
+
+       return TRUE;
+}
+
+static void
+counters_init (MonoProfiler *profiler)
+{
+       mono_counters_foreach (counters_init_add_counter, NULL);
+
+       MonoCounterAgent *agent;
+       LogBuffer *logbuffer;
+       int size = 1 + 5, len = 0;
+
+       for (agent = counters; agent; agent = agent->next) {
+               size += strlen (mono_counter_get_name (agent->counter)) + 1 + 5 * 5;
+               len += 1;
+       }
+
+       logbuffer = ensure_logbuf (size);
+
+       ENTER_LOG (logbuffer, "counters");
+       emit_byte (logbuffer, TYPE_SAMPLE_COUNTERS_DESC | TYPE_SAMPLE);
+       emit_value (logbuffer, len);
+       for (agent = counters; agent; agent = agent->next) {
+               const char *name = mono_counter_get_name (agent->counter);
+               emit_value (logbuffer, mono_counter_get_section (agent->counter));
+               emit_string (logbuffer, name, strlen (name) + 1);
+               emit_value (logbuffer, mono_counter_get_type (agent->counter));
+               emit_value (logbuffer, mono_counter_get_unit (agent->counter));
+               emit_value (logbuffer, mono_counter_get_variance (agent->counter));
+               emit_value (logbuffer, agent->index);
+       }
+       EXIT_LOG (logbuffer);
+
+       counters_initialized = TRUE;
+}
+
+static void
+counters_sample (MonoProfiler *profiler, uint64_t timestamp)
+{
+       MonoCounterAgent *agent;
+       MonoCounter *counter;
+       LogBuffer *logbuffer;
+       int type;
+       int buffer_size;
+       void *buffer;
+       int size;
+
+       if (!counters_initialized)
+               return;
+
+       buffer_size = 8;
+       buffer = calloc (1, buffer_size);
+
+       size = 1 + 10 + 5;
+       for (agent = counters; agent; agent = agent->next)
+               size += 10 * 2 + mono_counter_get_size (agent->counter);
+
+       logbuffer = ensure_logbuf (size);
+
+       ENTER_LOG (logbuffer, "counters");
+       emit_byte (logbuffer, TYPE_SAMPLE_COUNTERS | TYPE_SAMPLE);
+       emit_uvalue (logbuffer, timestamp);
+       for (agent = counters; agent; agent = agent->next) {
+               counter = agent->counter;
+
+               size_t size = mono_counter_get_size (counter);
+               if (size < 0) {
+                       continue; // FIXME error
+               } else if (size > buffer_size) {
+                       buffer_size = size;
+                       buffer = realloc (buffer, buffer_size);
+               }
+
+               memset (buffer, 0, buffer_size);
+
+               if (mono_counters_sample (counter, buffer, size) < 0)
+                       continue; // FIXME error
+
+               type = mono_counter_get_type (counter);
+
+               if (!agent->value) {
+                       agent->value = calloc (1, size);
+                       agent->value_size = size;
+               } else {
+                       if (type == MONO_COUNTER_STRING) {
+                               if (strncmp (agent->value, buffer, size) == 0)
+                                       continue;
+                       } else {
+                               if (agent->value_size == size && memcmp (agent->value, buffer, size) == 0)
+                                       continue;
+                       }
+               }
+
+               emit_uvalue (logbuffer, agent->index);
+               emit_uvalue (logbuffer, type);
+               switch (type) {
+               case MONO_COUNTER_INT:
+#if SIZEOF_VOID_P == 4
+               case MONO_COUNTER_WORD:
+#endif
+                       emit_svalue (logbuffer, *(int*)buffer - *(int*)agent->value);
+                       break;
+               case MONO_COUNTER_UINT:
+                       emit_uvalue (logbuffer, *(guint*)buffer - *(guint*)agent->value);
+                       break;
+               case MONO_COUNTER_TIME_INTERVAL:
+               case MONO_COUNTER_LONG:
+#if SIZEOF_VOID_P == 8
+               case MONO_COUNTER_WORD:
+#endif
+                       emit_svalue (logbuffer, *(gint64*)buffer - *(gint64*)agent->value);
+                       break;
+               case MONO_COUNTER_ULONG:
+                       emit_uvalue (logbuffer, *(guint64*)buffer - *(guint64*)agent->value);
+                       break;
+               case MONO_COUNTER_DOUBLE:
+                       emit_double (logbuffer, *(double*)buffer);
+                       break;
+               case MONO_COUNTER_STRING:
+                       if (size == 0)
+                               emit_string (logbuffer, "(null)", 7);
+                       else
+                               emit_string (logbuffer, (char*)buffer, size);
+                       break;
+               default:
+                       assert (0);
+               }
+
+               if (type == MONO_COUNTER_STRING && size > agent->value_size) {
+                       agent->value = realloc (agent->value, size);
+                       agent->value_size = size;
+               }
+
+               if (size > 0)
+                       memcpy (agent->value, buffer, size);
+       }
+       free (buffer);
+
+       emit_value (logbuffer, 0);
+       EXIT_LOG (logbuffer);
+}
+
+#endif /* DISABLE_HELPER_THREAD */
+
 static void
 log_shutdown (MonoProfiler *prof)
 {
@@ -1743,8 +2040,11 @@ log_shutdown (MonoProfiler *prof)
        }
 #endif
 #if USE_PERF_EVENTS
-       if (page_desc)
-               read_perf_mmap (prof);
+       if (perf_data) {
+               int i;
+               for (i = 0; i < num_perf; ++i)
+                       read_perf_mmap (prof, i);
+       }
 #endif
        dump_sample_hits (prof, prof->stat_buffers, 1);
        take_lock ();
@@ -1831,6 +2131,9 @@ helper_thread (void* arg)
        int len;
        char buf [64];
        MonoThread *thread = NULL;
+       GTimeVal start, now;
+
+       g_get_current_time (&start);
 
        //fprintf (stderr, "Server listening\n");
        command_socket = -1;
@@ -1850,12 +2153,21 @@ helper_thread (void* arg)
                                max_fd = command_socket;
                }
 #if USE_PERF_EVENTS
-               if (perf_fd >= 0) {
-                       FD_SET (perf_fd, &rfds);
-                       if (max_fd < perf_fd)
-                               max_fd = perf_fd;
+               if (perf_data) {
+                       int i;
+                       for ( i = 0; i < num_perf; ++i) {
+                               if (perf_data [i].perf_fd < 0)
+                                       continue;
+                               FD_SET (perf_data [i].perf_fd, &rfds);
+                               if (max_fd < perf_data [i].perf_fd)
+                                       max_fd = perf_data [i].perf_fd;
+                       }
                }
 #endif
+               g_get_current_time (&now);
+               counters_sample (prof, (uint64_t)(now.tv_sec * 1000 + now.tv_usec / 1000) - (uint64_t)(start.tv_sec * 1000 + start.tv_usec / 1000));
+               safe_dump (prof, ensure_logbuf (0));
+
                tv.tv_sec = 1;
                tv.tv_usec = 0;
                len = select (max_fd + 1, &rfds, NULL, NULL, &tv);
@@ -1886,16 +2198,30 @@ helper_thread (void* arg)
                        if (do_debug)
                                fprintf (stderr, "helper shutdown\n");
 #if USE_PERF_EVENTS
-                       if (perf_fd >= 0)
-                               read_perf_mmap (prof);
+                       if (perf_data) {
+                               int i;
+                               for ( i = 0; i < num_perf; ++i) {
+                                       if (perf_data [i].perf_fd < 0)
+                                               continue;
+                                       if (FD_ISSET (perf_data [i].perf_fd, &rfds))
+                                               read_perf_mmap (prof, i);
+                               }
+                       }
 #endif
                        safe_dump (prof, ensure_logbuf (0));
                        return NULL;
                }
 #if USE_PERF_EVENTS
-               if (perf_fd >= 0 && FD_ISSET (perf_fd, &rfds)) {
-                       read_perf_mmap (prof);
-                       safe_dump (prof, ensure_logbuf (0));
+               if (perf_data) {
+                       int i;
+                       for ( i = 0; i < num_perf; ++i) {
+                               if (perf_data [i].perf_fd < 0)
+                                       continue;
+                               if (FD_ISSET (perf_data [i].perf_fd, &rfds)) {
+                                       read_perf_mmap (prof, i);
+                                       safe_dump (prof, ensure_logbuf (0));
+                               }
+                       }
                }
 #endif
                if (command_socket >= 0 && FD_ISSET (command_socket, &rfds)) {
@@ -2038,7 +2364,7 @@ create_profiler (const char *filename)
 #if USE_PERF_EVENTS
        if (sample_type && !do_mono_sample)
                need_helper_thread = setup_perf_event ();
-       if (perf_fd < 0) {
+       if (!perf_data) {
                /* FIXME: warn if different freq or sample type */
                do_mono_sample = 1;
        }
@@ -2047,6 +2373,9 @@ create_profiler (const char *filename)
                prof->stat_buffers = create_stat_buffer ();
                need_helper_thread = 1;
        }
+       if (do_counters && !need_helper_thread) {
+               need_helper_thread = 1;
+       }
 #ifndef DISABLE_HELPER_THREAD
        if (hs_mode_ondemand || need_helper_thread) {
                if (!start_helper_thread (prof))
@@ -2072,6 +2401,7 @@ usage (int do_exit)
        printf ("\t[no]calls        enable/disable recording enter/leave method events\n");
        printf ("\theapshot[=MODE]  record heap shot info (by default at each major collection)\n");
        printf ("\t                 MODE: every XXms milliseconds, every YYgc collections, ondemand\n");
+       printf ("\tcounters         sample counters every 1s\n");
        printf ("\tsample[=TYPE]    use statistical sampling mode (by default cycles/1000)\n");
        printf ("\t                 TYPE: cycles,instr,cacherefs,cachemiss,branches,branchmiss\n");
        printf ("\t                 TYPE can be followed by /FREQUENCY\n");
@@ -2343,6 +2673,10 @@ mono_profiler_startup (const char *desc)
                        free (val);
                        continue;
                }
+               if ((opt = match_option (p, "counters", NULL)) != p) {
+                       do_counters = 1;
+                       continue;
+               }
                if (opt == p) {
                        usage (0);
                        exit (0);