doc/handbook/runtime.tex

   1 \chapter{Run Time System}
   2
   3 \section{Introduction}
   4
   5 \section{Object layout}
   6
   7 \section{Method invocation}
   8
   9 Java and the programming language Theta do not implement multiple
  10 inheritance, but single inheritance with multiple subtyping. This important
  11 difference makes object layout and method dispatch more efficient. Although
  12 the bidirectional layout was designed for a language with multiple
  13 subtyping, it has the problem that more than one {\em vtbl} pointer has to
  14 be included in objects. The CACAO JIT compiler \cite{KrGr97b} moves the
  15 dispatch header of the bidirectional layout into the class information with
  16 negative offsets from the {\em vtbl}. For each implemented interface there
  17 is a distinct interface {\em vtbl}. Unimplemented interfaces are
  18 represented by null pointers.
  19 %
  20 \begin{figure*}[htb]
  21 \begin{center}
  22 \setlength{\unitlength}{1mm}
  23 \begin{picture}(114,72)
  24 \put( 0,42){\vector(1,0){12}}
  25 \put( 0,42){\makebox(8,6){\it objptr}}
  26 \put(12,60){\makebox(24,6){\it object}}
  27 \put(48,63){\makebox(30,6){\it class}}
  28 \put(90,66){\makebox(24,6){\it code}}
  29 \put(12,42){\makebox(24,18){\it instance data}}
  30 \put(12,36){\makebox(24,6){\it class pointer}}
  31 \put(12,42){\line(1,0){24}}
  32 \put(36,39){\vector(1,0){12}}
  33 \put(48,57){\makebox(30,6){\it method pointer}}
  34 \put(48,57){\line(1,0){30}}
  35 \put(48,51){\makebox(30,6){\it method pointer}}
  36 \put(48,51){\line(1,0){30}}
  37 \put(48,45){\makebox(30,6){\it method pointer}}
  38 \put(48,45){\line(1,0){30}}
  39 \put(48,39){\makebox(30,6){\it class info}}
  40 \put(48,27){\makebox(30,6){\it interface pointer}}
  41 \put(48,33){\line(1,0){30}}
  42 \put(48,33){\makebox(30,6){\it interface pointer}}
  43 \put(48, 0){\makebox(30,6){\it method pointer}}
  44 \put(48, 6){\makebox(30,6){\it method pointer}}
  45 \put(48, 6){\line(1,0){30}}
  46 \put(48,21){\makebox(30,6){\it interfaces}}
  47 \put(90,36){\framebox(24,6){\it method code}}
  48 \put(44,15){\vector(1,0){4}}
  49 \put(44,15){\line(0,1){15}}
  50 \put(44,30){\line(1,0){4}}
  51 \put(40, 0){\vector(1,0){8}}
  52 \put(40, 0){\line(0,1){36}}
  53 \put(40,36){\line(1,0){8}}
  54 \put(78, 3){\line(1,0){8}}
  55 \put(86, 3){\line(0,1){51}}
  56 \put(78, 9){\line(1,0){4}}
  57 \put(82, 9){\line(0,1){39}}
  58 \put(78,18){\line(1,0){4}}
  59 \put(78,54){\line(1,0){8}}
  60 \put(86,36){\vector(1,0){4}}
  61 \put(90,48){\framebox(24,6){\it method code}}
  62 \put(78,48){\vector(1,0){12}}
  63 \put(90,60){\framebox(24,6){\it method code}}
  64 \put(78,60){\vector(1,0){12}}
  65 \thicklines
  66 \put(48, 0){\framebox(30,12){}}
  67 \put(48,15){\framebox(30,6){\it method pointer}}
  68 \put(12,36){\framebox(24,24){}}
  69 \put(48,27){\framebox(30,36){}}
  70 \put(48,39){\line(1,0){30}}
  71 \end{picture}
  72 \caption{CACAO object and compact class descriptor layout}
  73 \label{objectcompactlayout}
  74 \end{center}
  75 \end{figure*}
  76
  77 To call a virtual method, two memory access instructions are necessary
  78 (load the class pointer, load the method pointer) followed by the call
  79 instruction. Calling an interface method needs an additional indirection.
  80 %
  81 \begin{figure*}[htb]
  82 \begin{center}
  83 \setlength{\unitlength}{1mm}
  84 \begin{picture}(114,42)
  85 \put(0,15){\vector(1,0){12}}
  86 \put(0,15){\makebox(12,6){\it objptr}}
  87 \put(12,33){\makebox(24,6){\it object}}
  88 \put(48,36){\makebox(30,6){\it class}}
  89 \put(90,39){\makebox(24,6){\it code}}
  90 \put(12,15){\makebox(24,18){\it instance data}}
  91 \put(12,9){\makebox(24,6){\it class pointer}}
  92 \put(12,15){\line(1,0){24}}
  93 \put(36,12){\vector(1,0){12}}
  94 \put(48,30){\makebox(30,6){\it method pointer}}
  95 \put(48,30){\line(1,0){30}}
  96 \put(48,24){\makebox(30,6){\it method pointer}}
  97 \put(48,24){\line(1,0){30}}
  98 \put(48,18){\makebox(30,6){\it method pointer}}
  99 \put(48,18){\line(1,0){30}}
 100 \put(48,12){\makebox(30,6){\it class info}}
 101 \put(48,0){\makebox(30,6){\it ifmethod pointer}}
 102 \put(48,6){\line(1,0){30}}
 103 \put(48,6){\makebox(30,6){\it ifmethod pointer}}
 104 \put(90,3){\framebox(24,6){\it method code}}
 105 \put(78,3){\vector(1,0){12}}
 106 \put(82,3){\line(0,1){18}}
 107 \put(78,21){\line(1,0){4}}
 108 \put(78,27){\line(1,0){8}}
 109 \put(78,9){\line(1,0){8}}
 110 \put(86,9){\line(0,1){18}}
 111 \put(86,18){\vector(1,0){4}}
 112 \put(90,18){\framebox(24,6){\it method code}}
 113 \put(78,33){\vector(1,0){12}}
 114 \put(90,33){\framebox(24,6){\it method code}}
 115 \thicklines
 116 \put(12,9){\framebox(24,24){}}
 117 \put(48,0){\framebox(30,36){}}
 118 \put(48,12){\line(1,0){30}}
 119 \end{picture}
 120 \caption{CACAO object and fast class descriptor layout}
 121 \label{objectfastlayout}
 122 \end{center}
 123 \end{figure*}
 124
 125 In the faster scheme, we store interface methods in an additional table at
 126 negative offsets from the {\em vtbl} (see figure~\ref{objectfastlayout}).
 127 Segregating the interface virtual function table keeps the standard virtual
 128 function table small and allows interface methods to be called with just
 129 two memory accesses. The memory consumption of virtual function tables
 130 containing interface and class methods would be {\em number of (classes +
 131 interfaces) $\times$ number of distinct methods}. The memory consumption of
 132 the interface tables is only {\em number of classes which implement
 133 interfaces $\times$ number of interface methods}. Colouring can be used to
 134 reduce the number of distinct offsets for interface methods further, but
 135 complicates dynamic class loading, leading to renumbering and code
 136 patching.
 137
 138 The Jalapeno virtual machine implements an interface method invocation
 139 similar to the fast class descriptor layout of CACAO, but instead of
 140 colouring hashing of the method indices is used \cite{Alpern+01}. The table
 141 for the interface method pointers is allocated with a fixed size much
 142 smaller than the number of interface methods. When two method indices hash
 143 to the same offset, a conflict resolving stub is called instead of the
 144 interface methods directly. For conflict resolution the stub is passed the
 145 method index in a scratch register as additional argument. An interface
 146 method invocation can be executed with the following four machine
 147 instructions:
 148 %
 149 \begin{verbatim}
 150 LD  vtblptr,(obj)                  ; load vtbl pointer
 151 LD  mptr,hash(method_ptr)(vtblptr) ; load method pointer
 152 MV  mindex,idreg                   ; load method index
 153 JSR (mptr)                         ; call method (or conflict stub)
 154 \end{verbatim}
 155 %
 156 The number of machine instructions is the same as in the compact class
 157 descriptor layout of CACAO, but the indirection is eliminated, which
 158 reduces the number of cycles needed for the execution of this instruction
 159 sequence on a pipelined architecture. Compared to the old interface method
 160 invocation in the Jalapeno VM, which searched the superclass hierarchy for
 161 a matching method signature, the new method resulted in changes in the
 162 run-time from one percent slowdowns to speedups up to 51 percent.
 163
 164
 165 \section{Run time type checking}
 166
 167 Since type tests for trees are more (space) efficient than type tests for
 168 DAGs, a possible solution is to split a DAG into a tree part and the
 169 remaining graph. For languages with single inheritance and multiple
 170 subtyping this partitioning of the class hierarchy is already done in the
 171 language itself.
 172
 173 \begin{figure}[htb]
 174 \begin{center}
 175 \setlength{\unitlength}{1mm}
 176 \begin{picture}(86,26)
 177 \thicklines
 178 \put(23, 3){\circle{6}}
 179 \put(43, 3){\circle{6}}
 180 \put(63, 3){\circle{6}}
 181 \put(33,13){\circle{6}}
 182 \put(53,13){\circle{6}}
 183 \put(43,23){\circle{6}}
 184 \put(20, 0){\makebox(6,6){D}}
 185 \put(40, 0){\makebox(6,6){E}}
 186 \put(60, 0){\makebox(6,6){F}}
 187 \put(30,10){\makebox(6,6){B}}
 188 \put(50,10){\makebox(6,6){C}}
 189 \put(40,20){\makebox(6,6){A}}
 190 \put(19, 0){\makebox(0,6)[r]{\{3,0\}}}
 191 \put(47, 0){\makebox(6,6)[l]{\{4,0\}}}
 192 \put(67, 0){\makebox(0,6)[l]{\{6,0\}}}
 193 \put(29,10){\makebox(0,6)[r]{\{2,2\}}}
 194 \put(57,10){\makebox(0,6)[l]{\{5,1\}}}
 195 \put(47,20){\makebox(0,6)[l]{\{1,5\}}}
 196 \put(25, 5){\line( 1,1){6}}
 197 \put(35,15){\line( 1,1){6}}
 198 \put(41, 5){\line(-1,1){6}}
 199 \put(51,15){\line(-1,1){6}}
 200 \put(61, 5){\line(-1,1){6}}
 201 \end{picture}
 202 \caption{Relative numbering with \{{\tt baseval}, {\tt diffval}\} pairs}
 203 \label{relnumbering}
 204 \end{center}
 205 \end{figure}
 206
 207 CACAO uses a subtype test based on relative numbering for classes and a
 208 kind of matrix implementation for interfaces. Two numbers {\em low} and
 209 {\em high} are stored for each class in the class hierarchy. A depth first
 210 traversal of the hierarchy increments a counter for each class and assigns
 211 the counter to the {\em low} field when the class is first encountered and
 212 assigns the counter to the {\em high} field when the traversal leaves the
 213 class. In languages with dynamic class loading a renumbering of the
 214 hierarchy is needed whenever a class is added. A class {\em sub} is a
 215 subtype of another class {\em super}, if {\em super.low $\le$ sub.low $\le$
 216 super.high}. Since a range check is implemented more efficiently by an
 217 unsigned comparison, CACAO stores the difference between the {\em low} and
 218 {\em high} values and compares it against the difference of the {\em low}
 219 values of both classes. The code for {\tt instanceof} looks similar to:
 220
 221 \begin{verbatim}
 222 return (unsigned) (sub->vftbl->baseval - super->vftbl->baseval) <=
 223        (unsigned) (super->vftbl->diffval);
 224 \end{verbatim}
 225
 226 For leaf nodes in the class hierarchy the {\tt diffval} is 0 which results
 227 in a faster test (a simple comparison of the {\tt baseval} fields of the
 228 sub- and superclass). In general a JIT compiler can generate the faster
 229 test only for final classes. An AOT compiler or a JIT compiler which does
 230 patching of the already generated machine code may additionally replace
 231 both the {\tt baseval} and the {\tt diffval} of the superclass by a
 232 constant. Currently CACAO uses constants only when dynamic class loading is
 233 not used.
 234
 235 CACAO stores an interface table at negative offsets from the virtual method
 236 table (see figure~\ref{objectcompactlayout}). This table is needed for the
 237 invocation of interface methods. The same table is additionally used by the
 238 subtype test for interfaces. If the table is empty for the index of the
 239 superclass, the subtype test fails. The code for {\tt instanceof} looks
 240 similar to:
 241
 242 \begin{verbatim}
 243 return (sub->vftbl->interfacetable[-super->index] != NULL);
 244 \end{verbatim}
 245
 246 Both subtype tests can be implemented by very few machine code instructions
 247 without using branches which are expensive on modern processors.
 248
 249
 250 \section{Exception handling}
 251
 252 \subsection{Introduction}
 253
 254 Exceptions in Java occur either implicitly or explicitly. Typical
 255 implicit exceptions are references to the {\tt null} pointer, array
 256 index out of bounds and division by zero. Exceptions also can be
 257 raised explicitly with the {\tt throw} instruction. To handle
 258 exceptions occurring during execution, code which can raise an
 259 exception is included in a {\tt try} block. An efficient
 260 implementation of exception handling has to take care of managing {\tt
 261 try} blocks and to check for implicit exceptions efficiently .
 262
 263
 264 \subsection{Known implementation techniques}
 265
 266 Three standard methods exist for implementing exception handling:
 267
 268 \begin{itemize}
 269
 270 \item dynamically create a linked list of {\tt try} block data
 271       structures,
 272
 273 \item use static {\tt try} block tables and search these tables at run
 274       time (suggested for JavaVM interpreters),
 275
 276 \item use functions with two return values.
 277
 278 \end{itemize}
 279
 280 The first method has been used in portable implementations of
 281 exception handling for C++ \cite{Cameron+92} or Ada \cite{Giering+94}
 282 using {\tt setjmp} and {\tt longjmp}. A linked exception handling data
 283 structure is created when entering a {\tt try} block and the structure
 284 is discarded when leaving the protected block. Java requires precise
 285 exceptions. It means that all expressions evaluated before the
 286 exception raising instruction must have finished and all expressions
 287 after the raising instruction must not have been started. Therefore,
 288 in practice, some instructions may not be moved freely. In the case of
 289 subroutine calls, the callee-saved registers must be restored to their
 290 original value. The data structure can be used to store such
 291 information. The disadvantage of this method is that creating and
 292 discarding of the data structure takes some time even if an exception
 293 is never raised.
 294
 295 The second method has been suggested for an efficient exception
 296 handling implementation of C++ \cite{KoenigStroustrup90} and is used
 297 in Java implementations. For every method, the JavaVM maintains an
 298 exception table. This exception table contains the program counter of
 299 the start and the end of the {\tt try} block, the program counter of
 300 the exception handler and the type of the exception. A JavaVM
 301 interpreter can easily interpret this structure and dispatch to the
 302 corresponding handler code. If the byte code is translated to native
 303 code, the equivalent technique is more complicated.
 304
 305 To simplify restoration of the registers, the old CACAO implementation
 306 used a different scheme \cite{KrGr97b}. A method has two return
 307 values: the real return value and an exception value stored in a
 308 register. After each method call, the exception register is checked
 309 and, if it is non-zero, the exception handling code is executed. Since
 310 an exception is rarely raised, the branch is easy to predict and
 311 cheap. Entering and leaving a {\tt try} block have no associated
 312 costs. At compile time, the dispatch information contained in the
 313 exception table is translated into method dispatching native code.
 314
 315 Run time checks for null pointers and array bounds are quite frequent,
 316 but can be eliminated in many cases. It is often possible to move a
 317 loop invariant null pointer check before the loop or to eliminate a
 318 bound check. Some more sophisticated compilers use these code motion
 319 techniques.
 320
 321
 322 \subsection{Motivation for a change}
 323
 324 \begin{table*}
 325 \begin{center}
 326 \begin{tabular}[b]{|l|c|c|c|c|c|}
 327 \hline
 328                     & JavaLex & javac & espresso & Toba & java\_cup \\
 329 \hline
 330 null pointer checks &  6859   &  8197 &  11114   & 5825 &   7406    \\
 331 \hline
 332 method calls        &  3226   &  7498 &   7515   & 4401 &   5310    \\
 333 \hline
 334 try blocks          &    20   &   113 &     44   &   28 &     27    \\
 335 \hline
 336 \end{tabular}
 337 \caption{Number of pointer checks, method invocations and try blocks}
 338 \label{MethodTry}
 339 \end{center}
 340 \end{table*}
 341
 342 The old CACAO implementation was simple, but it only makes sense if
 343 the number of try blocks is high. We made an empirical study to count
 344 the numbers of static occurrences of method invocations and of {\tt
 345 try} blocks in some applications (see table \ref{MethodTry}). The
 346 number of method invocations is two magnitudes bigger than the number
 347 of {\tt try} blocks. Furthermore, an exception is rarely raised during
 348 program execution. This led us to a new implementation of exception
 349 handling.
 350
 351
 352 \subsection{The new exception handling scheme}
 353
 354 The new exception handling scheme is similar to that in a JavaVM
 355 interpreter. If an exception occurs, information in the exception
 356 table is interpreted. However native code complicates the matter.
 357
 358 The pointers to Java byte code must be replaced by pointers to native
 359 code. It requires that, during native code generation, the order of
 360 basic blocks not be allowed to change. If basic blocks are eliminated
 361 because of dead code, the information about a block can not be
 362 discarded if there is a pointer to it in the exception table.
 363
 364 A CACAO stack frame only contains copies of saved or spilled
 365 registers. There is no saved frame pointer. The size of a stack frame
 366 is only contained in the instructions which allocate and deallocate
 367 the stack. Therefore, to support exception handling, additional
 368 information has to be stored elsewhere.
 369
 370 The code for a method needs access to constants (mostly address
 371 constants). Since a global constant table would be too large for short
 372 address ranges and, because methods are compiled on demand, every
 373 method has its own constant area which is allocated directly before
 374 the start of the method code (see fig.\ \ref{methodlayout}). A
 375 register is reserved which contains the method pointer. Constants are
 376 addressed relative to the method pointer.
 377
 378 \begin{figure}[htb]
 379 \begin{center}
 380 \setlength{\unitlength}{1mm}
 381 \begin{picture}(50,18)
 382 \put(24,6){\vector(-1,0){6}}
 383 \put(24,3){\makebox(25,6){\it method pointer}}
 384 \put(0,0){\makebox(18, 6){\it constants}}
 385 \put(0,6){\makebox(18,12){\it code}}
 386 \thicklines
 387 \put(0,6){\line(1,0){18}}
 388 \put(0,0){\framebox(18,18){}}
 389 \end{picture}
 390 \caption{CACAO method layout}
 391 \label{methodlayout}
 392 \end{center}
 393 \end{figure}
 394
 395 During a method call, the method pointer of the calling method is
 396 destroyed. However the return address is stored in a register which is
 397 preserved during execution of the called method and has to be used for
 398 returning from the method. After a method return, the method pointer
 399 of the calling method is recomputed using the return address. The
 400 following code for a method call demonstrates the method calling
 401 convention:
 402
 403 \begin{verbatim}
 404 LDQ cp,(obj)     ; load class pointer
 405 LDQ mp,met(cp)   ; load method pointer
 406 JSR ra,(mp)      ; call method
 407 LDA mp=ra+offset ; recompute method pointer
 408 \end{verbatim}
 409
 410 At the beginning of the constant area, there are fields which contain
 411 the necessary information for register restoration:
 412
 413 \begin{mylist}{{\tt framesize}}
 414 \item[{\tt framesize}] the size of the stack frame
 415 \item[{\tt isleaf}]    a flag which is true if the method is a leaf
 416 method
 417 \item[{\tt intsave}]   number of saved integer registers
 418 \item[{\tt floatsave}] number of saved floating point registers
 419 \item[{\tt extable}]   the exception table -- similar to the JavaVM
 420 table
 421 \end{mylist}
 422
 423 The exception handler first checks if the current executing method has
 424 an associated handler and may dispatch to this handler. If there is no
 425 handler, it unwinds the stack and searches the parent methods for a
 426 handler. The information in the constant area is used to restore the
 427 registers and update the stack pointer. The return address and the
 428 offset in the immediately following {\tt LDA} instruction is used to
 429 recompute the method pointer.
 430
 431 \begin{table*}
 432 \begin{center}
 433 \begin{tabular}[b]{|l|c|c|c|c|c|}
 434 \hline
 435           & JavaLex & javac  & espresso & Toba  & java\_cup \\ \hline
 436 CACAO old &  61629  & 156907 &  122951  & 67602 &   87489   \\ \hline
 437 CACAO new &  37523  &  86346 &   69212  & 41315 &   52386   \\ \hline
 438 \end{tabular}
 439 \caption{Number of generated native instructions}
 440 \label{CodeSize}
 441 \end{center}
 442 \end{table*}
 443
 444 The change to the new scheme allowed us to implement the null pointer
 445 check for free. We protect the first 64 Kbyte of memory against read
 446 and write access. If an bus error is raised, we catch the signal and
 447 check if the faulting address is within the first 64K. If this is the
 448 case, we dispatch to our exception handler, otherwise we propagate the
 449 signal. This gives faster programs and reduces the work of the
 450 compiler in generating pointer checking code. As shown in table
 451 \ref{MethodTry}, the numbers of null pointer checks are quite high.
 452
 453
 454 \section{Garbage collector}
 455
 456