--- /dev/null
+\section{Alpha code generator}
--- /dev/null
+\documentclass[a4paper,twoside]{book}
+
+%\pagestyle{myheadings} \footheight 12pt \footskip 72pt
+
+%\pagestyle{empty}
+
+\pagestyle{headings}
+\setlength{\textwidth}{16cm}
+\setlength{\textheight}{23cm}
+\setlength{\topmargin}{0mm}
+\setlength{\oddsidemargin}{0cm}
+\setlength{\evensidemargin}{0cm}
+
+
+\parskip\medskipamount
+
+\newenvironment{mylist}[1]{
+ \begin{list}{}
+ {\settowidth{\labelwidth}{#1}
+ \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
+ \itemsep0ex \parsep0ex
+ \renewcommand{\makelabel}[1]{##1\hfill}
+ }
+ }{\end{list}}
+
+\title{The CACAO Java Virtual Machine}
+
+\author{Andreas Krall et al.}
+
+
+\date{}
+
+
+
+\begin{document}
+
+
+\maketitle
+
+\tableofcontents
+
+\include{intro}
+
+\include{overview}
+
+\include{loader}
+
+\include{runtime}
+ \include{threads}
+ \include{native}
+ \include{reflection}
+
+\include{jit}
+ \include{verification}
+ \include{loopopt}
+ \include{inlining}
+ \include{alpha}
+ \include{mips}
+ \include{powerpc}
+ \include{x86}
+
+\include{library}
+
+%\include{}
+\bibliography{java}
+\bibliographystyle{alpha}
+\end{document}
+
--- /dev/null
+\section{Method inlining}
--- /dev/null
+\chapter{Introduction}
+
+\section{Introduction}
+
+\section{History}
+
--- /dev/null
+Ishizaki+00@proceedings{ISMM98,
+ key = "ISMM",
+ booktitle = {Proceedings of the First International Symposium on
+ Memory Management},
+ title = {Proceedings of the First International Symposium on
+ Memory Management},
+ editor = {Richard Jones},
+ address = {Vancouver},
+ series = SIGPLAN,
+ volume = "34(3)",
+ publisher = ACM,
+ month = oct,
+ year = 1998,
+ ISBN = {1-58113-114-3},
+ note = {ISMM is the successor to the IWMM series of
+ workshops}
+}
+
+@INPROCEEDINGS{Chan:Dadoo:Santhanam:usenix:summer:1990,
+ AUTHOR = {Paul Chan and Manoj Dadoo and Vatsa Santhanam},
+ TITLE = {Evolution of the {U-code} Compiler Intermediate
+ Language at {Hewlett-Packard}},
+ BOOKTITLE = {Proceedings of the 1990 Summer USENIX Conference},
+ MONTH = {June},
+ PAGES = {199--210},
+ YEAR = 1990
+}
+
+@INPROCEEDINGS{Taba+98,
+ AUTHOR = {Ali-Reza Adl-Tabatabai and Michal Ciernak and
+ Guei-Yuan Lueh and Vishesh M. Parikh and James
+ M. Stichnoth},
+ TITLE = {Fast, Effective Code Generation in a Just-In-Time
+ {Java} Compiler},
+ BOOKTITLE = {Conference on Programming Language Design and
+ Implementation},
+ ORGANIZATION = {ACM},
+ SERIES = {SIGPLAN},
+ VOLUME = {33(5)},
+ PAGES = {280--290},
+ ADDRESS = {Montreal},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{Bacon+98,
+ AUTHOR = {David F. Bacon and Ravi Konuru and Chet Mruthy and
+ Mauricio Serrano},
+ TITLE = {Thin Locks: Featherweight Synchronization for {Java}},
+ BOOKTITLE = {Conference on Programming Language Design and
+ Implementation},
+ ORGANIZATION = {ACM},
+ SERIES = {SIGPLAN},
+ VOLUME = {33(5)},
+ PAGES = {258--268},
+ ADDRESS = {Montreal},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{Cameron+92,
+ AUTHOR = {Don Cameron and Paul Faust and Dmitry Lenkov and
+ Michey Mehta},
+ TITLE = {A Portable Implementation of {C++} Exception
+ Handling},
+ BOOKTITLE = {C++ Technical Conference},
+ ORGANIZATION = {USENIX},
+ PAGES = {225--243},
+ MONTH = {August},
+ YEAR = 1992
+}
+
+@ARTICLE{Chase94,
+ AUTHOR = {David Chase},
+ TITLE = {Implementation of Exception Handling},
+ JOURNAL = {Journal of C Language Translation},
+ VOLUME = 5,
+ NUMBER = 4,
+ PAGES = {??--??},
+ MONTH = {June},
+ YEAR = 1994
+}
+
+@INPROCEEDINGS{Giering+94,
+ AUTHOR = {E. W. Giering and Frank Mueller and T. P. Baker},
+ TITLE = {Features of the {Gnu} {Ada} Runtime Library},
+ BOOKTITLE = {TRI-Ada '94},
+ ORGANIZATION = {ACM},
+ PAGES = {93--103},
+ YEAR = 1994
+}
+
+@INPROCEEDINGS{Gunter+95,
+ AUTHOR = {Carl A. Gunter and Didier Remy and Jon G. Riecke},
+ TITLE = {A Generalization of Exceptions and Control in
+ {ML}-like languages},
+ BOOKTITLE = {Seventh International Conference on Functional
+ Programming Languages and Computer Architecture},
+ ORGANIZATION = {ACM},
+ PAGES = {12--23},
+ YEAR = 1995
+}
+
+@ARTICLE{KoenigStroustrup90,
+ AUTHOR = {Andrew Koenig and Bjarne Stroustrup},
+ TITLE = {Exception Handling for {C++}},
+ JOURNAL = {Journal of Object Oriented Programming},
+ VOLUME = 3,
+ NUMBER = 2,
+ PAGES = {16--33},
+ MONTH = {July/August},
+ YEAR = 1990
+}
+
+@BOOK{dragon86,
+ AUTHOR = {Alfred V. Aho and Ravi Sethi and Jeffrey D. Ullman},
+ TITLE = {Compilers: Principles, Techniques, and Tools},
+ PUBLISHER = {Addison-Wesley},
+ YEAR = 1986
+}
+
+
+@BOOK{appel98,
+ AUTHOR = {Appel, A.W.},
+ TITLE = {Modern Compiler Implementation in Java},
+ PUBLISHER = {Cambridge University Press},
+ YEAR = 1998
+}
+
+
+@BOOK{Stallings95,
+ AUTHOR = {William Stallings},
+ TITLE = {Operating Systems},
+ PUBLISHER = {Prentice Hall},
+ YEAR = 1995
+}
+
+@INPROCEEDINGS{Bershad+92,
+ AUTHOR = {Brian N. Bershad and David D. Redell and John
+ R. Ellis},
+ TITLE = {Fast mutual exclusion for uniprocessors},
+ BOOKTITLE = {Annual Symposium on Architectural Support for
+ Programming Languages and Operating Systems},
+ ORGANIZATION = {ACM},
+ PAGES = {223--233},
+ MONTH = {October},
+ YEAR = 1992
+}
+
+@INPROCEEDINGS{Mueller93,
+ AUTHOR = {Frank Mueller},
+ TITLE = {A Library Implementation of {POSIX} Threads under
+ {UNIX}},
+ BOOKTITLE = {Winter USENIX},
+ PAGES = {29--41},
+ ADDRESS = {San Diego},
+ MONTH = {January},
+ YEAR = 1993
+}
+
+@TECHREPORT{Keppel93,
+ AUTHOR = {David Keppel},
+ TITLE = {Tools and Techniques for Building Fast Portable
+ Threads Packages},
+ NUMBER = {UWCSE 93-05-06},
+ INSTITUTION = {University of Washington},
+ YEAR = 1993
+}
+
+@MISC{BissAWT97,
+ AUTHOR = {Peter Mehlitz},
+ TITLE = {Biss {AWT}},
+ HOWPUBLISHED = {{\tt http://www. biss-net.com/biss-awt.html}},
+ YEAR = 1997
+}
+
+@MISC{Jigsaw97,
+ KEY = {Jigsaw97},
+ TITLE = {Jigsaw},
+ HOWPUBLISHED = {{\tt http://www.w3.org/Jigsaw/}},
+ YEAR = 1997
+}
+
+@MISC{PosixThreads96,
+ KEY = {POSIX},
+ TITLE = {Standard for Threads Interface to {POSIX}},
+ HOWPUBLISHED = {IEEE, P1003.1c},
+ YEAR = 1996
+}
+
+@MISC{SunThreads97,
+ KEY = {JavaThreads},
+ TITLE = {Java Threads Whitepaper},
+ HOWPUBLISHED = {{\tt http://java. sun.com/}},
+ INSTITUTION = {SUN},
+ YEAR = 1997
+}
+
+@INPROCEEDINGS{AltmanEbcioglu97,
+ AUTHOR = {Erik Altman and Kemal Ebcio\u{g}lu},
+ TITLE = {{DAISY}: Dynamic Compilation for 100\% Architectural
+ Compatibility},
+ BOOKTITLE = {ISCA'97 - The 24th Annual International Symposium on
+ Computer Architecture},
+ ORGANIZATION = {ACM, IEEE},
+ YEAR = 1997
+}
+
+@BOOK{javaref96,
+ AUTHOR = {Ken Arnold and James Gosling},
+ TITLE = {The {Java} Programming Language},
+ PUBLISHER = {Addison-Wesley},
+ YEAR = 1996
+}
+
+@INPROCEEDINGS{bacon&sweeny96-,
+ author = {David F. Bacon and Peter F. Sweeney},
+ title = {Fast Static Analysis of {C++} Virtual Function
+ Calls},
+ booktitle = {Conference on Object-Oriented Programming Systems,
+ Languages \& Applications (OOPSLA'96)},
+ ORGANIZATION = {ACM},
+ pages = {324--341},
+ year = 1996
+}
+
+@ARTICLE{picoJava97,
+ AUTHOR = {J. Michael O'Connor and Marc Tremblay},
+ TITLE = {{picoJava-I}: The {Java} Virtual Machine in
+ Hardware},
+ JOURNAL = {IEEE Micro},
+ VOLUME = 17,
+ NUMBER = 2,
+ PAGES = {45--53},
+ MONTH = {April},
+ YEAR = 1997
+}
+
+@ARTICLE{SunJIT97,
+ AUTHOR = {Timothy Cramer and Richard Friedman and Terrence
+ Miller and David Seberger and Robert Wilson and
+ Mario Wolczko},
+ TITLE = {Compiling {Java} Just in Time},
+ JOURNAL = {IEEE Micro},
+ VOLUME = 17,
+ NUMBER = 3,
+ PAGES = {36--43},
+ MONTH = {June},
+ YEAR = 1997
+}
+
+@ARTICLE{ssa91,
+ AUTHOR = {Ron Cytron and Jeanne Ferrante and Barry K. Rosen
+ and Mark N. Wegman and F. Kenneth Zadeck},
+ TITLE = {Efficiently computing Static Single Assignment Form
+ and the Control Flow Graph},
+ JOURNAL = {ACM Transactions on Programming Languages and
+ Systems},
+ VOLUME = 13,
+ NUMBER = 4,
+ PAGES = {451--490},
+ MONTH = {October},
+ YEAR = 1991
+}
+
+@INPROCEEDINGS{diwan+96-,
+ AUTHOR = {Amer Diwan and J. Eliot B. Moss and Kathryn
+ S. McKinley},
+ TITLE = {Simple and Effective Analysis of Statically-Typed
+ Object-Oriented Programs},
+ BOOKTITLE = {Conference on Object-Oriented Programming Systems,
+ Languages \& Applications (OOPSLA '96)},
+ ORGANIZATION = {ACM},
+ PAGES = {292--305},
+ YEAR = 1996
+}
+
+@INPROCEEDINGS{EbciogluAltman97,
+ AUTHOR = {Kemal Ebcio\u{g}lu and Erik Altman and Erdem
+ Hokenek},
+ TITLE = {A {Java} {ILP} Machine based on fast Dynamic
+ Compilation},
+ BOOKTITLE = {MASCOTS'97 - International Workshop on Security and
+ Efficiency Aspects of {Java}},
+ YEAR = 1997
+}
+
+@INPROCEEDINGS{Ertl92-,
+ AUTHOR = {M. Anton Ertl},
+ TITLE = {A New Approach to {Forth} native Code Generation},
+ BOOKTITLE = {EuroForth '92},
+ ADDRESS = {Southampton},
+ YEAR = 1992
+}
+
+@PHDTHESIS{Ertl96phd,
+ AUTHOR = {M. Anton Ertl},
+ TITLE = {Implementation of Stack-Based Languages on Register
+ Machines},
+ SCHOOL = {Technische Universit\"at Wien},
+ MONTH = {April},
+ YEAR = 1996
+}
+
+@INPROCEEDINGS{ErtlMaierhofer95,
+ AUTHOR = {M. Anton Ertl and Martin Maierhofer},
+ TITLE = {Translating {Forth} to native {C}},
+ BOOKTITLE = {EuroForth '95},
+ ADDRES = {Dagstuhl},
+ YEAR = 1995
+}
+
+@INPROCEEDINGS{Ertl&Pirker97,
+ AUTHOR = {M. Anton Ertl and Christian Pirker},
+ TITLE = {The Structure of a {Forth} Native Code Compiler},
+ BOOKTITLE = {EuroForth '97 Conference Proceedings},
+ ADDRES = {Oxford},
+ PAGES = {107--116},
+ YEAR = 1997
+}
+
+}
+@PHDTHESIS{Franz94,
+ AUTHOR = {Michael Franz},
+ TITLE = {Code Generation On the Fly: A Key for Portable
+ Software},
+ SCHOOL = {ETH Z\"urich},
+ YEAR = 1994
+}
+
+@BOOK{smalltalk80,
+ AUTHOR = {Adele Goldberg and David Robson},
+ TITLE = {SMALLTALK-80, The Language and its Implementation},
+ PUBLISHER = {Addison-Wesley},
+ YEAR = 1983
+}
+
+@INPROCEEDINGS{Gough97,
+ AUTHOR = {K. John Gough},
+ TITLE = {Multi-language, Multi-target Compiler Development:
+ Evolution of the {Gardens Point} Compiler Project},
+ BOOKTITLE = {JMLC'97 --Joint Modular Languages Conference},
+ EDITOR = {Hanspeter M\"ossenb\"ock},
+ PUBLISHER = {LNCS 1204},
+ ADDRESS = {Linz},
+ YEAR = 1997
+}
+
+@MASTERSTHESIS{Grafl97,
+ AUTHOR = {Reinhard Grafl},
+ TITLE = {{CACAO}: {Ein 64Bit JavaVM Just-in-Time Compiler}},
+ SCHOOL = {Technische Universit\"at Wien},
+ MONTH = {January},
+ YEAR = 1997
+}
+
+@BOOK{hennessy&patterson90,
+ AUTHOR = {John L. Hennessy and David A. Patterson},
+ TITLE = {Computer Architecture. A Quantitative Approach},
+ PUBLISHER = {Morgan Kaufman Publishers},
+ YEAR = 1990
+}
+
+@ARTICLE{Hsieh+97,
+ AUTHOR = {Cheng-Hsueh A. Hsieh and Maria T. Conte and Teresa
+ L. Johnson and John C. Gyllenhaal and {Wen-mei}
+ W. Hwu},
+ TITLE = {Optimizing NET Compilers for Improved {Java}
+ Performance},
+ JOURNAL = {IEEE Computer},
+ VOLUME = 30,
+ NUMBER = 6,
+ PAGES = {67--75},
+ MONTH = {June},
+ YEAR = 1997
+}
+
+@INPROCEEDINGS{HsiehHwu97,
+ AUTHOR = {Cheng-Hsueh A. Hsieh and John C. Gyllenhaal and
+ {Wen-mei} W. Hwu},
+ TITLE = {Java Bytecode to Native Code Translation: The
+ {Caffeine} Prototype and Preliminary Results},
+ BOOKTITLE = {29th Annual IEEE/ACM International Symposium on
+ Microarchitecture (MICRO'29)},
+ ADDRES = {Paris},
+ YEAR = 1996
+}
+
+@ARTICLE{Holmer96,
+ AUTHOR = {Bruce K. Holmer and Barton Sano and Michael Carlton
+ and Peter van Roy and Alvin M. Despain},
+ TITLE = {Design and Analysis of Hardware for High-Performance
+ {Prolog}},
+ JOURNAL = {Journal of Logic Programming},
+ PUBLISHER = {Elsevier},
+ VOLUME = {29(1-3)},
+ PAGES = {107--139},
+ YEAR = 1996
+}
+
+@BOOK{garbagebook,
+ AUTHOR = {Richard Jones and Rafael Lins},
+ TITLE = {Garbage Collection},
+ PUBLISHER = {John Wiley \& Sons},
+ YEAR = 1996
+}
+
+@INPROCEEDINGS{Kistler97,
+ AUTHOR = {Thomas Kistler},
+ TITLE = {Dynamic Runtime Optimization},
+ BOOKTITLE = {JMLC'97 - Joint Modular Languages Conference},
+ EDITOR = {Hanspeter M\"ossenb\"ock},
+ PUBLISHER = {LNCS 1204},
+ ADDRESS = {Linz},
+ YEAR = 1997
+}
+
+@INPROCEEDINGS{KrBe95a,
+ AUTHOR = {Andreas Krall and Thomas Berger},
+ TITLE = {Incremental Global Compilation of {Prolog} with the
+ {Vienna Abstract Machine}},
+ BOOKTITLE = {Twelfth International Conference on Logic
+ Programming},
+ EDITOR = {Leon Sterling},
+ PUBLISHER = {MIT Press},
+ PAGES = {333--347},
+ ADDRESS = {Tokyo},
+ YEAR = 1995
+}
+
+@INPROCEEDINGS{KrGr97a,
+ AUTHOR = {Andreas Krall and Reinhard Grafl},
+ TITLE = {{CACAO} -- A 64 bit {JavaVM} Just-in-Time Compiler},
+ BOOKTITLE = {Workshop on Java for Science and Engineering
+ Computation},
+ EDITOR = {Geoffry C. Fox and Wei Li},
+ ORGANIZATION = {ACM},
+ MONTH = {June},
+ PAGES = {},
+ ADDRESS = {Las Vegas},
+ YEAR = 1997
+}
+
+@ARTICLE{KrGr97b,
+ AUTHOR = {Andreas Krall and Reinhard Grafl},
+ TITLE = {{CACAO} -- A 64 bit {JavaVM} Just-in-Time Compiler},
+ JOURNAL = {Concurrency: Practice and Experience},
+ PUBLISHER = {John Wiley \& Sons},
+ VOLUME = {9},
+ NUMBER = {11},
+ PAGES = {1017--1030},
+ YEAR = 1997
+}
+
+@INPROCEEDINGS{KrErGsch98,
+ AUTHOR = {Andreas Krall and Anton Ertl and Michael Gschwind},
+ TITLE = {{JavaVM} Implementations: Compilers versus Hardware},
+ BOOKTITLE = {Australian Computer Architecture Conference
+ ({ACAC'98})},
+ EDITOR = {John Morris},
+ VOLUME = {20(4)},
+ SERIES = {Australian Computer Science Communications},
+ PUBLISHER = {Springer},
+ PAGES = {101--110},
+ ADDRESS = {Perth},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{KrPr98a,
+ AUTHOR = {Andreas Krall and Mark Probst},
+ TITLE = {Monitors and Exceptions: How to implement {Java}
+ efficiently},
+ BOOKTITLE = {ACM 1998 Workshop on Java for High-Performance
+ Computing},
+ EDITOR = {Siamak Hassanzadeh and Klaus Schauser},
+ ORGANIZATION = {ACM},
+ MONTH = {March},
+ PAGES = {15--24},
+ ADDRESS = {Palo Alto},
+ YEAR = 1998
+}
+
+@ARTICLE{KrPr98b,
+ AUTHOR = {Andreas Krall and Mark Probst},
+ TITLE = {Monitors and Exceptions: How to implement {Java}
+ efficiently},
+ JOURNAL = {Concurrency: Practice and Experience},
+ PUBLISHER = {John Wiley \& Sons},
+ VOLUME = {10},
+ NUMBER = {11--13},
+ PAGES = {837--850},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{Kr98b,
+ AUTHOR = {Andreas Krall},
+ TITLE = {Efficient {JavaVM} Just-in-Time Compilation},
+ BOOKTITLE = {International Conference on Parallel Architectures
+ and Compilation Techniques},
+ EDITOR = {Jean-Luc Gaudiot},
+ ORGANIZATION = {IFIP,ACM,IEEE},
+ PUBLISHER = {North-Holland},
+ PAGES = {205--212},
+ ADDRESS = {Paris},
+ MONTH = {October},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{GrErKr01b,
+ AUTHOR = {David Gregg and M.~Anton Ertl and Andreas Krall},
+ TITLE = {Implementing an Efficient {Java} Interpreter},
+ BOOKTITLE = {HPCN'01, Java in High Performance Computing},
+ EDITOR = {Vladimir Getov and George K. Thiruvathukal},
+ SERIES = {LNCS 2110},
+ PUBLISHER = {Springer},
+ PAGES = {613--620},
+ ADDRESS = {Amsterdam},
+ MONTH = {June},
+ YEAR = 2001}
+
+@BOOK{javavm96,
+ AUTHOR = {Tim Lindholm and Frank Yellin},
+ TITLE = {The {Java} Virtual Machine Specification},
+ PUBLISHER = {Addison-Wesley},
+ YEAR = 1996
+}
+
+@INPROCEEDINGS{Muller+97,
+ AUTHOR = {Gilles Muller and Barbara Moura and Fabrice Bellard
+ and Charles Consel},
+ TITLE = {Harrisa: a Flexible and Efficient {Java} Environment
+ Mixing Bytecode and Compiled Code},
+ BOOKTITLE = {COOTS'97 - third Conference on Object-Oriented
+ Technologies and Systems},
+ YEAR = 1997
+}
+
+@PHDTHESIS{Lilith?,
+ AUTHOR = {Richard Stanley Ohran},
+ TITLE = {Lilith: a workstation computer for Modula-2},
+ SCHOOL = {ETH Z\"urich},
+ YEAR = 1984
+}
+
+@INPROCEEDINGS{PaJoSm97,
+ AUTHOR = {Subbaro Palacharia and Norman P. Jouppi and
+ J.E. Smith},
+ TITLE = {Complexity-Effective Superscalar Processors},
+ BOOKTITLE = {ISCA'97 - The 24th Annual International Symposium on
+ Computer Architecture},
+ ORGANIZATION = {ACM, IEEE},
+ YEAR = 1997
+}
+
+@BOOK{Pcode82,
+ AUTHOR = {Steven Pemberton and Martin C. Daniels},
+ TITLE = {Pascal Implementation, The P4 Compiler},
+ PUBLISHER = {Ellis Horwood},
+ YEAR = 1982
+}
+
+@BOOK{AlphaRef95,
+ AUTHOR = {R.L. Sites, R.T. Witek},
+ TITLE = {Alpha AXP Architecture Reference Manual},
+ PUBLISHER = {Digital Press},
+ YEAR = 1995
+}
+
+@INPROCEEDINGS{UNCOL61,
+ AUTHOR = {T. B. Steel},
+ TITLE = {A first version of {UNCOL}},
+ BOOKTITLE = {Proceedings of the Western Joint IRE-AIEE-ACM
+ Computer Conference},
+ YEAR = 1961,
+ PAGES = {371 -- 377}
+}
+
+@ARTICLE{AmsterdamKit83,
+ AUTHOR = {Andrew S. Tanenbaum and Hans van {Sta\-ve\-ren} and
+ E. G. Keizer and Johan W. Stevenson},
+ TITLE = {A Practical Tool Kit for making Portable Compilers},
+ JOURNAL = {Communications of the ACM},
+ VOLUME = 16,
+ NUMBER = 9,
+ PAGES = {654--660},
+ MONTH = {September},
+ YEAR = 1983
+}
+
+@ARTICLE{AmsterdamKit89,
+ AUTHOR = "A. S. Tanenbaum and M. F. Kaashoek and
+ K. G. Langendoen and C. J. H. Jacobs",
+ TITLE = "The design of very fast portable compilers",
+ JOURNAL = "ACM SIG{\-}PLAN Notices",
+ VOLUME = 24,
+ NUMBER = 11,
+ PAGES = "125--131",
+ MONTH = nov,
+ YEAR = 1989
+}
+
+@INPROCEEDINGS{Self,
+ AUTHOR = {David Ungar and Randall B. Smith},
+ TITLE = {{SELF}: The Power of Simplicity},
+ BOOKTITLE = {OOPSLA '87 Proceedings},
+ YEAR = {1987},
+ PAGES = {227--242}
+}
+
+@INPROCEEDINGS{VitekHorspool96,
+ AUTHOR = {Jan Vitek and R. Nigel Horspool},
+ TITLE = {Compact Dispatch Tables for Dynamically Typed Object
+ Oriented Languages},
+ BOOKTITLE = {6th International Conference CC '96},
+ YEAR = 1996,
+ PAGES = {307--325}
+}
+
+@INPROCEEDINGS{KrViHo97,
+ AUTHOR = {Andreas Krall and Jan Vitek and Nigel Horspool},
+ TITLE = {Near Optimal Hierarchical Encoding of Types},
+ BOOKTITLE = {11th European Conference on Object Oriented Programming ({ECOOP'97})},
+ EDITOR = {Mehmet Aksit and Satoshi Matsuoka},
+ SERIES = {LNCS 1241},
+ PUBLISHER = {Springer},
+ PAGES = {128--145},
+ ADDRESS = {Finland},
+ YEAR = 1997}
+
+@INPROCEEDINGS{ViHoKr97,
+ AUTHOR = {Jan Vitek and Nigel Horspool and Andreas Krall},
+ TITLE = {Efficient Type Inclusion Tests},
+ BOOKTITLE = {Conference on Object Oriented Programming Systems,
+ Languages \& Applications ({OOPSLA '97})},
+ EDITOR = {Toby Bloom},
+ ORGANIZATION = {ACM},
+ MONTH = {October},
+ PAGES = {142--157},
+ ADDRESS = {Atlanta},
+ YEAR = 1997
+}
+
+@INPROCEEDINGS{GehmRandScal00,
+ AUTHOR = {Sanjay Gehmawat and Keith H. Randall and Daniel
+ J. Scales},
+ TITLE = {Field Analysis: Getting Useful and Low-cost
+ Interprocedural Information},
+ BOOKTITLE = {Conference on Programming Language Design and
+ Implementation},
+ ORGANIZATION = {ACM},
+ SERIES = {SIGPLAN},
+ VOLUME = {35(5)},
+ PAGES = {334--344},
+ ADDRESS = {Vancouver},
+ YEAR = 2000
+}
+
+@INPROCEEDINGS{BodikGuptaSarkar00,
+ AUTHOR = {Rastislav Bodik and Rajiv Gupta and Vivek Sarkar},
+ TITLE = {{ABCD}: Eliminating Array Bound Checks on Demand},
+ BOOKTITLE = {Conference on Programming Language Design and
+ Implementation},
+ ORGANIZATION = {ACM},
+ SERIES = {SIGPLAN},
+ VOLUME = {35(5)},
+ PAGES = {321--333},
+ ADDRESS = {Vancouver},
+ YEAR = 2000
+}
+
+@INPROCEEDINGS{Ruf00,
+ AUTHOR = {Erik Ruf},
+ TITLE = {Effective Synchronization Removal for {Java}},
+ BOOKTITLE = {Conference on Programming Language Design and
+ Implementation},
+ ORGANIZATION = {ACM},
+ SERIES = {SIGPLAN},
+ VOLUME = {35(5)},
+ PAGES = {208--218},
+ ADDRESS = {Vancouver},
+ YEAR = 2000
+}
+
+@ARTICLE{Marmot00,
+ AUTHOR = "Robert Fitzgerald and Todd B. Knoblock and Erik Ruf and
+ Bjarne Steensgaard and David Tarditi",
+ TITLE = "Marmot: an optimizing compiler for {Java}",
+ JOURNAL = "Software -- Practice and Experience",
+ VOLUME = 30,
+ NUMBER = 3,
+ PAGES = "199--232",
+ MONTH = nov,
+ YEAR = 2000
+}
+
+@Misc{Wilkinson:97,
+ OPTkey = {},
+ author = {Tim Wilkinson},
+ title = {{KAFFE}: A free virtual machine to run {Java} code},
+ howpublished = {{\tt http: //www.kaffe.org}},
+ year = {1997},
+ OPTmonth = {},
+ OPTnote = {},
+ OPTannote = {}
+}
+
+@TechReport{Proebsting+97,
+ author = {Todd A. Proebsting and Gregg Townsend and Patrick
+ Bridges and John H. Hartman and Tim Newsham and
+ Scott A. Watterson},
+ title = {Toba: {Java} for Applications},
+ institution = {University of Arizona},
+ year = {1997},
+ OPTkey = {},
+ OPTtype = {},
+ OPTnumber = {},
+ address = {Tucson, AZ},
+ OPTmonth = {},
+ OPTnote = {},
+ OPTannote = {}
+}
+
+@BOOK{GCBook96,
+ AUTHOR = {Richard Jones and Rafael Lins},
+ TITLE = {Garbage Collection},
+ PUBLISHER = {John Wiley},
+ YEAR = 1996
+}
+
+@INCOLLECTION{garbagesurvey,
+ AUTHOR = {Paul R. Wilson},
+ TITLE = {Uniprocessor Garbage Collection Techniques},
+ BOOKTITLE = {ACM Computing Surveys},
+ PUBLISHER = {ACM},
+ PAGES = {to apear},
+ YEAR = {1994}
+}
+
+@INPROCEEDINGS{Age+98,
+ AUTHOR = {Ole Agesen and David Detlefs and J. Eliot B. Moss},
+ TITLE = {Garbage Collection and Local Variable Type-Precision
+ and Liveness in {Java} Virtual Machines},
+ BOOKTITLE = {Conference on Programming Language Design and
+ Implementation},
+ ORGANIZATION = {ACM},
+ SERIES = {SIGPLAN},
+ VOLUME = {33(6)},
+ PAGES = {269--279},
+ ADDRESS = {Montreal},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{Johnstone98,
+ AUTHOR = {Mark S. Johnstone and Paul R. Wilson},
+ TITLE = {The Memory Fragmentation Problem: Solved?},
+ BOOKTITLE = {1998 International Symposium on Memory Management},
+ ORGANIZATION = {ACM},
+ PAGES = {},
+ ADDRESS = {Vancouver},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{Hicks+98,
+ AUTHOR = {Michael Hicks and Luke Hornof and Jonathan T. Moore
+ and Scott Nettles},
+ TITLE = {A Study of Large Object Spaces},
+ BOOKTITLE = {1998 International Symposium on Memory Management},
+ ORGANIZATION = {ACM},
+ PAGES = {138--145},
+ ADDRESS = {Vancouver},
+ YEAR = 1998
+}
+
+@INPROCEEDINGS{Colnet+98,
+ AUTHOR = {Dominique Colnet and Philippe Coucaud and Olivier
+ Zendra},
+ TITLE = {Compiler Support to Customize the Mark and Sweep
+ Algorithm},
+ BOOKTITLE = {1998 International Symposium on Memory Management},
+ ORGANIZATION = {ACM},
+ PAGES = {154--165},
+ ADDRESS = {Vancouver},
+ YEAR = 1998
+}
+
+@ARTICLE{Boehm88,
+ AUTHOR = {Hans-Juergen Boehm and Mark Weiser},
+ TITLE = {Garbage Collection in an Uncooperative Environment},
+ JOURNAL = {Software Practice and Experience},
+ PUBLISHER = {John Wiley \& Sons},
+ VOLUME = {18},
+ NUMBER = {9},
+ PAGES = {807--820},
+ YEAR = 1988
+}
+
+@InProceedings{HudsonMoss:1992,
+ author = {Richard L. Hudson and J. Eliot B. Moss},
+ title = {Incremental Collection of Mature Objects},
+ booktitle = {Proceedings of the International Workshop on Memory
+ Management},
+ pages = {388--403},
+ month = {September},
+ year = {1992}
+}
+
+@TECHREPORT{Petit-Bianco99,
+ AUTHOR = {Alexandre Petit-Bianco99},
+ TITLE = {No Silver Bullet - Garbage Collection for {Java} in
+ Embedded Systems},
+ NUMBER = {http://sourceware.cygnus.com/java/},
+ INSTITUTION = {Cygnus Solutions},
+ MONTH = {August},
+ YEAR = 1998
+}
+
+@TECHREPORT{Ritzau99,
+ AUTHOR = {Alexandre Petit-Bianco99},
+ TITLE = {No Silver Bullet - Garbage Collection for {Java} in
+ Embedded Systems},
+ NUMBER = {thesis proposal, http://www.ida.liu.se/~tobri/},
+ INSTITUTION = {Link\"oping University},
+ MONTH = {August},
+ YEAR = 1998
+}
+
+@inproceedings{lim98,
+ author = {Tian F. Lim and Przemyslaw Pardyak and Brian
+ N. Bershad},
+ title = {A Memory-Efficient Real-Time Non-Copying Garbage
+ Collector},
+ pages = {118--129},
+ crossref = {ISMM98},
+ URL = "http://www.cs.washington.edu/homes/tian/ISMM98/",
+ abstract = {Garbage collectors used in operating systems such as
+ SPIN and embedded systems such as Java and Inferno
+ must operate with limited resources and minimize
+ their impact on application
+ performance. Consequently, they must maintain short
+ real-time pauses, low overhead, and a small memory
+ footprint. Most garbage collectors are not adequate
+ because they are either not real-time or they
+ require larger heaps because they trade space for
+ time. For example, Treadmill uses segregated free
+ lists to allocate and reclaim memory in constant
+ time but at the cost of under-utilizing memory. We
+ have implemented a new version of the Treadmill
+ collector and used it in the SPIN extensible
+ operating system kernel. We chose Treadmill for its
+ excellent real-time latencies and low overhead. We
+ improve its poor memory utilization by using real-
+ time page-level management techniques that reduce
+ the fragmentation caused by segregated free
+ lists. We use page-wise collection to locate pages
+ of free memory, which are dynamically reassigned
+ between free lists as needed. We use virtual memory
+ to dynamically remap free pages into continuous
+ ranges. Our experiments demonstrate that we have
+ substantially improved memory utilization without
+ compromising latency or overhead, and that the new
+ collector performs very well for SPIN's workloads.}
+}
+
+@Article{IBM:Moreira:2000,
+ author = {J. E. Moreira and S. P. Midkoff and M. Gupta and P. V. Artigas and M. Snir and R. D. Lawrence},
+ title = {Java programming for high-performance numerical computing},
+ journal = {IBM Systems Journal},
+ year = {2000},
+ OPTkey = {},
+ volume = {39},
+ number = {1},
+ pages = {21--56},
+ OPTmonth = {},
+ OPTnote = {},
+ OPTannote = {}
+}
+
+@Article{IBM:Suganuma:2000,
+ author = {T. Suganuma and T. Ogasawara and M. TaT. Yasuekeuchi and and M. Kawahito and K. Ishizaki and H. Komatsuatani},
+ title = {Overview of the {IBM} {Java} Just-in-Time Compiler},
+ journal = {IBM Systems Journal},
+ year = {2000},
+ OPTkey = {},
+ volume = {39},
+ number = {1},
+ pages = {175--193},
+ OPTmonth = {},
+ OPTnote = {},
+ OPTannote = {}
+}
+
+@TechReport{Compaq:Scales:2000,
+ author = {Daniel J. Scales and Keith H. Randall and Sanjay Ghemawat and Jeff Dean},
+ title = {The {Swift} Compiler: Design and Implementation},
+ institution = {Compaq Western Research Laboratory},
+ year = {2000},
+ OPTkey = {},
+ OPTtype = {},
+ number = {2000/2},
+ OPTaddress = {},
+ month = {April},
+ OPTnote = {},
+ OPTannote = {}
+}
+
+@Manual{NaturalBridge,
+ title = {{BulletTrainTM} optimizing compiler and runtime for {JVM} bytecode},
+ OPTkey = {},
+ OPTauthor = {},
+ organization = {NaturalBridge},
+ OPTaddress = {},
+ OPTedition = {},
+ OPTmonth = {},
+ OPTyear = {},
+ note = {{\tt http://www.naturalbridge.com}},
+ OPTannote = {}
+}
+
+
+@Manual{TowerJ,
+ title = {{TowerJ 3.0: A New Generation Native Java Compiler And Runtime Environment}},
+ OPTkey = {},
+ OPTauthor = {},
+ organization = {Tower Technologies},
+ OPTaddress = {},
+ OPTedition = {},
+ OPTmonth = {},
+ OPTyear = {},
+ note = {{\tt http://www.towerj.com}},
+ OPTannote = {}
+}
+
+@InProceedings{SGI:Smith:2000,
+ author = {Todd Smith and Suresh Srinivas and Philipp Tomsich and Jinpyo Park},
+ title = {Practical Experiences with {Java} Compilation},
+ booktitle = {Proceedings of the Intl. Conf. on High-Performance Computing},
+ OPTcrossref = {},
+ OPTkey = {},
+ OPTpages = {},
+ year = {2000},
+ OPTeditor = {},
+ volume = {1970},
+ OPTnumber = {},
+ series = {Lecture Notes in Computer Science},
+ OPTaddress = {},
+ month = {December},
+ OPTorganization = {},
+ publisher = {Springer},
+ OPTnote = {},
+ OPTannote = {}
+}
+
+@INPROCEEDINGS{DetlefsAgesen,
+ AUTHOR = "David Detlefs and Ole Agesen",
+ TITLE = "{The Case for Multiple Compilers}",
+ BOOKTITLE = "Proc. OOPSLA 1999 VM Workshop on Simplicity,
+ Performance and Portability in Virtual Machine
+ Design",
+ YEAR = 1997,
+}
+
+@InProceedings{Krall_Tomsich_1999a,
+ author = {Andreas Krall and Philipp Tomsich},
+ title = {Garbage Collection for Large Memory {Java} Applications},
+ booktitle = {Proceedings of the 7th European Conference on High-Performance Computing and Networking (HPCN Europe'99)},
+ pages = {895--907},
+ year = {1999},
+ OPTeditor = {},
+ volume = {1593},
+ series = {Lecture Notes in Computer Science},
+ month = {Apr},
+ publisher = {Springer Verlag},
+}
+
+@INPROCEEDINGS{Sable00,
+ AUTHOR = {Patrice Pominville and Feng Quian and Raja Vallee-Rai
+ and Laurie Hendren and Clark Verbrugge},
+ TITLE = {A Framework for Optimizing {Java} Using Attrributes},
+ BOOKTITLE = {CASCON},
+ ORGANIZATION = {IBM},
+ ADDRESS = {Mississauga},
+ YEAR = 2000
+}
+
+@InProceedings{Hoelzle+91,
+ author = "Urs H{\"o}lzle and Craig Chambers and David Ungar",
+ title = "Optimizing Dynamically-Typed Object-Oriented
+ Programming Languages with Polymorphic Inline Caches",
+ pages = "21--38",
+ booktitle = "Proceedings of the European Conference on Object-Oriented
+ Programming (ECOOP'91)",
+ volume = {512},
+ series = {Lecture Notes in Computer Science},
+ month = jul,
+ publisher = "Springer Verlag",
+ address = "Geneva",
+ year = "1991",
+ annote = "Abstract: Polymorphic inline caches (PICs) provide a new way to reduce the
+ overhead of polymorphic message sends by extending inline caches to include more
+ than one cached lookup result per call site. For a set of typical object-oriented
+ SELF programs, PICs achieve a median speedup of 11%.
+ As an important side effect, PICs collect type information by recording all of the
+ receiver types actually used at a given call site. The compiler can exploit this
+ type information to generate better code when recompiling a method. An
+ experimental version of such a system achieves a median speedup of 27% for our set
+ of SELF programs, reducing the number of non-inlined message sends by a factor of
+ two.
+ Implementations of dynamically-typed object-oriented languages have been limited
+ by the paucity of type information available to the compiler. The abundance of the
+ type information provided by PICs suggests a new compilation approach for these
+ languages, adaptive compilation. Such compilers may succeed in generating very
+ efficient code for the time-critical parts of a program without incurring
+ distracting compilation pauses."
+
+}
+
+@InProceedings{HoelzleUngar94,
+ author = "Urs H{\"o}lzle and David Ungar",
+ title = "Optimizing Dynamically-Dispatched Calls with Run-Time
+ Type Feedback",
+ pages = "326--336",
+ ISBN = "0-89791-662-X",
+ booktitle = "Proceedings of the Conference on Programming Language
+ Design and Implementation (PLDI'94)",
+ month = jun,
+ publisher = "ACM Press",
+ address = "New York, NY, USA",
+ year = "1994",
+}
+
+@InProceedings{DriesenHoelzle96,
+ author = "Karel Driesen and Urs H{\"o}lzle",
+ title = "The Direct Cost of Virtual Function Calls in {C++}",
+ booktitle = "Conference on Object-Oriented Programming Systems,
+ Languages \& Applications ({OOPSLA} '96)",
+ year = "1996",
+ pages = "306--323",
+ annote = "Studies the cost of virtual function calls on modern
+ processors, taking into account the effects of
+ out-of-order execution and caches. On their baseline
+ architecture, the standard implementation of virtual
+ function calls take 1\%--10\% of the instructions and
+ 2\%--29\% of the cycles. The thunk implementation is
+ slightly faster, for most benchmarks, and much faster
+ for a few. The relative cost of virtual function calls
+ will increase slightly in the future. The influences of
+ architectural variations like branch misprediction
+ penalties, branch prediction accuracy, issue widths,
+ and load latency. The cost per dispatch is 2--5 cycles
+ for most benchmarks (10 cycles for one benchmark) on
+ their baseline architecture (4-wide processor, 2 cycle
+ load latency, 4 cycle branch latency).",
+}
+
+@InProceedings{Oxhoj+92,
+ author = "Nicholas Oxh{\o}j and Jens Palsberg and Michael~I.
+ Schwartzbach",
+ editor = "O. Lehrmann~Madsen",
+ title = "{Making Type Inference Practical}",
+ booktitle = "Proceedings of the \mbox{ECOOP}~'92 European
+ Conference on Object-oriented Programming",
+ series = "LNCS 615",
+ pages = "329--349",
+ publisher = "Springer-Verlag",
+ address = "Utrecht, The Netherlands",
+ month = jul,
+ year = "1992",
+}
+
+@Article{IC::Palsberg1995,
+ refkey = "C1680; PN2520",
+ title = "Efficient Inference of Object Types",
+ author = "Jens Palsberg",
+ pages = "198--209",
+ journal = "Information and Computation",
+ month = dec,
+ year = "1995",
+ volume = "123",
+ number = "2",
+ abstract = "Abadi and Cardelli have recently investigated a
+ calculus of objects (Abadi and Cardelli, 1994). The
+ calculus supports a key feature of object-oriented
+ languages: an object can be emulated by another object
+ that has more refined methods. Abadi and Cardelli
+ presented four first-order type systems for the
+ calculus. The simplest one is based on finite types and
+ no subtyping, and the most powerful one has both
+ recursive types and subtyping. Open until now is the
+ question of type inference, and in the presence of
+ subtyping ``the absence of minimum typings poses
+ practical problems for type inference'' (Abadi and
+ Cardelli, 1994). \par In this paper we give an $O(n^3)$
+ algorithm for each of the four type inference problems
+ and we prove that all the problems are P-complete. We
+ also indicate how to modify the algorithms to handle
+ functions and records.",
+ }
+
+@InProceedings{TipPalsberg00,
+ author = "Frank Tip and Jens Palsberg",
+ title = "Scalable propagation-based call graph construction
+ algorithms.",
+ pages = "281--293",
+ booktitle = "Proceedings of the Conference on Object-Oriented
+ Programming, Systems, Languages and Application
+ ({OOPSLA}-00)",
+ month = oct,
+ series = "ACM Sigplan Notices",
+ volume = "35(10)",
+ publisher = "ACM Press",
+ address = "N. Y.",
+ year = "2000",
+}
+
+
+@InProceedings{Dean+95,
+ author = "Jeffrey Dean and David Grove and Craig Chambers",
+ title = "Optimization of Object-Oriented Programs Using Static
+ Class Hierarchy Analysis",
+ booktitle = "Proceedings of the 9th European Conference on Object-Oriented
+ Programming (ECOOP'95)",
+ volume = {952},
+ series = {Lecture Notes in Computer Science},
+ publisher = "Springer Verlag",
+ pages = "77--101",
+ year = "1995",
+}
+
+@InProceedings{BaconSweeney96,
+ author = "David F. Bacon and Peter F. Sweeney",
+ title = "Fast Static Analysis of {C}++ Virtual Function Calls",
+ booktitle = "Conference on Object-Oriented Programming Systems,
+ Languages \& Applications ({OOPSLA} '96)",
+ year = "1996",
+ pages = "324--341",
+ publisher = "ACM Press",
+ annote = "An empirical study of the effectiveness of
+ \emph{unique name} analysis, \emph{class hierarchy
+ analysis} and \emph{rapid type analysis}. Rapid type
+ analysis is an analysis that takes into account which
+ classes are actually instantiated. It is very fast and
+ detects a a significant percentage of monomorphic calls
+ for some benchmarks (up to 100\%).",
+}
+
+@phdthesis{Bacon97,
+ author = "David Francis Bacon",
+ title = "Fast and Effective Optimization of Statically Typed
+ Object-Oriented Languages",
+ school = "University of California, Berkeley",
+ year = "1997",
+ abstract = "In this dissertation, we show how a relatively simple
+ and extremely fast interprocedural optimization
+ algorithm can be used to optimize many of the expensive
+ features of statically typed, object-oriented languages
+ -- in particular, C++ and Java. We present a new
+ optimization algorithm, Rapid Type Analysis, and show
+ that it is fast both in theory and in practice, and
+ significantly out-performs other {"}fast{"} algorithms
+ for virtual function call resolution. We present
+ optimization algorithms for the resolution of virtual
+ function calls, conversion of virtual inheritance to
+ direct inheritance, elimination of dynamic casts and
+ dynamic type checks, and removal of object
+ synchronization. These algorithms are all presented
+ within a common framework that allows them to be driven
+ by the information collected by Rapid Type Analysis, or
+ by some other type analysis algorithm. Collectively,
+ the optimizations in this dissertation free the
+ programmer from having to sacrifice modularity and
+ extensibility for performance. Instead, the programmer
+ can freely make use of the most powerful features of
+ object-oriented programming, since the optimizer will
+ remove unnecessary extensibility from the program.",
+}
+
+@Article{Shivers88,
+ author = "Olin Shivers",
+ title = "Control-Flow Analysis in {Scheme}",
+ pages = "164--174",
+ journal = "ACM SIGPLAN Notices",
+ year = "1988",
+ month = jul,
+ volume = "23",
+ number = "7",
+ note = "Proceedings of the ACM SIGPLAN 1988 Conference on
+ Programming Language Design and Implementation.",
+}
+
+
+@PhdThesis{Shivers91,
+ author = "Olin Shivers",
+ title = "Control-Flow Analysis of Higher-Order Languages",
+ school = "Carnegie-Mellon University",
+ year = "1991",
+ month = may,
+}
+
+
+@InProceedings{Grove+97,
+ author = "David Grove and Greg DeFouw and
+ Jeffrey Dean and Craig Chambers",
+ title = "Call Graph Construction in Object-Oriented Languages",
+ pages = "108--124",
+ booktitle = "Proceedings of the {ACM} {SIGPLAN} Conference on
+ Object-Oriented Programming Systems, Languages and
+ Applications ({OOPSLA}-97)",
+ month = oct,
+ series = "ACM SIGPLAN Notices",
+ volume = "32, 10",
+ publisher = "ACM Press",
+ address = "New York",
+ year = "1997",
+}
+
+@InProceedings{DeFouw:1998:FIC,
+ author = "Greg DeFouw and David Grove and Craig Chambers",
+ title = "Fast interprocedural class analysis",
+ editor = "ACM",
+ booktitle = "Conference record of {POPL} '98: the 25th {ACM}
+ {SIGPLAN-SIGACT} Symposium on Principles of Programming
+ Languages: papers presented at the Symposium, San
+ Diego, California, 19--21 January 1998",
+ publisher = "ACM Press",
+ address = "New York",
+ year = "1998",
+ pages = "222--236",
+ year = "1998",
+ url = "http://www.acm.org:80/pubs/citations/proceedings/plan/268946/p222-defouw/",
+}
+
+@Article{Collin:1997:TIL,
+ author = "S. Collin and D. Colnet and O. Zendra",
+ title = "Type Inference for Late Binding: The {SmallEiffel}
+ Compiler",
+ journal = "Lecture Notes in Computer Science",
+ volume = "1204",
+ pages = "67--??",
+ year = "1997",
+ coden = "LNCSD9",
+ ISSN = "0302-9743",
+ bibdate = "Fri Aug 22 11:59:49 MDT 1997",
+ acknowledgement = ack-nhfb,
+}
+
+
+@InProceedings{Zendra+97,
+ author = "Olivier Zendra and Dominique Colnet and Suzanne
+ Collin",
+ title = "Efficient Dynamic Dispatch without Virtual Function
+ Tables: The Small {Eiffel} Compiler",
+ pages = "125--141",
+ ISSN = "0362-1340",
+ booktitle = "Proceedings of the {ACM} {SIGPLAN} Conference on
+ Object-Oriented Programming Systems, Languages and
+ Applications ({OOPSLA}-97)",
+ month = oct # "~5--9",
+ series = "ACM SIGPLAN Notices",
+ volume = "32, 10",
+ publisher = "ACM Press",
+ address = "New York",
+ year = "1997",
+}
+
+@InProceedings{Sundaresan+00,
+ author = "Vijay Sundaresan and Laurie J. Hendren and Chrislain
+ Razafimahefa and Raja Vall{\'e}e-Rai and Patrick Lam
+ and Etienne Gagnon and Charles Godin",
+ title = "Practical virtual method call resolution for Java.",
+ pages = "264--280",
+ booktitle = "Proceedings of the Conference on Object-Oriented
+ Programming, Systems, Languages and Application
+ ({OOPSLA}-00)",
+ month = oct # "~15--19",
+ series = "ACM Sigplan Notices",
+ volume = "35.10",
+ publisher = "ACM Press",
+ address = "N. Y.",
+ year = "2000",
+}
+
+@InProceedings{Myers95,
+ author = "Andrew C. Myers",
+ title = "Bidirectional Object Layout for Separate Compilation",
+ booktitle = "{OOPSLA}~'95 Conference Proceedings: Object-Oriented
+ Programming Systems, Languages, and Applications",
+ year = "1995",
+ publisher = "ACM Press",
+ pages = "124--139",
+ url = "http://www.acm.org/pubs/articles/proceedings/oops/217838/p124-myers/p124-myers.pdf",
+}
+
+@InProceedings{Stroustrup89,
+ author = "Bjarne Stroustrup",
+ title = "Multiple Inheritance for {C++}",
+ editor = "{USENIX} Association",
+ booktitle = "Computing Systems, Fall, 1989.",
+ publisher = "USENIX Association",
+ address = "Berkeley, CA, USA",
+ month = "Fall",
+ year = "1989",
+ volume = "2(4)",
+ pages = "367--395",
+}
+
+@InProceedings{Rose88,
+ author = "John R. Rose",
+ title = "Fast Dispatch Mechanisms for Stock Hardware",
+ editor = "Norman Meyrowitz",
+ booktitle = "{OOPSLA}'88: Object-Oriented Programming Systems,
+ Languages and Applications: Conference Proceedings",
+ year = "1988",
+ pages = "27--35",
+}
+
+@TechReport{Liskov+95,
+ author = "Barbara Liskov and Dorothy Curtis and Mark Day and
+ Sanjay Ghemawat and Robert Gruber and Paul Johnson and
+ Andrew C. Myers",
+ title = "Theta Reference Manual",
+ institution = "Massachusetts Institute of Technology, Laboratory for
+ Computer Science",
+ month = feb,
+ year = "1995",
+ number = "Programming Methodology Group Memo 88",
+ url = "http://clef.lcs.mit.edu/Thor-papers.html",
+}
+
+
+@InProceedings{Ishizaki+00,
+ author = "Kazuaki Ishizaki and Motohiro Kawahito and Toshiaki
+ Yasue and Hideaki Komatsu and Toshio Nakatani",
+ title = "A study of devirtualization techniques for a {JavaTM}
+ Just-In-Time compiler.",
+ pages = "294--310",
+ booktitle = "Proceedings of the Conference on Object-Oriented
+ Programming, Systems, Languages and Application
+ ({OOPSLA}-00)",
+ month = oct # "~15--19",
+ series = "ACM Sigplan Notices",
+ volume = "35.10",
+ publisher = "ACM Press",
+ address = "N. Y.",
+ year = "2000",
+}
+
+
+@INPROCEEDINGS{Alpern+01,
+ AUTHOR = {Bowen Alpern and Anthony Cocchi and David Grove and Derek Lieber},
+ TITLE = {Efficient Dispatch of {Java} Interface Methods},
+ BOOKTITLE = {HPCN'01, Java in High Performance Computing},
+ EDITOR = {Vladimir Getov and George K. Thiruvathukal},
+ SERIES = {LNCS 2110},
+ PUBLISHER = {Springer},
+ PAGES = {621--628},
+ ADDRESS = {Amsterdam},
+ MONTH = {June},
+ YEAR = 2001
+}
+
+
+@BOOK{AbramskyHankin87,
+ AUTHOR = {Samson Abramsky and Chris Hankin},
+ TITLE = {Abstract Interpretation of Declarative Languages},
+ PUBLISHER = {Ellis Horwood},
+ YEAR = 1987
+}
+
+@INPROCEEDINGS{Blanchet99,
+ AUTHOR = {Bruno Blanchet},
+ TITLE = {Escape Analysis for Object Oriented Languages:
+ Application to Java},
+ BOOKTITLE = {Proceedings of OOPSLA'99},
+ ORGANIZATION = {ACM},
+ ADDRESS = {Denver},
+ PAGES = {20--34},
+ MONTH = {November},
+ YEAR = 1999
+}
+
+@INPROCEEDINGS{Choi99,
+ AUTHOR = {J.-G. Choi and M. Gupta and M. Serrano and
+ V.C. Sreedhar and S. Midkiff},
+ TITLE = {Escape Analysis for Java},
+ BOOKTITLE = {Proceedings of OOPSLA'99},
+ ORGANIZATION = {ACM},
+ ADDRESS = {Denver},
+ PAGES = {1-19},
+ MONTH = {November},
+ YEAR = 1999
+}
+
+@ARTICLE{AitKaci89,
+ AUTHOR = {H. A\"{i}t-Kaci and R. Boyer and P. Lincoln and R.
+Nasr},
+ TITLE = {Efficient implementation of lattice operations},
+ JOURNAL = {Transactions on Programming Languages and Systems},
+ ORGANIZATION = {ACM},
+ VOLUME = 11,
+ NUMBER = 1,
+ PAGES = {115-146},
+ MONTH = {November},
+ YEAR = 1989
+}
+
+@INPROCEEDINGS{Caseau93,
+ AUTHOR = {Yves Caseau},
+ TITLE = {Efficient handling of multiple inheritance hierarchies},
+ BOOKTITLE = {Proceedings of OOPSLA'93},
+ ORGANIZATION = {ACM},
+ SERIES = {SIGPLAN},
+ VOLUME = {28(10)},
+ PAGES = {271--287},
+ YEAR = 1993
+}
+
+
+@ARTICLE{Cohen91,
+ AUTHOR = {N.J. Cohen},
+ TITLE = {Type-extension type tests can be performed in constant time},
+ JOURNAL = {Transactions on Programming Languages and Systems},
+ ORGANIZATION = {ACM},
+ VOLUME = 13,
+ NUMBER = 4,
+ PAGES = {626--629},
+ MONTH = {April},
+ YEAR = 1991
+}
+
+
+@ARTICLE{Dencker84,
+ AUTHOR = {P. Dencker and K. D\"{u}rre and J. Heuft},
+ TITLE = {Optimization of Parser Tables for Portable Compilers},
+ JOURNAL = {Transactions on Programming Languages and Systems},
+ ORGANIZATION = {ACM},
+ VOLUME = 6,
+ NUMBER = 4,
+ PAGES = {546--572},
+ MONTH = {April},
+ YEAR = 1984
+}
+
+
+@ARTICLE{KraViHo97,
+ AUTHOR = {A. Krall and J. Vitek and R.N. Horspool},
+ TITLE = {Near optimal hierarchical encoding of types},
+ BOOKTITLE = {European Conference on Object-Oriented Programming
+ ({ECOOP'97})},
+ PAGES = {128-145},
+ PUBLISHER = {LNCS 1241},
+ ADDRESS = {Jyv\"{a}skyl\"{a}},
+ YEAR = 1997
+}
+
+
+@ARTICLE{VitekHorspool94,
+ AUTHOR = {J. Vitek and R.N. Horspool},
+ TITLE = {Taming Message Passing: Efficient Method Look-Up for
+ Dynamically Typed Languages},
+ BOOKTITLE = {European Conference on Object-Oriented Programming
+ ({ECOOP'94})},
+ PAGES = {432--449},
+ PUBLISHER = {LNCS 821},
+ ADDRESS = {Bologna},
+ YEAR = 1994
+}
+
+@InProceedings{Agesen95,
+ title = "The {Cartesian} Product Algorithm: Simple and Precise
+ Type Inference of Parametric Polymorphism",
+ author = "Ole Agesen",
+ pages = "2--26",
+ editor = "Walter G. Olthoff",
+ booktitle = "{ECOOP}'95---Object-Oriented Programming, 9th European
+ Conference",
+ address = "{\AA}arhus, Denmark",
+ month = "7--11~" # aug,
+ year = "1995",
+ series = "Lecture Notes in Computer Science",
+ volume = "952",
+ publisher = "Springer",
+}
+
+@InProceedings{Jalapeno99,
+ author = "Bowen Alpern and Dick Attanasio and Anthony Cocchi and
+ Derek Lieber and Stephen Smith and Ton Ngo and John J.
+ Barton",
+ title = "Implementing Jalapeno in Java",
+ pages = "314--324",
+ editor = "Loren Meissner",
+ booktitle = "Proceeings of the 1999 {ACM} {SIGPLAN} Conference on
+ Object-Oriented Programming, Systems, Languages {\&}
+ Applications ({OOPSLA}`99)",
+ month = nov # "~1--5",
+ series = "ACM Sigplan Notices",
+ volume = "34.10",
+ publisher = "ACM Press",
+ address = "N. Y.",
+ year = "1999",
+}
+
+
+@Article{Jalapeno00,
+ author = "B. Alpern and C. R. Attanasio and J. J. Barton and M.
+ G. Burke and P. Cheng and J.-D. Choi and A. Cocchi and
+ S. J. Fink and D. Grove and M. Hind and S. F. Hummel
+ and D. Lieber and V. Litvinov and M. F. Mergen and T.
+ Ngo and J. R. Russell and V. Sarkar and M. J. Serrano
+ and J. C. Shepherd and S. E. Smith and V. C. Sreedhar
+ and H. Srinivasan and J. Whaley",
+ title = "The {Jalape{\~n}o} virtual machine",
+ journal = "IBM Systems Journal",
+ volume = "39",
+ number = "1",
+ pages = "211--238",
+ month = "????",
+ year = "2000",
+ coden = "IBMSA7",
+ ISSN = "0018-8670",
+ bibdate = "Mon Apr 24 15:43:02 MDT 2000",
+ url = "http://www.almaden.ibm.com/journal/sj/391/alpern.html",
+ acknowledgement = ack-nhfb,
+ keywords = "Java",
+}
+
+@InProceedings{HotSpot01,
+ author = "Michael Paleczny and Chnstopher Vick and Cliff Click",
+ title = "The {Java} {HotSpot} Server Compiler",
+ pages = "1--12",
+ booktitle = "Proceedings of the {$Java{\^{TM}}$} Virtual Machine
+ Research and Technology Symposium ({JVM}-01)",
+ month = apr # " ~23--24",
+ publisher = "USENIX Association",
+ address = "Berkley, USA",
+ year = "2001",
+}
+
--- /dev/null
+\chapter{The Just-In-Time Compiler}
+
+\section{Introduction}
+
+\section{The Java Virtual Machine}
+
+\label{chapjvm}
+
+The JavaVM is a typed stack architecture \cite{javavm96}. There are
+different instructions for integer, long integer, floating point and
+address types. Byte and character types have only special memory access
+instructions and are treated as integers for arithmetic operations. The
+main instruction set consists of arithmetic/logical and load/store/constant
+instructions. There are special instructions for array access and for
+accessing the fields of objects (memory access), for method invocation,
+and for type checking. A JavaVM has to check the program for type
+correctness and executes only correct programs. The following examples show
+some important JavaVM instructions and how a Java program is represented by
+these instructions.
+
+\begin{tabular}{ll}
+{\tt iload n}& load contents of local variable n \\
+{\tt istore n}& store stack top in local variable n \\
+{\tt imul }& product of 2 topmost stack elements \\
+{\tt isub }& difference of 2 topmost stack elements \\
+\end{tabular}
+
+The Java assignment statement {\tt a = b - c * d} is translated into
+
+\begin{tabular}{ll}
+{\tt iload b }& load contents of variable b \\
+{\tt iload c }& load contents of variable c \\
+{\tt iload d }& load contents of variable d \\
+{\tt imul }& compute c * d \\
+{\tt isub }& compute b - (c * d) \\
+{\tt istore a}& store stack top in variable a \\
+\end{tabular}
+
+Accessing the fields of objects is handled by the instructions {\tt
+getfield} and {\tt putfield}. The instruction {\tt getfield} expects an
+object reference on the stack and has an index into the constant pool as an
+operand. The index into the constant pool must be a reference to a pair
+containing the class name and a field name. The types of the classes
+referenced by the constant pool index and by the object reference must be
+compatible, a fact which is usually checked statically at load time. The
+object reference has to be different from the {\tt null} pointer, a fact
+which must usually be checked at run time.
+
+Array access is handled by the {\tt aload} and {\tt astore} instructions.
+Separate versions of these instructions exist for each of the basic types
+({\tt byte}, {\tt int}, {\tt float}, {\tt ref}, etc.). The {\tt aload}
+instruction expects a reference to an array and an index (of type {\tt
+int}) on the stack. The array reference must not be the {\tt null} pointer.
+The index must be greater than or equal to zero and less than the array
+length.
+
+There a special commands for method invocation. Each method has its own
+virtual stack and an area for local variables. After the method invocation,
+the stack of the caller is empty and the arguments are copied into the
+first local variables of the called method. After execution of a {\tt
+return} instruction, the called method returns to its caller. If the called
+method is a function, it pops the return value from its own stack and
+pushes it onto the stack of the caller.
+
+The {\tt instanceof} and {\tt checkcast} instructions are used for subtype
+testing. Both expect a reference to an object on the stack and have an
+index into the constant pool as operand. The index must reference a class,
+array or interface type. The two instructions differ in their result and in
+their behavior if the object reference is {\tt null}.
+
+
+\section{Translation of stack code to register code}
+
+\label{chaptranslation}
+
+The architecture of a RISC processor is completely different from the stack
+architecture of the JavaVM. RISC processors have large sets of registers.
+(The Alpha has 32 integer registers and 32 floating point registers which
+are both 64 bits wide.) They execute arithmetic and logic operations only
+on values which are held in registers. Load and store instructions are
+provided to move data between memory and registers. Local variables of
+methods usually reside in registers and are saved in memory only during a
+method call or if there are too few registers. The use of registers for
+parameter passing is defined by calling conventions.
+
+\subsection{Machine code translation examples}
+
+The previous example expression {\tt a = b - c * d} would be translated
+by an optimizing C compiler to the following two Alpha instructions (the
+variables {\tt a}, {\tt b}, {\tt c} and {\tt d} reside in registers):
+
+\begin{quote}
+\begin{tabular}{ll}
+{\tt MULL c,d,tmp0 }&{\tt ; tmp0 = c * d }\\
+{\tt SUBL b,tmp0,a }&{\tt ; a = b - tmp0 }\\
+\end{tabular}
+\end{quote}
+
+If JavaVM code is translated to machine code, the stack is eliminated and
+the stack slots are represented by temporary variables usually residing in
+registers. A naive translation of the previous example would result in the
+following Alpha instructions:
+
+\begin{quote}
+\begin{tabular}{ll}
+{\tt MOVE b,t0 }&{\tt ; iload b } \\
+{\tt MOVE c,t1 }&{\tt ; iload c } \\
+{\tt MOVE d,t2 }&{\tt ; iload d } \\
+{\tt MULL t1,t2,t1}&{\tt ; imul } \\
+{\tt SUBL t0,t1,t0}&{\tt ; isub } \\
+{\tt MOVE t0,a }&{\tt ; istore a} \\
+\end{tabular}
+\end{quote}
+
+The problems of translating JavaVM code to machine code are primarily the
+elimination of the unnecessary copy instructions and finding an efficient
+register allocation algorithm. A common but expensive technique is to do
+the naive translation and use an additional pass for copy elimination and
+coalescing.
+
+
+\section{The translation algorithm}
+
+The new translation algorithm can get by with three passes. The first pass
+determines basic blocks and builds a representation of the JavaVM
+instructions which is faster to decode. The second pass analyses the stack
+and generates a static stack structure. During stack analysis variable
+dependencies are tracked and register requirements are computed. In the
+final pass register allocation of temporary registers is combined with
+machine code generation.
+
+The new compiler computes the exact number of objects needed or computes an
+upper bound and allocates the memory for the necessary temporary data
+structures in three big blocks (the basic block array, the instruction
+array and the stack array). Eliminating all the double linked lists also
+reduced the memory requirements by a factor of five.
+
+
+\subsection{Basic block determination}
+
+The first pass scans the JavaVM instructions, determines the basic blocks
+and generates an array of instructions which has fixed size and is easier
+to decode in the following passes. Each instruction contains the opcode,
+two operands and a pointer to the static stack structure after the
+instruction (see next sections). The different opcodes of JavaVM
+instructions which fold operands into the opcode are represented by just
+one opcode in the instruction array.
+
+
+\subsection{Basic block interfacing convention}
+
+The handling of control flow joins was quite complicated in the old
+compiler. We therefore introduced a fixed interface at basic block
+boundaries. Every stack slot at a basic block boundary is assigned a fixed
+interface register. The stack analysis pass determines the type of the
+register and if it has to be saved across method invocations. To enlarge
+the size of basic blocks method invocations do not end basic blocks. To
+guide our compiler design we did some static analysis on a large
+application written in Java: the javac compiler and the libraries it uses.
+Table \ref{tab-1} shows that in more than 93\% of the cases the stack is
+empty at basic block boundaries and that the maximal stack depth is 6.
+Using this data it becomes clear that the old join handling did not improve
+the quality of the machine code.
+
+\begin{table*}
+\begin{center}
+\begin{tabular}[b]{|l|c|c|c|c|c|c|c|c|}
+\hline
+stack depth & 0 & 1 & 2 & 3 & 4 & 5 & 6 & $>$6 \\ \hline
+occurrences & 7930 & 258 & 136 & 112 & 36 & 8 & 3 & 0 \\ \hline
+\end{tabular}
+\caption{distribution of stack depth at block boundary}
+\label{tab-1}
+\end{center}
+\end{table*}
+
+
+\subsection{Copy elimination}
+
+To eliminate unnecessary copies the loading of values is delayed until the
+instruction is reached which consumes the value. To compute the information
+a the run time stack is simulated at compile time. Instead of values the
+compile time stack contains the type of the value, if a local variable was
+loaded to a stack location and similar information. Adl-Tabatabai
+\cite{Taba+98} used a dynamic stack which is changed at every instruction.
+A dynamic stack only gives the possibility to move information from earlier
+instructions to later instructions. We use a static stack structure which
+enables information flow in both directions.
+
+Fig.~\ref{Trans1} shows our instruction and stack representation. An
+instruction has a reference to the stack before the instruction and the
+stack after the instruction. The stack is represented as a linked list. The
+two stacks can be seen as the source and destination operands of an
+instruction. In the implementation only the destination stack is stored,
+the source stack is the destination of stack of the previous instruction.
+
+\begin{figure}[htb]
+\begin{center}
+\setlength{\unitlength}{1mm}
+\begin{picture}(25,32)
+\put( 0, 0 ){\makebox(10,5){\tt b}}
+\put( 5, 2.5){\oval(10,5)}
+\put( 5, 9 ){\vector(0,-1){4}}
+\put( 0, 9 ){\makebox(10,5){\tt c}}
+\put( 5,11.5){\oval(10,5)}
+\put( 5,18 ){\vector(0,-1){4}}
+\put( 0,18 ){\makebox(10,5){\tt d}}
+\put( 5,20.5){\oval(10,5)}
+%\put( 5,27 ){\vector(0,-1){4}}
+\put(15, 9 ){\makebox(10,5){\tt c*d}}
+\put(20,11.5){\oval(10,5)}
+\put(20, 9 ){\line(0,-1){2}}
+\put( 5, 7 ){\line(1,0){15}}
+\put(10,27 ){\framebox(15,5){\tt imul}}
+\put(20,27 ){\vector(0,-1){13}}
+\put(15,27 ){\line(0,-1){6.5}}
+\put(15,20.5){\vector(-1,0){5}}
+\end{picture}
+\caption{instruction and stack representation}
+\label{Trans1}
+\end{center}
+\end{figure}
+
+This representation can easily be used for copy elimination. Each stack
+element not only contains the type of the stack slot but also the local
+variable number of which it is a copy, the argument number if it is an
+argument, the interface register number if it is an interface. Load (push
+the content of a variable onto the stack) and store store instructions do
+no generate a copy machine instruction if the stack slot contains the same
+local variable. Generated machine instructions for arithmetic operations
+directly use the local variables as their operands.
+
+There are some pitfalls with this scheme. Take the example of
+fig.~\ref{Trans2}. The stack bottom contains the local variable {\tt a}.
+The instruction {\tt istore a} will write a new value for {\tt a} and will
+make a later use of this variable invalid. To avoid this we have to copy
+the local variable to a stack variable. An important decision is at which
+position the copy instruction should be inserted. Since there is a high
+number of {\tt dup} instructions in Java programs (around 4\%) and it is
+possible that a local variable resides in memory, the copy should be done
+with the {\tt load} instruction. Since the stack is represented as a linked
+list only the destination stack has to be checked for occurrences of the
+offending variable and these occurrences are replaced by a stack variable.
+
+
+\begin{figure}[htb]
+\begin{center}
+\setlength{\unitlength}{1mm}
+\begin{picture}(79,32)
+\put( 0,27 ){\framebox(15,5){\tt iload a}}
+\put(16,27 ){\framebox(13,5){\tt dup}}
+\put(30,27 ){\framebox(17,5){\tt iconst 1}}
+\put(48,27 ){\framebox(13,5){\tt iadd}}
+\put(62,27 ){\framebox(17,5){\tt istore a}}
+\put(10,27 ){\vector(0,-1){22}}
+\put( 5,27 ){\vector(0,-1){4}}
+\put( 5, 0 ){\makebox(10,5){\tt a}}
+\put(10, 2.5){\oval(10,5)}
+\put(20,27 ){\line(0,-1){2}}
+\put(20,25 ){\line(-1,0){10}}
+\put(25,27 ){\vector(0,-1){13}}
+\put(20, 9 ){\makebox(10,5){\tt a}}
+\put(25,11.5){\oval(10,5)}
+\put(25, 9 ){\line(0,-1){2}}
+\put(10, 7 ){\line(1,0){63}}
+\put(36,27 ){\line(0,-1){2}}
+\put(36,25 ){\line(-1,0){11}}
+\put(41,27 ){\vector(0,-1){4}}
+\put(36,18 ){\makebox(10,5){\tt 1}}
+\put(41,20.5){\oval(10,5)}
+\put(41,18 ){\line(0,-1){2}}
+\put(41,16 ){\line(-1,0){16}}
+\put(52,27 ){\line(0,-1){2}}
+\put(52,25 ){\line(-1,0){11}}
+\put(73,27 ){\line(0,-1){20}}
+\put(57,27 ){\vector(0,-1){13}}
+\put(52, 9 ){\makebox(10,5){\tt +}}
+\put(57,11.5){\oval(10,5)}
+\put(57,9 ){\line(0,-1){2}}
+\put(68,27 ){\line(0,-1){2}}
+\put(68,25 ){\line(-1,0){11}}
+\end{picture}
+\caption{anti dependence}
+\label{Trans2}
+\end{center}
+\end{figure}
+
+To answer the question of how often this could happen and how expensive
+the stack search is, we analysed again the {\tt javac} compiler. In more
+than 98\% of the cases the stack is empty (see table \ref{tab-2}). In only
+0.2\% of the cases the stack depth is higher than 1 and the biggest stack
+depth is 3.
+
+\begin{table}[htb]
+\begin{center}
+\begin{tabular}[b]{|l|c|c|c|c|c|}
+\hline
+stack depth & 0 & 1 & 2 & 3 & $>$3 \\ \hline
+occurrences & 2167 & 31 & 1 & 3 & 0 \\ \hline
+\end{tabular}
+\caption{distribution of {\tt store} stack depth}
+\label{tab-2}
+\end{center}
+\end{table}
+
+\begin{table*}
+\begin{center}
+\begin{tabular}[b]{|l|c|c|c|c|c|c|c|c|c|c|}
+\hline
+chain length & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & $>$9 \\ \hline
+occurrences & 1892& 62 & 23 & 62 & 30 & 11 & 41 & 9 & 7 & 65 \\ \hline
+\end{tabular}
+\caption{distribution of creator-store distances}
+\label{tab-3}
+\end{center}
+\end{table*}
+
+To avoid copy instructions when executing a {\tt store} it is necessary to
+connect the creation of a value with the store which consumes it. In that
+case a {\tt store} not only can conflict with copies of a local variable
+which result from {\tt load} instructions before the creator of the value,
+but also with {\tt load} and {\tt store} instructions which exist between
+the creation of value and the {\tt store}. In fig.~\ref{Trans3} the {\tt
+iload a} instruction conflicts with the {\tt istore a} instruction.
+
+\begin{figure}[htb]
+\begin{center}
+\setlength{\unitlength}{1mm}
+\begin{picture}(67,27)
+\put( 0,22 ){\framebox(15,5){\tt iadd}}
+\put(16,22 ){\framebox(15,5){\tt iload a}}
+\put(32,22 ){\framebox(17,5){\tt istore b}}
+\put(50,22 ){\framebox(17,5){\tt istore a}}
+\put(10,22 ){\vector(0,-1){13}}
+\put( 5,22 ){\line(0,-1){2}}
+\put( 5,20 ){\line(-1,0){3}}
+\put( 2,20 ){\vector(0,-1){20}}
+\put(10, 4 ){\line(0,-1){2}}
+\put( 5, 4 ){\makebox(10,5){\tt +}}
+\put(10, 6.5){\oval(10,5)}
+\put(21,22 ){\line(0,-1){2}}
+\put(21,20 ){\line(-1,0){11}}
+\put(26,22 ){\vector(0,-1){4}}
+\put(21,13 ){\makebox(10,5){\tt a}}
+\put(26,15.5){\oval(10,5)}
+\put(26,13 ){\line(0,-1){2}}
+\put(10,11 ){\line(1,0){46}}
+\put(38,22 ){\line(0,-1){2}}
+\put(38,20 ){\line(-1,0){12}}
+\put(43,22 ){\line(0,-1){11}}
+\put(56,22 ){\line(0,-1){11}}
+\put(61,22 ){\line(0,-1){20}}
+\put( 2, 2 ){\line(1,0){59}}
+\end{picture}
+\caption{anti dependence}
+\label{Trans3}
+\end{center}
+\end{figure}
+
+The anti dependences are detected by checking the stack locations of the
+previous instructions for conflicts. Since the stack locations are
+allocated as one big array just the stack elements which have a higher
+index than the current stack element have to be checked. Table \ref{tab-3}
+gives the distribution of the distance between the creation of the value
+and the corresponding store. In 86\% of the cases the distance is one.
+
+\begin{figure*}[htb]
+\begin{center}
+%\small
+\begin{verbatim}
+ java.io.ByteArrayOutputStream.write (int)void
+
+ Local Table:
+ 0: (adr) S15
+ 1: (int) S14
+ 2: (int) S13
+ 3: (int) S12 (adr) S12
+
+ Interface Table:
+ 0: (int) T24
+
+ [ L00] 0 ALOAD 0
+ [ T23] 1 GETFIELD 16
+ [ L02] 2 IADDCONST 1
+ [ L02] 3 NOP
+ [ ] 4 ISTORE 2
+ [ L02] 5 ILOAD 2
+ [ L00 L02] 6 ALOAD 0
+ [ T23 L02] 7 GETFIELD 8
+ [ T23 L02] 8 ARRAYLENGTH
+ [ ] 9 IF_ICMPLE L005
+ ...............
+
+ [ ] 18 IF_ICMPLT L003
+ [ ] L002:
+ [ I00] 19 ILOAD 3
+ [ I00] 20 GOTO L004
+ [ ] L003:
+ [ I00] 21 ILOAD 2
+ [ A00] L004:
+ [ L03] 22 BUILTIN1 newarray_byte
+ [ ] 23 ASTORE 3
+ [ L00] 24 ALOAD 0
+ [ A00] 25 GETFIELD 8
+ [ A01 A00] 26 ICONST 0
+ [ A02 A01 A00] 27 ALOAD 3
+ [ A03 A02 A01 A00] 28 ICONST 0
+ [ L00 A03 A02 A01 A00] 29 ALOAD 0
+ [ A04 A03 A02 A01 A00] 30 GETFIELD 16
+ [ ] 31 INVOKESTATIC java/lang/System.arraycopy
+ [ L00] 32 ALOAD 0
+ [ L03 L00] 33 ALOAD 3
+ [ ] 34 PUTFIELD 8
+ [ ] L005:
+ ...............
+
+ [ ] 45 RETURN
+\end{verbatim}
+\caption{Example: intermediate instructions and stack contents}
+\label{IntermediateStack}
+\end{center}
+\end{figure*}
+
+The output dependences are checked by storing the instruction number of the
+last store in each local variable. If a store conflicts due to dependences
+the creator places the value in a stack register. Additional dependences
+arise because of exceptions. The exception mechanism in Java is precise.
+Therefore {\tt store} instructions are not allowed to be executed before
+an exception raising instruction. This is checked easily be remembering
+the last instruction which could raise an exception. In methods which contain
+no exception handler this conflict can be safely ignored because no
+exception handler can have access to these variables.
+
+
+\subsection{Register allocation}
+
+Expensive register allocation algorithms are neither suitable nor necessary.
+The {\tt javac} compiler does a coloring of the local variables and assigns
+the same number to variables which are not active at the same time. The
+stack variables have implicitly encoded their live ranges. When a value is
+pushed, the live range start. When a value is popped, the live range ends.
+
+Complications arise only with stack manipulation instructions like {\tt dup}
+and {\tt swap}. We flag therefore the first creation of a stack variable and
+mark a duplicated one as a copy. The register used for this variable can
+be reused only after the last copy is popped.
+
+During stack analysis stack variables are marked which have to survive a
+method invocation. These stack variables and local variables are assigned
+callee saved registers. If there are not enough registers available,
+these variables are allocated in memory.
+
+Efficient implementation of method invocation is crucial to the performance
+of Java. Therefore, we preallocate the argument registers and the return
+value in a similar way as we handle store instructions. Input arguments (in
+Java input arguments are the first variables) for leaf procedures (and
+input arguments for processors with register windows) are preassigned, too.
+
+
+\subsection{Instruction combining}
+
+Together with stack analysis we combine constant loading instructions with
+selected instructions which are following immediately. In the class of
+combinable instructions are add, subtract, multiply and divide instructions,
+logical and shift instructions and compare/branch instructions. During code
+generation the constant is checked if it lies in the range for immediate
+operands of the target architecture and appropriate code is generated.
+
+The old translator did for some complex instructions an expansion in
+multiple instructions to avoid complex instructions in the later passes.
+One of such instructions was the expansion of the {\tt lookup} instruction
+in a series of load constant and compare and branch instructions. Since
+the constants are usually quite small this unnecessarily increased the
+size of the intermediate representation and the final code. The new
+compiler delays the expansion into multiple instructions to the code
+generation pass which reduces all representations and speeds up the
+compilation.
+
+
+\subsection{Example}
+
+Fig. \ref{IntermediateStack} shows the intermediate representation and
+stack information as produced by the compiler for debugging purposes. The
+{\tt Local Table} gives the types and register assignment for the local
+variables. The Java compiler reuses the same local variable slot for
+different local variables if there life ranges do not overlap. In this
+example the variable slot 3 is even used for local variables of different
+types (integer and address). The JIT-compiler assigned the saved register
+12 to this variable.
+
+One interface register is used in this example entering the basic block
+with label {\tt L004}. At the entry of the basic block the interface
+register has to be copied to the argument register {\tt A00}. This is one
+of the rare cases where a more sophisticated coalescing algorithm could
+have allocated an argument register for the interface.
+
+At instruction 2 and 3 you can see the combining of a constant with an
+arithmetic instruction. Since the instructions are allocated in an array
+the empty slot has to be filled with a {\tt NOP} instruction. The {\tt
+ADDCONSTANT} instruction already has the local variable {\tt L02} as
+destination, an information which comes from the later {\tt ISTORE} at
+number 4. Similarly the {\tt INVOKESTATIC} at number 31 has marked all its
+operands as arguments. In this example all copy (beside the one to the
+interface register) have been eliminated.
+
+
+\subsection{Complexity of the algorithm}
+
+The complexity of the algorithm is mostly linear with respect to the number
+of instructions and the number of local variables plus the number of stack
+slots. There are only a small number of spots where it is not linear.
+
+\begin{itemize}
+\item At the begin of a basic block the stack has to be copied to separate
+ the stacks of different basic blocks. Table \ref{tab-1} shows that
+ the stack at the boundary of a basic block is in most cases zero.
+ Therefore, this copying does not influence the linear performance of
+ the algorithm.
+\item A store has to check for a later use of the same variable. Table
+ \ref{tab-2} shows that this is not a problem, too.
+\item A store additionally has to check for the previous use of the same
+ variable between creation of the value and the store. The distances
+ between the creation and the use are small (in most case only 1) as
+ shown by table \ref{tab-3}.
+\end{itemize}
+
+Compiling {\tt javac} 29\% of the compile time are spent in parsing and
+basic block determination, 18\% are spent in stack analysis, 16\% are spent
+in register allocation and 37\% are spent in machine code generation.
+
+
--- /dev/null
+\chapter{Class Library and Interface}
+
+\section{Introduction}
+
+\section{Header file generator}
\ No newline at end of file
--- /dev/null
+\chapter{Class Loader}
+
+\section{Introduction}
+
+\section{System class loader}
+
+\section{Data structures}
+
+\section{Dynamic class loader}
+
+\section{Eager - lazy class loading}
+
+\section{Linking}
+
+\section{Initialization}
--- /dev/null
+\section{Loop optimizations}
+
+\subsection{Introduction}
+
+In safe programming languages like Java, all array accesses are
+checked. It is assured that the used index is greater or equal to
+zero and less than the array length, otherwise an
+ArrayIndexOutOfBoundsException is thrown. It is obvious that this
+gain in saftey causes a run-time overhead that is especially
+unpleasant in loops where these checks have to be performed many
+times. Often it is possible to remove these checks in loops by
+inserting additional tests at the beginning of the loop and
+examinig the code structure and loop condition thereby saving run-
+time speed at each loop execuion. The following algorithm performs
+this task for a special set of array accesses in loops. It should
+be obvious that it is not always possible to predict the value of
+an array index by inserting tests at the beginning of the loop. An
+example would be a global variable that is used as an array index.
+This variable could be changed by a different thread virtually any
+time, so removing a bound check for this array access is not a
+good idea. The following algorithm only performs bound check
+removal, when the induction variable (the variable used as array
+index) is local and is only modified by adding/subtracting a
+constant inside the loop body or remains unchanged (is constant).
+If a constant is used as index, optimzation can take place as
+well. When other variables are used to modify the induction
+variable or when different arithmetic operations are used, no
+optimization can take place. Nevertheless, the most common use for
+index variables in loops is their increment or decrement by a
+constant, so most of the array access should be considered for
+optimization.
+
+\subsection{Initialization}
+
+Before array bound checks in loops can be eliminated, the loops
+have to be detected. The algorithm performs its analysis on
+intermediate code level, so we cannot rely on just looking at
+while/for statments. A basic block analysis has been completed, so
+the algorithm can work on this data structure already. It uses the
+Lengauer-Tarjan algorithm to find existing loops, which bases on
+determining the dominator tree (a reference with extensive
+documentation for this algorithm can be found in \cite{appel98}). It uses a
+depth first search on the control flow graph to build a spanning
+tree. Then the semidominator and dominator theorem are utilized to
+extract the dominator tree and by looking at back edges (a back
+edge is an edge where the target node dominates the source node)
+all loops can be detected. Before starting to look at each loop,
+the case of different loops that share the same header node has to
+be considered. The algorithm works on each loop not looking on any
+other loop and performs its optimzation. The procedures that
+analyze the control flow of the program, build a flow graph and
+extract the loops can be found in the files graph.c (for control
+flow graph) and loop.c (for the loop extractig algorithm). The
+control flow graph is simple built by looking at the last
+instruction of each basic block and then deciding, which other
+blocks can be reached by that instruction. One problem can occur,
+when lookup/table-switch instructions cause multiple jumps to the
+same node. It must be prevented that the target node´s predecessor
+list contains that node more than once. So an additionally array
+is needed to prevent double entries in the predecessor array. When
+the necessary flow data structure and the list of all loops has
+been built, the main algorithm can start.
+
+
+\subsection{Algorithm}
+
+The procedures for the main algorithm can be found in analyze.c.
+Before each loops is processed on its own, two global
+preprocessing steps are necessary.
+
+The first step deals with loops, that share the same header node.
+This special case can happen when a loop body ends with an if-
+then/else statment. The loop finding algorithm then reports two
+different loops, which share the same header node. When additional
+tests are inserted at the header node and another loop sharing the
+same header is later evaluated, inserting different tests,
+problems could arise. To prevent this from happening, loops
+sharing the same header node are simply merged into a bigger one
+by unioning their nodes. Because the nodes of the loop are already
+sorted by increasing basic block numbers, a simple merge of the
+nodes can be done (analyze\_double\_headers).
+
+The second step before the loop by loop analysis commences is the
+building of a loop hierarchie. Nested loops cause problems when
+variables that are used as array indexes elsewhere get modified.
+Because these modifications (eg. an increment by a constant) can
+happen an unknown number of times, their results are
+unpredictable. These modifications in nested loops have to be
+recognized and reacted upon accordingly. The algorithm builds a
+tree where each node represents a loop with its parent being the
+directly surrounding loop. A dummy root node, representing the
+whole function has to be inserted as well. When loops have to be
+duplicated because of optimzed array accesses, it is important to
+extend or duplicate exception entries as well. Because of that,
+exceptions are inserted into the tree as follows. Every exception
+is inserted at the node in the hierarchie that represents the
+loop, that directly contains this exception. When an exception is
+part of a nested loop, it is inserted only at the node,
+representing this nested loop, because by following this nodes
+parent pointers, we find all other loops, that contain the
+exception (analyze\_nested). Finally, the sequentiel order of the
+loops is determined by a topological sort of the loops, satisfying
+the condition, that all nested loops have to be optimzed before
+their parent loop is processed. This can be archieved by a post-
+order traversal of the hierarchie tree with skipping the root
+node.
+
+After these global steps have been completed, each loop is
+processed. Then the loops are checked for array accesses
+(analyze\_for\_array\_access) and the process exits if none
+are found. Finally, two special cases must be considered.
+
+\begin{enumerate}
+
+\item The algorithm uses the loop condition to guarantee certain
+bounds for variables. If the loop condition consists of an or-
+statement, these guarantees can not be held up. If the loop
+statement reads
+
+\begin{verbatim}
+while ((var > 0) or true)
+\end{verbatim}
+
+vars value is not bound to be greater than zero. When an or-statement
+is found, loop optimization is stopped.
+
+\item Variables that are used as indexes in array accesses might be
+modified in exception handling code of try-catch statements inside
+the loop. Because code in catch blocks is not considered in the
+control flow graph and because these modifications happen
+unpredictable (and should occur rarely) loops containing these
+catch blocks with index variable modifications are not optimized.
+During the scan for array accesses within the loop, all
+interesting variables are recorded. A variable is considered
+interesting, when it is used as index in an array access or when
+its value is changes (due to a store instruction)
+(analyze\_or\_exceptions and scan\_global\_list).
+
+\end{enumerate}
+
+The next step is responsible for the analyzation of the loop
+condition itself. The loop condition can create a so called
+dynamic constraint on a variable. If one side of the loop
+condition is constant during the execution of the loop the other
+side can be safely assumed to be less than, greater or equal
+than or equal to this side (depending on the operator) at the
+start of each loop pass. It is important to notice the difference
+to so called static constraints on variables. That means that
+these variables can be safely assumed to stay below or above a
+certain constant during the loop execution by simple testing them
+prior to loop entry. This is obviously true for constants but does
+also hold for variables that are only decremented or incremented.
+For these, a static lower or upper bound can be guaranteed for the
+whole loop execution by simply inserting a single test prior to
+loop entry. Dynamic constraints, on the contrary, can vary in
+different parts of the loop. After the variable is tested in the
+loop condition, it might be changed to different values in
+different paths of execution. All variables, that are never
+changed or that get only decremented/incremented have static and
+dynamic constraints, all others can only have dynamic ones
+(init\_constraint).
+
+Now the core bound check removal procedure is started. Beginning
+directly after the loop condition all changes of variables that
+are used as index variables somewhere in the loop are recorded.
+This is an iterative process since different paths of execution
+may yield different changes to these variables. Especially changes
+in nested loops cause these changes to become unpredictable and
+therefore no upper/lower bound can be held up any further. After
+this iterative process caused all changes to become stable, each
+array access is examined (e.g. a statement like
+\begin{verbatim}
+if (x == true)
+ i++;
+else
+ i+=2;
+\end{verbatim}
+must result in an increase of 2 for i when control
+flow joins after the if-statement). If it is possible, by
+inserting additional static/dynamic tests as needed, to assure
+that the index variable is greater or equal to zero and less than
+the array length, the bound checks are removed
+(remove\_bound\_checks). It is possible that more than one static
+tests gets inserted for a single variable, when it is used in
+different array access (possibly with different constants added or
+subtracted). Because all tests have to hold for this variable to
+allow optimzed code to be executed, only the strictest tests have
+to be done (e.g. if an integer array x[] is accessed by i in the
+statements x[i+1] and x[i-1], it has to be guaranteed that i $>$ 1
+(for second statement) and that i $<$ arraylength-1 (for the first
+statement)). Parallel to the insertion of new tests, the number of
+needed instructions for the new loop header node is accordingly
+increased. When optimzing the loop, it is important to
+differentiate between the loop head and the rest of the basic
+blocks, forming the body. The dynamic constraints can only be used
+for array accesses in the loop body, as the loop comparison is
+done at the end of the header node. Nevertheless, all static
+constraints can still be used to optimze array access in the
+header block. Because it is possible that an array reference is
+null prior to entering the loop, all array references that are
+loaded to compute an arraylength have to be checked against null.
+
+After all new tests have been determined, it is necessary to
+reorder the code and insert the new tests (insert\_static\_checks).
+The first step is the creation of a new header node. Because all
+jumps to the beginning of the old loop now have to get to the new
+header first, it is more efficient to just replace the code in the
+old header block by the new tests. So only jumps within the loops
+need to be patched and the rest of the code can remain untouched.
+For each constraint that has been found in the previous step of
+the algorithm, two values have to be loaded and are then compared.
+Because it is possible during runtime that these tests fail (and
+no guarantee can be made) a copy of the loop with the original,
+checked array accesses must exist. Depending on the outcome of the
+test cascade, a jump to the optimized or original loop is made. To
+copy the loop, all basic blocks that are part of the loop are
+copied and appended to the end of the global basic block list.
+After that, both the original and the copy of the loop need post-
+processing to redirect certain jump targets. All jumps to the old
+loop head (which now contains the static checks) in the original
+loop have to be redirected to the newly inserted block, that holds
+the code for the original loop head. In the copied loop, all jumps
+inside the loop have to be redirected to jumps to the copied
+blocks. When loops are duplicated, these changes must be reflected
+in the node list of all parent loops as well. So the hierarchie
+tree is climbed and the new nodes are added to all enclosing
+loops. Because these node lists are sorted, it is necessary to
+deal with the new loop head in a different way. The loop head has
+to be inserted into the correct place of the parent loops while
+all other copied nodes can simply be appended to the basic block
+list.
+
+Now all exceptions have to be examined. There are three different
+cases, that must be considered. An exception can be part of the
+loop body (ie. is inside the loop), an exception can contain the
+loop or an exception is in a different part of the code (and does
+not have to be further considered). To be able to find all
+exceptions that are part of the loop, the algorithm uses the
+hierarchie tree and gets all exceptions of all children nodes. The
+exception handlers of these loops have to be copied and jumps to
+the original loop have to be redirected to the copied loop. The
+start and end code pointers of the protected area have to be moved
+to the copied blocks. The exception handler code is identified by
+looking at the successor nodes starting from the handler block. As
+long as control flow does not reach a loop node again, all blocks
+are considered as part of the handler code. Exceptions that
+surround the loop have to be handled different. It is not
+necessary to copy the exception handler, as it is not part of the
+loop. Nevertheless it is important to extend the protected area to
+the copied blocks. So the first and last block (including copied
+exception handler blocks of nested loops) that got copied are
+stored. The exceptions are then extended to contain all these new
+blocks and jump to the original handler. As no code is duplicated,
+nothing needs to be patched. When climbing up the hierarchie tree
+by following the parent pointer, not all exceptions of parent
+nodes really enclose the loop. It is possible that a parent loop
+contains an exception and the loop, that is optimized, right after
+each other. Those exceptions must not be handled and are ignored.
+Because of the layout of the exception table, where appropriate
+exceptions (that could be nested) are found by a linear search
+from the start, it is necessary to insert newly created exceptions
+right after their original ones.
+
+One performance problem still remains after these modifications.
+The new loop header that replaces the original one is located
+exactly where the old one was. A fall through from the previous
+block (that could be part of the loop) to the loop header must be
+patched by inserting an additional goto-instruction to the
+original head of the loop. Because this would insert an additional
+jump into the loop body, performance may deteriorate. To solve
+this shortcoming, the new loop head has to be moved before the
+first node of the loop. This might cause the problem of moving the
+loop head to the beginning of the global basic block list. This
+pointer points to the initial basic block array and can not be
+easily reassigned. A new pointer is needed that temporary holds
+the begin of the basic block list and is assigned to the original
+pointer after all optimization step have been finished. Any fall
+through from the predecessor of the loop head now reaches the old
+loop head, that has been inserted right after the new loop head.
+After all loops have been processed, register allocation and code
+genration proceed as usual and optimized code is generated.
+
+\subsection{Helper functions}
+
+An important helper function is stored in tracing.c. This
+functions is needed to determine the variables/constants that
+participate in array accesses or comparisons. As all work is done
+on java bytecode, it is necessary to find the arguments of an
+interesting operation by examining the stack. Unfortunately these
+values can just be temporary and origin in variables loaded
+earlier and getting modified by arithmetic operations. To find the
+variables that participate in an array access, one has to walk
+back instructions and looking at the stack, until an appropriate
+load is found. The function tracing preforms this task by steping
+back instruction for instruction and record the stack changes
+these instructions cause until the correct load or push constant
+operating has been found or until it becomes clear, that it is
+impossible to determine the origin (e.g. when a function return
+value is used). The value is then delivered in a special structure
+and used by the main algorithm.
+
+\subsection{Performance - impact and gain}
+
+It is obvious that the optimization process causes additionally
+compile time overhead but can give performance gains during
+runtime, when 3 less instructions are executed per array access
+many times in a loop. The additional overhaed is mainly caused by
+the needed control flow analysis, which has to be done for every
+method and is linear to the number of basic blocks. Then the loop
+scaning algorithm trys to find loops in the control flow graph.
+This algorithm has to be started every time a method is compiled
+and runs in O(n*ld(n)) where n is the number of basic blocks.
+After the loops have been scanned for array access, the runtime of
+the final bound check removal and loop copying process mainly
+depends on the nesting depth of the hierarchie tree. For every
+additional nesting depth, the number of basic blocks, that must be
+copied and patch is doubled, resulting in exponential overhead
+according to the nesting depth. Additionally, the search for
+modifications of index variables, which is an iterative process,
+becomes slowed down by deeply nested loops. Things get worse when
+many exceptions are involved, because not only exceptions in
+children nodes have to be considered, but all enclosing exceptions
+as well. Nevertheless those cases should rarely occur and in real
+environments, the net gain can be significant. Especially in tight
+loops, when arrays are initialized or their elements summed up,
+three less instructions can gain up to 30 percent speedup, when loops run
+many times.
+
+
+\vspace{4mm}
+
+\begin{tabular}{|l|c|c|c|}
+
+\hline
+
+& Compile time (in ms) & \multicolumn{2}{c|}{Run time (in ms) } \\
+
+\multicolumn{1}{|c|}{\raisebox{1ex}{Cacao Options}} & javac & perf & sieve 1000 1000\\ \hline
+
+\rule{0mm}{5mm}with optimization (-oloop) & 176 & 1510 & 98 \\
+
+without optimization & 118 & 1720 & 131 \\
+
+\hline
+
+\end{tabular}
+
--- /dev/null
+\section{MIPS code generator}
--- /dev/null
+\section{Native interface}
--- /dev/null
+\chapter{Overview}
+
+\section{Introduction}
+
+\section{Class loader}
+
+\section{Just-in-time compiler}
+
+\section{Run time system}
+
+\section{Class library}
--- /dev/null
+\section{PowerPC code generator}
--- /dev/null
+\section{Reflection}
--- /dev/null
+\chapter{Run Time System}
+
+\section{Introduction}
+
+\section{Object layout}
+
+\section{Method invocation}
+
+Java and the programming language Theta do not implement multiple
+inheritance, but single inheritance with multiple subtyping. This important
+difference makes object layout and method dispatch more efficient. Although
+the bidirectional layout was designed for a language with multiple
+subtyping, it has the problem that more than one {\em vtbl} pointer has to
+be included in objects. The CACAO JIT compiler \cite{KrGr97b} moves the
+dispatch header of the bidirectional layout into the class information with
+negative offsets from the {\em vtbl}. For each implemented interface there
+is a distinct interface {\em vtbl}. Unimplemented interfaces are
+represented by null pointers.
+%
+\begin{figure*}[htb]
+\begin{center}
+\setlength{\unitlength}{1mm}
+\begin{picture}(114,72)
+\put( 0,42){\vector(1,0){12}}
+\put( 0,42){\makebox(8,6){\it objptr}}
+\put(12,60){\makebox(24,6){\it object}}
+\put(48,63){\makebox(30,6){\it class}}
+\put(90,66){\makebox(24,6){\it code}}
+\put(12,42){\makebox(24,18){\it instance data}}
+\put(12,36){\makebox(24,6){\it class pointer}}
+\put(12,42){\line(1,0){24}}
+\put(36,39){\vector(1,0){12}}
+\put(48,57){\makebox(30,6){\it method pointer}}
+\put(48,57){\line(1,0){30}}
+\put(48,51){\makebox(30,6){\it method pointer}}
+\put(48,51){\line(1,0){30}}
+\put(48,45){\makebox(30,6){\it method pointer}}
+\put(48,45){\line(1,0){30}}
+\put(48,39){\makebox(30,6){\it class info}}
+\put(48,27){\makebox(30,6){\it interface pointer}}
+\put(48,33){\line(1,0){30}}
+\put(48,33){\makebox(30,6){\it interface pointer}}
+\put(48, 0){\makebox(30,6){\it method pointer}}
+\put(48, 6){\makebox(30,6){\it method pointer}}
+\put(48, 6){\line(1,0){30}}
+\put(48,21){\makebox(30,6){\it interfaces}}
+\put(90,36){\framebox(24,6){\it method code}}
+\put(44,15){\vector(1,0){4}}
+\put(44,15){\line(0,1){15}}
+\put(44,30){\line(1,0){4}}
+\put(40, 0){\vector(1,0){8}}
+\put(40, 0){\line(0,1){36}}
+\put(40,36){\line(1,0){8}}
+\put(78, 3){\line(1,0){8}}
+\put(86, 3){\line(0,1){51}}
+\put(78, 9){\line(1,0){4}}
+\put(82, 9){\line(0,1){39}}
+\put(78,18){\line(1,0){4}}
+\put(78,54){\line(1,0){8}}
+\put(86,36){\vector(1,0){4}}
+\put(90,48){\framebox(24,6){\it method code}}
+\put(78,48){\vector(1,0){12}}
+\put(90,60){\framebox(24,6){\it method code}}
+\put(78,60){\vector(1,0){12}}
+\thicklines
+\put(48, 0){\framebox(30,12){}}
+\put(48,15){\framebox(30,6){\it method pointer}}
+\put(12,36){\framebox(24,24){}}
+\put(48,27){\framebox(30,36){}}
+\put(48,39){\line(1,0){30}}
+\end{picture}
+\caption{CACAO object and compact class descriptor layout}
+\label{objectcompactlayout}
+\end{center}
+\end{figure*}
+
+To call a virtual method, two memory access instructions are necessary
+(load the class pointer, load the method pointer) followed by the call
+instruction. Calling an interface method needs an additional indirection.
+%
+\begin{figure*}[htb]
+\begin{center}
+\setlength{\unitlength}{1mm}
+\begin{picture}(114,42)
+\put(0,15){\vector(1,0){12}}
+\put(0,15){\makebox(12,6){\it objptr}}
+\put(12,33){\makebox(24,6){\it object}}
+\put(48,36){\makebox(30,6){\it class}}
+\put(90,39){\makebox(24,6){\it code}}
+\put(12,15){\makebox(24,18){\it instance data}}
+\put(12,9){\makebox(24,6){\it class pointer}}
+\put(12,15){\line(1,0){24}}
+\put(36,12){\vector(1,0){12}}
+\put(48,30){\makebox(30,6){\it method pointer}}
+\put(48,30){\line(1,0){30}}
+\put(48,24){\makebox(30,6){\it method pointer}}
+\put(48,24){\line(1,0){30}}
+\put(48,18){\makebox(30,6){\it method pointer}}
+\put(48,18){\line(1,0){30}}
+\put(48,12){\makebox(30,6){\it class info}}
+\put(48,0){\makebox(30,6){\it ifmethod pointer}}
+\put(48,6){\line(1,0){30}}
+\put(48,6){\makebox(30,6){\it ifmethod pointer}}
+\put(90,3){\framebox(24,6){\it method code}}
+\put(78,3){\vector(1,0){12}}
+\put(82,3){\line(0,1){18}}
+\put(78,21){\line(1,0){4}}
+\put(78,27){\line(1,0){8}}
+\put(78,9){\line(1,0){8}}
+\put(86,9){\line(0,1){18}}
+\put(86,18){\vector(1,0){4}}
+\put(90,18){\framebox(24,6){\it method code}}
+\put(78,33){\vector(1,0){12}}
+\put(90,33){\framebox(24,6){\it method code}}
+\thicklines
+\put(12,9){\framebox(24,24){}}
+\put(48,0){\framebox(30,36){}}
+\put(48,12){\line(1,0){30}}
+\end{picture}
+\caption{CACAO object and fast class descriptor layout}
+\label{objectfastlayout}
+\end{center}
+\end{figure*}
+
+In the faster scheme, we store interface methods in an additional table at
+negative offsets from the {\em vtbl} (see figure~\ref{objectfastlayout}).
+Segregating the interface virtual function table keeps the standard virtual
+function table small and allows interface methods to be called with just
+two memory accesses. The memory consumption of virtual function tables
+containing interface and class methods would be {\em number of (classes +
+interfaces) $\times$ number of distinct methods}. The memory consumption of
+the interface tables is only {\em number of classes which implement
+interfaces $\times$ number of interface methods}. Colouring can be used to
+reduce the number of distinct offsets for interface methods further, but
+complicates dynamic class loading, leading to renumbering and code
+patching.
+
+The Jalapeno virtual machine implements an interface method invocation
+similar to the fast class descriptor layout of CACAO, but instead of
+colouring hashing of the method indices is used \cite{Alpern+01}. The table
+for the interface method pointers is allocated with a fixed size much
+smaller than the number of interface methods. When two method indices hash
+to the same offset, a conflict resolving stub is called instead of the
+interface methods directly. For conflict resolution the stub is passed the
+method index in a scratch register as additional argument. An interface
+method invocation can be executed with the following four machine
+instructions:
+%
+\begin{verbatim}
+LD vtblptr,(obj) ; load vtbl pointer
+LD mptr,hash(method_ptr)(vtblptr) ; load method pointer
+MV mindex,idreg ; load method index
+JSR (mptr) ; call method (or conflict stub)
+\end{verbatim}
+%
+The number of machine instructions is the same as in the compact class
+descriptor layout of CACAO, but the indirection is eliminated, which
+reduces the number of cycles needed for the execution of this instruction
+sequence on a pipelined architecture. Compared to the old interface method
+invocation in the Jalapeno VM, which searched the superclass hierarchy for
+a matching method signature, the new method resulted in changes in the
+run-time from one percent slowdowns to speedups up to 51 percent.
+
+
+\section{Run time type checking}
+
+Since type tests for trees are more (space) efficient than type tests for
+DAGs, a possible solution is to split a DAG into a tree part and the
+remaining graph. For languages with single inheritance and multiple
+subtyping this partitioning of the class hierarchy is already done in the
+language itself.
+
+\begin{figure}[htb]
+\begin{center}
+\setlength{\unitlength}{1mm}
+\begin{picture}(86,26)
+\thicklines
+\put(23, 3){\circle{6}}
+\put(43, 3){\circle{6}}
+\put(63, 3){\circle{6}}
+\put(33,13){\circle{6}}
+\put(53,13){\circle{6}}
+\put(43,23){\circle{6}}
+\put(20, 0){\makebox(6,6){D}}
+\put(40, 0){\makebox(6,6){E}}
+\put(60, 0){\makebox(6,6){F}}
+\put(30,10){\makebox(6,6){B}}
+\put(50,10){\makebox(6,6){C}}
+\put(40,20){\makebox(6,6){A}}
+\put(19, 0){\makebox(0,6)[r]{\{3,0\}}}
+\put(47, 0){\makebox(6,6)[l]{\{4,0\}}}
+\put(67, 0){\makebox(0,6)[l]{\{6,0\}}}
+\put(29,10){\makebox(0,6)[r]{\{2,2\}}}
+\put(57,10){\makebox(0,6)[l]{\{5,1\}}}
+\put(47,20){\makebox(0,6)[l]{\{1,5\}}}
+\put(25, 5){\line( 1,1){6}}
+\put(35,15){\line( 1,1){6}}
+\put(41, 5){\line(-1,1){6}}
+\put(51,15){\line(-1,1){6}}
+\put(61, 5){\line(-1,1){6}}
+\end{picture}
+\caption{Relative numbering with \{{\tt baseval}, {\tt diffval}\} pairs}
+\label{relnumbering}
+\end{center}
+\end{figure}
+
+CACAO uses a subtype test based on relative numbering for classes and a
+kind of matrix implementation for interfaces. Two numbers {\em low} and
+{\em high} are stored for each class in the class hierarchy. A depth first
+traversal of the hierarchy increments a counter for each class and assigns
+the counter to the {\em low} field when the class is first encountered and
+assigns the counter to the {\em high} field when the traversal leaves the
+class. In languages with dynamic class loading a renumbering of the
+hierarchy is needed whenever a class is added. A class {\em sub} is a
+subtype of another class {\em super}, if {\em super.low $\le$ sub.low $\le$
+super.high}. Since a range check is implemented more efficiently by an
+unsigned comparison, CACAO stores the difference between the {\em low} and
+{\em high} values and compares it against the difference of the {\em low}
+values of both classes. The code for {\tt instanceof} looks similar to:
+
+\begin{verbatim}
+return (unsigned) (sub->vftbl->baseval - super->vftbl->baseval) <=
+ (unsigned) (super->vftbl->diffval);
+\end{verbatim}
+
+For leaf nodes in the class hierarchy the {\tt diffval} is 0 which results
+in a faster test (a simple comparison of the {\tt baseval} fields of the
+sub- and superclass). In general a JIT compiler can generate the faster
+test only for final classes. An AOT compiler or a JIT compiler which does
+patching of the already generated machine code may additionally replace
+both the {\tt baseval} and the {\tt diffval} of the superclass by a
+constant. Currently CACAO uses constants only when dynamic class loading is
+not used.
+
+CACAO stores an interface table at negative offsets from the virtual method
+table (see figure~\ref{objectcompactlayout}). This table is needed for the
+invocation of interface methods. The same table is additionally used by the
+subtype test for interfaces. If the table is empty for the index of the
+superclass, the subtype test fails. The code for {\tt instanceof} looks
+similar to:
+
+\begin{verbatim}
+return (sub->vftbl->interfacetable[-super->index] != NULL);
+\end{verbatim}
+
+Both subtype tests can be implemented by very few machine code instructions
+without using branches which are expensive on modern processors.
+
+
+\section{Exception handling}
+
+\subsection{Introduction}
+
+Exceptions in Java occur either implicitly or explicitly. Typical
+implicit exceptions are references to the {\tt null} pointer, array
+index out of bounds and division by zero. Exceptions also can be
+raised explicitly with the {\tt throw} instruction. To handle
+exceptions occurring during execution, code which can raise an
+exception is included in a {\tt try} block. An efficient
+implementation of exception handling has to take care of managing {\tt
+try} blocks and to check for implicit exceptions efficiently .
+
+
+\subsection{Known implementation techniques}
+
+Three standard methods exist for implementing exception handling:
+
+\begin{itemize}
+
+\item dynamically create a linked list of {\tt try} block data
+ structures,
+
+\item use static {\tt try} block tables and search these tables at run
+ time (suggested for JavaVM interpreters),
+
+\item use functions with two return values.
+
+\end{itemize}
+
+The first method has been used in portable implementations of
+exception handling for C++ \cite{Cameron+92} or Ada \cite{Giering+94}
+using {\tt setjmp} and {\tt longjmp}. A linked exception handling data
+structure is created when entering a {\tt try} block and the structure
+is discarded when leaving the protected block. Java requires precise
+exceptions. It means that all expressions evaluated before the
+exception raising instruction must have finished and all expressions
+after the raising instruction must not have been started. Therefore,
+in practice, some instructions may not be moved freely. In the case of
+subroutine calls, the callee-saved registers must be restored to their
+original value. The data structure can be used to store such
+information. The disadvantage of this method is that creating and
+discarding of the data structure takes some time even if an exception
+is never raised.
+
+The second method has been suggested for an efficient exception
+handling implementation of C++ \cite{KoenigStroustrup90} and is used
+in Java implementations. For every method, the JavaVM maintains an
+exception table. This exception table contains the program counter of
+the start and the end of the {\tt try} block, the program counter of
+the exception handler and the type of the exception. A JavaVM
+interpreter can easily interpret this structure and dispatch to the
+corresponding handler code. If the byte code is translated to native
+code, the equivalent technique is more complicated.
+
+To simplify restoration of the registers, the old CACAO implementation
+used a different scheme \cite{KrGr97b}. A method has two return
+values: the real return value and an exception value stored in a
+register. After each method call, the exception register is checked
+and, if it is non-zero, the exception handling code is executed. Since
+an exception is rarely raised, the branch is easy to predict and
+cheap. Entering and leaving a {\tt try} block have no associated
+costs. At compile time, the dispatch information contained in the
+exception table is translated into method dispatching native code.
+
+Run time checks for null pointers and array bounds are quite frequent,
+but can be eliminated in many cases. It is often possible to move a
+loop invariant null pointer check before the loop or to eliminate a
+bound check. Some more sophisticated compilers use these code motion
+techniques.
+
+
+\subsection{Motivation for a change}
+
+\begin{table*}
+\begin{center}
+\begin{tabular}[b]{|l|c|c|c|c|c|}
+\hline
+ & JavaLex & javac & espresso & Toba & java\_cup \\
+\hline
+null pointer checks & 6859 & 8197 & 11114 & 5825 & 7406 \\
+\hline
+method calls & 3226 & 7498 & 7515 & 4401 & 5310 \\
+\hline
+try blocks & 20 & 113 & 44 & 28 & 27 \\
+\hline
+\end{tabular}
+\caption{Number of pointer checks, method invocations and try blocks}
+\label{MethodTry}
+\end{center}
+\end{table*}
+
+The old CACAO implementation was simple, but it only makes sense if
+the number of try blocks is high. We made an empirical study to count
+the numbers of static occurrences of method invocations and of {\tt
+try} blocks in some applications (see table \ref{MethodTry}). The
+number of method invocations is two magnitudes bigger than the number
+of {\tt try} blocks. Furthermore, an exception is rarely raised during
+program execution. This led us to a new implementation of exception
+handling.
+
+
+\subsection{The new exception handling scheme}
+
+The new exception handling scheme is similar to that in a JavaVM
+interpreter. If an exception occurs, information in the exception
+table is interpreted. However native code complicates the matter.
+
+The pointers to Java byte code must be replaced by pointers to native
+code. It requires that, during native code generation, the order of
+basic blocks not be allowed to change. If basic blocks are eliminated
+because of dead code, the information about a block can not be
+discarded if there is a pointer to it in the exception table.
+
+A CACAO stack frame only contains copies of saved or spilled
+registers. There is no saved frame pointer. The size of a stack frame
+is only contained in the instructions which allocate and deallocate
+the stack. Therefore, to support exception handling, additional
+information has to be stored elsewhere.
+
+The code for a method needs access to constants (mostly address
+constants). Since a global constant table would be too large for short
+address ranges and, because methods are compiled on demand, every
+method has its own constant area which is allocated directly before
+the start of the method code (see fig.\ \ref{methodlayout}). A
+register is reserved which contains the method pointer. Constants are
+addressed relative to the method pointer.
+
+\begin{figure}[htb]
+\begin{center}
+\setlength{\unitlength}{1mm}
+\begin{picture}(50,18)
+\put(24,6){\vector(-1,0){6}}
+\put(24,3){\makebox(25,6){\it method pointer}}
+\put(0,0){\makebox(18, 6){\it constants}}
+\put(0,6){\makebox(18,12){\it code}}
+\thicklines
+\put(0,6){\line(1,0){18}}
+\put(0,0){\framebox(18,18){}}
+\end{picture}
+\caption{CACAO method layout}
+\label{methodlayout}
+\end{center}
+\end{figure}
+
+During a method call, the method pointer of the calling method is
+destroyed. However the return address is stored in a register which is
+preserved during execution of the called method and has to be used for
+returning from the method. After a method return, the method pointer
+of the calling method is recomputed using the return address. The
+following code for a method call demonstrates the method calling
+convention:
+
+\begin{verbatim}
+LDQ cp,(obj) ; load class pointer
+LDQ mp,met(cp) ; load method pointer
+JSR ra,(mp) ; call method
+LDA mp=ra+offset ; recompute method pointer
+\end{verbatim}
+
+At the beginning of the constant area, there are fields which contain
+the necessary information for register restoration:
+
+\begin{mylist}{{\tt framesize}}
+\item[{\tt framesize}] the size of the stack frame
+\item[{\tt isleaf}] a flag which is true if the method is a leaf
+method
+\item[{\tt intsave}] number of saved integer registers
+\item[{\tt floatsave}] number of saved floating point registers
+\item[{\tt extable}] the exception table -- similar to the JavaVM
+table
+\end{mylist}
+
+The exception handler first checks if the current executing method has
+an associated handler and may dispatch to this handler. If there is no
+handler, it unwinds the stack and searches the parent methods for a
+handler. The information in the constant area is used to restore the
+registers and update the stack pointer. The return address and the
+offset in the immediately following {\tt LDA} instruction is used to
+recompute the method pointer.
+
+\begin{table*}
+\begin{center}
+\begin{tabular}[b]{|l|c|c|c|c|c|}
+\hline
+ & JavaLex & javac & espresso & Toba & java\_cup \\ \hline
+CACAO old & 61629 & 156907 & 122951 & 67602 & 87489 \\ \hline
+CACAO new & 37523 & 86346 & 69212 & 41315 & 52386 \\ \hline
+\end{tabular}
+\caption{Number of generated native instructions}
+\label{CodeSize}
+\end{center}
+\end{table*}
+
+The change to the new scheme allowed us to implement the null pointer
+check for free. We protect the first 64 Kbyte of memory against read
+and write access. If an bus error is raised, we catch the signal and
+check if the faulting address is within the first 64K. If this is the
+case, we dispatch to our exception handler, otherwise we propagate the
+signal. This gives faster programs and reduces the work of the
+compiler in generating pointer checking code. As shown in table
+\ref{MethodTry}, the numbers of null pointer checks are quite high.
+
+
+\section{Garbage collector}
+
+
--- /dev/null
+\section{Multi threading}
+
+\subsection{Introduction}
+
+Threads are an integral part of the Java programming language. A Java
+Runtime Environment (JRE) has to implement threads to be compliant. A
+thread implementation includes the creation, execution, switching,
+killing and synchronization of threads. In Java the latter is provided
+by monitors. Monitors ensure that, for a given object, only one thread
+at a time can execute certain methods, known as synchronized methods,
+and blocks which are marked by the keyword \texttt{synchronized}.
+
+Monitors are usually implemented using mutex (mutual exclusion). A
+mutex is a data structure which contains the necessary information to
+guarantee that only one unit of execution can perform a critical
+section at the same time \cite{Stallings95}.
+
+As we show in section \ref{MonitorTest} a fast implementation of the
+synchronization mechanism is crucial for the efficiency of Java. One
+implementation produced more than 800\% overhead in one of our tests.
+
+
+\subsection{Related work}
+
+Java threads can be implemented using threads provided by the
+operating system kernel, as user-level libraries, or as a combination
+of the two. There exist a number of articles describing different
+thread implementation aspects but none captures the problem of
+efficient monitor operations for objects.
+
+Sun's first implementation of the JavaVM on Solaris was based on
+user-level threads. The current implementation uses a combination of
+kernel and user-level threads. Some of the advantages of this approach
+are outlined in \cite{SunThreads97}.
+
+The freely available JavaVM implementation {\tt kaffe} by Tim
+Wilkinson uses user-level threads \cite{Wilkinson:97}. Until version
+0.9, each object contained the complete mutex data structure. This
+enabled a fast monitor implementation but used a lot more memory than
+necessary.
+
+Apart from thread implementations used in JavaVM's there are many
+other thread standards and implementations, the most notable being the
+IEEE POSIX extension \cite{PosixThreads96}.
+
+In \cite{Mueller93}, Mueller describes a library implementation of
+POSIX threads on a standard UNIX system. This library is a user-level
+implementation which need no support from the operating system. A very
+popular library of user-level primitives for implementing threads is
+{\em QuickThreads} by David Keppel, described in \cite{Keppel93}.
+
+Bershad et al. present a fast mechanism for mutual exclusion on
+uniprocessor systems \cite{Bershad+92}. They have a software solution
+for the implementation of an atomic test-and-set operation which is
+faster than the corresponding hardware instruction. However, their
+implementation relies on assistance from the operating system.
+
+
+\subsection{Implementation basics}
+
+A complete thread implementation for Java has to provide:
+
+\begin{itemize}
+
+\item thread creation and destruction,
+
+\item low level thread switching (usually implemented in assembly
+ language),
+
+\item thread scheduling,
+
+\item synchronization primitives,
+
+\item a thread safe non-blocking input/output library.
+
+\end{itemize}
+
+Cacao's current implementation of threads is based mainly on the
+threading code of {\tt kaffe} version 0.7, which has been released
+under a BSD-style license and can thus be used freely
+\cite{Wilkinson:97}. As mentioned above, {\tt kaffe}'s threads are
+completely user-level, which means, for example, that they cannot take
+advantage of a multiprocessor system.
+
+\begin{table*}
+\begin{center}
+\begin{tabular}{|l|l|c|c|c|c|c|c|} \hline
+ & & \multicolumn{3}{|c|}{mutex test}
+ & \multicolumn{3}{|c|}{tree test}\\ \hline
+ & & \multicolumn{2}{|c|}{run time in secs}&call cost
+ & \multicolumn{2}{|c|}{run time in secs}&call cost\\
+Machine & JavaVM & no sync & sync & in $\mu$s & no sync & sync & in $\mu$s \\ \hline\hline
+DEC 21064A 289MHz & OSF JDK port (1.0.2)& 1.20 & 4.14 & 9.8 & 8.37 & 34.69 & 9.8 \\ \hline
+DEC 21064A 289MHz & DEC JDK interpreter & 1.71 & 12.80 & 36.97 & 12.25 & 143.10 & 39.93 \\ \hline
+DEC 21064A 289MHz & DEC JDK JIT & 1.06 & 11.05 & 33.30 & 7.80 & 130.50 & 37.45 \\ \hline
+Pentium-S 166MHz & Linux JDK 1.1.3 & 0.96 & 1.69 & 2.43 & 7.93 & 16.12 & 2.50 \\ \hline
+AMD K6 200MHz & Sun JPP JIT (WinNT) & 0.57 & 0.94 & 1.23 & 2.8 & 9.8 & 2.13 \\ \hline
+AMD K6 200MHz & Navigator 4.04 (WinNT) & 0.03 & 0.77 & 2.46 & 2.3 & 9.2 & 2.11 \\ \hline
+%PowerPC 604e 200MHz & Linux JDK Beta & 1.16 & 2.05 & 2.96 & & & \\ \hline
+%Sun Ultra-1 & JDK 1.1.2 & 0.90 & 1.25 & 1.16 & & & \\ \hline
+%DEC 21064A 300MHz & Cacao unoptimized & 0.24 & 0.48 & 0.80 & 0.24 & 0.48 & 0.80 \\ \hline
+DEC 21064A 289MHz & Cacao & 0.28 & 0.40 & 0.40 & 1.46 & 4.71 & 0.99 \\ \hline
+\end{tabular}
+\caption{Overhead for calling synchronized methods (one object)}
+\label{SyncronizedOverhead}
+\end{center}
+\end{table*}
+
+There are several reasons why we chose this approach:
+
+\begin{itemize}
+
+\item Thread support differs from operating system to operating
+system. Not only do some operating systems provide different APIs to
+other systems, but even if they do provide the same interface (most
+often in the form of POSIX threads), the costs of various operations
+are often very different across platforms.
+
+\item It was a complete implementation, tailored for the use in a
+JavaVM and thus made it possible for us to get thread support up and
+running quickly.
+
+\item Parts of Cacao are not yet thread-safe (most notably the
+compiler and the garbage collector), thus making it difficult to use a
+kernel-supported solution.
+
+\end{itemize}
+
+Each thread in a Java program corresponds to an instance of the
+\texttt{java.lang.Thread} class. In order for the Java run time
+environment (JRE) to associate platform specific information with such
+an instance, it has a private member called \texttt{PrivateInfo} of
+type \texttt{int}, which can be used by the JRE. Kaffe version 0.7
+used this member as a pointer to a context structure. Since pointers
+are 64-bit values on the Alpha, we use an array of context structures.
+\texttt{PrivateInfo} is then used as an index into this array.
+
+A fixed-size stack is allocated for each thread. The stack size for
+ordinary threads can be specified as a command-line parameter. Special
+threads (such as the garbage collector) have their own stack sizes. In
+order to catch stack overflows without the burden of checking the
+stack top at each method entry, we simply guard the stack top with one
+or more memory pages. The memory protection bits of these pages are
+modified to cause a memory protection violation when accessed. The
+number of guard pages can be specified on the command-line.
+
+Thread switching is implemented straightforwardly, namely by saving
+callee-save registers to the stack, switching to the new stack,
+restoring callee-save registers and performing a subroutine return.
+
+Scheduling is very simple in Cacao: higher priority threads always get
+precedence over lower priority threads, and threads of the same
+priority are scheduled with a round-robin algorithm.
+
+I/O in user-level threads is a problem since UNIX I/O calls are, by
+default, blocking. They would block not just the current thread but
+the whole process. This is undesirable. It is thus common practice for
+thread libraries to use non-blocking I/O and, in the case of an
+operation which would block, suspend the corresponding thread and
+perform a multiplexed I/O operation (\texttt{select(2)} in UNIX) on
+all such blocked files regularly, especially if there are no runnable
+threads available.
+
+
+\subsection{Motivation}
+
+\label{MonitorTest}
+
+To optimize an implementation it is necessary to find the parts which
+are critical to performance. Therefore, we analysed the cost of
+monitors with two small test programs. The {\em mutex test} program
+simply invoked a method 300000 times, once with the method being
+declared \texttt{synchronized} and once without. The other test, {\em
+tree test}, allocated a balanced binary tree with 65535 elements and
+recursively traversed it 50 times using a method, again once being
+synchronized and once being not.
+
+Table \ref{SyncronizedOverhead} shows the differences in run-time for
+the two versions of the programs for several JavaVM's. The table
+includes the run times for both versions of the programs in seconds.
+The column `call cost' gives the overhead of a call of a synchronized
+method compared to a call of a non-synchronized one. From these
+numbers it is obvious that calls to synchronized methods, or monitors
+in general, are much slower than ordinary method calls.
+
+\begin{table*}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+Application & Objects allocated & Objects with mutex & Monitor
+operations & Parallel Mutexes \\ \hline\hline
+javac & 111504 & 13695 &
+840292 & 5 \\ \hline
+Biss & 84939 & 13357 &
+1058901 & 12 \\ \hline
+Jigsaw & 215411 & 23804 &
+855691 & 25 \\ \hline
+\end{tabular}
+\caption{Mutual exclusion statistics}
+\label{MutexStatistics}
+\end{center}
+\end{table*}
+
+The question that arises is now whether this has any influence on
+common Java programs. To answer this question, we have used a modified
+version of \texttt{kaffe} to gather statistics about monitor usage.
+The results are summarized in table \ref{MutexStatistics}.
+
+Javac is an invocation of Sun's \texttt{javac} on the Toba source
+files \cite{Proebsting+97} and is thus single-threaded. Execution of
+this program takes only a few seconds using Cacao with threads
+disabled. Biss is a more or less typical working session with the Java
+development environment of the Biss-AWT \cite{BissAWT97}. It is
+slightly multithreaded. Jigsaw invokes the HTTP server Jigsaw
+\cite{Jigsaw97} of the World Wide Web Consortium and lets it serve
+identical parallel requests from seven hosts, amounting to about one
+megabyte split across 200 files for each request. This application is
+highly multithreaded.
+
+The table contains the number of objects allocated during execution
+and the number of objects for which a monitor has been entered. The
+`Monitor operations' column gives the total number of operations
+performed on monitors, while the numbers in the `Parallel Mutexes'
+column show the greatest number of mutexes that were locked
+simultaneously.
+
+These numbers demonstrate that the performance of monitor operations
+is of paramount importance for a fast JavaVM implementation. We did
+not include the number of blocking monitor operations because there
+were just two of them, namely in the Biss application. It was only
+after we modified \texttt{kaffe} to switch to another thread before
+each \texttt{monitorenter} operation that the Biss and Jigsaw
+applications performed a few thousand blocking \texttt{monitorenter}s.
+
+
+\subsection{Mutex implementation}
+
+Our mutex data structure includes a pointer to the thread that has
+currently locked the mutex (\texttt{holder}). If the mutex is not
+locked at all, this is a null pointer. Because one thread can lock the
+same mutex multiple times, we need a count of how many lock operations
+without corresponding unlocks have been performed on the mutex
+(\texttt{count}). When it goes down to zero, the mutex is not locked
+by any thread. Furthermore, we need to keep track of the threads which
+have tried to lock the mutex but were blocked and are now waiting for
+it to become unlocked (\texttt{waiters}).
+
+The data structure is defined as follows:
+
+\begin{verbatim}
+struct mutex {
+ int count;
+ thread *holder;
+ thread *waiters;
+}
+\end{verbatim}
+
+The locking of a mutex can now be specified as in
+figure~\ref{lockMutex}.
+
+\begin{figure}
+\begin{verbatim}
+void lockMutex (struct mutex *mux) {
+ disablePreemption();
+
+ if (mux->holder == NULL) {
+ mux->holder = currentThread;
+ mux->count = 1;
+ } else if (mux->holder == currentThread) {
+ mux->count++;
+ } else {
+ blockUntilMutexIsNotLocked(mux);
+ mux->holder = currentThread;
+ mux->count = 1;
+ }
+
+ enablePreemption();
+}
+\end{verbatim}
+\caption{Code for \texttt{lockMutex()}}
+\label{lockMutex}
+\end{figure}
+
+The macro \texttt{disablePreemption()} simply sets a global flag
+indicating that a critical section is currently being executed and
+that preemption must not take place. \texttt{enablePreemption()}
+unsets the flag and checks whether a thread switch is necessary (see
+below). On a multiprocessor system this would need to be implemented
+using a test-and-set instruction, or some equivalent.
+
+The inverse operation, namely the unlocking of the mutex, can be
+expressed as in figure~\ref{unlockMutex}.
+
+\begin{figure}
+\begin{verbatim}
+void unlockMutex (struct mutex *mux) {
+ disablePreemption();
+
+ --mux->count;
+ if (mux->count == 0) {
+ mux->holder = NULL;
+ if (mux->waiters != NULL) {
+ thread *next = mux->waiters;
+ mux->waiters = next->next;
+ resumeThread(next);
+ }
+ }
+
+ enablePreemption();
+}
+\end{verbatim}
+\caption{Code for \texttt{unlockMutex()}}
+\label{unlockMutex}
+\end{figure}
+
+The function \texttt{resumeThread()} sets a flag which results in a
+thread switch to the given thread after preemption has been
+re-enabled.
+
+These algorithms are simple, straightforward and reasonably efficient.
+
+
+\subsection{Object-Mutex relation}
+\label{objectmutex}
+
+Since the JavaVM specification states that each object has a mutex
+associated with it, the obvious solution seems to be to include the
+mutex structure in each object. The mutex structure could be reduced
+to a length of eight bytes if we used thread numbers instead of
+pointers. But, using such a solution, the javac application would
+allocate nearly one megabyte of mutex data, just for a few seconds of
+execution. This is unacceptable.
+
+On the other hand, the figures show that it is very seldom that more
+than 20 mutexes are locked at the same time. This suggests that using
+a hash table, indexed by the address of the object for which a monitor
+operation is to be performed could be very efficient. This is exactly
+what Cacao does.
+
+We use a hash function which uses the $2 n$ least significant bits of
+the address where $2^n$ is the size of the hash table. This function
+can be implemented in four RISC instructions. Since we ran into
+performance problems with overflow handling by internal chaining, we
+now use external chaining with a preallocated pool of overflow entries
+managed by a free list.
+
+An entry in the hash table has the following structure:
+
+\begin{verbatim}
+struct entry {
+ object *obj;
+ struct mutex mux;
+ struct entry *next;
+}
+\end{verbatim}
+
+In order to minimize the overhead of the locking/unlocking operations,
+we should strive for code sequences small enough to be inlined, yet
+powerful enough to handle the common case. We have chosen to handle
+the first entry in the collision chain slightly differently from the
+rest by not destroying the associated mutex when the count goes down
+to zero. Instead the decision is deferred until when a mutex with the
+same hash code gets locked and would thus use this location. The
+major benefit of this approach is that the code to lock the mutex need
+not (in the common case) check for the location to be free, since each
+hash table location will, during the course of execution, only be free
+at the beginning of the program and will then never be freed again.
+The unlocking code is simplified by the fact that the code need not
+check whether the hash table location should be freed. Based on the
+experience that a recently locked mutex is likely to be locked again
+in the near future, this technique also improves overall performance.
+
+\begin{figure}
+\begin{verbatim}
+ 1 void monitorenter (object *o)
+ 2 {
+ 3 entry *e;
+ 4 disablePreemption();
+ 5
+ 6 e = firstChainEntry(o);
+ 7 if (e->obj == o
+ 8 && e->mux.holder
+ 9 == currentThread)
+10 e->mux.count++;
+11 else
+12 lockMutexForObject(e,o);
+13
+14 enablePreemption();
+15 }
+\end{verbatim}
+\caption{Code of \texttt{monitorenter()}}
+\label{monitorenter}
+\end{figure}
+
+\begin{figure}
+\begin{verbatim}
+ 1 void monitorexit (object *o)
+ 2 {
+ 3 entry *e;
+ 4 disablePreemption();
+ 5
+ 6 e = firstChainEntry(o);
+ 7 if (e->obj == o)
+ 8 e->mux.count--;
+ 9 else
+10 unlockMutexForObject(e,o);
+11
+12 enablePreemption();
+13 }
+\end{verbatim}
+\caption{Code of \texttt{monitorexit()}}
+\label{monitorexit}
+\end{figure}
+
+
+\begin{table*}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|r|} % monitorEnter
+\hline
+Program & Line 6 & Line 10 & count 0 & Line 12 & Ratio $12/6$ \\ \hline\hline
+javac & 420147 & 405978 & 381019 & 14169 & 3.372 \% \\ \hline
+Biss & 384350 & 370171 & 425038 & 14179 & 3.689 \% \\ \hline
+Jigsaw & 426695 & 391680 & 294957 & 35015 & 8.206 \% \\ \hline
+\end{tabular}
+\caption{Execution statistics for \texttt{monitorenter()}}
+\label{monitorEnter}
+\end{center}
+\end{table*}
+
+
+\begin{figure}
+\begin{verbatim}
+ 1 void lockMutexForObject (entry *e,
+ 2 object *o) {
+ 3 disablePreemption();
+ 4
+ 5 if (e->obj != NULL)
+ 6 if (e->mux.count == NULL)
+ 7 claimEntry(e,o);
+ 8 else
+ 9 while (e->obj != o) {
+10 if (e->next == NULL) {
+11 e = e->next = allocEntry(o);
+12 break;
+13 }
+14 e = e->next;
+15 }
+16 else
+17 e->obj = o;
+18
+19 lockMutex(&e->mux);
+20
+21 enablePreemption();
+22 }
+\end{verbatim}
+\caption{Code for \texttt{lockMutexForObject()}}
+\label{lockMutexForObject}
+\end{figure}
+
+\begin{figure}
+\begin{verbatim}
+ 1 void unlockMutexForObject (entry *e,
+ 2 object *o) {
+ 3 disablePreemption();
+ 4
+ 5 if (e->obj == o)
+ 6 unlockMutex(&e->mux);
+ 7 else {
+ 8 /* Assuming entry is there */
+ 9 while (e->next->obj != o)
+10 e = e->next;
+11 unlockMutex(&e->next->mux);
+12 if (e->next->mux.count == 0)
+13 e->next = freeEntry(e->next);
+14 }
+15
+16 enablePreemption();
+17 }
+\end{verbatim}
+\caption{Code for \texttt{unlockMutexForObject()}}
+\label{unlockMutexForObject}
+\end{figure}
+
+See figures~\ref{monitorenter} and~\ref{monitorexit} for the code of
+the corresponding JavaVM instructions. These functions handle (as we
+show below) the most common case and depend on the two functions for
+the general case presented in figures~\ref{lockMutexForObject}
+and~\ref{unlockMutexForObject} (we now assume that
+\texttt{enablePreemption()}/\texttt{disablePreemption()} pairs can be
+nested).
+
+Even if they are not short enough to be inlined in every synchronized
+method, these functions are certainly small enough to make the effort
+of coding them as specially tailored, assembly language functions
+worthwhile. This bypasses the standard subroutine linkage conventions,
+gaining a little extra speed.
+
+\subsection{Results}
+
+\begin{table}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} % monitorExit
+\hline
+Program & Line 6 & Line 8 & Line 10 & Ratio $10/6$ \\ \hline\hline
+javac & 420146 & 419815 & 331 & 0.078 \% \\ \hline
+Biss & 384368 & 383281 & 1087 & 0.282 \% \\ \hline
+Jigsaw & 428997 & 416890 & 12107 & 2.822 \% \\ \hline
+\end{tabular}
+\caption{Execution statistics for \texttt{monitorexit()}}
+\label{monitorExit}
+\end{center}
+\end{table}
+
+To demonstrate that nearly all cases are indeed handled by these small
+routines, we have written a small application which simulates the
+locking and unlocking operations of the three applications we used
+above (tables~\ref{monitorEnter} and \ref{monitorExit}). As can be
+seen, only a small percentage of cases need to be handled in the
+general routines \texttt{lockMutexForObject()} and
+\texttt{unlockMutexForObject()}.
+
+The column `count 0' gives the number of mutex locks where the {\tt count}
+of the mutex before locking is zero. We present this number to evaluate an
+implementation where {\tt monitorenter()} checks the {\tt count} instead of
+the {\tt holder} to determine if the mutex can be locked immediately. The
+numbers seem to be in favour of checking {\tt holder} but at least indicate
+that the difference is neglectable. Furthermore, locking the mutex after
+only having checked {\tt count} needs an additional assignment (setting
+{\tt holder}).
+
+\begin{table*}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+ & \multicolumn{4}{|c|}{miss rates by size of cache} \\
+Application & 1 Element & 2 Elements & 4 Elements & 8 Elements \\
+\hline\hline
+javac & 15.076 \% & 9.757 \% & 4.931 \% & 3.193 \% \\ \hline
+Biss & 13.488 \% & 8.349 \% & 4.765 \% & 3.141 \% \\ \hline
+Jigsaw & 43.694 \% & 37.700 \% & 22.680 \% & 5.182 \% \\ \hline
+\end{tabular}
+\caption{Results of cache simulation}
+\label{CacheResults}
+\end{center}
+\end{table*}
+
+We have also considered the possibility of using a cache of recently
+used mutexes to improve performance, similar to a
+translation-lookaside buffer in microprocessors which cache the
+mapping between virtual and physical memory pages. To evaluate whether
+this would be worthwhile, we have simulated caches with one, two, four
+and eight elements using the three applications as test candidates. We
+have used least-recently-used as the cache replacement strategy.
+Though this is not easily implemented in software, it provides a good
+estimate of the best hit rate that can be achieved with an efficient
+implementation. Table~\ref{CacheResults} summarizes the results.
+
+\begin{table*}
+\begin{center}
+\begin{tabular}[b]{|l|c|c|c|c|c|}
+\hline
+ & JavaLex & javac & espresso & Toba &
+java\_cup \\ \hline
+run time without threads & 2.82 & 4.91 & 3.23 & 4.32
+& 1.35 \\ \hline
+run time with threads & 3.89 & 5.40 & 3.23 & 5.53
+& 1.56 \\ \hline
+overhead (optimized impl.) & 37 \% & 10 \% & 0 \% & 28 \%
+& 15 \% \\ \hline
+number of lock/unlock pairs & 1818889 & 420146 & 2837 & 1558370 &
+124956 \\ \hline
+\end{tabular}
+\caption{Overhead of monitor operations}
+\label{CACAOMutexCost}
+\end{center}
+\end{table*}
+
+Using an implementation supporting the monitor routines, as discussed
+in section~\ref{objectmutex}, and one implementation without thread
+support, we have run several applications on a 300MHz DEC 21064A (see
+table~\ref{CACAOMutexCost}). For these single threaded applications,
+the overhead introduced by monitor operations ranges from 0\% to 37\%,
+depending on the number of monitor operations in the applications.
+Note, however, that this cannot be compared to the overhead figures
+given in table \ref{SyncronizedOverhead}, since these applications do
+more than just entering and exiting a monitor.
+
+Using the implementation described, the {\em mutex test} application
+for table \ref{SyncronizedOverhead} took 0.40 seconds with a
+synchronized and 0.28 seconds with an ordinary method to complete. In
+this program the time spent on locking/unlocking a mutex is 0.40
+microseconds. The reason for the higher cost of mutex operations in
+the {\em tree test} is that this test violates the locality of monitor
+operations. Overall, these numbers compare very favorably with the
+other implementations.
+
+For most single-threaded applications, the monitor overhead can be
+eliminated completely. If it is possible to determine statically that
+the dynamic class-loader and the \texttt{java.lang.Thread} class are
+not used, synchronization code need not be generated.
+
+
+\subsection{Thin locks}
+
+
--- /dev/null
+\section{Verification}
--- /dev/null
+\section{X86 code generator}