From: cacao Date: Mon, 27 Oct 2003 17:34:01 +0000 (+0000) Subject: framework for handbook added X-Git-Url: http://wien.tomnetworks.com/gitweb/?p=cacao.git;a=commitdiff_plain;h=3f796a984a3b8387184a56b639fa76798f7c1242 framework for handbook added --- diff --git a/doc/handbook/alpha.tex b/doc/handbook/alpha.tex new file mode 100644 index 000000000..d6d0fc8b5 --- /dev/null +++ b/doc/handbook/alpha.tex @@ -0,0 +1 @@ +\section{Alpha code generator} diff --git a/doc/handbook/cacao.tex b/doc/handbook/cacao.tex new file mode 100644 index 000000000..f5206af8e --- /dev/null +++ b/doc/handbook/cacao.tex @@ -0,0 +1,68 @@ +\documentclass[a4paper,twoside]{book} + +%\pagestyle{myheadings} \footheight 12pt \footskip 72pt + +%\pagestyle{empty} + +\pagestyle{headings} +\setlength{\textwidth}{16cm} +\setlength{\textheight}{23cm} +\setlength{\topmargin}{0mm} +\setlength{\oddsidemargin}{0cm} +\setlength{\evensidemargin}{0cm} + + +\parskip\medskipamount + +\newenvironment{mylist}[1]{ + \begin{list}{} + {\settowidth{\labelwidth}{#1} + \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep} + \itemsep0ex \parsep0ex + \renewcommand{\makelabel}[1]{##1\hfill} + } + }{\end{list}} + +\title{The CACAO Java Virtual Machine} + +\author{Andreas Krall et al.} + + +\date{} + + + +\begin{document} + + +\maketitle + +\tableofcontents + +\include{intro} + +\include{overview} + +\include{loader} + +\include{runtime} + \include{threads} + \include{native} + \include{reflection} + +\include{jit} + \include{verification} + \include{loopopt} + \include{inlining} + \include{alpha} + \include{mips} + \include{powerpc} + \include{x86} + +\include{library} + +%\include{} +\bibliography{java} +\bibliographystyle{alpha} +\end{document} + diff --git a/doc/handbook/inlining.tex b/doc/handbook/inlining.tex new file mode 100644 index 000000000..af7931016 --- /dev/null +++ b/doc/handbook/inlining.tex @@ -0,0 +1 @@ +\section{Method inlining} diff --git a/doc/handbook/intro.tex b/doc/handbook/intro.tex new file mode 100644 index 000000000..8bb03c10e --- /dev/null +++ b/doc/handbook/intro.tex @@ -0,0 +1,6 @@ +\chapter{Introduction} + +\section{Introduction} + +\section{History} + diff --git a/doc/handbook/java.bib b/doc/handbook/java.bib new file mode 100644 index 000000000..4542255c1 --- /dev/null +++ b/doc/handbook/java.bib @@ -0,0 +1,1535 @@ +Ishizaki+00@proceedings{ISMM98, + key = "ISMM", + booktitle = {Proceedings of the First International Symposium on + Memory Management}, + title = {Proceedings of the First International Symposium on + Memory Management}, + editor = {Richard Jones}, + address = {Vancouver}, + series = SIGPLAN, + volume = "34(3)", + publisher = ACM, + month = oct, + year = 1998, + ISBN = {1-58113-114-3}, + note = {ISMM is the successor to the IWMM series of + workshops} +} + +@INPROCEEDINGS{Chan:Dadoo:Santhanam:usenix:summer:1990, + AUTHOR = {Paul Chan and Manoj Dadoo and Vatsa Santhanam}, + TITLE = {Evolution of the {U-code} Compiler Intermediate + Language at {Hewlett-Packard}}, + BOOKTITLE = {Proceedings of the 1990 Summer USENIX Conference}, + MONTH = {June}, + PAGES = {199--210}, + YEAR = 1990 +} + +@INPROCEEDINGS{Taba+98, + AUTHOR = {Ali-Reza Adl-Tabatabai and Michal Ciernak and + Guei-Yuan Lueh and Vishesh M. Parikh and James + M. Stichnoth}, + TITLE = {Fast, Effective Code Generation in a Just-In-Time + {Java} Compiler}, + BOOKTITLE = {Conference on Programming Language Design and + Implementation}, + ORGANIZATION = {ACM}, + SERIES = {SIGPLAN}, + VOLUME = {33(5)}, + PAGES = {280--290}, + ADDRESS = {Montreal}, + YEAR = 1998 +} + +@INPROCEEDINGS{Bacon+98, + AUTHOR = {David F. Bacon and Ravi Konuru and Chet Mruthy and + Mauricio Serrano}, + TITLE = {Thin Locks: Featherweight Synchronization for {Java}}, + BOOKTITLE = {Conference on Programming Language Design and + Implementation}, + ORGANIZATION = {ACM}, + SERIES = {SIGPLAN}, + VOLUME = {33(5)}, + PAGES = {258--268}, + ADDRESS = {Montreal}, + YEAR = 1998 +} + +@INPROCEEDINGS{Cameron+92, + AUTHOR = {Don Cameron and Paul Faust and Dmitry Lenkov and + Michey Mehta}, + TITLE = {A Portable Implementation of {C++} Exception + Handling}, + BOOKTITLE = {C++ Technical Conference}, + ORGANIZATION = {USENIX}, + PAGES = {225--243}, + MONTH = {August}, + YEAR = 1992 +} + +@ARTICLE{Chase94, + AUTHOR = {David Chase}, + TITLE = {Implementation of Exception Handling}, + JOURNAL = {Journal of C Language Translation}, + VOLUME = 5, + NUMBER = 4, + PAGES = {??--??}, + MONTH = {June}, + YEAR = 1994 +} + +@INPROCEEDINGS{Giering+94, + AUTHOR = {E. W. Giering and Frank Mueller and T. P. Baker}, + TITLE = {Features of the {Gnu} {Ada} Runtime Library}, + BOOKTITLE = {TRI-Ada '94}, + ORGANIZATION = {ACM}, + PAGES = {93--103}, + YEAR = 1994 +} + +@INPROCEEDINGS{Gunter+95, + AUTHOR = {Carl A. Gunter and Didier Remy and Jon G. Riecke}, + TITLE = {A Generalization of Exceptions and Control in + {ML}-like languages}, + BOOKTITLE = {Seventh International Conference on Functional + Programming Languages and Computer Architecture}, + ORGANIZATION = {ACM}, + PAGES = {12--23}, + YEAR = 1995 +} + +@ARTICLE{KoenigStroustrup90, + AUTHOR = {Andrew Koenig and Bjarne Stroustrup}, + TITLE = {Exception Handling for {C++}}, + JOURNAL = {Journal of Object Oriented Programming}, + VOLUME = 3, + NUMBER = 2, + PAGES = {16--33}, + MONTH = {July/August}, + YEAR = 1990 +} + +@BOOK{dragon86, + AUTHOR = {Alfred V. Aho and Ravi Sethi and Jeffrey D. Ullman}, + TITLE = {Compilers: Principles, Techniques, and Tools}, + PUBLISHER = {Addison-Wesley}, + YEAR = 1986 +} + + +@BOOK{appel98, + AUTHOR = {Appel, A.W.}, + TITLE = {Modern Compiler Implementation in Java}, + PUBLISHER = {Cambridge University Press}, + YEAR = 1998 +} + + +@BOOK{Stallings95, + AUTHOR = {William Stallings}, + TITLE = {Operating Systems}, + PUBLISHER = {Prentice Hall}, + YEAR = 1995 +} + +@INPROCEEDINGS{Bershad+92, + AUTHOR = {Brian N. Bershad and David D. Redell and John + R. Ellis}, + TITLE = {Fast mutual exclusion for uniprocessors}, + BOOKTITLE = {Annual Symposium on Architectural Support for + Programming Languages and Operating Systems}, + ORGANIZATION = {ACM}, + PAGES = {223--233}, + MONTH = {October}, + YEAR = 1992 +} + +@INPROCEEDINGS{Mueller93, + AUTHOR = {Frank Mueller}, + TITLE = {A Library Implementation of {POSIX} Threads under + {UNIX}}, + BOOKTITLE = {Winter USENIX}, + PAGES = {29--41}, + ADDRESS = {San Diego}, + MONTH = {January}, + YEAR = 1993 +} + +@TECHREPORT{Keppel93, + AUTHOR = {David Keppel}, + TITLE = {Tools and Techniques for Building Fast Portable + Threads Packages}, + NUMBER = {UWCSE 93-05-06}, + INSTITUTION = {University of Washington}, + YEAR = 1993 +} + +@MISC{BissAWT97, + AUTHOR = {Peter Mehlitz}, + TITLE = {Biss {AWT}}, + HOWPUBLISHED = {{\tt http://www. biss-net.com/biss-awt.html}}, + YEAR = 1997 +} + +@MISC{Jigsaw97, + KEY = {Jigsaw97}, + TITLE = {Jigsaw}, + HOWPUBLISHED = {{\tt http://www.w3.org/Jigsaw/}}, + YEAR = 1997 +} + +@MISC{PosixThreads96, + KEY = {POSIX}, + TITLE = {Standard for Threads Interface to {POSIX}}, + HOWPUBLISHED = {IEEE, P1003.1c}, + YEAR = 1996 +} + +@MISC{SunThreads97, + KEY = {JavaThreads}, + TITLE = {Java Threads Whitepaper}, + HOWPUBLISHED = {{\tt http://java. sun.com/}}, + INSTITUTION = {SUN}, + YEAR = 1997 +} + +@INPROCEEDINGS{AltmanEbcioglu97, + AUTHOR = {Erik Altman and Kemal Ebcio\u{g}lu}, + TITLE = {{DAISY}: Dynamic Compilation for 100\% Architectural + Compatibility}, + BOOKTITLE = {ISCA'97 - The 24th Annual International Symposium on + Computer Architecture}, + ORGANIZATION = {ACM, IEEE}, + YEAR = 1997 +} + +@BOOK{javaref96, + AUTHOR = {Ken Arnold and James Gosling}, + TITLE = {The {Java} Programming Language}, + PUBLISHER = {Addison-Wesley}, + YEAR = 1996 +} + +@INPROCEEDINGS{bacon&sweeny96-, + author = {David F. Bacon and Peter F. Sweeney}, + title = {Fast Static Analysis of {C++} Virtual Function + Calls}, + booktitle = {Conference on Object-Oriented Programming Systems, + Languages \& Applications (OOPSLA'96)}, + ORGANIZATION = {ACM}, + pages = {324--341}, + year = 1996 +} + +@ARTICLE{picoJava97, + AUTHOR = {J. Michael O'Connor and Marc Tremblay}, + TITLE = {{picoJava-I}: The {Java} Virtual Machine in + Hardware}, + JOURNAL = {IEEE Micro}, + VOLUME = 17, + NUMBER = 2, + PAGES = {45--53}, + MONTH = {April}, + YEAR = 1997 +} + +@ARTICLE{SunJIT97, + AUTHOR = {Timothy Cramer and Richard Friedman and Terrence + Miller and David Seberger and Robert Wilson and + Mario Wolczko}, + TITLE = {Compiling {Java} Just in Time}, + JOURNAL = {IEEE Micro}, + VOLUME = 17, + NUMBER = 3, + PAGES = {36--43}, + MONTH = {June}, + YEAR = 1997 +} + +@ARTICLE{ssa91, + AUTHOR = {Ron Cytron and Jeanne Ferrante and Barry K. Rosen + and Mark N. Wegman and F. Kenneth Zadeck}, + TITLE = {Efficiently computing Static Single Assignment Form + and the Control Flow Graph}, + JOURNAL = {ACM Transactions on Programming Languages and + Systems}, + VOLUME = 13, + NUMBER = 4, + PAGES = {451--490}, + MONTH = {October}, + YEAR = 1991 +} + +@INPROCEEDINGS{diwan+96-, + AUTHOR = {Amer Diwan and J. Eliot B. Moss and Kathryn + S. McKinley}, + TITLE = {Simple and Effective Analysis of Statically-Typed + Object-Oriented Programs}, + BOOKTITLE = {Conference on Object-Oriented Programming Systems, + Languages \& Applications (OOPSLA '96)}, + ORGANIZATION = {ACM}, + PAGES = {292--305}, + YEAR = 1996 +} + +@INPROCEEDINGS{EbciogluAltman97, + AUTHOR = {Kemal Ebcio\u{g}lu and Erik Altman and Erdem + Hokenek}, + TITLE = {A {Java} {ILP} Machine based on fast Dynamic + Compilation}, + BOOKTITLE = {MASCOTS'97 - International Workshop on Security and + Efficiency Aspects of {Java}}, + YEAR = 1997 +} + +@INPROCEEDINGS{Ertl92-, + AUTHOR = {M. Anton Ertl}, + TITLE = {A New Approach to {Forth} native Code Generation}, + BOOKTITLE = {EuroForth '92}, + ADDRESS = {Southampton}, + YEAR = 1992 +} + +@PHDTHESIS{Ertl96phd, + AUTHOR = {M. Anton Ertl}, + TITLE = {Implementation of Stack-Based Languages on Register + Machines}, + SCHOOL = {Technische Universit\"at Wien}, + MONTH = {April}, + YEAR = 1996 +} + +@INPROCEEDINGS{ErtlMaierhofer95, + AUTHOR = {M. Anton Ertl and Martin Maierhofer}, + TITLE = {Translating {Forth} to native {C}}, + BOOKTITLE = {EuroForth '95}, + ADDRES = {Dagstuhl}, + YEAR = 1995 +} + +@INPROCEEDINGS{Ertl&Pirker97, + AUTHOR = {M. Anton Ertl and Christian Pirker}, + TITLE = {The Structure of a {Forth} Native Code Compiler}, + BOOKTITLE = {EuroForth '97 Conference Proceedings}, + ADDRES = {Oxford}, + PAGES = {107--116}, + YEAR = 1997 +} + +} +@PHDTHESIS{Franz94, + AUTHOR = {Michael Franz}, + TITLE = {Code Generation On the Fly: A Key for Portable + Software}, + SCHOOL = {ETH Z\"urich}, + YEAR = 1994 +} + +@BOOK{smalltalk80, + AUTHOR = {Adele Goldberg and David Robson}, + TITLE = {SMALLTALK-80, The Language and its Implementation}, + PUBLISHER = {Addison-Wesley}, + YEAR = 1983 +} + +@INPROCEEDINGS{Gough97, + AUTHOR = {K. John Gough}, + TITLE = {Multi-language, Multi-target Compiler Development: + Evolution of the {Gardens Point} Compiler Project}, + BOOKTITLE = {JMLC'97 --Joint Modular Languages Conference}, + EDITOR = {Hanspeter M\"ossenb\"ock}, + PUBLISHER = {LNCS 1204}, + ADDRESS = {Linz}, + YEAR = 1997 +} + +@MASTERSTHESIS{Grafl97, + AUTHOR = {Reinhard Grafl}, + TITLE = {{CACAO}: {Ein 64Bit JavaVM Just-in-Time Compiler}}, + SCHOOL = {Technische Universit\"at Wien}, + MONTH = {January}, + YEAR = 1997 +} + +@BOOK{hennessy&patterson90, + AUTHOR = {John L. Hennessy and David A. Patterson}, + TITLE = {Computer Architecture. A Quantitative Approach}, + PUBLISHER = {Morgan Kaufman Publishers}, + YEAR = 1990 +} + +@ARTICLE{Hsieh+97, + AUTHOR = {Cheng-Hsueh A. Hsieh and Maria T. Conte and Teresa + L. Johnson and John C. Gyllenhaal and {Wen-mei} + W. Hwu}, + TITLE = {Optimizing NET Compilers for Improved {Java} + Performance}, + JOURNAL = {IEEE Computer}, + VOLUME = 30, + NUMBER = 6, + PAGES = {67--75}, + MONTH = {June}, + YEAR = 1997 +} + +@INPROCEEDINGS{HsiehHwu97, + AUTHOR = {Cheng-Hsueh A. Hsieh and John C. Gyllenhaal and + {Wen-mei} W. Hwu}, + TITLE = {Java Bytecode to Native Code Translation: The + {Caffeine} Prototype and Preliminary Results}, + BOOKTITLE = {29th Annual IEEE/ACM International Symposium on + Microarchitecture (MICRO'29)}, + ADDRES = {Paris}, + YEAR = 1996 +} + +@ARTICLE{Holmer96, + AUTHOR = {Bruce K. Holmer and Barton Sano and Michael Carlton + and Peter van Roy and Alvin M. Despain}, + TITLE = {Design and Analysis of Hardware for High-Performance + {Prolog}}, + JOURNAL = {Journal of Logic Programming}, + PUBLISHER = {Elsevier}, + VOLUME = {29(1-3)}, + PAGES = {107--139}, + YEAR = 1996 +} + +@BOOK{garbagebook, + AUTHOR = {Richard Jones and Rafael Lins}, + TITLE = {Garbage Collection}, + PUBLISHER = {John Wiley \& Sons}, + YEAR = 1996 +} + +@INPROCEEDINGS{Kistler97, + AUTHOR = {Thomas Kistler}, + TITLE = {Dynamic Runtime Optimization}, + BOOKTITLE = {JMLC'97 - Joint Modular Languages Conference}, + EDITOR = {Hanspeter M\"ossenb\"ock}, + PUBLISHER = {LNCS 1204}, + ADDRESS = {Linz}, + YEAR = 1997 +} + +@INPROCEEDINGS{KrBe95a, + AUTHOR = {Andreas Krall and Thomas Berger}, + TITLE = {Incremental Global Compilation of {Prolog} with the + {Vienna Abstract Machine}}, + BOOKTITLE = {Twelfth International Conference on Logic + Programming}, + EDITOR = {Leon Sterling}, + PUBLISHER = {MIT Press}, + PAGES = {333--347}, + ADDRESS = {Tokyo}, + YEAR = 1995 +} + +@INPROCEEDINGS{KrGr97a, + AUTHOR = {Andreas Krall and Reinhard Grafl}, + TITLE = {{CACAO} -- A 64 bit {JavaVM} Just-in-Time Compiler}, + BOOKTITLE = {Workshop on Java for Science and Engineering + Computation}, + EDITOR = {Geoffry C. Fox and Wei Li}, + ORGANIZATION = {ACM}, + MONTH = {June}, + PAGES = {}, + ADDRESS = {Las Vegas}, + YEAR = 1997 +} + +@ARTICLE{KrGr97b, + AUTHOR = {Andreas Krall and Reinhard Grafl}, + TITLE = {{CACAO} -- A 64 bit {JavaVM} Just-in-Time Compiler}, + JOURNAL = {Concurrency: Practice and Experience}, + PUBLISHER = {John Wiley \& Sons}, + VOLUME = {9}, + NUMBER = {11}, + PAGES = {1017--1030}, + YEAR = 1997 +} + +@INPROCEEDINGS{KrErGsch98, + AUTHOR = {Andreas Krall and Anton Ertl and Michael Gschwind}, + TITLE = {{JavaVM} Implementations: Compilers versus Hardware}, + BOOKTITLE = {Australian Computer Architecture Conference + ({ACAC'98})}, + EDITOR = {John Morris}, + VOLUME = {20(4)}, + SERIES = {Australian Computer Science Communications}, + PUBLISHER = {Springer}, + PAGES = {101--110}, + ADDRESS = {Perth}, + YEAR = 1998 +} + +@INPROCEEDINGS{KrPr98a, + AUTHOR = {Andreas Krall and Mark Probst}, + TITLE = {Monitors and Exceptions: How to implement {Java} + efficiently}, + BOOKTITLE = {ACM 1998 Workshop on Java for High-Performance + Computing}, + EDITOR = {Siamak Hassanzadeh and Klaus Schauser}, + ORGANIZATION = {ACM}, + MONTH = {March}, + PAGES = {15--24}, + ADDRESS = {Palo Alto}, + YEAR = 1998 +} + +@ARTICLE{KrPr98b, + AUTHOR = {Andreas Krall and Mark Probst}, + TITLE = {Monitors and Exceptions: How to implement {Java} + efficiently}, + JOURNAL = {Concurrency: Practice and Experience}, + PUBLISHER = {John Wiley \& Sons}, + VOLUME = {10}, + NUMBER = {11--13}, + PAGES = {837--850}, + YEAR = 1998 +} + +@INPROCEEDINGS{Kr98b, + AUTHOR = {Andreas Krall}, + TITLE = {Efficient {JavaVM} Just-in-Time Compilation}, + BOOKTITLE = {International Conference on Parallel Architectures + and Compilation Techniques}, + EDITOR = {Jean-Luc Gaudiot}, + ORGANIZATION = {IFIP,ACM,IEEE}, + PUBLISHER = {North-Holland}, + PAGES = {205--212}, + ADDRESS = {Paris}, + MONTH = {October}, + YEAR = 1998 +} + +@INPROCEEDINGS{GrErKr01b, + AUTHOR = {David Gregg and M.~Anton Ertl and Andreas Krall}, + TITLE = {Implementing an Efficient {Java} Interpreter}, + BOOKTITLE = {HPCN'01, Java in High Performance Computing}, + EDITOR = {Vladimir Getov and George K. Thiruvathukal}, + SERIES = {LNCS 2110}, + PUBLISHER = {Springer}, + PAGES = {613--620}, + ADDRESS = {Amsterdam}, + MONTH = {June}, + YEAR = 2001} + +@BOOK{javavm96, + AUTHOR = {Tim Lindholm and Frank Yellin}, + TITLE = {The {Java} Virtual Machine Specification}, + PUBLISHER = {Addison-Wesley}, + YEAR = 1996 +} + +@INPROCEEDINGS{Muller+97, + AUTHOR = {Gilles Muller and Barbara Moura and Fabrice Bellard + and Charles Consel}, + TITLE = {Harrisa: a Flexible and Efficient {Java} Environment + Mixing Bytecode and Compiled Code}, + BOOKTITLE = {COOTS'97 - third Conference on Object-Oriented + Technologies and Systems}, + YEAR = 1997 +} + +@PHDTHESIS{Lilith?, + AUTHOR = {Richard Stanley Ohran}, + TITLE = {Lilith: a workstation computer for Modula-2}, + SCHOOL = {ETH Z\"urich}, + YEAR = 1984 +} + +@INPROCEEDINGS{PaJoSm97, + AUTHOR = {Subbaro Palacharia and Norman P. Jouppi and + J.E. Smith}, + TITLE = {Complexity-Effective Superscalar Processors}, + BOOKTITLE = {ISCA'97 - The 24th Annual International Symposium on + Computer Architecture}, + ORGANIZATION = {ACM, IEEE}, + YEAR = 1997 +} + +@BOOK{Pcode82, + AUTHOR = {Steven Pemberton and Martin C. Daniels}, + TITLE = {Pascal Implementation, The P4 Compiler}, + PUBLISHER = {Ellis Horwood}, + YEAR = 1982 +} + +@BOOK{AlphaRef95, + AUTHOR = {R.L. Sites, R.T. Witek}, + TITLE = {Alpha AXP Architecture Reference Manual}, + PUBLISHER = {Digital Press}, + YEAR = 1995 +} + +@INPROCEEDINGS{UNCOL61, + AUTHOR = {T. B. Steel}, + TITLE = {A first version of {UNCOL}}, + BOOKTITLE = {Proceedings of the Western Joint IRE-AIEE-ACM + Computer Conference}, + YEAR = 1961, + PAGES = {371 -- 377} +} + +@ARTICLE{AmsterdamKit83, + AUTHOR = {Andrew S. Tanenbaum and Hans van {Sta\-ve\-ren} and + E. G. Keizer and Johan W. Stevenson}, + TITLE = {A Practical Tool Kit for making Portable Compilers}, + JOURNAL = {Communications of the ACM}, + VOLUME = 16, + NUMBER = 9, + PAGES = {654--660}, + MONTH = {September}, + YEAR = 1983 +} + +@ARTICLE{AmsterdamKit89, + AUTHOR = "A. S. Tanenbaum and M. F. Kaashoek and + K. G. Langendoen and C. J. H. Jacobs", + TITLE = "The design of very fast portable compilers", + JOURNAL = "ACM SIG{\-}PLAN Notices", + VOLUME = 24, + NUMBER = 11, + PAGES = "125--131", + MONTH = nov, + YEAR = 1989 +} + +@INPROCEEDINGS{Self, + AUTHOR = {David Ungar and Randall B. Smith}, + TITLE = {{SELF}: The Power of Simplicity}, + BOOKTITLE = {OOPSLA '87 Proceedings}, + YEAR = {1987}, + PAGES = {227--242} +} + +@INPROCEEDINGS{VitekHorspool96, + AUTHOR = {Jan Vitek and R. Nigel Horspool}, + TITLE = {Compact Dispatch Tables for Dynamically Typed Object + Oriented Languages}, + BOOKTITLE = {6th International Conference CC '96}, + YEAR = 1996, + PAGES = {307--325} +} + +@INPROCEEDINGS{KrViHo97, + AUTHOR = {Andreas Krall and Jan Vitek and Nigel Horspool}, + TITLE = {Near Optimal Hierarchical Encoding of Types}, + BOOKTITLE = {11th European Conference on Object Oriented Programming ({ECOOP'97})}, + EDITOR = {Mehmet Aksit and Satoshi Matsuoka}, + SERIES = {LNCS 1241}, + PUBLISHER = {Springer}, + PAGES = {128--145}, + ADDRESS = {Finland}, + YEAR = 1997} + +@INPROCEEDINGS{ViHoKr97, + AUTHOR = {Jan Vitek and Nigel Horspool and Andreas Krall}, + TITLE = {Efficient Type Inclusion Tests}, + BOOKTITLE = {Conference on Object Oriented Programming Systems, + Languages \& Applications ({OOPSLA '97})}, + EDITOR = {Toby Bloom}, + ORGANIZATION = {ACM}, + MONTH = {October}, + PAGES = {142--157}, + ADDRESS = {Atlanta}, + YEAR = 1997 +} + +@INPROCEEDINGS{GehmRandScal00, + AUTHOR = {Sanjay Gehmawat and Keith H. Randall and Daniel + J. Scales}, + TITLE = {Field Analysis: Getting Useful and Low-cost + Interprocedural Information}, + BOOKTITLE = {Conference on Programming Language Design and + Implementation}, + ORGANIZATION = {ACM}, + SERIES = {SIGPLAN}, + VOLUME = {35(5)}, + PAGES = {334--344}, + ADDRESS = {Vancouver}, + YEAR = 2000 +} + +@INPROCEEDINGS{BodikGuptaSarkar00, + AUTHOR = {Rastislav Bodik and Rajiv Gupta and Vivek Sarkar}, + TITLE = {{ABCD}: Eliminating Array Bound Checks on Demand}, + BOOKTITLE = {Conference on Programming Language Design and + Implementation}, + ORGANIZATION = {ACM}, + SERIES = {SIGPLAN}, + VOLUME = {35(5)}, + PAGES = {321--333}, + ADDRESS = {Vancouver}, + YEAR = 2000 +} + +@INPROCEEDINGS{Ruf00, + AUTHOR = {Erik Ruf}, + TITLE = {Effective Synchronization Removal for {Java}}, + BOOKTITLE = {Conference on Programming Language Design and + Implementation}, + ORGANIZATION = {ACM}, + SERIES = {SIGPLAN}, + VOLUME = {35(5)}, + PAGES = {208--218}, + ADDRESS = {Vancouver}, + YEAR = 2000 +} + +@ARTICLE{Marmot00, + AUTHOR = "Robert Fitzgerald and Todd B. Knoblock and Erik Ruf and + Bjarne Steensgaard and David Tarditi", + TITLE = "Marmot: an optimizing compiler for {Java}", + JOURNAL = "Software -- Practice and Experience", + VOLUME = 30, + NUMBER = 3, + PAGES = "199--232", + MONTH = nov, + YEAR = 2000 +} + +@Misc{Wilkinson:97, + OPTkey = {}, + author = {Tim Wilkinson}, + title = {{KAFFE}: A free virtual machine to run {Java} code}, + howpublished = {{\tt http: //www.kaffe.org}}, + year = {1997}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} + +@TechReport{Proebsting+97, + author = {Todd A. Proebsting and Gregg Townsend and Patrick + Bridges and John H. Hartman and Tim Newsham and + Scott A. Watterson}, + title = {Toba: {Java} for Applications}, + institution = {University of Arizona}, + year = {1997}, + OPTkey = {}, + OPTtype = {}, + OPTnumber = {}, + address = {Tucson, AZ}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} + +@BOOK{GCBook96, + AUTHOR = {Richard Jones and Rafael Lins}, + TITLE = {Garbage Collection}, + PUBLISHER = {John Wiley}, + YEAR = 1996 +} + +@INCOLLECTION{garbagesurvey, + AUTHOR = {Paul R. Wilson}, + TITLE = {Uniprocessor Garbage Collection Techniques}, + BOOKTITLE = {ACM Computing Surveys}, + PUBLISHER = {ACM}, + PAGES = {to apear}, + YEAR = {1994} +} + +@INPROCEEDINGS{Age+98, + AUTHOR = {Ole Agesen and David Detlefs and J. Eliot B. Moss}, + TITLE = {Garbage Collection and Local Variable Type-Precision + and Liveness in {Java} Virtual Machines}, + BOOKTITLE = {Conference on Programming Language Design and + Implementation}, + ORGANIZATION = {ACM}, + SERIES = {SIGPLAN}, + VOLUME = {33(6)}, + PAGES = {269--279}, + ADDRESS = {Montreal}, + YEAR = 1998 +} + +@INPROCEEDINGS{Johnstone98, + AUTHOR = {Mark S. Johnstone and Paul R. Wilson}, + TITLE = {The Memory Fragmentation Problem: Solved?}, + BOOKTITLE = {1998 International Symposium on Memory Management}, + ORGANIZATION = {ACM}, + PAGES = {}, + ADDRESS = {Vancouver}, + YEAR = 1998 +} + +@INPROCEEDINGS{Hicks+98, + AUTHOR = {Michael Hicks and Luke Hornof and Jonathan T. Moore + and Scott Nettles}, + TITLE = {A Study of Large Object Spaces}, + BOOKTITLE = {1998 International Symposium on Memory Management}, + ORGANIZATION = {ACM}, + PAGES = {138--145}, + ADDRESS = {Vancouver}, + YEAR = 1998 +} + +@INPROCEEDINGS{Colnet+98, + AUTHOR = {Dominique Colnet and Philippe Coucaud and Olivier + Zendra}, + TITLE = {Compiler Support to Customize the Mark and Sweep + Algorithm}, + BOOKTITLE = {1998 International Symposium on Memory Management}, + ORGANIZATION = {ACM}, + PAGES = {154--165}, + ADDRESS = {Vancouver}, + YEAR = 1998 +} + +@ARTICLE{Boehm88, + AUTHOR = {Hans-Juergen Boehm and Mark Weiser}, + TITLE = {Garbage Collection in an Uncooperative Environment}, + JOURNAL = {Software Practice and Experience}, + PUBLISHER = {John Wiley \& Sons}, + VOLUME = {18}, + NUMBER = {9}, + PAGES = {807--820}, + YEAR = 1988 +} + +@InProceedings{HudsonMoss:1992, + author = {Richard L. Hudson and J. Eliot B. Moss}, + title = {Incremental Collection of Mature Objects}, + booktitle = {Proceedings of the International Workshop on Memory + Management}, + pages = {388--403}, + month = {September}, + year = {1992} +} + +@TECHREPORT{Petit-Bianco99, + AUTHOR = {Alexandre Petit-Bianco99}, + TITLE = {No Silver Bullet - Garbage Collection for {Java} in + Embedded Systems}, + NUMBER = {http://sourceware.cygnus.com/java/}, + INSTITUTION = {Cygnus Solutions}, + MONTH = {August}, + YEAR = 1998 +} + +@TECHREPORT{Ritzau99, + AUTHOR = {Alexandre Petit-Bianco99}, + TITLE = {No Silver Bullet - Garbage Collection for {Java} in + Embedded Systems}, + NUMBER = {thesis proposal, http://www.ida.liu.se/~tobri/}, + INSTITUTION = {Link\"oping University}, + MONTH = {August}, + YEAR = 1998 +} + +@inproceedings{lim98, + author = {Tian F. Lim and Przemyslaw Pardyak and Brian + N. Bershad}, + title = {A Memory-Efficient Real-Time Non-Copying Garbage + Collector}, + pages = {118--129}, + crossref = {ISMM98}, + URL = "http://www.cs.washington.edu/homes/tian/ISMM98/", + abstract = {Garbage collectors used in operating systems such as + SPIN and embedded systems such as Java and Inferno + must operate with limited resources and minimize + their impact on application + performance. Consequently, they must maintain short + real-time pauses, low overhead, and a small memory + footprint. Most garbage collectors are not adequate + because they are either not real-time or they + require larger heaps because they trade space for + time. For example, Treadmill uses segregated free + lists to allocate and reclaim memory in constant + time but at the cost of under-utilizing memory. We + have implemented a new version of the Treadmill + collector and used it in the SPIN extensible + operating system kernel. We chose Treadmill for its + excellent real-time latencies and low overhead. We + improve its poor memory utilization by using real- + time page-level management techniques that reduce + the fragmentation caused by segregated free + lists. We use page-wise collection to locate pages + of free memory, which are dynamically reassigned + between free lists as needed. We use virtual memory + to dynamically remap free pages into continuous + ranges. Our experiments demonstrate that we have + substantially improved memory utilization without + compromising latency or overhead, and that the new + collector performs very well for SPIN's workloads.} +} + +@Article{IBM:Moreira:2000, + author = {J. E. Moreira and S. P. Midkoff and M. Gupta and P. V. Artigas and M. Snir and R. D. Lawrence}, + title = {Java programming for high-performance numerical computing}, + journal = {IBM Systems Journal}, + year = {2000}, + OPTkey = {}, + volume = {39}, + number = {1}, + pages = {21--56}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} + +@Article{IBM:Suganuma:2000, + author = {T. Suganuma and T. Ogasawara and M. TaT. Yasuekeuchi and and M. Kawahito and K. Ishizaki and H. Komatsuatani}, + title = {Overview of the {IBM} {Java} Just-in-Time Compiler}, + journal = {IBM Systems Journal}, + year = {2000}, + OPTkey = {}, + volume = {39}, + number = {1}, + pages = {175--193}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} + +@TechReport{Compaq:Scales:2000, + author = {Daniel J. Scales and Keith H. Randall and Sanjay Ghemawat and Jeff Dean}, + title = {The {Swift} Compiler: Design and Implementation}, + institution = {Compaq Western Research Laboratory}, + year = {2000}, + OPTkey = {}, + OPTtype = {}, + number = {2000/2}, + OPTaddress = {}, + month = {April}, + OPTnote = {}, + OPTannote = {} +} + +@Manual{NaturalBridge, + title = {{BulletTrainTM} optimizing compiler and runtime for {JVM} bytecode}, + OPTkey = {}, + OPTauthor = {}, + organization = {NaturalBridge}, + OPTaddress = {}, + OPTedition = {}, + OPTmonth = {}, + OPTyear = {}, + note = {{\tt http://www.naturalbridge.com}}, + OPTannote = {} +} + + +@Manual{TowerJ, + title = {{TowerJ 3.0: A New Generation Native Java Compiler And Runtime Environment}}, + OPTkey = {}, + OPTauthor = {}, + organization = {Tower Technologies}, + OPTaddress = {}, + OPTedition = {}, + OPTmonth = {}, + OPTyear = {}, + note = {{\tt http://www.towerj.com}}, + OPTannote = {} +} + +@InProceedings{SGI:Smith:2000, + author = {Todd Smith and Suresh Srinivas and Philipp Tomsich and Jinpyo Park}, + title = {Practical Experiences with {Java} Compilation}, + booktitle = {Proceedings of the Intl. Conf. on High-Performance Computing}, + OPTcrossref = {}, + OPTkey = {}, + OPTpages = {}, + year = {2000}, + OPTeditor = {}, + volume = {1970}, + OPTnumber = {}, + series = {Lecture Notes in Computer Science}, + OPTaddress = {}, + month = {December}, + OPTorganization = {}, + publisher = {Springer}, + OPTnote = {}, + OPTannote = {} +} + +@INPROCEEDINGS{DetlefsAgesen, + AUTHOR = "David Detlefs and Ole Agesen", + TITLE = "{The Case for Multiple Compilers}", + BOOKTITLE = "Proc. OOPSLA 1999 VM Workshop on Simplicity, + Performance and Portability in Virtual Machine + Design", + YEAR = 1997, +} + +@InProceedings{Krall_Tomsich_1999a, + author = {Andreas Krall and Philipp Tomsich}, + title = {Garbage Collection for Large Memory {Java} Applications}, + booktitle = {Proceedings of the 7th European Conference on High-Performance Computing and Networking (HPCN Europe'99)}, + pages = {895--907}, + year = {1999}, + OPTeditor = {}, + volume = {1593}, + series = {Lecture Notes in Computer Science}, + month = {Apr}, + publisher = {Springer Verlag}, +} + +@INPROCEEDINGS{Sable00, + AUTHOR = {Patrice Pominville and Feng Quian and Raja Vallee-Rai + and Laurie Hendren and Clark Verbrugge}, + TITLE = {A Framework for Optimizing {Java} Using Attrributes}, + BOOKTITLE = {CASCON}, + ORGANIZATION = {IBM}, + ADDRESS = {Mississauga}, + YEAR = 2000 +} + +@InProceedings{Hoelzle+91, + author = "Urs H{\"o}lzle and Craig Chambers and David Ungar", + title = "Optimizing Dynamically-Typed Object-Oriented + Programming Languages with Polymorphic Inline Caches", + pages = "21--38", + booktitle = "Proceedings of the European Conference on Object-Oriented + Programming (ECOOP'91)", + volume = {512}, + series = {Lecture Notes in Computer Science}, + month = jul, + publisher = "Springer Verlag", + address = "Geneva", + year = "1991", + annote = "Abstract: Polymorphic inline caches (PICs) provide a new way to reduce the + overhead of polymorphic message sends by extending inline caches to include more + than one cached lookup result per call site. For a set of typical object-oriented + SELF programs, PICs achieve a median speedup of 11%. + As an important side effect, PICs collect type information by recording all of the + receiver types actually used at a given call site. The compiler can exploit this + type information to generate better code when recompiling a method. An + experimental version of such a system achieves a median speedup of 27% for our set + of SELF programs, reducing the number of non-inlined message sends by a factor of + two. + Implementations of dynamically-typed object-oriented languages have been limited + by the paucity of type information available to the compiler. The abundance of the + type information provided by PICs suggests a new compilation approach for these + languages, adaptive compilation. Such compilers may succeed in generating very + efficient code for the time-critical parts of a program without incurring + distracting compilation pauses." + +} + +@InProceedings{HoelzleUngar94, + author = "Urs H{\"o}lzle and David Ungar", + title = "Optimizing Dynamically-Dispatched Calls with Run-Time + Type Feedback", + pages = "326--336", + ISBN = "0-89791-662-X", + booktitle = "Proceedings of the Conference on Programming Language + Design and Implementation (PLDI'94)", + month = jun, + publisher = "ACM Press", + address = "New York, NY, USA", + year = "1994", +} + +@InProceedings{DriesenHoelzle96, + author = "Karel Driesen and Urs H{\"o}lzle", + title = "The Direct Cost of Virtual Function Calls in {C++}", + booktitle = "Conference on Object-Oriented Programming Systems, + Languages \& Applications ({OOPSLA} '96)", + year = "1996", + pages = "306--323", + annote = "Studies the cost of virtual function calls on modern + processors, taking into account the effects of + out-of-order execution and caches. On their baseline + architecture, the standard implementation of virtual + function calls take 1\%--10\% of the instructions and + 2\%--29\% of the cycles. The thunk implementation is + slightly faster, for most benchmarks, and much faster + for a few. The relative cost of virtual function calls + will increase slightly in the future. The influences of + architectural variations like branch misprediction + penalties, branch prediction accuracy, issue widths, + and load latency. The cost per dispatch is 2--5 cycles + for most benchmarks (10 cycles for one benchmark) on + their baseline architecture (4-wide processor, 2 cycle + load latency, 4 cycle branch latency).", +} + +@InProceedings{Oxhoj+92, + author = "Nicholas Oxh{\o}j and Jens Palsberg and Michael~I. + Schwartzbach", + editor = "O. Lehrmann~Madsen", + title = "{Making Type Inference Practical}", + booktitle = "Proceedings of the \mbox{ECOOP}~'92 European + Conference on Object-oriented Programming", + series = "LNCS 615", + pages = "329--349", + publisher = "Springer-Verlag", + address = "Utrecht, The Netherlands", + month = jul, + year = "1992", +} + +@Article{IC::Palsberg1995, + refkey = "C1680; PN2520", + title = "Efficient Inference of Object Types", + author = "Jens Palsberg", + pages = "198--209", + journal = "Information and Computation", + month = dec, + year = "1995", + volume = "123", + number = "2", + abstract = "Abadi and Cardelli have recently investigated a + calculus of objects (Abadi and Cardelli, 1994). The + calculus supports a key feature of object-oriented + languages: an object can be emulated by another object + that has more refined methods. Abadi and Cardelli + presented four first-order type systems for the + calculus. The simplest one is based on finite types and + no subtyping, and the most powerful one has both + recursive types and subtyping. Open until now is the + question of type inference, and in the presence of + subtyping ``the absence of minimum typings poses + practical problems for type inference'' (Abadi and + Cardelli, 1994). \par In this paper we give an $O(n^3)$ + algorithm for each of the four type inference problems + and we prove that all the problems are P-complete. We + also indicate how to modify the algorithms to handle + functions and records.", + } + +@InProceedings{TipPalsberg00, + author = "Frank Tip and Jens Palsberg", + title = "Scalable propagation-based call graph construction + algorithms.", + pages = "281--293", + booktitle = "Proceedings of the Conference on Object-Oriented + Programming, Systems, Languages and Application + ({OOPSLA}-00)", + month = oct, + series = "ACM Sigplan Notices", + volume = "35(10)", + publisher = "ACM Press", + address = "N. Y.", + year = "2000", +} + + +@InProceedings{Dean+95, + author = "Jeffrey Dean and David Grove and Craig Chambers", + title = "Optimization of Object-Oriented Programs Using Static + Class Hierarchy Analysis", + booktitle = "Proceedings of the 9th European Conference on Object-Oriented + Programming (ECOOP'95)", + volume = {952}, + series = {Lecture Notes in Computer Science}, + publisher = "Springer Verlag", + pages = "77--101", + year = "1995", +} + +@InProceedings{BaconSweeney96, + author = "David F. Bacon and Peter F. Sweeney", + title = "Fast Static Analysis of {C}++ Virtual Function Calls", + booktitle = "Conference on Object-Oriented Programming Systems, + Languages \& Applications ({OOPSLA} '96)", + year = "1996", + pages = "324--341", + publisher = "ACM Press", + annote = "An empirical study of the effectiveness of + \emph{unique name} analysis, \emph{class hierarchy + analysis} and \emph{rapid type analysis}. Rapid type + analysis is an analysis that takes into account which + classes are actually instantiated. It is very fast and + detects a a significant percentage of monomorphic calls + for some benchmarks (up to 100\%).", +} + +@phdthesis{Bacon97, + author = "David Francis Bacon", + title = "Fast and Effective Optimization of Statically Typed + Object-Oriented Languages", + school = "University of California, Berkeley", + year = "1997", + abstract = "In this dissertation, we show how a relatively simple + and extremely fast interprocedural optimization + algorithm can be used to optimize many of the expensive + features of statically typed, object-oriented languages + -- in particular, C++ and Java. We present a new + optimization algorithm, Rapid Type Analysis, and show + that it is fast both in theory and in practice, and + significantly out-performs other {"}fast{"} algorithms + for virtual function call resolution. We present + optimization algorithms for the resolution of virtual + function calls, conversion of virtual inheritance to + direct inheritance, elimination of dynamic casts and + dynamic type checks, and removal of object + synchronization. These algorithms are all presented + within a common framework that allows them to be driven + by the information collected by Rapid Type Analysis, or + by some other type analysis algorithm. Collectively, + the optimizations in this dissertation free the + programmer from having to sacrifice modularity and + extensibility for performance. Instead, the programmer + can freely make use of the most powerful features of + object-oriented programming, since the optimizer will + remove unnecessary extensibility from the program.", +} + +@Article{Shivers88, + author = "Olin Shivers", + title = "Control-Flow Analysis in {Scheme}", + pages = "164--174", + journal = "ACM SIGPLAN Notices", + year = "1988", + month = jul, + volume = "23", + number = "7", + note = "Proceedings of the ACM SIGPLAN 1988 Conference on + Programming Language Design and Implementation.", +} + + +@PhdThesis{Shivers91, + author = "Olin Shivers", + title = "Control-Flow Analysis of Higher-Order Languages", + school = "Carnegie-Mellon University", + year = "1991", + month = may, +} + + +@InProceedings{Grove+97, + author = "David Grove and Greg DeFouw and + Jeffrey Dean and Craig Chambers", + title = "Call Graph Construction in Object-Oriented Languages", + pages = "108--124", + booktitle = "Proceedings of the {ACM} {SIGPLAN} Conference on + Object-Oriented Programming Systems, Languages and + Applications ({OOPSLA}-97)", + month = oct, + series = "ACM SIGPLAN Notices", + volume = "32, 10", + publisher = "ACM Press", + address = "New York", + year = "1997", +} + +@InProceedings{DeFouw:1998:FIC, + author = "Greg DeFouw and David Grove and Craig Chambers", + title = "Fast interprocedural class analysis", + editor = "ACM", + booktitle = "Conference record of {POPL} '98: the 25th {ACM} + {SIGPLAN-SIGACT} Symposium on Principles of Programming + Languages: papers presented at the Symposium, San + Diego, California, 19--21 January 1998", + publisher = "ACM Press", + address = "New York", + year = "1998", + pages = "222--236", + year = "1998", + url = "http://www.acm.org:80/pubs/citations/proceedings/plan/268946/p222-defouw/", +} + +@Article{Collin:1997:TIL, + author = "S. Collin and D. Colnet and O. Zendra", + title = "Type Inference for Late Binding: The {SmallEiffel} + Compiler", + journal = "Lecture Notes in Computer Science", + volume = "1204", + pages = "67--??", + year = "1997", + coden = "LNCSD9", + ISSN = "0302-9743", + bibdate = "Fri Aug 22 11:59:49 MDT 1997", + acknowledgement = ack-nhfb, +} + + +@InProceedings{Zendra+97, + author = "Olivier Zendra and Dominique Colnet and Suzanne + Collin", + title = "Efficient Dynamic Dispatch without Virtual Function + Tables: The Small {Eiffel} Compiler", + pages = "125--141", + ISSN = "0362-1340", + booktitle = "Proceedings of the {ACM} {SIGPLAN} Conference on + Object-Oriented Programming Systems, Languages and + Applications ({OOPSLA}-97)", + month = oct # "~5--9", + series = "ACM SIGPLAN Notices", + volume = "32, 10", + publisher = "ACM Press", + address = "New York", + year = "1997", +} + +@InProceedings{Sundaresan+00, + author = "Vijay Sundaresan and Laurie J. Hendren and Chrislain + Razafimahefa and Raja Vall{\'e}e-Rai and Patrick Lam + and Etienne Gagnon and Charles Godin", + title = "Practical virtual method call resolution for Java.", + pages = "264--280", + booktitle = "Proceedings of the Conference on Object-Oriented + Programming, Systems, Languages and Application + ({OOPSLA}-00)", + month = oct # "~15--19", + series = "ACM Sigplan Notices", + volume = "35.10", + publisher = "ACM Press", + address = "N. Y.", + year = "2000", +} + +@InProceedings{Myers95, + author = "Andrew C. Myers", + title = "Bidirectional Object Layout for Separate Compilation", + booktitle = "{OOPSLA}~'95 Conference Proceedings: Object-Oriented + Programming Systems, Languages, and Applications", + year = "1995", + publisher = "ACM Press", + pages = "124--139", + url = "http://www.acm.org/pubs/articles/proceedings/oops/217838/p124-myers/p124-myers.pdf", +} + +@InProceedings{Stroustrup89, + author = "Bjarne Stroustrup", + title = "Multiple Inheritance for {C++}", + editor = "{USENIX} Association", + booktitle = "Computing Systems, Fall, 1989.", + publisher = "USENIX Association", + address = "Berkeley, CA, USA", + month = "Fall", + year = "1989", + volume = "2(4)", + pages = "367--395", +} + +@InProceedings{Rose88, + author = "John R. Rose", + title = "Fast Dispatch Mechanisms for Stock Hardware", + editor = "Norman Meyrowitz", + booktitle = "{OOPSLA}'88: Object-Oriented Programming Systems, + Languages and Applications: Conference Proceedings", + year = "1988", + pages = "27--35", +} + +@TechReport{Liskov+95, + author = "Barbara Liskov and Dorothy Curtis and Mark Day and + Sanjay Ghemawat and Robert Gruber and Paul Johnson and + Andrew C. Myers", + title = "Theta Reference Manual", + institution = "Massachusetts Institute of Technology, Laboratory for + Computer Science", + month = feb, + year = "1995", + number = "Programming Methodology Group Memo 88", + url = "http://clef.lcs.mit.edu/Thor-papers.html", +} + + +@InProceedings{Ishizaki+00, + author = "Kazuaki Ishizaki and Motohiro Kawahito and Toshiaki + Yasue and Hideaki Komatsu and Toshio Nakatani", + title = "A study of devirtualization techniques for a {JavaTM} + Just-In-Time compiler.", + pages = "294--310", + booktitle = "Proceedings of the Conference on Object-Oriented + Programming, Systems, Languages and Application + ({OOPSLA}-00)", + month = oct # "~15--19", + series = "ACM Sigplan Notices", + volume = "35.10", + publisher = "ACM Press", + address = "N. Y.", + year = "2000", +} + + +@INPROCEEDINGS{Alpern+01, + AUTHOR = {Bowen Alpern and Anthony Cocchi and David Grove and Derek Lieber}, + TITLE = {Efficient Dispatch of {Java} Interface Methods}, + BOOKTITLE = {HPCN'01, Java in High Performance Computing}, + EDITOR = {Vladimir Getov and George K. Thiruvathukal}, + SERIES = {LNCS 2110}, + PUBLISHER = {Springer}, + PAGES = {621--628}, + ADDRESS = {Amsterdam}, + MONTH = {June}, + YEAR = 2001 +} + + +@BOOK{AbramskyHankin87, + AUTHOR = {Samson Abramsky and Chris Hankin}, + TITLE = {Abstract Interpretation of Declarative Languages}, + PUBLISHER = {Ellis Horwood}, + YEAR = 1987 +} + +@INPROCEEDINGS{Blanchet99, + AUTHOR = {Bruno Blanchet}, + TITLE = {Escape Analysis for Object Oriented Languages: + Application to Java}, + BOOKTITLE = {Proceedings of OOPSLA'99}, + ORGANIZATION = {ACM}, + ADDRESS = {Denver}, + PAGES = {20--34}, + MONTH = {November}, + YEAR = 1999 +} + +@INPROCEEDINGS{Choi99, + AUTHOR = {J.-G. Choi and M. Gupta and M. Serrano and + V.C. Sreedhar and S. Midkiff}, + TITLE = {Escape Analysis for Java}, + BOOKTITLE = {Proceedings of OOPSLA'99}, + ORGANIZATION = {ACM}, + ADDRESS = {Denver}, + PAGES = {1-19}, + MONTH = {November}, + YEAR = 1999 +} + +@ARTICLE{AitKaci89, + AUTHOR = {H. A\"{i}t-Kaci and R. Boyer and P. Lincoln and R. +Nasr}, + TITLE = {Efficient implementation of lattice operations}, + JOURNAL = {Transactions on Programming Languages and Systems}, + ORGANIZATION = {ACM}, + VOLUME = 11, + NUMBER = 1, + PAGES = {115-146}, + MONTH = {November}, + YEAR = 1989 +} + +@INPROCEEDINGS{Caseau93, + AUTHOR = {Yves Caseau}, + TITLE = {Efficient handling of multiple inheritance hierarchies}, + BOOKTITLE = {Proceedings of OOPSLA'93}, + ORGANIZATION = {ACM}, + SERIES = {SIGPLAN}, + VOLUME = {28(10)}, + PAGES = {271--287}, + YEAR = 1993 +} + + +@ARTICLE{Cohen91, + AUTHOR = {N.J. Cohen}, + TITLE = {Type-extension type tests can be performed in constant time}, + JOURNAL = {Transactions on Programming Languages and Systems}, + ORGANIZATION = {ACM}, + VOLUME = 13, + NUMBER = 4, + PAGES = {626--629}, + MONTH = {April}, + YEAR = 1991 +} + + +@ARTICLE{Dencker84, + AUTHOR = {P. Dencker and K. D\"{u}rre and J. Heuft}, + TITLE = {Optimization of Parser Tables for Portable Compilers}, + JOURNAL = {Transactions on Programming Languages and Systems}, + ORGANIZATION = {ACM}, + VOLUME = 6, + NUMBER = 4, + PAGES = {546--572}, + MONTH = {April}, + YEAR = 1984 +} + + +@ARTICLE{KraViHo97, + AUTHOR = {A. Krall and J. Vitek and R.N. Horspool}, + TITLE = {Near optimal hierarchical encoding of types}, + BOOKTITLE = {European Conference on Object-Oriented Programming + ({ECOOP'97})}, + PAGES = {128-145}, + PUBLISHER = {LNCS 1241}, + ADDRESS = {Jyv\"{a}skyl\"{a}}, + YEAR = 1997 +} + + +@ARTICLE{VitekHorspool94, + AUTHOR = {J. Vitek and R.N. Horspool}, + TITLE = {Taming Message Passing: Efficient Method Look-Up for + Dynamically Typed Languages}, + BOOKTITLE = {European Conference on Object-Oriented Programming + ({ECOOP'94})}, + PAGES = {432--449}, + PUBLISHER = {LNCS 821}, + ADDRESS = {Bologna}, + YEAR = 1994 +} + +@InProceedings{Agesen95, + title = "The {Cartesian} Product Algorithm: Simple and Precise + Type Inference of Parametric Polymorphism", + author = "Ole Agesen", + pages = "2--26", + editor = "Walter G. Olthoff", + booktitle = "{ECOOP}'95---Object-Oriented Programming, 9th European + Conference", + address = "{\AA}arhus, Denmark", + month = "7--11~" # aug, + year = "1995", + series = "Lecture Notes in Computer Science", + volume = "952", + publisher = "Springer", +} + +@InProceedings{Jalapeno99, + author = "Bowen Alpern and Dick Attanasio and Anthony Cocchi and + Derek Lieber and Stephen Smith and Ton Ngo and John J. + Barton", + title = "Implementing Jalapeno in Java", + pages = "314--324", + editor = "Loren Meissner", + booktitle = "Proceeings of the 1999 {ACM} {SIGPLAN} Conference on + Object-Oriented Programming, Systems, Languages {\&} + Applications ({OOPSLA}`99)", + month = nov # "~1--5", + series = "ACM Sigplan Notices", + volume = "34.10", + publisher = "ACM Press", + address = "N. Y.", + year = "1999", +} + + +@Article{Jalapeno00, + author = "B. Alpern and C. R. Attanasio and J. J. Barton and M. + G. Burke and P. Cheng and J.-D. Choi and A. Cocchi and + S. J. Fink and D. Grove and M. Hind and S. F. Hummel + and D. Lieber and V. Litvinov and M. F. Mergen and T. + Ngo and J. R. Russell and V. Sarkar and M. J. Serrano + and J. C. Shepherd and S. E. Smith and V. C. Sreedhar + and H. Srinivasan and J. Whaley", + title = "The {Jalape{\~n}o} virtual machine", + journal = "IBM Systems Journal", + volume = "39", + number = "1", + pages = "211--238", + month = "????", + year = "2000", + coden = "IBMSA7", + ISSN = "0018-8670", + bibdate = "Mon Apr 24 15:43:02 MDT 2000", + url = "http://www.almaden.ibm.com/journal/sj/391/alpern.html", + acknowledgement = ack-nhfb, + keywords = "Java", +} + +@InProceedings{HotSpot01, + author = "Michael Paleczny and Chnstopher Vick and Cliff Click", + title = "The {Java} {HotSpot} Server Compiler", + pages = "1--12", + booktitle = "Proceedings of the {$Java{\^{TM}}$} Virtual Machine + Research and Technology Symposium ({JVM}-01)", + month = apr # " ~23--24", + publisher = "USENIX Association", + address = "Berkley, USA", + year = "2001", +} + diff --git a/doc/handbook/jit.tex b/doc/handbook/jit.tex new file mode 100644 index 000000000..e65136df5 --- /dev/null +++ b/doc/handbook/jit.tex @@ -0,0 +1,531 @@ +\chapter{The Just-In-Time Compiler} + +\section{Introduction} + +\section{The Java Virtual Machine} + +\label{chapjvm} + +The JavaVM is a typed stack architecture \cite{javavm96}. There are +different instructions for integer, long integer, floating point and +address types. Byte and character types have only special memory access +instructions and are treated as integers for arithmetic operations. The +main instruction set consists of arithmetic/logical and load/store/constant +instructions. There are special instructions for array access and for +accessing the fields of objects (memory access), for method invocation, +and for type checking. A JavaVM has to check the program for type +correctness and executes only correct programs. The following examples show +some important JavaVM instructions and how a Java program is represented by +these instructions. + +\begin{tabular}{ll} +{\tt iload n}& load contents of local variable n \\ +{\tt istore n}& store stack top in local variable n \\ +{\tt imul }& product of 2 topmost stack elements \\ +{\tt isub }& difference of 2 topmost stack elements \\ +\end{tabular} + +The Java assignment statement {\tt a = b - c * d} is translated into + +\begin{tabular}{ll} +{\tt iload b }& load contents of variable b \\ +{\tt iload c }& load contents of variable c \\ +{\tt iload d }& load contents of variable d \\ +{\tt imul }& compute c * d \\ +{\tt isub }& compute b - (c * d) \\ +{\tt istore a}& store stack top in variable a \\ +\end{tabular} + +Accessing the fields of objects is handled by the instructions {\tt +getfield} and {\tt putfield}. The instruction {\tt getfield} expects an +object reference on the stack and has an index into the constant pool as an +operand. The index into the constant pool must be a reference to a pair +containing the class name and a field name. The types of the classes +referenced by the constant pool index and by the object reference must be +compatible, a fact which is usually checked statically at load time. The +object reference has to be different from the {\tt null} pointer, a fact +which must usually be checked at run time. + +Array access is handled by the {\tt aload} and {\tt astore} instructions. +Separate versions of these instructions exist for each of the basic types +({\tt byte}, {\tt int}, {\tt float}, {\tt ref}, etc.). The {\tt aload} +instruction expects a reference to an array and an index (of type {\tt +int}) on the stack. The array reference must not be the {\tt null} pointer. +The index must be greater than or equal to zero and less than the array +length. + +There a special commands for method invocation. Each method has its own +virtual stack and an area for local variables. After the method invocation, +the stack of the caller is empty and the arguments are copied into the +first local variables of the called method. After execution of a {\tt +return} instruction, the called method returns to its caller. If the called +method is a function, it pops the return value from its own stack and +pushes it onto the stack of the caller. + +The {\tt instanceof} and {\tt checkcast} instructions are used for subtype +testing. Both expect a reference to an object on the stack and have an +index into the constant pool as operand. The index must reference a class, +array or interface type. The two instructions differ in their result and in +their behavior if the object reference is {\tt null}. + + +\section{Translation of stack code to register code} + +\label{chaptranslation} + +The architecture of a RISC processor is completely different from the stack +architecture of the JavaVM. RISC processors have large sets of registers. +(The Alpha has 32 integer registers and 32 floating point registers which +are both 64 bits wide.) They execute arithmetic and logic operations only +on values which are held in registers. Load and store instructions are +provided to move data between memory and registers. Local variables of +methods usually reside in registers and are saved in memory only during a +method call or if there are too few registers. The use of registers for +parameter passing is defined by calling conventions. + +\subsection{Machine code translation examples} + +The previous example expression {\tt a = b - c * d} would be translated +by an optimizing C compiler to the following two Alpha instructions (the +variables {\tt a}, {\tt b}, {\tt c} and {\tt d} reside in registers): + +\begin{quote} +\begin{tabular}{ll} +{\tt MULL c,d,tmp0 }&{\tt ; tmp0 = c * d }\\ +{\tt SUBL b,tmp0,a }&{\tt ; a = b - tmp0 }\\ +\end{tabular} +\end{quote} + +If JavaVM code is translated to machine code, the stack is eliminated and +the stack slots are represented by temporary variables usually residing in +registers. A naive translation of the previous example would result in the +following Alpha instructions: + +\begin{quote} +\begin{tabular}{ll} +{\tt MOVE b,t0 }&{\tt ; iload b } \\ +{\tt MOVE c,t1 }&{\tt ; iload c } \\ +{\tt MOVE d,t2 }&{\tt ; iload d } \\ +{\tt MULL t1,t2,t1}&{\tt ; imul } \\ +{\tt SUBL t0,t1,t0}&{\tt ; isub } \\ +{\tt MOVE t0,a }&{\tt ; istore a} \\ +\end{tabular} +\end{quote} + +The problems of translating JavaVM code to machine code are primarily the +elimination of the unnecessary copy instructions and finding an efficient +register allocation algorithm. A common but expensive technique is to do +the naive translation and use an additional pass for copy elimination and +coalescing. + + +\section{The translation algorithm} + +The new translation algorithm can get by with three passes. The first pass +determines basic blocks and builds a representation of the JavaVM +instructions which is faster to decode. The second pass analyses the stack +and generates a static stack structure. During stack analysis variable +dependencies are tracked and register requirements are computed. In the +final pass register allocation of temporary registers is combined with +machine code generation. + +The new compiler computes the exact number of objects needed or computes an +upper bound and allocates the memory for the necessary temporary data +structures in three big blocks (the basic block array, the instruction +array and the stack array). Eliminating all the double linked lists also +reduced the memory requirements by a factor of five. + + +\subsection{Basic block determination} + +The first pass scans the JavaVM instructions, determines the basic blocks +and generates an array of instructions which has fixed size and is easier +to decode in the following passes. Each instruction contains the opcode, +two operands and a pointer to the static stack structure after the +instruction (see next sections). The different opcodes of JavaVM +instructions which fold operands into the opcode are represented by just +one opcode in the instruction array. + + +\subsection{Basic block interfacing convention} + +The handling of control flow joins was quite complicated in the old +compiler. We therefore introduced a fixed interface at basic block +boundaries. Every stack slot at a basic block boundary is assigned a fixed +interface register. The stack analysis pass determines the type of the +register and if it has to be saved across method invocations. To enlarge +the size of basic blocks method invocations do not end basic blocks. To +guide our compiler design we did some static analysis on a large +application written in Java: the javac compiler and the libraries it uses. +Table \ref{tab-1} shows that in more than 93\% of the cases the stack is +empty at basic block boundaries and that the maximal stack depth is 6. +Using this data it becomes clear that the old join handling did not improve +the quality of the machine code. + +\begin{table*} +\begin{center} +\begin{tabular}[b]{|l|c|c|c|c|c|c|c|c|} +\hline +stack depth & 0 & 1 & 2 & 3 & 4 & 5 & 6 & $>$6 \\ \hline +occurrences & 7930 & 258 & 136 & 112 & 36 & 8 & 3 & 0 \\ \hline +\end{tabular} +\caption{distribution of stack depth at block boundary} +\label{tab-1} +\end{center} +\end{table*} + + +\subsection{Copy elimination} + +To eliminate unnecessary copies the loading of values is delayed until the +instruction is reached which consumes the value. To compute the information +a the run time stack is simulated at compile time. Instead of values the +compile time stack contains the type of the value, if a local variable was +loaded to a stack location and similar information. Adl-Tabatabai +\cite{Taba+98} used a dynamic stack which is changed at every instruction. +A dynamic stack only gives the possibility to move information from earlier +instructions to later instructions. We use a static stack structure which +enables information flow in both directions. + +Fig.~\ref{Trans1} shows our instruction and stack representation. An +instruction has a reference to the stack before the instruction and the +stack after the instruction. The stack is represented as a linked list. The +two stacks can be seen as the source and destination operands of an +instruction. In the implementation only the destination stack is stored, +the source stack is the destination of stack of the previous instruction. + +\begin{figure}[htb] +\begin{center} +\setlength{\unitlength}{1mm} +\begin{picture}(25,32) +\put( 0, 0 ){\makebox(10,5){\tt b}} +\put( 5, 2.5){\oval(10,5)} +\put( 5, 9 ){\vector(0,-1){4}} +\put( 0, 9 ){\makebox(10,5){\tt c}} +\put( 5,11.5){\oval(10,5)} +\put( 5,18 ){\vector(0,-1){4}} +\put( 0,18 ){\makebox(10,5){\tt d}} +\put( 5,20.5){\oval(10,5)} +%\put( 5,27 ){\vector(0,-1){4}} +\put(15, 9 ){\makebox(10,5){\tt c*d}} +\put(20,11.5){\oval(10,5)} +\put(20, 9 ){\line(0,-1){2}} +\put( 5, 7 ){\line(1,0){15}} +\put(10,27 ){\framebox(15,5){\tt imul}} +\put(20,27 ){\vector(0,-1){13}} +\put(15,27 ){\line(0,-1){6.5}} +\put(15,20.5){\vector(-1,0){5}} +\end{picture} +\caption{instruction and stack representation} +\label{Trans1} +\end{center} +\end{figure} + +This representation can easily be used for copy elimination. Each stack +element not only contains the type of the stack slot but also the local +variable number of which it is a copy, the argument number if it is an +argument, the interface register number if it is an interface. Load (push +the content of a variable onto the stack) and store store instructions do +no generate a copy machine instruction if the stack slot contains the same +local variable. Generated machine instructions for arithmetic operations +directly use the local variables as their operands. + +There are some pitfalls with this scheme. Take the example of +fig.~\ref{Trans2}. The stack bottom contains the local variable {\tt a}. +The instruction {\tt istore a} will write a new value for {\tt a} and will +make a later use of this variable invalid. To avoid this we have to copy +the local variable to a stack variable. An important decision is at which +position the copy instruction should be inserted. Since there is a high +number of {\tt dup} instructions in Java programs (around 4\%) and it is +possible that a local variable resides in memory, the copy should be done +with the {\tt load} instruction. Since the stack is represented as a linked +list only the destination stack has to be checked for occurrences of the +offending variable and these occurrences are replaced by a stack variable. + + +\begin{figure}[htb] +\begin{center} +\setlength{\unitlength}{1mm} +\begin{picture}(79,32) +\put( 0,27 ){\framebox(15,5){\tt iload a}} +\put(16,27 ){\framebox(13,5){\tt dup}} +\put(30,27 ){\framebox(17,5){\tt iconst 1}} +\put(48,27 ){\framebox(13,5){\tt iadd}} +\put(62,27 ){\framebox(17,5){\tt istore a}} +\put(10,27 ){\vector(0,-1){22}} +\put( 5,27 ){\vector(0,-1){4}} +\put( 5, 0 ){\makebox(10,5){\tt a}} +\put(10, 2.5){\oval(10,5)} +\put(20,27 ){\line(0,-1){2}} +\put(20,25 ){\line(-1,0){10}} +\put(25,27 ){\vector(0,-1){13}} +\put(20, 9 ){\makebox(10,5){\tt a}} +\put(25,11.5){\oval(10,5)} +\put(25, 9 ){\line(0,-1){2}} +\put(10, 7 ){\line(1,0){63}} +\put(36,27 ){\line(0,-1){2}} +\put(36,25 ){\line(-1,0){11}} +\put(41,27 ){\vector(0,-1){4}} +\put(36,18 ){\makebox(10,5){\tt 1}} +\put(41,20.5){\oval(10,5)} +\put(41,18 ){\line(0,-1){2}} +\put(41,16 ){\line(-1,0){16}} +\put(52,27 ){\line(0,-1){2}} +\put(52,25 ){\line(-1,0){11}} +\put(73,27 ){\line(0,-1){20}} +\put(57,27 ){\vector(0,-1){13}} +\put(52, 9 ){\makebox(10,5){\tt +}} +\put(57,11.5){\oval(10,5)} +\put(57,9 ){\line(0,-1){2}} +\put(68,27 ){\line(0,-1){2}} +\put(68,25 ){\line(-1,0){11}} +\end{picture} +\caption{anti dependence} +\label{Trans2} +\end{center} +\end{figure} + +To answer the question of how often this could happen and how expensive +the stack search is, we analysed again the {\tt javac} compiler. In more +than 98\% of the cases the stack is empty (see table \ref{tab-2}). In only +0.2\% of the cases the stack depth is higher than 1 and the biggest stack +depth is 3. + +\begin{table}[htb] +\begin{center} +\begin{tabular}[b]{|l|c|c|c|c|c|} +\hline +stack depth & 0 & 1 & 2 & 3 & $>$3 \\ \hline +occurrences & 2167 & 31 & 1 & 3 & 0 \\ \hline +\end{tabular} +\caption{distribution of {\tt store} stack depth} +\label{tab-2} +\end{center} +\end{table} + +\begin{table*} +\begin{center} +\begin{tabular}[b]{|l|c|c|c|c|c|c|c|c|c|c|} +\hline +chain length & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & $>$9 \\ \hline +occurrences & 1892& 62 & 23 & 62 & 30 & 11 & 41 & 9 & 7 & 65 \\ \hline +\end{tabular} +\caption{distribution of creator-store distances} +\label{tab-3} +\end{center} +\end{table*} + +To avoid copy instructions when executing a {\tt store} it is necessary to +connect the creation of a value with the store which consumes it. In that +case a {\tt store} not only can conflict with copies of a local variable +which result from {\tt load} instructions before the creator of the value, +but also with {\tt load} and {\tt store} instructions which exist between +the creation of value and the {\tt store}. In fig.~\ref{Trans3} the {\tt +iload a} instruction conflicts with the {\tt istore a} instruction. + +\begin{figure}[htb] +\begin{center} +\setlength{\unitlength}{1mm} +\begin{picture}(67,27) +\put( 0,22 ){\framebox(15,5){\tt iadd}} +\put(16,22 ){\framebox(15,5){\tt iload a}} +\put(32,22 ){\framebox(17,5){\tt istore b}} +\put(50,22 ){\framebox(17,5){\tt istore a}} +\put(10,22 ){\vector(0,-1){13}} +\put( 5,22 ){\line(0,-1){2}} +\put( 5,20 ){\line(-1,0){3}} +\put( 2,20 ){\vector(0,-1){20}} +\put(10, 4 ){\line(0,-1){2}} +\put( 5, 4 ){\makebox(10,5){\tt +}} +\put(10, 6.5){\oval(10,5)} +\put(21,22 ){\line(0,-1){2}} +\put(21,20 ){\line(-1,0){11}} +\put(26,22 ){\vector(0,-1){4}} +\put(21,13 ){\makebox(10,5){\tt a}} +\put(26,15.5){\oval(10,5)} +\put(26,13 ){\line(0,-1){2}} +\put(10,11 ){\line(1,0){46}} +\put(38,22 ){\line(0,-1){2}} +\put(38,20 ){\line(-1,0){12}} +\put(43,22 ){\line(0,-1){11}} +\put(56,22 ){\line(0,-1){11}} +\put(61,22 ){\line(0,-1){20}} +\put( 2, 2 ){\line(1,0){59}} +\end{picture} +\caption{anti dependence} +\label{Trans3} +\end{center} +\end{figure} + +The anti dependences are detected by checking the stack locations of the +previous instructions for conflicts. Since the stack locations are +allocated as one big array just the stack elements which have a higher +index than the current stack element have to be checked. Table \ref{tab-3} +gives the distribution of the distance between the creation of the value +and the corresponding store. In 86\% of the cases the distance is one. + +\begin{figure*}[htb] +\begin{center} +%\small +\begin{verbatim} + java.io.ByteArrayOutputStream.write (int)void + + Local Table: + 0: (adr) S15 + 1: (int) S14 + 2: (int) S13 + 3: (int) S12 (adr) S12 + + Interface Table: + 0: (int) T24 + + [ L00] 0 ALOAD 0 + [ T23] 1 GETFIELD 16 + [ L02] 2 IADDCONST 1 + [ L02] 3 NOP + [ ] 4 ISTORE 2 + [ L02] 5 ILOAD 2 + [ L00 L02] 6 ALOAD 0 + [ T23 L02] 7 GETFIELD 8 + [ T23 L02] 8 ARRAYLENGTH + [ ] 9 IF_ICMPLE L005 + ............... + + [ ] 18 IF_ICMPLT L003 + [ ] L002: + [ I00] 19 ILOAD 3 + [ I00] 20 GOTO L004 + [ ] L003: + [ I00] 21 ILOAD 2 + [ A00] L004: + [ L03] 22 BUILTIN1 newarray_byte + [ ] 23 ASTORE 3 + [ L00] 24 ALOAD 0 + [ A00] 25 GETFIELD 8 + [ A01 A00] 26 ICONST 0 + [ A02 A01 A00] 27 ALOAD 3 + [ A03 A02 A01 A00] 28 ICONST 0 + [ L00 A03 A02 A01 A00] 29 ALOAD 0 + [ A04 A03 A02 A01 A00] 30 GETFIELD 16 + [ ] 31 INVOKESTATIC java/lang/System.arraycopy + [ L00] 32 ALOAD 0 + [ L03 L00] 33 ALOAD 3 + [ ] 34 PUTFIELD 8 + [ ] L005: + ............... + + [ ] 45 RETURN +\end{verbatim} +\caption{Example: intermediate instructions and stack contents} +\label{IntermediateStack} +\end{center} +\end{figure*} + +The output dependences are checked by storing the instruction number of the +last store in each local variable. If a store conflicts due to dependences +the creator places the value in a stack register. Additional dependences +arise because of exceptions. The exception mechanism in Java is precise. +Therefore {\tt store} instructions are not allowed to be executed before +an exception raising instruction. This is checked easily be remembering +the last instruction which could raise an exception. In methods which contain +no exception handler this conflict can be safely ignored because no +exception handler can have access to these variables. + + +\subsection{Register allocation} + +Expensive register allocation algorithms are neither suitable nor necessary. +The {\tt javac} compiler does a coloring of the local variables and assigns +the same number to variables which are not active at the same time. The +stack variables have implicitly encoded their live ranges. When a value is +pushed, the live range start. When a value is popped, the live range ends. + +Complications arise only with stack manipulation instructions like {\tt dup} +and {\tt swap}. We flag therefore the first creation of a stack variable and +mark a duplicated one as a copy. The register used for this variable can +be reused only after the last copy is popped. + +During stack analysis stack variables are marked which have to survive a +method invocation. These stack variables and local variables are assigned +callee saved registers. If there are not enough registers available, +these variables are allocated in memory. + +Efficient implementation of method invocation is crucial to the performance +of Java. Therefore, we preallocate the argument registers and the return +value in a similar way as we handle store instructions. Input arguments (in +Java input arguments are the first variables) for leaf procedures (and +input arguments for processors with register windows) are preassigned, too. + + +\subsection{Instruction combining} + +Together with stack analysis we combine constant loading instructions with +selected instructions which are following immediately. In the class of +combinable instructions are add, subtract, multiply and divide instructions, +logical and shift instructions and compare/branch instructions. During code +generation the constant is checked if it lies in the range for immediate +operands of the target architecture and appropriate code is generated. + +The old translator did for some complex instructions an expansion in +multiple instructions to avoid complex instructions in the later passes. +One of such instructions was the expansion of the {\tt lookup} instruction +in a series of load constant and compare and branch instructions. Since +the constants are usually quite small this unnecessarily increased the +size of the intermediate representation and the final code. The new +compiler delays the expansion into multiple instructions to the code +generation pass which reduces all representations and speeds up the +compilation. + + +\subsection{Example} + +Fig. \ref{IntermediateStack} shows the intermediate representation and +stack information as produced by the compiler for debugging purposes. The +{\tt Local Table} gives the types and register assignment for the local +variables. The Java compiler reuses the same local variable slot for +different local variables if there life ranges do not overlap. In this +example the variable slot 3 is even used for local variables of different +types (integer and address). The JIT-compiler assigned the saved register +12 to this variable. + +One interface register is used in this example entering the basic block +with label {\tt L004}. At the entry of the basic block the interface +register has to be copied to the argument register {\tt A00}. This is one +of the rare cases where a more sophisticated coalescing algorithm could +have allocated an argument register for the interface. + +At instruction 2 and 3 you can see the combining of a constant with an +arithmetic instruction. Since the instructions are allocated in an array +the empty slot has to be filled with a {\tt NOP} instruction. The {\tt +ADDCONSTANT} instruction already has the local variable {\tt L02} as +destination, an information which comes from the later {\tt ISTORE} at +number 4. Similarly the {\tt INVOKESTATIC} at number 31 has marked all its +operands as arguments. In this example all copy (beside the one to the +interface register) have been eliminated. + + +\subsection{Complexity of the algorithm} + +The complexity of the algorithm is mostly linear with respect to the number +of instructions and the number of local variables plus the number of stack +slots. There are only a small number of spots where it is not linear. + +\begin{itemize} +\item At the begin of a basic block the stack has to be copied to separate + the stacks of different basic blocks. Table \ref{tab-1} shows that + the stack at the boundary of a basic block is in most cases zero. + Therefore, this copying does not influence the linear performance of + the algorithm. +\item A store has to check for a later use of the same variable. Table + \ref{tab-2} shows that this is not a problem, too. +\item A store additionally has to check for the previous use of the same + variable between creation of the value and the store. The distances + between the creation and the use are small (in most case only 1) as + shown by table \ref{tab-3}. +\end{itemize} + +Compiling {\tt javac} 29\% of the compile time are spent in parsing and +basic block determination, 18\% are spent in stack analysis, 16\% are spent +in register allocation and 37\% are spent in machine code generation. + + diff --git a/doc/handbook/library.tex b/doc/handbook/library.tex new file mode 100644 index 000000000..85be1c128 --- /dev/null +++ b/doc/handbook/library.tex @@ -0,0 +1,5 @@ +\chapter{Class Library and Interface} + +\section{Introduction} + +\section{Header file generator} \ No newline at end of file diff --git a/doc/handbook/loader.tex b/doc/handbook/loader.tex new file mode 100644 index 000000000..82acd8ee3 --- /dev/null +++ b/doc/handbook/loader.tex @@ -0,0 +1,15 @@ +\chapter{Class Loader} + +\section{Introduction} + +\section{System class loader} + +\section{Data structures} + +\section{Dynamic class loader} + +\section{Eager - lazy class loading} + +\section{Linking} + +\section{Initialization} diff --git a/doc/handbook/loopopt.tex b/doc/handbook/loopopt.tex new file mode 100644 index 000000000..83371e0e1 --- /dev/null +++ b/doc/handbook/loopopt.tex @@ -0,0 +1,347 @@ +\section{Loop optimizations} + +\subsection{Introduction} + +In safe programming languages like Java, all array accesses are +checked. It is assured that the used index is greater or equal to +zero and less than the array length, otherwise an +ArrayIndexOutOfBoundsException is thrown. It is obvious that this +gain in saftey causes a run-time overhead that is especially +unpleasant in loops where these checks have to be performed many +times. Often it is possible to remove these checks in loops by +inserting additional tests at the beginning of the loop and +examinig the code structure and loop condition thereby saving run- +time speed at each loop execuion. The following algorithm performs +this task for a special set of array accesses in loops. It should +be obvious that it is not always possible to predict the value of +an array index by inserting tests at the beginning of the loop. An +example would be a global variable that is used as an array index. +This variable could be changed by a different thread virtually any +time, so removing a bound check for this array access is not a +good idea. The following algorithm only performs bound check +removal, when the induction variable (the variable used as array +index) is local and is only modified by adding/subtracting a +constant inside the loop body or remains unchanged (is constant). +If a constant is used as index, optimzation can take place as +well. When other variables are used to modify the induction +variable or when different arithmetic operations are used, no +optimization can take place. Nevertheless, the most common use for +index variables in loops is their increment or decrement by a +constant, so most of the array access should be considered for +optimization. + +\subsection{Initialization} + +Before array bound checks in loops can be eliminated, the loops +have to be detected. The algorithm performs its analysis on +intermediate code level, so we cannot rely on just looking at +while/for statments. A basic block analysis has been completed, so +the algorithm can work on this data structure already. It uses the +Lengauer-Tarjan algorithm to find existing loops, which bases on +determining the dominator tree (a reference with extensive +documentation for this algorithm can be found in \cite{appel98}). It uses a +depth first search on the control flow graph to build a spanning +tree. Then the semidominator and dominator theorem are utilized to +extract the dominator tree and by looking at back edges (a back +edge is an edge where the target node dominates the source node) +all loops can be detected. Before starting to look at each loop, +the case of different loops that share the same header node has to +be considered. The algorithm works on each loop not looking on any +other loop and performs its optimzation. The procedures that +analyze the control flow of the program, build a flow graph and +extract the loops can be found in the files graph.c (for control +flow graph) and loop.c (for the loop extractig algorithm). The +control flow graph is simple built by looking at the last +instruction of each basic block and then deciding, which other +blocks can be reached by that instruction. One problem can occur, +when lookup/table-switch instructions cause multiple jumps to the +same node. It must be prevented that the target nodeĀ“s predecessor +list contains that node more than once. So an additionally array +is needed to prevent double entries in the predecessor array. When +the necessary flow data structure and the list of all loops has +been built, the main algorithm can start. + + +\subsection{Algorithm} + +The procedures for the main algorithm can be found in analyze.c. +Before each loops is processed on its own, two global +preprocessing steps are necessary. + +The first step deals with loops, that share the same header node. +This special case can happen when a loop body ends with an if- +then/else statment. The loop finding algorithm then reports two +different loops, which share the same header node. When additional +tests are inserted at the header node and another loop sharing the +same header is later evaluated, inserting different tests, +problems could arise. To prevent this from happening, loops +sharing the same header node are simply merged into a bigger one +by unioning their nodes. Because the nodes of the loop are already +sorted by increasing basic block numbers, a simple merge of the +nodes can be done (analyze\_double\_headers). + +The second step before the loop by loop analysis commences is the +building of a loop hierarchie. Nested loops cause problems when +variables that are used as array indexes elsewhere get modified. +Because these modifications (eg. an increment by a constant) can +happen an unknown number of times, their results are +unpredictable. These modifications in nested loops have to be +recognized and reacted upon accordingly. The algorithm builds a +tree where each node represents a loop with its parent being the +directly surrounding loop. A dummy root node, representing the +whole function has to be inserted as well. When loops have to be +duplicated because of optimzed array accesses, it is important to +extend or duplicate exception entries as well. Because of that, +exceptions are inserted into the tree as follows. Every exception +is inserted at the node in the hierarchie that represents the +loop, that directly contains this exception. When an exception is +part of a nested loop, it is inserted only at the node, +representing this nested loop, because by following this nodes +parent pointers, we find all other loops, that contain the +exception (analyze\_nested). Finally, the sequentiel order of the +loops is determined by a topological sort of the loops, satisfying +the condition, that all nested loops have to be optimzed before +their parent loop is processed. This can be archieved by a post- +order traversal of the hierarchie tree with skipping the root +node. + +After these global steps have been completed, each loop is +processed. Then the loops are checked for array accesses +(analyze\_for\_array\_access) and the process exits if none +are found. Finally, two special cases must be considered. + +\begin{enumerate} + +\item The algorithm uses the loop condition to guarantee certain +bounds for variables. If the loop condition consists of an or- +statement, these guarantees can not be held up. If the loop +statement reads + +\begin{verbatim} +while ((var > 0) or true) +\end{verbatim} + +vars value is not bound to be greater than zero. When an or-statement +is found, loop optimization is stopped. + +\item Variables that are used as indexes in array accesses might be +modified in exception handling code of try-catch statements inside +the loop. Because code in catch blocks is not considered in the +control flow graph and because these modifications happen +unpredictable (and should occur rarely) loops containing these +catch blocks with index variable modifications are not optimized. +During the scan for array accesses within the loop, all +interesting variables are recorded. A variable is considered +interesting, when it is used as index in an array access or when +its value is changes (due to a store instruction) +(analyze\_or\_exceptions and scan\_global\_list). + +\end{enumerate} + +The next step is responsible for the analyzation of the loop +condition itself. The loop condition can create a so called +dynamic constraint on a variable. If one side of the loop +condition is constant during the execution of the loop the other +side can be safely assumed to be less than, greater or equal +than or equal to this side (depending on the operator) at the +start of each loop pass. It is important to notice the difference +to so called static constraints on variables. That means that +these variables can be safely assumed to stay below or above a +certain constant during the loop execution by simple testing them +prior to loop entry. This is obviously true for constants but does +also hold for variables that are only decremented or incremented. +For these, a static lower or upper bound can be guaranteed for the +whole loop execution by simply inserting a single test prior to +loop entry. Dynamic constraints, on the contrary, can vary in +different parts of the loop. After the variable is tested in the +loop condition, it might be changed to different values in +different paths of execution. All variables, that are never +changed or that get only decremented/incremented have static and +dynamic constraints, all others can only have dynamic ones +(init\_constraint). + +Now the core bound check removal procedure is started. Beginning +directly after the loop condition all changes of variables that +are used as index variables somewhere in the loop are recorded. +This is an iterative process since different paths of execution +may yield different changes to these variables. Especially changes +in nested loops cause these changes to become unpredictable and +therefore no upper/lower bound can be held up any further. After +this iterative process caused all changes to become stable, each +array access is examined (e.g. a statement like +\begin{verbatim} +if (x == true) + i++; +else + i+=2; +\end{verbatim} +must result in an increase of 2 for i when control +flow joins after the if-statement). If it is possible, by +inserting additional static/dynamic tests as needed, to assure +that the index variable is greater or equal to zero and less than +the array length, the bound checks are removed +(remove\_bound\_checks). It is possible that more than one static +tests gets inserted for a single variable, when it is used in +different array access (possibly with different constants added or +subtracted). Because all tests have to hold for this variable to +allow optimzed code to be executed, only the strictest tests have +to be done (e.g. if an integer array x[] is accessed by i in the +statements x[i+1] and x[i-1], it has to be guaranteed that i $>$ 1 +(for second statement) and that i $<$ arraylength-1 (for the first +statement)). Parallel to the insertion of new tests, the number of +needed instructions for the new loop header node is accordingly +increased. When optimzing the loop, it is important to +differentiate between the loop head and the rest of the basic +blocks, forming the body. The dynamic constraints can only be used +for array accesses in the loop body, as the loop comparison is +done at the end of the header node. Nevertheless, all static +constraints can still be used to optimze array access in the +header block. Because it is possible that an array reference is +null prior to entering the loop, all array references that are +loaded to compute an arraylength have to be checked against null. + +After all new tests have been determined, it is necessary to +reorder the code and insert the new tests (insert\_static\_checks). +The first step is the creation of a new header node. Because all +jumps to the beginning of the old loop now have to get to the new +header first, it is more efficient to just replace the code in the +old header block by the new tests. So only jumps within the loops +need to be patched and the rest of the code can remain untouched. +For each constraint that has been found in the previous step of +the algorithm, two values have to be loaded and are then compared. +Because it is possible during runtime that these tests fail (and +no guarantee can be made) a copy of the loop with the original, +checked array accesses must exist. Depending on the outcome of the +test cascade, a jump to the optimized or original loop is made. To +copy the loop, all basic blocks that are part of the loop are +copied and appended to the end of the global basic block list. +After that, both the original and the copy of the loop need post- +processing to redirect certain jump targets. All jumps to the old +loop head (which now contains the static checks) in the original +loop have to be redirected to the newly inserted block, that holds +the code for the original loop head. In the copied loop, all jumps +inside the loop have to be redirected to jumps to the copied +blocks. When loops are duplicated, these changes must be reflected +in the node list of all parent loops as well. So the hierarchie +tree is climbed and the new nodes are added to all enclosing +loops. Because these node lists are sorted, it is necessary to +deal with the new loop head in a different way. The loop head has +to be inserted into the correct place of the parent loops while +all other copied nodes can simply be appended to the basic block +list. + +Now all exceptions have to be examined. There are three different +cases, that must be considered. An exception can be part of the +loop body (ie. is inside the loop), an exception can contain the +loop or an exception is in a different part of the code (and does +not have to be further considered). To be able to find all +exceptions that are part of the loop, the algorithm uses the +hierarchie tree and gets all exceptions of all children nodes. The +exception handlers of these loops have to be copied and jumps to +the original loop have to be redirected to the copied loop. The +start and end code pointers of the protected area have to be moved +to the copied blocks. The exception handler code is identified by +looking at the successor nodes starting from the handler block. As +long as control flow does not reach a loop node again, all blocks +are considered as part of the handler code. Exceptions that +surround the loop have to be handled different. It is not +necessary to copy the exception handler, as it is not part of the +loop. Nevertheless it is important to extend the protected area to +the copied blocks. So the first and last block (including copied +exception handler blocks of nested loops) that got copied are +stored. The exceptions are then extended to contain all these new +blocks and jump to the original handler. As no code is duplicated, +nothing needs to be patched. When climbing up the hierarchie tree +by following the parent pointer, not all exceptions of parent +nodes really enclose the loop. It is possible that a parent loop +contains an exception and the loop, that is optimized, right after +each other. Those exceptions must not be handled and are ignored. +Because of the layout of the exception table, where appropriate +exceptions (that could be nested) are found by a linear search +from the start, it is necessary to insert newly created exceptions +right after their original ones. + +One performance problem still remains after these modifications. +The new loop header that replaces the original one is located +exactly where the old one was. A fall through from the previous +block (that could be part of the loop) to the loop header must be +patched by inserting an additional goto-instruction to the +original head of the loop. Because this would insert an additional +jump into the loop body, performance may deteriorate. To solve +this shortcoming, the new loop head has to be moved before the +first node of the loop. This might cause the problem of moving the +loop head to the beginning of the global basic block list. This +pointer points to the initial basic block array and can not be +easily reassigned. A new pointer is needed that temporary holds +the begin of the basic block list and is assigned to the original +pointer after all optimization step have been finished. Any fall +through from the predecessor of the loop head now reaches the old +loop head, that has been inserted right after the new loop head. +After all loops have been processed, register allocation and code +genration proceed as usual and optimized code is generated. + +\subsection{Helper functions} + +An important helper function is stored in tracing.c. This +functions is needed to determine the variables/constants that +participate in array accesses or comparisons. As all work is done +on java bytecode, it is necessary to find the arguments of an +interesting operation by examining the stack. Unfortunately these +values can just be temporary and origin in variables loaded +earlier and getting modified by arithmetic operations. To find the +variables that participate in an array access, one has to walk +back instructions and looking at the stack, until an appropriate +load is found. The function tracing preforms this task by steping +back instruction for instruction and record the stack changes +these instructions cause until the correct load or push constant +operating has been found or until it becomes clear, that it is +impossible to determine the origin (e.g. when a function return +value is used). The value is then delivered in a special structure +and used by the main algorithm. + +\subsection{Performance - impact and gain} + +It is obvious that the optimization process causes additionally +compile time overhead but can give performance gains during +runtime, when 3 less instructions are executed per array access +many times in a loop. The additional overhaed is mainly caused by +the needed control flow analysis, which has to be done for every +method and is linear to the number of basic blocks. Then the loop +scaning algorithm trys to find loops in the control flow graph. +This algorithm has to be started every time a method is compiled +and runs in O(n*ld(n)) where n is the number of basic blocks. +After the loops have been scanned for array access, the runtime of +the final bound check removal and loop copying process mainly +depends on the nesting depth of the hierarchie tree. For every +additional nesting depth, the number of basic blocks, that must be +copied and patch is doubled, resulting in exponential overhead +according to the nesting depth. Additionally, the search for +modifications of index variables, which is an iterative process, +becomes slowed down by deeply nested loops. Things get worse when +many exceptions are involved, because not only exceptions in +children nodes have to be considered, but all enclosing exceptions +as well. Nevertheless those cases should rarely occur and in real +environments, the net gain can be significant. Especially in tight +loops, when arrays are initialized or their elements summed up, +three less instructions can gain up to 30 percent speedup, when loops run +many times. + + +\vspace{4mm} + +\begin{tabular}{|l|c|c|c|} + +\hline + +& Compile time (in ms) & \multicolumn{2}{c|}{Run time (in ms) } \\ + +\multicolumn{1}{|c|}{\raisebox{1ex}{Cacao Options}} & javac & perf & sieve 1000 1000\\ \hline + +\rule{0mm}{5mm}with optimization (-oloop) & 176 & 1510 & 98 \\ + +without optimization & 118 & 1720 & 131 \\ + +\hline + +\end{tabular} + diff --git a/doc/handbook/mips.tex b/doc/handbook/mips.tex new file mode 100644 index 000000000..97e464178 --- /dev/null +++ b/doc/handbook/mips.tex @@ -0,0 +1 @@ +\section{MIPS code generator} diff --git a/doc/handbook/native.tex b/doc/handbook/native.tex new file mode 100644 index 000000000..c88c64e09 --- /dev/null +++ b/doc/handbook/native.tex @@ -0,0 +1 @@ +\section{Native interface} diff --git a/doc/handbook/overview.tex b/doc/handbook/overview.tex new file mode 100644 index 000000000..80fec0909 --- /dev/null +++ b/doc/handbook/overview.tex @@ -0,0 +1,11 @@ +\chapter{Overview} + +\section{Introduction} + +\section{Class loader} + +\section{Just-in-time compiler} + +\section{Run time system} + +\section{Class library} diff --git a/doc/handbook/powerpc.tex b/doc/handbook/powerpc.tex new file mode 100644 index 000000000..59718c7d7 --- /dev/null +++ b/doc/handbook/powerpc.tex @@ -0,0 +1 @@ +\section{PowerPC code generator} diff --git a/doc/handbook/reflection.tex b/doc/handbook/reflection.tex new file mode 100644 index 000000000..0d44e42c1 --- /dev/null +++ b/doc/handbook/reflection.tex @@ -0,0 +1 @@ +\section{Reflection} diff --git a/doc/handbook/runtime.tex b/doc/handbook/runtime.tex new file mode 100644 index 000000000..2cb1db8d7 --- /dev/null +++ b/doc/handbook/runtime.tex @@ -0,0 +1,456 @@ +\chapter{Run Time System} + +\section{Introduction} + +\section{Object layout} + +\section{Method invocation} + +Java and the programming language Theta do not implement multiple +inheritance, but single inheritance with multiple subtyping. This important +difference makes object layout and method dispatch more efficient. Although +the bidirectional layout was designed for a language with multiple +subtyping, it has the problem that more than one {\em vtbl} pointer has to +be included in objects. The CACAO JIT compiler \cite{KrGr97b} moves the +dispatch header of the bidirectional layout into the class information with +negative offsets from the {\em vtbl}. For each implemented interface there +is a distinct interface {\em vtbl}. Unimplemented interfaces are +represented by null pointers. +% +\begin{figure*}[htb] +\begin{center} +\setlength{\unitlength}{1mm} +\begin{picture}(114,72) +\put( 0,42){\vector(1,0){12}} +\put( 0,42){\makebox(8,6){\it objptr}} +\put(12,60){\makebox(24,6){\it object}} +\put(48,63){\makebox(30,6){\it class}} +\put(90,66){\makebox(24,6){\it code}} +\put(12,42){\makebox(24,18){\it instance data}} +\put(12,36){\makebox(24,6){\it class pointer}} +\put(12,42){\line(1,0){24}} +\put(36,39){\vector(1,0){12}} +\put(48,57){\makebox(30,6){\it method pointer}} +\put(48,57){\line(1,0){30}} +\put(48,51){\makebox(30,6){\it method pointer}} +\put(48,51){\line(1,0){30}} +\put(48,45){\makebox(30,6){\it method pointer}} +\put(48,45){\line(1,0){30}} +\put(48,39){\makebox(30,6){\it class info}} +\put(48,27){\makebox(30,6){\it interface pointer}} +\put(48,33){\line(1,0){30}} +\put(48,33){\makebox(30,6){\it interface pointer}} +\put(48, 0){\makebox(30,6){\it method pointer}} +\put(48, 6){\makebox(30,6){\it method pointer}} +\put(48, 6){\line(1,0){30}} +\put(48,21){\makebox(30,6){\it interfaces}} +\put(90,36){\framebox(24,6){\it method code}} +\put(44,15){\vector(1,0){4}} +\put(44,15){\line(0,1){15}} +\put(44,30){\line(1,0){4}} +\put(40, 0){\vector(1,0){8}} +\put(40, 0){\line(0,1){36}} +\put(40,36){\line(1,0){8}} +\put(78, 3){\line(1,0){8}} +\put(86, 3){\line(0,1){51}} +\put(78, 9){\line(1,0){4}} +\put(82, 9){\line(0,1){39}} +\put(78,18){\line(1,0){4}} +\put(78,54){\line(1,0){8}} +\put(86,36){\vector(1,0){4}} +\put(90,48){\framebox(24,6){\it method code}} +\put(78,48){\vector(1,0){12}} +\put(90,60){\framebox(24,6){\it method code}} +\put(78,60){\vector(1,0){12}} +\thicklines +\put(48, 0){\framebox(30,12){}} +\put(48,15){\framebox(30,6){\it method pointer}} +\put(12,36){\framebox(24,24){}} +\put(48,27){\framebox(30,36){}} +\put(48,39){\line(1,0){30}} +\end{picture} +\caption{CACAO object and compact class descriptor layout} +\label{objectcompactlayout} +\end{center} +\end{figure*} + +To call a virtual method, two memory access instructions are necessary +(load the class pointer, load the method pointer) followed by the call +instruction. Calling an interface method needs an additional indirection. +% +\begin{figure*}[htb] +\begin{center} +\setlength{\unitlength}{1mm} +\begin{picture}(114,42) +\put(0,15){\vector(1,0){12}} +\put(0,15){\makebox(12,6){\it objptr}} +\put(12,33){\makebox(24,6){\it object}} +\put(48,36){\makebox(30,6){\it class}} +\put(90,39){\makebox(24,6){\it code}} +\put(12,15){\makebox(24,18){\it instance data}} +\put(12,9){\makebox(24,6){\it class pointer}} +\put(12,15){\line(1,0){24}} +\put(36,12){\vector(1,0){12}} +\put(48,30){\makebox(30,6){\it method pointer}} +\put(48,30){\line(1,0){30}} +\put(48,24){\makebox(30,6){\it method pointer}} +\put(48,24){\line(1,0){30}} +\put(48,18){\makebox(30,6){\it method pointer}} +\put(48,18){\line(1,0){30}} +\put(48,12){\makebox(30,6){\it class info}} +\put(48,0){\makebox(30,6){\it ifmethod pointer}} +\put(48,6){\line(1,0){30}} +\put(48,6){\makebox(30,6){\it ifmethod pointer}} +\put(90,3){\framebox(24,6){\it method code}} +\put(78,3){\vector(1,0){12}} +\put(82,3){\line(0,1){18}} +\put(78,21){\line(1,0){4}} +\put(78,27){\line(1,0){8}} +\put(78,9){\line(1,0){8}} +\put(86,9){\line(0,1){18}} +\put(86,18){\vector(1,0){4}} +\put(90,18){\framebox(24,6){\it method code}} +\put(78,33){\vector(1,0){12}} +\put(90,33){\framebox(24,6){\it method code}} +\thicklines +\put(12,9){\framebox(24,24){}} +\put(48,0){\framebox(30,36){}} +\put(48,12){\line(1,0){30}} +\end{picture} +\caption{CACAO object and fast class descriptor layout} +\label{objectfastlayout} +\end{center} +\end{figure*} + +In the faster scheme, we store interface methods in an additional table at +negative offsets from the {\em vtbl} (see figure~\ref{objectfastlayout}). +Segregating the interface virtual function table keeps the standard virtual +function table small and allows interface methods to be called with just +two memory accesses. The memory consumption of virtual function tables +containing interface and class methods would be {\em number of (classes + +interfaces) $\times$ number of distinct methods}. The memory consumption of +the interface tables is only {\em number of classes which implement +interfaces $\times$ number of interface methods}. Colouring can be used to +reduce the number of distinct offsets for interface methods further, but +complicates dynamic class loading, leading to renumbering and code +patching. + +The Jalapeno virtual machine implements an interface method invocation +similar to the fast class descriptor layout of CACAO, but instead of +colouring hashing of the method indices is used \cite{Alpern+01}. The table +for the interface method pointers is allocated with a fixed size much +smaller than the number of interface methods. When two method indices hash +to the same offset, a conflict resolving stub is called instead of the +interface methods directly. For conflict resolution the stub is passed the +method index in a scratch register as additional argument. An interface +method invocation can be executed with the following four machine +instructions: +% +\begin{verbatim} +LD vtblptr,(obj) ; load vtbl pointer +LD mptr,hash(method_ptr)(vtblptr) ; load method pointer +MV mindex,idreg ; load method index +JSR (mptr) ; call method (or conflict stub) +\end{verbatim} +% +The number of machine instructions is the same as in the compact class +descriptor layout of CACAO, but the indirection is eliminated, which +reduces the number of cycles needed for the execution of this instruction +sequence on a pipelined architecture. Compared to the old interface method +invocation in the Jalapeno VM, which searched the superclass hierarchy for +a matching method signature, the new method resulted in changes in the +run-time from one percent slowdowns to speedups up to 51 percent. + + +\section{Run time type checking} + +Since type tests for trees are more (space) efficient than type tests for +DAGs, a possible solution is to split a DAG into a tree part and the +remaining graph. For languages with single inheritance and multiple +subtyping this partitioning of the class hierarchy is already done in the +language itself. + +\begin{figure}[htb] +\begin{center} +\setlength{\unitlength}{1mm} +\begin{picture}(86,26) +\thicklines +\put(23, 3){\circle{6}} +\put(43, 3){\circle{6}} +\put(63, 3){\circle{6}} +\put(33,13){\circle{6}} +\put(53,13){\circle{6}} +\put(43,23){\circle{6}} +\put(20, 0){\makebox(6,6){D}} +\put(40, 0){\makebox(6,6){E}} +\put(60, 0){\makebox(6,6){F}} +\put(30,10){\makebox(6,6){B}} +\put(50,10){\makebox(6,6){C}} +\put(40,20){\makebox(6,6){A}} +\put(19, 0){\makebox(0,6)[r]{\{3,0\}}} +\put(47, 0){\makebox(6,6)[l]{\{4,0\}}} +\put(67, 0){\makebox(0,6)[l]{\{6,0\}}} +\put(29,10){\makebox(0,6)[r]{\{2,2\}}} +\put(57,10){\makebox(0,6)[l]{\{5,1\}}} +\put(47,20){\makebox(0,6)[l]{\{1,5\}}} +\put(25, 5){\line( 1,1){6}} +\put(35,15){\line( 1,1){6}} +\put(41, 5){\line(-1,1){6}} +\put(51,15){\line(-1,1){6}} +\put(61, 5){\line(-1,1){6}} +\end{picture} +\caption{Relative numbering with \{{\tt baseval}, {\tt diffval}\} pairs} +\label{relnumbering} +\end{center} +\end{figure} + +CACAO uses a subtype test based on relative numbering for classes and a +kind of matrix implementation for interfaces. Two numbers {\em low} and +{\em high} are stored for each class in the class hierarchy. A depth first +traversal of the hierarchy increments a counter for each class and assigns +the counter to the {\em low} field when the class is first encountered and +assigns the counter to the {\em high} field when the traversal leaves the +class. In languages with dynamic class loading a renumbering of the +hierarchy is needed whenever a class is added. A class {\em sub} is a +subtype of another class {\em super}, if {\em super.low $\le$ sub.low $\le$ +super.high}. Since a range check is implemented more efficiently by an +unsigned comparison, CACAO stores the difference between the {\em low} and +{\em high} values and compares it against the difference of the {\em low} +values of both classes. The code for {\tt instanceof} looks similar to: + +\begin{verbatim} +return (unsigned) (sub->vftbl->baseval - super->vftbl->baseval) <= + (unsigned) (super->vftbl->diffval); +\end{verbatim} + +For leaf nodes in the class hierarchy the {\tt diffval} is 0 which results +in a faster test (a simple comparison of the {\tt baseval} fields of the +sub- and superclass). In general a JIT compiler can generate the faster +test only for final classes. An AOT compiler or a JIT compiler which does +patching of the already generated machine code may additionally replace +both the {\tt baseval} and the {\tt diffval} of the superclass by a +constant. Currently CACAO uses constants only when dynamic class loading is +not used. + +CACAO stores an interface table at negative offsets from the virtual method +table (see figure~\ref{objectcompactlayout}). This table is needed for the +invocation of interface methods. The same table is additionally used by the +subtype test for interfaces. If the table is empty for the index of the +superclass, the subtype test fails. The code for {\tt instanceof} looks +similar to: + +\begin{verbatim} +return (sub->vftbl->interfacetable[-super->index] != NULL); +\end{verbatim} + +Both subtype tests can be implemented by very few machine code instructions +without using branches which are expensive on modern processors. + + +\section{Exception handling} + +\subsection{Introduction} + +Exceptions in Java occur either implicitly or explicitly. Typical +implicit exceptions are references to the {\tt null} pointer, array +index out of bounds and division by zero. Exceptions also can be +raised explicitly with the {\tt throw} instruction. To handle +exceptions occurring during execution, code which can raise an +exception is included in a {\tt try} block. An efficient +implementation of exception handling has to take care of managing {\tt +try} blocks and to check for implicit exceptions efficiently . + + +\subsection{Known implementation techniques} + +Three standard methods exist for implementing exception handling: + +\begin{itemize} + +\item dynamically create a linked list of {\tt try} block data + structures, + +\item use static {\tt try} block tables and search these tables at run + time (suggested for JavaVM interpreters), + +\item use functions with two return values. + +\end{itemize} + +The first method has been used in portable implementations of +exception handling for C++ \cite{Cameron+92} or Ada \cite{Giering+94} +using {\tt setjmp} and {\tt longjmp}. A linked exception handling data +structure is created when entering a {\tt try} block and the structure +is discarded when leaving the protected block. Java requires precise +exceptions. It means that all expressions evaluated before the +exception raising instruction must have finished and all expressions +after the raising instruction must not have been started. Therefore, +in practice, some instructions may not be moved freely. In the case of +subroutine calls, the callee-saved registers must be restored to their +original value. The data structure can be used to store such +information. The disadvantage of this method is that creating and +discarding of the data structure takes some time even if an exception +is never raised. + +The second method has been suggested for an efficient exception +handling implementation of C++ \cite{KoenigStroustrup90} and is used +in Java implementations. For every method, the JavaVM maintains an +exception table. This exception table contains the program counter of +the start and the end of the {\tt try} block, the program counter of +the exception handler and the type of the exception. A JavaVM +interpreter can easily interpret this structure and dispatch to the +corresponding handler code. If the byte code is translated to native +code, the equivalent technique is more complicated. + +To simplify restoration of the registers, the old CACAO implementation +used a different scheme \cite{KrGr97b}. A method has two return +values: the real return value and an exception value stored in a +register. After each method call, the exception register is checked +and, if it is non-zero, the exception handling code is executed. Since +an exception is rarely raised, the branch is easy to predict and +cheap. Entering and leaving a {\tt try} block have no associated +costs. At compile time, the dispatch information contained in the +exception table is translated into method dispatching native code. + +Run time checks for null pointers and array bounds are quite frequent, +but can be eliminated in many cases. It is often possible to move a +loop invariant null pointer check before the loop or to eliminate a +bound check. Some more sophisticated compilers use these code motion +techniques. + + +\subsection{Motivation for a change} + +\begin{table*} +\begin{center} +\begin{tabular}[b]{|l|c|c|c|c|c|} +\hline + & JavaLex & javac & espresso & Toba & java\_cup \\ +\hline +null pointer checks & 6859 & 8197 & 11114 & 5825 & 7406 \\ +\hline +method calls & 3226 & 7498 & 7515 & 4401 & 5310 \\ +\hline +try blocks & 20 & 113 & 44 & 28 & 27 \\ +\hline +\end{tabular} +\caption{Number of pointer checks, method invocations and try blocks} +\label{MethodTry} +\end{center} +\end{table*} + +The old CACAO implementation was simple, but it only makes sense if +the number of try blocks is high. We made an empirical study to count +the numbers of static occurrences of method invocations and of {\tt +try} blocks in some applications (see table \ref{MethodTry}). The +number of method invocations is two magnitudes bigger than the number +of {\tt try} blocks. Furthermore, an exception is rarely raised during +program execution. This led us to a new implementation of exception +handling. + + +\subsection{The new exception handling scheme} + +The new exception handling scheme is similar to that in a JavaVM +interpreter. If an exception occurs, information in the exception +table is interpreted. However native code complicates the matter. + +The pointers to Java byte code must be replaced by pointers to native +code. It requires that, during native code generation, the order of +basic blocks not be allowed to change. If basic blocks are eliminated +because of dead code, the information about a block can not be +discarded if there is a pointer to it in the exception table. + +A CACAO stack frame only contains copies of saved or spilled +registers. There is no saved frame pointer. The size of a stack frame +is only contained in the instructions which allocate and deallocate +the stack. Therefore, to support exception handling, additional +information has to be stored elsewhere. + +The code for a method needs access to constants (mostly address +constants). Since a global constant table would be too large for short +address ranges and, because methods are compiled on demand, every +method has its own constant area which is allocated directly before +the start of the method code (see fig.\ \ref{methodlayout}). A +register is reserved which contains the method pointer. Constants are +addressed relative to the method pointer. + +\begin{figure}[htb] +\begin{center} +\setlength{\unitlength}{1mm} +\begin{picture}(50,18) +\put(24,6){\vector(-1,0){6}} +\put(24,3){\makebox(25,6){\it method pointer}} +\put(0,0){\makebox(18, 6){\it constants}} +\put(0,6){\makebox(18,12){\it code}} +\thicklines +\put(0,6){\line(1,0){18}} +\put(0,0){\framebox(18,18){}} +\end{picture} +\caption{CACAO method layout} +\label{methodlayout} +\end{center} +\end{figure} + +During a method call, the method pointer of the calling method is +destroyed. However the return address is stored in a register which is +preserved during execution of the called method and has to be used for +returning from the method. After a method return, the method pointer +of the calling method is recomputed using the return address. The +following code for a method call demonstrates the method calling +convention: + +\begin{verbatim} +LDQ cp,(obj) ; load class pointer +LDQ mp,met(cp) ; load method pointer +JSR ra,(mp) ; call method +LDA mp=ra+offset ; recompute method pointer +\end{verbatim} + +At the beginning of the constant area, there are fields which contain +the necessary information for register restoration: + +\begin{mylist}{{\tt framesize}} +\item[{\tt framesize}] the size of the stack frame +\item[{\tt isleaf}] a flag which is true if the method is a leaf +method +\item[{\tt intsave}] number of saved integer registers +\item[{\tt floatsave}] number of saved floating point registers +\item[{\tt extable}] the exception table -- similar to the JavaVM +table +\end{mylist} + +The exception handler first checks if the current executing method has +an associated handler and may dispatch to this handler. If there is no +handler, it unwinds the stack and searches the parent methods for a +handler. The information in the constant area is used to restore the +registers and update the stack pointer. The return address and the +offset in the immediately following {\tt LDA} instruction is used to +recompute the method pointer. + +\begin{table*} +\begin{center} +\begin{tabular}[b]{|l|c|c|c|c|c|} +\hline + & JavaLex & javac & espresso & Toba & java\_cup \\ \hline +CACAO old & 61629 & 156907 & 122951 & 67602 & 87489 \\ \hline +CACAO new & 37523 & 86346 & 69212 & 41315 & 52386 \\ \hline +\end{tabular} +\caption{Number of generated native instructions} +\label{CodeSize} +\end{center} +\end{table*} + +The change to the new scheme allowed us to implement the null pointer +check for free. We protect the first 64 Kbyte of memory against read +and write access. If an bus error is raised, we catch the signal and +check if the faulting address is within the first 64K. If this is the +case, we dispatch to our exception handler, otherwise we propagate the +signal. This gives faster programs and reduces the work of the +compiler in generating pointer checking code. As shown in table +\ref{MethodTry}, the numbers of null pointer checks are quite high. + + +\section{Garbage collector} + + diff --git a/doc/handbook/threads.tex b/doc/handbook/threads.tex new file mode 100644 index 000000000..d9fc76060 --- /dev/null +++ b/doc/handbook/threads.tex @@ -0,0 +1,603 @@ +\section{Multi threading} + +\subsection{Introduction} + +Threads are an integral part of the Java programming language. A Java +Runtime Environment (JRE) has to implement threads to be compliant. A +thread implementation includes the creation, execution, switching, +killing and synchronization of threads. In Java the latter is provided +by monitors. Monitors ensure that, for a given object, only one thread +at a time can execute certain methods, known as synchronized methods, +and blocks which are marked by the keyword \texttt{synchronized}. + +Monitors are usually implemented using mutex (mutual exclusion). A +mutex is a data structure which contains the necessary information to +guarantee that only one unit of execution can perform a critical +section at the same time \cite{Stallings95}. + +As we show in section \ref{MonitorTest} a fast implementation of the +synchronization mechanism is crucial for the efficiency of Java. One +implementation produced more than 800\% overhead in one of our tests. + + +\subsection{Related work} + +Java threads can be implemented using threads provided by the +operating system kernel, as user-level libraries, or as a combination +of the two. There exist a number of articles describing different +thread implementation aspects but none captures the problem of +efficient monitor operations for objects. + +Sun's first implementation of the JavaVM on Solaris was based on +user-level threads. The current implementation uses a combination of +kernel and user-level threads. Some of the advantages of this approach +are outlined in \cite{SunThreads97}. + +The freely available JavaVM implementation {\tt kaffe} by Tim +Wilkinson uses user-level threads \cite{Wilkinson:97}. Until version +0.9, each object contained the complete mutex data structure. This +enabled a fast monitor implementation but used a lot more memory than +necessary. + +Apart from thread implementations used in JavaVM's there are many +other thread standards and implementations, the most notable being the +IEEE POSIX extension \cite{PosixThreads96}. + +In \cite{Mueller93}, Mueller describes a library implementation of +POSIX threads on a standard UNIX system. This library is a user-level +implementation which need no support from the operating system. A very +popular library of user-level primitives for implementing threads is +{\em QuickThreads} by David Keppel, described in \cite{Keppel93}. + +Bershad et al. present a fast mechanism for mutual exclusion on +uniprocessor systems \cite{Bershad+92}. They have a software solution +for the implementation of an atomic test-and-set operation which is +faster than the corresponding hardware instruction. However, their +implementation relies on assistance from the operating system. + + +\subsection{Implementation basics} + +A complete thread implementation for Java has to provide: + +\begin{itemize} + +\item thread creation and destruction, + +\item low level thread switching (usually implemented in assembly + language), + +\item thread scheduling, + +\item synchronization primitives, + +\item a thread safe non-blocking input/output library. + +\end{itemize} + +Cacao's current implementation of threads is based mainly on the +threading code of {\tt kaffe} version 0.7, which has been released +under a BSD-style license and can thus be used freely +\cite{Wilkinson:97}. As mentioned above, {\tt kaffe}'s threads are +completely user-level, which means, for example, that they cannot take +advantage of a multiprocessor system. + +\begin{table*} +\begin{center} +\begin{tabular}{|l|l|c|c|c|c|c|c|} \hline + & & \multicolumn{3}{|c|}{mutex test} + & \multicolumn{3}{|c|}{tree test}\\ \hline + & & \multicolumn{2}{|c|}{run time in secs}&call cost + & \multicolumn{2}{|c|}{run time in secs}&call cost\\ +Machine & JavaVM & no sync & sync & in $\mu$s & no sync & sync & in $\mu$s \\ \hline\hline +DEC 21064A 289MHz & OSF JDK port (1.0.2)& 1.20 & 4.14 & 9.8 & 8.37 & 34.69 & 9.8 \\ \hline +DEC 21064A 289MHz & DEC JDK interpreter & 1.71 & 12.80 & 36.97 & 12.25 & 143.10 & 39.93 \\ \hline +DEC 21064A 289MHz & DEC JDK JIT & 1.06 & 11.05 & 33.30 & 7.80 & 130.50 & 37.45 \\ \hline +Pentium-S 166MHz & Linux JDK 1.1.3 & 0.96 & 1.69 & 2.43 & 7.93 & 16.12 & 2.50 \\ \hline +AMD K6 200MHz & Sun JPP JIT (WinNT) & 0.57 & 0.94 & 1.23 & 2.8 & 9.8 & 2.13 \\ \hline +AMD K6 200MHz & Navigator 4.04 (WinNT) & 0.03 & 0.77 & 2.46 & 2.3 & 9.2 & 2.11 \\ \hline +%PowerPC 604e 200MHz & Linux JDK Beta & 1.16 & 2.05 & 2.96 & & & \\ \hline +%Sun Ultra-1 & JDK 1.1.2 & 0.90 & 1.25 & 1.16 & & & \\ \hline +%DEC 21064A 300MHz & Cacao unoptimized & 0.24 & 0.48 & 0.80 & 0.24 & 0.48 & 0.80 \\ \hline +DEC 21064A 289MHz & Cacao & 0.28 & 0.40 & 0.40 & 1.46 & 4.71 & 0.99 \\ \hline +\end{tabular} +\caption{Overhead for calling synchronized methods (one object)} +\label{SyncronizedOverhead} +\end{center} +\end{table*} + +There are several reasons why we chose this approach: + +\begin{itemize} + +\item Thread support differs from operating system to operating +system. Not only do some operating systems provide different APIs to +other systems, but even if they do provide the same interface (most +often in the form of POSIX threads), the costs of various operations +are often very different across platforms. + +\item It was a complete implementation, tailored for the use in a +JavaVM and thus made it possible for us to get thread support up and +running quickly. + +\item Parts of Cacao are not yet thread-safe (most notably the +compiler and the garbage collector), thus making it difficult to use a +kernel-supported solution. + +\end{itemize} + +Each thread in a Java program corresponds to an instance of the +\texttt{java.lang.Thread} class. In order for the Java run time +environment (JRE) to associate platform specific information with such +an instance, it has a private member called \texttt{PrivateInfo} of +type \texttt{int}, which can be used by the JRE. Kaffe version 0.7 +used this member as a pointer to a context structure. Since pointers +are 64-bit values on the Alpha, we use an array of context structures. +\texttt{PrivateInfo} is then used as an index into this array. + +A fixed-size stack is allocated for each thread. The stack size for +ordinary threads can be specified as a command-line parameter. Special +threads (such as the garbage collector) have their own stack sizes. In +order to catch stack overflows without the burden of checking the +stack top at each method entry, we simply guard the stack top with one +or more memory pages. The memory protection bits of these pages are +modified to cause a memory protection violation when accessed. The +number of guard pages can be specified on the command-line. + +Thread switching is implemented straightforwardly, namely by saving +callee-save registers to the stack, switching to the new stack, +restoring callee-save registers and performing a subroutine return. + +Scheduling is very simple in Cacao: higher priority threads always get +precedence over lower priority threads, and threads of the same +priority are scheduled with a round-robin algorithm. + +I/O in user-level threads is a problem since UNIX I/O calls are, by +default, blocking. They would block not just the current thread but +the whole process. This is undesirable. It is thus common practice for +thread libraries to use non-blocking I/O and, in the case of an +operation which would block, suspend the corresponding thread and +perform a multiplexed I/O operation (\texttt{select(2)} in UNIX) on +all such blocked files regularly, especially if there are no runnable +threads available. + + +\subsection{Motivation} + +\label{MonitorTest} + +To optimize an implementation it is necessary to find the parts which +are critical to performance. Therefore, we analysed the cost of +monitors with two small test programs. The {\em mutex test} program +simply invoked a method 300000 times, once with the method being +declared \texttt{synchronized} and once without. The other test, {\em +tree test}, allocated a balanced binary tree with 65535 elements and +recursively traversed it 50 times using a method, again once being +synchronized and once being not. + +Table \ref{SyncronizedOverhead} shows the differences in run-time for +the two versions of the programs for several JavaVM's. The table +includes the run times for both versions of the programs in seconds. +The column `call cost' gives the overhead of a call of a synchronized +method compared to a call of a non-synchronized one. From these +numbers it is obvious that calls to synchronized methods, or monitors +in general, are much slower than ordinary method calls. + +\begin{table*} +\begin{center} +\begin{tabular}{|l|r|r|r|r|} \hline +Application & Objects allocated & Objects with mutex & Monitor +operations & Parallel Mutexes \\ \hline\hline +javac & 111504 & 13695 & +840292 & 5 \\ \hline +Biss & 84939 & 13357 & +1058901 & 12 \\ \hline +Jigsaw & 215411 & 23804 & +855691 & 25 \\ \hline +\end{tabular} +\caption{Mutual exclusion statistics} +\label{MutexStatistics} +\end{center} +\end{table*} + +The question that arises is now whether this has any influence on +common Java programs. To answer this question, we have used a modified +version of \texttt{kaffe} to gather statistics about monitor usage. +The results are summarized in table \ref{MutexStatistics}. + +Javac is an invocation of Sun's \texttt{javac} on the Toba source +files \cite{Proebsting+97} and is thus single-threaded. Execution of +this program takes only a few seconds using Cacao with threads +disabled. Biss is a more or less typical working session with the Java +development environment of the Biss-AWT \cite{BissAWT97}. It is +slightly multithreaded. Jigsaw invokes the HTTP server Jigsaw +\cite{Jigsaw97} of the World Wide Web Consortium and lets it serve +identical parallel requests from seven hosts, amounting to about one +megabyte split across 200 files for each request. This application is +highly multithreaded. + +The table contains the number of objects allocated during execution +and the number of objects for which a monitor has been entered. The +`Monitor operations' column gives the total number of operations +performed on monitors, while the numbers in the `Parallel Mutexes' +column show the greatest number of mutexes that were locked +simultaneously. + +These numbers demonstrate that the performance of monitor operations +is of paramount importance for a fast JavaVM implementation. We did +not include the number of blocking monitor operations because there +were just two of them, namely in the Biss application. It was only +after we modified \texttt{kaffe} to switch to another thread before +each \texttt{monitorenter} operation that the Biss and Jigsaw +applications performed a few thousand blocking \texttt{monitorenter}s. + + +\subsection{Mutex implementation} + +Our mutex data structure includes a pointer to the thread that has +currently locked the mutex (\texttt{holder}). If the mutex is not +locked at all, this is a null pointer. Because one thread can lock the +same mutex multiple times, we need a count of how many lock operations +without corresponding unlocks have been performed on the mutex +(\texttt{count}). When it goes down to zero, the mutex is not locked +by any thread. Furthermore, we need to keep track of the threads which +have tried to lock the mutex but were blocked and are now waiting for +it to become unlocked (\texttt{waiters}). + +The data structure is defined as follows: + +\begin{verbatim} +struct mutex { + int count; + thread *holder; + thread *waiters; +} +\end{verbatim} + +The locking of a mutex can now be specified as in +figure~\ref{lockMutex}. + +\begin{figure} +\begin{verbatim} +void lockMutex (struct mutex *mux) { + disablePreemption(); + + if (mux->holder == NULL) { + mux->holder = currentThread; + mux->count = 1; + } else if (mux->holder == currentThread) { + mux->count++; + } else { + blockUntilMutexIsNotLocked(mux); + mux->holder = currentThread; + mux->count = 1; + } + + enablePreemption(); +} +\end{verbatim} +\caption{Code for \texttt{lockMutex()}} +\label{lockMutex} +\end{figure} + +The macro \texttt{disablePreemption()} simply sets a global flag +indicating that a critical section is currently being executed and +that preemption must not take place. \texttt{enablePreemption()} +unsets the flag and checks whether a thread switch is necessary (see +below). On a multiprocessor system this would need to be implemented +using a test-and-set instruction, or some equivalent. + +The inverse operation, namely the unlocking of the mutex, can be +expressed as in figure~\ref{unlockMutex}. + +\begin{figure} +\begin{verbatim} +void unlockMutex (struct mutex *mux) { + disablePreemption(); + + --mux->count; + if (mux->count == 0) { + mux->holder = NULL; + if (mux->waiters != NULL) { + thread *next = mux->waiters; + mux->waiters = next->next; + resumeThread(next); + } + } + + enablePreemption(); +} +\end{verbatim} +\caption{Code for \texttt{unlockMutex()}} +\label{unlockMutex} +\end{figure} + +The function \texttt{resumeThread()} sets a flag which results in a +thread switch to the given thread after preemption has been +re-enabled. + +These algorithms are simple, straightforward and reasonably efficient. + + +\subsection{Object-Mutex relation} +\label{objectmutex} + +Since the JavaVM specification states that each object has a mutex +associated with it, the obvious solution seems to be to include the +mutex structure in each object. The mutex structure could be reduced +to a length of eight bytes if we used thread numbers instead of +pointers. But, using such a solution, the javac application would +allocate nearly one megabyte of mutex data, just for a few seconds of +execution. This is unacceptable. + +On the other hand, the figures show that it is very seldom that more +than 20 mutexes are locked at the same time. This suggests that using +a hash table, indexed by the address of the object for which a monitor +operation is to be performed could be very efficient. This is exactly +what Cacao does. + +We use a hash function which uses the $2 n$ least significant bits of +the address where $2^n$ is the size of the hash table. This function +can be implemented in four RISC instructions. Since we ran into +performance problems with overflow handling by internal chaining, we +now use external chaining with a preallocated pool of overflow entries +managed by a free list. + +An entry in the hash table has the following structure: + +\begin{verbatim} +struct entry { + object *obj; + struct mutex mux; + struct entry *next; +} +\end{verbatim} + +In order to minimize the overhead of the locking/unlocking operations, +we should strive for code sequences small enough to be inlined, yet +powerful enough to handle the common case. We have chosen to handle +the first entry in the collision chain slightly differently from the +rest by not destroying the associated mutex when the count goes down +to zero. Instead the decision is deferred until when a mutex with the +same hash code gets locked and would thus use this location. The +major benefit of this approach is that the code to lock the mutex need +not (in the common case) check for the location to be free, since each +hash table location will, during the course of execution, only be free +at the beginning of the program and will then never be freed again. +The unlocking code is simplified by the fact that the code need not +check whether the hash table location should be freed. Based on the +experience that a recently locked mutex is likely to be locked again +in the near future, this technique also improves overall performance. + +\begin{figure} +\begin{verbatim} + 1 void monitorenter (object *o) + 2 { + 3 entry *e; + 4 disablePreemption(); + 5 + 6 e = firstChainEntry(o); + 7 if (e->obj == o + 8 && e->mux.holder + 9 == currentThread) +10 e->mux.count++; +11 else +12 lockMutexForObject(e,o); +13 +14 enablePreemption(); +15 } +\end{verbatim} +\caption{Code of \texttt{monitorenter()}} +\label{monitorenter} +\end{figure} + +\begin{figure} +\begin{verbatim} + 1 void monitorexit (object *o) + 2 { + 3 entry *e; + 4 disablePreemption(); + 5 + 6 e = firstChainEntry(o); + 7 if (e->obj == o) + 8 e->mux.count--; + 9 else +10 unlockMutexForObject(e,o); +11 +12 enablePreemption(); +13 } +\end{verbatim} +\caption{Code of \texttt{monitorexit()}} +\label{monitorexit} +\end{figure} + + +\begin{table*} +\begin{center} +\begin{tabular}{|l|r|r|r|r|r|} % monitorEnter +\hline +Program & Line 6 & Line 10 & count 0 & Line 12 & Ratio $12/6$ \\ \hline\hline +javac & 420147 & 405978 & 381019 & 14169 & 3.372 \% \\ \hline +Biss & 384350 & 370171 & 425038 & 14179 & 3.689 \% \\ \hline +Jigsaw & 426695 & 391680 & 294957 & 35015 & 8.206 \% \\ \hline +\end{tabular} +\caption{Execution statistics for \texttt{monitorenter()}} +\label{monitorEnter} +\end{center} +\end{table*} + + +\begin{figure} +\begin{verbatim} + 1 void lockMutexForObject (entry *e, + 2 object *o) { + 3 disablePreemption(); + 4 + 5 if (e->obj != NULL) + 6 if (e->mux.count == NULL) + 7 claimEntry(e,o); + 8 else + 9 while (e->obj != o) { +10 if (e->next == NULL) { +11 e = e->next = allocEntry(o); +12 break; +13 } +14 e = e->next; +15 } +16 else +17 e->obj = o; +18 +19 lockMutex(&e->mux); +20 +21 enablePreemption(); +22 } +\end{verbatim} +\caption{Code for \texttt{lockMutexForObject()}} +\label{lockMutexForObject} +\end{figure} + +\begin{figure} +\begin{verbatim} + 1 void unlockMutexForObject (entry *e, + 2 object *o) { + 3 disablePreemption(); + 4 + 5 if (e->obj == o) + 6 unlockMutex(&e->mux); + 7 else { + 8 /* Assuming entry is there */ + 9 while (e->next->obj != o) +10 e = e->next; +11 unlockMutex(&e->next->mux); +12 if (e->next->mux.count == 0) +13 e->next = freeEntry(e->next); +14 } +15 +16 enablePreemption(); +17 } +\end{verbatim} +\caption{Code for \texttt{unlockMutexForObject()}} +\label{unlockMutexForObject} +\end{figure} + +See figures~\ref{monitorenter} and~\ref{monitorexit} for the code of +the corresponding JavaVM instructions. These functions handle (as we +show below) the most common case and depend on the two functions for +the general case presented in figures~\ref{lockMutexForObject} +and~\ref{unlockMutexForObject} (we now assume that +\texttt{enablePreemption()}/\texttt{disablePreemption()} pairs can be +nested). + +Even if they are not short enough to be inlined in every synchronized +method, these functions are certainly small enough to make the effort +of coding them as specially tailored, assembly language functions +worthwhile. This bypasses the standard subroutine linkage conventions, +gaining a little extra speed. + +\subsection{Results} + +\begin{table} +\begin{center} +\begin{tabular}{|l|r|r|r|r|} % monitorExit +\hline +Program & Line 6 & Line 8 & Line 10 & Ratio $10/6$ \\ \hline\hline +javac & 420146 & 419815 & 331 & 0.078 \% \\ \hline +Biss & 384368 & 383281 & 1087 & 0.282 \% \\ \hline +Jigsaw & 428997 & 416890 & 12107 & 2.822 \% \\ \hline +\end{tabular} +\caption{Execution statistics for \texttt{monitorexit()}} +\label{monitorExit} +\end{center} +\end{table} + +To demonstrate that nearly all cases are indeed handled by these small +routines, we have written a small application which simulates the +locking and unlocking operations of the three applications we used +above (tables~\ref{monitorEnter} and \ref{monitorExit}). As can be +seen, only a small percentage of cases need to be handled in the +general routines \texttt{lockMutexForObject()} and +\texttt{unlockMutexForObject()}. + +The column `count 0' gives the number of mutex locks where the {\tt count} +of the mutex before locking is zero. We present this number to evaluate an +implementation where {\tt monitorenter()} checks the {\tt count} instead of +the {\tt holder} to determine if the mutex can be locked immediately. The +numbers seem to be in favour of checking {\tt holder} but at least indicate +that the difference is neglectable. Furthermore, locking the mutex after +only having checked {\tt count} needs an additional assignment (setting +{\tt holder}). + +\begin{table*} +\begin{center} +\begin{tabular}{|l|r|r|r|r|} \hline + & \multicolumn{4}{|c|}{miss rates by size of cache} \\ +Application & 1 Element & 2 Elements & 4 Elements & 8 Elements \\ +\hline\hline +javac & 15.076 \% & 9.757 \% & 4.931 \% & 3.193 \% \\ \hline +Biss & 13.488 \% & 8.349 \% & 4.765 \% & 3.141 \% \\ \hline +Jigsaw & 43.694 \% & 37.700 \% & 22.680 \% & 5.182 \% \\ \hline +\end{tabular} +\caption{Results of cache simulation} +\label{CacheResults} +\end{center} +\end{table*} + +We have also considered the possibility of using a cache of recently +used mutexes to improve performance, similar to a +translation-lookaside buffer in microprocessors which cache the +mapping between virtual and physical memory pages. To evaluate whether +this would be worthwhile, we have simulated caches with one, two, four +and eight elements using the three applications as test candidates. We +have used least-recently-used as the cache replacement strategy. +Though this is not easily implemented in software, it provides a good +estimate of the best hit rate that can be achieved with an efficient +implementation. Table~\ref{CacheResults} summarizes the results. + +\begin{table*} +\begin{center} +\begin{tabular}[b]{|l|c|c|c|c|c|} +\hline + & JavaLex & javac & espresso & Toba & +java\_cup \\ \hline +run time without threads & 2.82 & 4.91 & 3.23 & 4.32 +& 1.35 \\ \hline +run time with threads & 3.89 & 5.40 & 3.23 & 5.53 +& 1.56 \\ \hline +overhead (optimized impl.) & 37 \% & 10 \% & 0 \% & 28 \% +& 15 \% \\ \hline +number of lock/unlock pairs & 1818889 & 420146 & 2837 & 1558370 & +124956 \\ \hline +\end{tabular} +\caption{Overhead of monitor operations} +\label{CACAOMutexCost} +\end{center} +\end{table*} + +Using an implementation supporting the monitor routines, as discussed +in section~\ref{objectmutex}, and one implementation without thread +support, we have run several applications on a 300MHz DEC 21064A (see +table~\ref{CACAOMutexCost}). For these single threaded applications, +the overhead introduced by monitor operations ranges from 0\% to 37\%, +depending on the number of monitor operations in the applications. +Note, however, that this cannot be compared to the overhead figures +given in table \ref{SyncronizedOverhead}, since these applications do +more than just entering and exiting a monitor. + +Using the implementation described, the {\em mutex test} application +for table \ref{SyncronizedOverhead} took 0.40 seconds with a +synchronized and 0.28 seconds with an ordinary method to complete. In +this program the time spent on locking/unlocking a mutex is 0.40 +microseconds. The reason for the higher cost of mutex operations in +the {\em tree test} is that this test violates the locality of monitor +operations. Overall, these numbers compare very favorably with the +other implementations. + +For most single-threaded applications, the monitor overhead can be +eliminated completely. If it is possible to determine statically that +the dynamic class-loader and the \texttt{java.lang.Thread} class are +not used, synchronization code need not be generated. + + +\subsection{Thin locks} + + diff --git a/doc/handbook/verification.tex b/doc/handbook/verification.tex new file mode 100644 index 000000000..0108b2749 --- /dev/null +++ b/doc/handbook/verification.tex @@ -0,0 +1 @@ +\section{Verification} diff --git a/doc/handbook/x86.tex b/doc/handbook/x86.tex new file mode 100644 index 000000000..6f4ab72d7 --- /dev/null +++ b/doc/handbook/x86.tex @@ -0,0 +1 @@ +\section{X86 code generator}