rjfpubs.bib

@conference{fowler77,
  author = {Fowler, R.J.},
  title = {Approaches to Multi-Dimensional Searching},
  booktitle = {Proceedings of the First Intl. Advanced Study Symposium on Topological
		Data Structures for Geographic Information Systems},
  institution = {Harvard Laboratory for Computer Graphics and Spatial Analysis, 
	   Cambridge, MA},
  month = oct,
  year = 1977,
  note = {Republished in {\em The Harvard Papers on
		Geographic Information Systems\/}, Vol. 4.}
}
@conference{peuckerfowler78,
  author = {T.K. Peucker and R.J. Fowler and J.J. Little and D.M. Mark},
  title = {The Triangulated Irregular Network},
  booktitle = {Proceedings of the American 
	  Society of Photogrammetry Digital Terrain Model Symposium},
  address = {St. Louis, Mo.},
  month = may,
  year = 1978
}
@conference{fowlerlittle79,
  author = {R.J. Fowler and J.J. Little},
  title = {Automatic Extraction of Irregular Network Digital 
                  Terrain Models},
  booktitle = {Proceedings of SIGGRAPH79},
  month = aug,
  year = 1979,
  note = {{Computer Graphics}, Vol. 13, No. 2, Aug. 1979,},
  pages = {199-207}
}
@conference{tanimotofowler80,
  author = {S.L. Tanimoto and R.J. Fowler},
  title = {Covering Image Subsets with Patches},
  booktitle = { Proceedings of the International Conference on Pattern
		Recognition},
  month = dec,
  year = 1980,
  address = {Miami Beach, Fla.},
  pages = {835-839}
}
@techreport{fowlerpaterson80,
  author = {R. J. Fowler and M.S. Paterson and S.L. Tanimoto},
  title = {The Complexity of Packing and Covering in the Plane and 
		Related Intersection Graph Problems},
  institution = {Dept. of Computer Science, University of Washington},
  number = {80-05-02},
  month = may,
  year = 1980
}
@article{fowlerpaterson81,
  author = {R.J. Fowler and M.S. Paterson and S.L Tanimoto},
  title = {Optimal Packing and Covering in the Plane are NP-Complete},
  journal = {Information Processing Letters},
  volume = 12,
  number = 3,
  month = jun,
  year = 1981,
  pages = {133-137}
}
@inproceedings{LazowskaEtal81,
  author = {E. Lazowska and H. Levy and G. Almes and M. Fischer 
                      and R. Fowler and S. Vestal},
  title = {The Architecture of the {E}den System},
  booktitle = {Proceedings of the Eighth Symposium on 
                         Operating Systems Principles},
  address = {Pacific Grove, California},
  pages = {148-159},
  month = dec,
  year = 1981,
  facility = uwash
}
@article{dolevfischer82,
  author = {D. Dolev and M.J. Fischer and R.J. Fowler and N.A. Lynch and H.R.  Strong},
  title = {An Efficient Byzantine Agreement Without Authentication},
  journal = {Information and Control},
  volume = 52,
  month = mar,
  year = 1982,
  pages = {257-274}
}
@conference{lynchfischer82,
  author = {N.A. Lynch and M.J. Fischer and R.J. Fowler},
  title = {A Simple and Efficient Byzantine Generals Algorithm},
  booktitle = {Proceedings of the Second Symposium
	 on Reliability in Distributed Software and Database Systems},
  month = jul,
  year = 1982,
  address = {Pittsburgh, PA},
  pages = {46-52}
}
@article{fowlerstrubel82,
  author = {R.J. Fowler and A.B. Strubel and P.A. Thiemans and S.C. Vestal
           and M.J.  Fischer and T.H.  Kehl and E.D. Lazowska},
  title = {The {CSL} Switch: A Micro-Computer
		Controlled Multi-Computer Front End},
  journal = {The Journal of Digital Systems},
  volume = {6},
  number = {3/4, Summer/Fall},
  year = 1982,
  pages = {265-278}
}
@techreport{Fowler84a,
  fookey = {Fowler 84},
  author = {Robert J. Fowler},
  title = {The Analysis of a Simple Distributed Resource Finding Protocol},
  institution = {Department of Computer Science, University of Washington},
  year = {1984},
  number = {TR84-08-02},
  month = {August},
  address = {Seattle}
}
@phdthesis{Fowler85,
  fookey = {Fowler 85},
  author = {Robert J. Fowler},
  title = {Decentralized Object Finding Using Forwarding Addresses},
  school = {University of Washington},
  year = {1985},
  note = {(Department of Computer Science Technical Report TR85-12-1)},
  month = {December},
  address = {Seattle, Washington}
}
@inproceedings{Fowler86a,
  author = {Robert J. Fowler},
  title = {The Complexity of Using Forwarding Addresses for   
                Decentralized Object Finding},
  booktitle = {Proceedings of the Fifth Annual ACM Symposium on  
                Principles of Distributed Computing},
  address = {Calgary, Alberta, Canada},
  month = {August},
  year = 1986,
  pages = {108--120}
}
@inproceedings{FowlerEtal88,
  author = {Robert J. Fowler and Thomas J. LeBlanc and John M. Mellor-Crummey},
  title = {An Integrated Approach to Parallel Program Debugging and
                Performance Analysis on Large-Scale Multiprocessors},
  booktitle = {Proceedings of the SIGPLAN/SIGOPS Workshop on Parallel
                and Distribute Debugging},
  organization = {ACM},
  address = {Madison, Wisconsin},
  month = may,
  year = 1988,
  pages = {74-182}
}
@techreport{FowlerCox88a,
  author = {Robert J. Fowler and Alan L. Cox},
  title = {An Overview of {{\sf PLATINUM}}:  A Platform for Investigating
                 Non-Uniform Memory},
  institution = {Computer Science Department, University of Rochester},
  number = {TR-262},
  month = nov,
  year = 1988
}
@conference{coxfowler89,
  author = {A.L. Cox and R.J. Fowler},
  title = {The Implementation of a Coherent Memory Abstraction on a
                {NUMA} Multiprocessor: Experiences with {PLATINUM}},
  booktitle = {Proceedings of the 12th {ACM} Symposium on Operating
                Systems Principles},
  year = 1989,
  pages = {32-44},
  address = {Litchfield Park, AZ},
  month = dec
}
@techreport{CoxEtal90,
  author = {Alan L. Cox and Robert J. Fowler and Jack E. Veenstra},
  title = {Interprocessor Invocation on a {NUMA} Multiprocessor},
  institution = {Computer Science Department, University of Rochester},
  number = {TR-356},
  year = 1990,
  month = oct
}
@article{LeblancEtal90,
  author = {Thomas J. LeBlanc and John M. Mellor-Crummey and Robert J. Fowler},
  title = {Analyzing Parallel Program Executions Using Multiple Views},
  journal = {Journal of Parallel and Distributed Computing},
  volume = 9,
  pages = {203--217},
  year = 1990
}
@conference{boloskyscott91,
  author = {W.J. Bolosky and M.L. Scott and R.P. Fitzgerald and
                R.J. Fowler and A.L. Cox},
  title = {{NUMA} Policies and Their Relation to Memory Architecture},
  booktitle = {Proceedings of the 4th Symposium on Architectural Support
                for Programming Languages and Operating Systems},
  year = 1991,
  month = apr,
  pages = {212-221}
}
@conference{fowlerkontothanassis92a,
  author = {Fowler, R. J. and L. I. Kontothanassis},
  title = {Supporting User-Level Exception Handling on a Multiprocessor 
		Micro-Kernel: Experiences with PLATINUM},
  booktitle = {Proceedings of SEDMS-III},
  address = {Newport Beach, CA},
  month = mar,
  year = 1992,
  ps = {papers/sedms.ps.gz},
  pages = {217-232}
}
@techreport{fowlerkontothanassis92b,
  author = {R.J. Fowler and L.I. Kontothanassis},
  title = {Improving Processor and Cache Locality in
                Fine-Grain Parallel Computations using Object-Affinity
                Scheduling and Continuation Passing},
  institution = {University of Rochester, Department of Computer Science},
  year = 1992,
  ps = {papers/tr411.ps.gz},
  number = {TR-411}
}
@conference{veenstrafowler92,
  author = {J.E. Veenstra and R.J. Fowler},
  title = {A Performance Evaluation of
                 Optimal Hybrid Cache Coherency Protocols},
  booktitle = {5th International Conference on Architectural Support for Programming La
nguages and Operating Systems},
  year = 1992,
  pages = {149-160},
  ps = {papers/asplos92.ps.gz},
  month = sep
}
@conference{coxfowler93,
  author = {A. Cox and R. Fowler},
  title = {Adaptive Cache Coherency for Detecting Migratory Shared Data},
  booktitle = {Proceedings of the 20th Annual International Symposium
                on Computer Architecture},
  address = {San Diego, California},
  month = may,
  ps = {papers/isca93.ps.gz},
  year = 1993
}
@techreport{veenstrafowler93a,
  author = {J. E. Veenstra and R. J. Fowler},
  title = {{MINT} Tutorial and User Manual},
  institution = {University of Rochester, Department of Computer Science},
  number = {TR-452},
  month = nov,
  year = 1993
}
@conference{macariefowler93,
  author = {I. Macarie and R. J. Fowler},
  title = {A Closed-Form Approximation to a MVA
	 Multiprocessor Model and a New Component of Communication Overhead},
  booktitle = {Proceedings ROCYCS '93 - The Romanian Symposium on Computer Science},
  month = nov,
  year = 1993,
  address = {Iasi, Romania},
  ps = {papers/rocsys.ps.gz},
  pages = {335-347}
}
@conference{fowler93,
  author = {R. J. Fowler},
  title = {Architectural Convergence and the Granularity of
	    Objects in Distributed Systems},
  booktitle = {Proceedings of the ECOOP '93 Workshop on
           Object-Based Distributed Programming},
  editor = {R. R. Guerraoui, O Nierstrasz, M. Riveill},
  month = jul,
  year = 1994,
  series = {Lecture Notes on Computer Science},
  publisher = {Springer-Verlag},
  volume = {791},
  ps = {papers/lncs791.ps.gz},
  pages = {36-49}
}
@conference{fowlerkontothanassis94,
  author = {R. J. Fowler and L. I. Kontothanassis},
  title = {Mercury: Object-Affinity Scheduling and Continuation 
		Passing on Multiprocessors},
  booktitle = {Proceedinge of PARLE'94 (Parallel Architectures and Languages
               Europe)},
  address = {Athens, Greece},
  month = jul,
  year = 1994,
  series = {Lecture Notes on Computer Science},
  publisher = {Springer-Verlag},
  volume = {817},
  pages = {661-676},
  ps = {papers/MercuryPARLE94.ps.gz},
  xnote = {refereed conference}
}
@conference{veenstrafowler94a,
  author = {J. E. Veenstra and R. J. Fowler},
  title = { {MINT}: A Front End for Efficient
		Sequential Simulation of Multiprocessor Memory Hierarchies},
  booktitle = {Proceedings of the Second International Workshop on Modeling,
		Analysis and Simulation of  Computer and
		Telecommunication Systems (MASCOTS'94)},
  address = {Durham, North Carolina},
  month = jan,
  year = 1994,
  pages = {201-207},
  ps = {papers/MASCOTS94.ps.gz},
  xnote = {refereed conference}
}
@techreport{veenstrafowler94b,
  author = {J. E. Veenstra and R. J. Fowler},
  title = {The Prospects for On-Line
		Hybrid Coherency Protocols on Bus-Based Multiprocessors},
  institution = {University of Rochester, Computer Science Department},
  number = {TR-490},
  month = mar,
  year = 1994
}
@conference{kochfowler94a,
  author = {P. T. Koch and R. J. Fowler},
  title = {Carlsberg: A Distributed Execution Environment Providing Coherent 
            Shared Memory and Integrated Message Passing},
  booktitle = {Proceedings of the Nordic Workshop on Programming Environment
                 Research (NWPER'94)},
  editor = {B. Magnusson and G. Hedin and S. Min\"{o}r},
  month = jun,
  year = 1994,
  pages = {279-294},
  ps = {papers/NWPER94.ps.gz},
  note = {Available as Lund Institute of Technology Report LU-CS-TR:94-127}
}
@conference{kochfowler94b,
  author = {P. T. Koch and R. J. Fowler and E. B. Jul},
  title = {Message-Driven Relaxed Consistency in a Software Distributed 
		Shared Memory},
  booktitle = {Proceedings First Symposium on
		Operating Systems Design and Implementation (OSDI)},
  address = {Monterey, California},
  month = nov,
  year = 1994,
  pages = {75-86},
  ps = {papers/OSDI94.ps.gz},
  xnote = {refereed conference}
}
@conference{zhanglcr2k,
  author = {Kai Zhang and John Mellor-Crummey and Robert Fowler},
  title = {Compilation and Runtime Optimizations for Software
             Distributed Shared Memory},
  booktitle = {Languages, Compilers and Run-Time Systems for Scalable
            Computers, 5th International Workshop},
  address = {Rochester, New York},
  month = may,
  year = 2000,
  series = {Lecture Notes on Computer Science},
  publisher = {Springer-Verlag},
  volume = {1915},
  pages = {182-191},
  pdf = {papers/zhanglcr2k.pdf},
  ps = {papers/zhanglcr2k.ps.gz},
  abstract = {We present two novel optimizations for compiling
  High Performance Fortran~(HPF) to page-based software distributed
  shared memory systems~(SDSM).  One technique, {\em compiler-managed
  restricted consistency}, uses compiler-derived knowledge to delay the
  application of memory consistency operations to data that is provably
  not shared in the current synchronization interval, thus reducing
  false sharing. (False sharing occurs when two or more
  processors each accesses mutually disjoint sets of data elements in
  the same block.) The other technique, {\em compiler-managed shared
  buffers}, when combined with the previous optimization, eliminates
  fragmentation. (Fragmentation occurs when an entire block of
  data is communicated to transport only a small fraction its content.)
  Together, the two techni ques permit compiler-generated code to
  efficiently apply multi-dimensional computation partitioning and
  wavefront parallelism to execute efficiently on SDSM systems.}
}
@conference{broomHUG00ea,
  author = {Bradley Broom and Daniel {Chavarr\'\i{}a-Miranda} and  Guohua Jin
    and  Rob Fowler  and  Ken Kennedy and  John Mellor-Crummey},
  title = {Overpartitioning with the Rice dHPF Compiler},
  booktitle = {Proceedings of the 4th Annual HPF Users Group Meeting},
  address = {Tokyo, Japan},
  month = oct,
  year = 2000,
  ps = {papers/HUG00EA.ps.gz},
  pdf = {papers/HUG00EA.pdf},
  note = {(extended abstract)}
}
@conference{broom:kelpio,
  author = { Bradley Broom and Rob Fowler and Ken Kennedy},
  title = {{KelpIO}: A Telescope-Ready
    Domain-Specific {I/O} Library for Irregular Block-Structured
    Applications},
  booktitle = { Proceedings of the 2001 IEEE
    International Symposium on Cluster Computing and the Grid},
  address = { Brisbane, Australia },
  month = may,
  year = 2001,
  pages = {148-155},
  note = {Best Paper Award},
  ps = {papers/kiopaper.ps.gz},
  pdf = {papers/kiopaper.pdf},
  abstract = { To ameliorate the
  need to spend significant programmer time modifying parallel
  programs to achieve high-performance, while maintaining compact,
  comprehensible source codes, this paper advocates the use of
  telescoping languages technology to automatically apply, during the
  normal compilation process, high-level performance enhancing
  transformations to applications using a high-level domain-specific
  {I/O} library.  We believe that this approach will be more
  acceptable to application developers than new language extensions,
  but will be just as amenable to optimization by advanced compilers,
  effectively making it a domain-specific language extension for
  {I/O}.

  The paper describes a domain-specific {I/O} library for irregular
  block-structured applications based on the {KeLP} library, describes
  high-level transformations of the library primitives for improving
  performance, and describes how a high-level domain-specific optimizer
  for applying these transformations could be constructed using the
  telescoping languages framework.  },
  xnote = { IEEE Press # PR01010, ISBN 0-7695-1010-8}
}
@article{kennedy:telestrat,
  author = {K. Kennedy and B. Broom and K. Cooper 
             and J. Dongarra and R. Fowler and D. Gannon and
              L. Johnsson and J. Mellor-Crummey and L. Torczon},
  title = {Telescoping Languages: A Strategy for Automatic Generation of 
             Scientific Problem-Solving Systems from Annotated Libraries},
  journal = {Journal of Parallel and Distributed Computing},
  year = 2001,
  month = dec,
  volume = {61},
  pages = {1803--1826},
  abstract = {
    As machines and programs have become more complex, the process of
    programming applications that can exploit the power of
    high-performance systems has become more difficult and
    correspondingly more labor-intensive.  This has substantially
    widened the \emph{software gap}---the discrepancy between the need
    for new software and the aggregate capacity of the workforce to
    produce it.  This problem has been compounded by the slow growth
    of programming productivity, especially for high-performance
    programs, over the past two decades.

    One way to bridge this gap is to make it possible for
    end  users to develop programs in high-level domain-specific
    programming systems.  In the past,  a major impediment to the
    acceptance of 
    such systems  has been the poor performance of the
    resulting applications.  To address this problem, we are 
    developing a new compiler-based infrastructure, called \emph{TeleGen},
    that will make it practical to construct efficient 
    domain-specific high-level languages from
    annotated component libraries. These languages are called 
    \emph{telescoping languages}, because they can be nested within 
    one another.  

    For programs written in telescoping languages, high performance and
    reasonable compilation times  can be achieved by exhaustively 
    analyzing the component libraries in advance to produce a language 
    processor that recognizes and optimizes library operations as 
    primitives in the language. The key to making this strategy 
    practical is to keep compile times low by generating a custom 
    compiler with extensive built-in knowledge of the underlying libraries.
    The goal is to achieve compile times that are linearly
    proportional to the size of the program presented by the user, rather than 
    to the aggregate size of that program plus the base libraries.}
}
@article{MFMT:TJS02,
  author = {John Mellor-Crummey and Robert Fowler and 
               Gabriel Marin and Nathan Tallent},
  title = {{HPCView:} A tool for top-down analysis of node performance},
  journal = {The Journal of Supercomputing},
  volume = 23,
  pages = {81-104},
  year = 2002,
  note = {Extended version. Special Issue of
          selected papers from the 
          2001 Los Alamos Computer Science Institute Symposium}
}
@conference{MFM:LACSI01,
  author = {John Mellor-Crummey and Robert Fowler and Gabriel Marin},
  title = {{HPCView:} A tool for top-down analysis of node performance},
  booktitle = {Proceedings of the Los Alamos Computer Science 
               Institute Second Annual Symposium},
  address = {Santa Fe, NM},
  month = oct,
  year = 2001,
  note = {Distributed on CD-ROM},
  ps = {papers/hpcview-lacsi01.ps.gz},
  pdf = {papers/hpcview-lacsi01.pdf},
  abstract = {Although it is increasingly difficult for large
    scientific programs to attain a significant fraction of peak
    performance on systems based on microprocessors with substantial
    instruction level parallelism and with deep memory hierarchies,
    performance analysis and tuning tools are still not used on a
    day-to-day basis by algorithm and application designers. We
    present HPCView - a toolkit for combining multiple sets of program
    profile data, correlating the data with source code, and
    generating a database that can be analyzed portably and
    collaboratively with commodity Web browsers. We argue that HPCView
    addresses many of the issues that have limited the usability and
    the utility of most existing tools. We originally built HPCView to
    facilitate our own work on data layout and optimizing
    compilers. Now, in addition to daily use within our group, HPCView
    is being used by several code development teams in DoD and DoE
    laboratories as well as at NCSA.  }
}
@conference{MCFW:ICS2001,
  author = {John Mellor-Crummey and Robert Fowler and David Whalley},
  title = {Tools for Application-Oriented Performance Tuning},
  booktitle = {Proceedings of the International Conference on Supercomputing (ICS2001)},
  month = jun,
  year = 2001,
  address = {Sorrento, Italy},
  pages = {154-165},
  pdf = {papers/hpcview-ics01.pdf},
  ps = {papers/hpcview-ics01.ps.gz},
  abstract = { Application performance tuning is a complex process that
  requires assembling various types of information and correlating it
  with source code to pinpoint the causes of performance
  bottlenecks. Existing performance tools don't adequately support this
  process in one or more dimensions.  We discuss some of the critical
  utility and usability issues for application-level performance
  analysis tools in the context of two performance tools, \textit{MHSim}
  and \textit{HPCView}, that we built to support our own work on data
  layout and optimizing compilers.  \textit{MHsim} is a memory hierarchy
  simulator that produces source-level information not otherwise
  available about memory hierarchy utilization and the causes of cache
  conflicts.  \textit{HPCView} is a tool that combines data from
  arbitrary sets of instrumentation sources and correlates it with
  program source code.  Both tools report their results in
  scope-hierarchy views of the corresponding source code and produce
  their output as HTML databases that can be analyzed portably and
  collaboratively using a commodity browser.  In addition to daily use
  within our group, the tools are being used successfully by several
  code development teams in DoD and DoE laboratories.  }
}
@conference{JMCF:SC01,
  author = {Guohua Jin and John Mellor-Crummey and
    Robert Fowler},
  title = {Increasing Temporal Locality with Skewing
    and Recursive Blocking},
  booktitle = {Proceedings of Supercomputing
    2001},
  address = {Denver, COL},
  month = nov,
  year = 2001,
  ps = {papers/timeskew-sc01.ps.gz},
  pdf = {papers/timeskew-sc01.pdf},
  abstract = {Effective memory hierarchy
    utilization is critical for high performance on modern single- and
    multi- processor systems.  The key to performance on such systems
    is substantial temporal reuse.  This paper focuses on strategies
    for improving temporal reuse in large-scale scientific codes that
    use iterative methods.  We propose a coordinated approach for
    improving multi-level memory hierarchy utilization in such codes.
    We describe prismatic time skewing, a strategy for increasing
    temporal reuse in loop nests by applying multi-level time skewing.
    Novel aspects of this work include multi-dimensional skewing,
    handling for carried data dependences without additional storage,
    bi-directional skewing for handling periodic boundary conditions,
    and an interprocedural analysis and transformation strategy.  We
    combine prismatic skewing with recursive blocking to boost reuse
    at all levels in a memory hierarchy.  A preliminary evaluation of
    our techniques showed performance improvements ranging from 243\%
    to a factor of 15 compared to the original code.  Comparisons with
    time-skewing methods in the literature on these benchmarks show
    that the performance using our techniques is better by 12\% to
    133\%.  With an interprocedural application of our techniques, we
    were able to reduce total primary cache misses of a large
    application code by 23\% to 27\% and secondary cache misses by
    45\% to 119\%. }
}
@conference{MCFW:sigmetrics01,
  author = {John Mellor-Crummey and Robert Fowler and David Whalley},
  title = {On Producing Useful Information for Analyzing and Tuning
               Applications},
  booktitle = {Proceedings of the International Conference
                on Measurement and Modeling of Computer Systems (Sigmetrics 2001)},
  month = jun,
  year = 2001,
  address = {Cambridge, Mass},
  pages = {332-333},
  ps = {papers/sigmetrics01poster.ps.gz},
  note = {(poster)}
}
@techreport{CDFMC:rr2001-45,
  author = {Daniel Chavarr\'\i{}a-Miranda and Alain Darte and Robert Fowler
              and John Mellor-Crummey},
  title = {On efficient parallelization of line-sweep
               computations},
  institution = {Laboratoire de l'Informatique
              du Parall�lisme, �cole Normale Sup�riore de Lyon},
  number = {RR2001-45},
  month = nov,
  year = 2001,
  ps = {papers/RR2001-45.ps},
  pdf = {papers/RR2001-45.pdf},
  abstract = {
   Multipartitioning is a strategy for partitioning multi-dimensional
   arrays among a collection of processors so that line-sweep
   computations can be performed efficiently. With multipartitioning,
   computations that require solving 1-D recurrences along each dimension
   of a multidimensional array can be parallelized effectively. Previous
   techniques for multipartitioning yield efficient parallelizations over
   3D domains only when the number of processors is a perfect
   square. This paper considers the general problem of computing optimal
   multipartitionings for d-dimensional data volumes on an arbitrary
   number of processors. We describe an algorithm that computes an
   optimal multipartitioning for this general case, which enables
   efficient parallelizations of line-sweep computations under arbitrary
   conditions. Finally, we describe a prototype implementation of
   generalized multipartitioning in the Rice dHPF compiler and
   performance results obtains when using it to parallelize a line-sweep
   computation for different numbers of processors.}
}
@conference{CDFM:CPC2001,
  author = {Daniel {Chavarr\'\i{}a-Miranda} and Alain Darte and 
              Robert J. Fowler and John Mellor-Crummey},
  title = {On Efficient Parallelization of Line-Sweep Computations},
  booktitle = {Proceedings of Compilers for Parallel Computers (CPC2001)},
  month = jun,
  year = 2001,
  address = {Edinburgh, Scotland},
  note = {Also available at http://www.icsa.informatics.ed.ac.uk/cpc2001/proceedings.html},
  ps = {papers/CPC2001.ps.gz},
  abstract = {
  Multipartitioning is a strategy for partitioning multi-dimensional
  arrays among a collection of processors so that line-sweep
  computations can be performed efficiently. The principal property of a
  multipartitioned array is that for a line sweep along any array
  dimension, all processors have the same number of tiles to compute at
  each step in the sweep. This property results in full, balanced
  parallelism. A secondary benefit of multipartitionings is that they
  induce only coarse-grain communication.

  Previously, computing a $d$-dimensional multipartitioning required
  that $p/(d - 1)$ be integral, where $p$ is the number of
  processors.  Here, we describe an algorithm to compute a
  $d$-dimensional multipartitioning of an array of $\rho$ dimensions for
  an arbitrary number of processors, for any $d$, $2 \leq d \leq \rho$.
  When using a multipartitioning to parallelize a line sweep
  computation, the best partitioning is the one that exploits all of the
  processors and has the smallest communication volume. To compute the
  best multipartitioning of a $\rho$-dimensional array, we describe a
  cost model for selecting $d$, the dimensionality of the best
  partitioning, and the number of cuts
  along each partitioned dimension.  In practice, our technique will
  choose a 
  $3$-dimensional 
  multipartitioning for a 
  $3$-dimensional 
  line-sweep computation, except when $p$ is a prime; previously, a
  $3$-dimensional 
  multipartitioning could be applied only when
  $\sqrt{p}$ is integral.

  We describe an implementation of multipartitioning in the Rice dHPF
  compiler 
  and performance results obtained
  to parallelize a line sweep computation on a range of different
   numbers of processors.
}
}
@conference{DCFM:IPDPS02,
  author = {Daniel Chavarr\'{\i}a-Miranda and Alain Darte and Robert Fowler
           and John Mellor-Crummey},
  title = {Generalized Multipartitioning for Multi-dimensional Arrays},
  booktitle = {Proceedings of the International Parallel and 
         Distributed Processing Symposium},
  address = {Fort Lauderdale, FL},
  month = apr,
  year = 2002,
  note = {Selected as a Best Paper},
  ps = {papers/multipart-ipdps02.ps},
  pdf = {papers/multipart-ipdps02.pdf},
  abstract = {Multipartitioning is a strategy for parallelizing
    computations that require solving 1D recurrences along each
    dimension of a multi-dimensional array. Previous techniques for
    multipartitioning yield efficient parallelizations over 3D domains
    only when the number of processors is a perfect square. This paper
    considers the general problem of computing multipartitionings for
    d-dimensional data volumes on an arbitrary number of
    processors. We describe an algorithm that computes an optimal
    multipartitioning onto all of the processors for this general
    case. Finally, we describe how we extended the Rice dHPF compiler
    for High Performance Fortran to generate code that exploits
    generalized multipartitioning and show that the compiler's
    generated code for the NAS SP computational fluid dynamics
    benchmark achieves scalable high performance. }
}
@article{MellorCrummeyEtAl:CPE02,
  author = {John Mellor-Crummey and Vikram Adve and Bradley
	Broom and Daniel {Chavarr\'\i{}a-Miranda} and Robert Fowler and
	Guohua Jin and Ken Kennedy and Qing Yi},
  title = {Advanced Optimization Strategies in the {Rice dHPF} Compiler},
  journal = {Concurrency: Practice and Experience},
  year = 2002,
  volume = {14},
  number = {8 \& 9},
  month = aug,
  pages = {741-768},
  abstract = {
To a large extent, today's commercially available HPF compilers have 
failed to deliver scalable parallel performance for a broad spectrum
of applications because of insufficiently powerful compiler
analysis and optimization.
Substantial restructuring and hand-optimization can be required to achieve
acceptable performance with an HPF port of an existing Fortran
application, even for regular data-parallel applications.
A key goal of the Rice dHPF compiler project has been to
develop optimization techniques that enable a wide range of 
existing scientific applications to be ported easily to
efficient HPF with minimal restructuring.
This paper describes the challenges to effective parallelization
presented by complex (but regular) data-parallel applications,
and then describes how the novel analysis and optimization technologies
in the dHPF compiler address these challenges effectively,
without major rewriting of the applications.
We illustrate the techniques by describing their use for
parallelizing the NAS SP and BT benchmarks.
The dHPF compiler generates multipartitioned parallelizations
of these codes that are approaching the scalability and efficiency
of sophisticated hand-coded parallelizations. 
}
}
@conference{FMCJQ:LACSI2002,
  author = {Robert Fowler and John Mellor-Crummey and Guohua Jin and Apan Qasem},
  title = {A Source-to-source Loop Transformation Tool (Extended poster abstract)},
  booktitle = {Proceedings of the Los Alamos Computer Science Institute 
            3rd Annual Symposium},
  month = oct,
  year = 2002,
  address = {Santa Fe, NM},
  note = {Published on CD-ROM.}
}
@conference{FCEZ:HOTOS03,
  author = {Robert Fowler and Alan Cox and Sameh Elnikety and 
             Willy Zwaenepoel},
  title = {Using Performance Reflection in Systems Software},
  booktitle = {Proceedings of USENIX Workshop on  Hot Topics 
         in Operating Systems (HOTOS IX)},
  address = {Lihue, HI},
  month = mar,
  year = 2003,
  note = {Extended abstract.}
}
@incollection{BFKKP:MIT03,
  author = {Bradley Broom and Rob Fowler and Ken Kennedy and 
        Charles Koelbel and Michael Paleczny},
  editor = {Daniel Reed},
  title = {Compiler Support for Out-Of-Core Arrays on Parallel
        Machines},
  pages = {155-174},
  booktitle = {Scalable Input/Output},
  publisher = {MIT Press},
  year = {2003},
  address = {Cambridge, MA},
  month = oct
}
@article{DarteEtAl:JPDS03,
  author = {Alain Darte and John Mellor-Crummey and Robert Fowler 
                and Daniel Chavarr\'\i{}a-Miranda},
  title = {Generalized multipartitioning of multi-dimensional arrays for 
                  parallelizing line-sweep computations},
  journal = {Journal of  Parallel and Distributed Computing},
  year = 2003,
  volume = {63},
  number = {9},
  month = sep,
  pages = {887-991},
  pdf = {http://authors.elsevier.com/sd/article/S0743731503001035},
  abstract = {
Multipartitioning is a strategy for decomposing multi-dimensional
arrays into tiles and mapping the resulting tiles onto a collection of
processors. This class of partitionings enables efficient
parallelization of "line-sweep" computations that solve
one-dimensional recurrences along each dimension of a
multi-dimensional array. Multipartitionings yield balanced parallelism
for line sweeps by assigning each processor the same number of data
tiles to compute at each step of a sweep along any array
dimension. Also, they induce only coarse-grain communication.

This paper considers the problem of computing generalized
multipartitionings, which decompose d-dimensional arrays,
dgt-or-equal, slanted2, onto an arbitrary number of processors. We
describe an algorithm that computes an optimal multipartitioning onto
all of the processors for this general case. We use a cost model to
select the dimensionality of the best partitioning and the number of
cuts to make along each array dimension; then, we show how to
construct a mapping that assigns the resulting data tiles to each of
the processors. The assignment of tiles to processors induced by this
class of multipartitionings corresponds to an instance of a latin
hyper-rectangle, a natural extension of latin squares, which have been
widely studied in mathematics and statistics.

Finally, we describe how we extended the Rice dHPF compiler for High
Performance Fortran to generate code that employs our strategy for
generalized multipartitioning and show that the compiler's generated
code for the NAS SP computational fluid dynamics benchmark achieves
scalable high performance.
}
}
@conference{MEAD:AMS2004,
  author = {Robert Wilhelmson and Jay Alameda and Kelvin Droegemeier 
    and Michael Folk and  Rob Fowler and Dennis Gannon and Sara Graves 
    and Dale Haidvogel and Parry  Husbands and Charles Lee Isbell Jr. 
    and Dan Weber and Paul Woodward and  Bryant W. York and Sarah 
    Anderson and Brian Jewett and Christopher  Moore and David Nolan 
    and David Porter and Dave Semeraro and and Steve Tanner},
  title = {{MEAD} (A Modeling Environment for Atmospheric Discovery)},
  booktitle = {20th International Conference on Interactive Information 
   and Processing Systems {(IIPS)} for Meteorology, Oceanography, 
   and Hydrology},
  month = jan,
  year = 2004,
  address = {Seattle, {WA}},
  url = {http://ams.confex.com/ams/84Annual/20IIPS/abstracts/73057.htm},
  note = {in conjunction with the 84th AMS annual meeting},
  abstract = {
  The goal of the MEAD Expedition is the development and adaptation of
Grid and TeraGrid-enabled cyberinfrastructure for enabling ensemble or
very large domain model simulations coupled with data handling,
analysis, data mining, and visualization services. This includes a
dynamic workflow and data management environment applicable in a
variety of fluid flow modeling environments. The specific applications
chosen for MEAD are mesoscale storm and hurricane research and
education. The MEAD Expedition is a cyberinfrastructure proving ground
that has been funded for two years by the National Computational
Science Alliance, an NSF PACI program. The MEAD project is documented
at http://www.ncsa.uiuc.edu/AboutUs/FocusAreas/MEADExpedition.html.
}
}
@article{KennedyEtAl:PIEEE05,
  author = {Ken Kennedy and Bradley Broom and Arun Chauhan and Robert Fowler and John Garvin
              and Charles Koelbel and Cheryl McCosh and John Mellor-Crummey},
  title = {Telescoping languages: A System for Automatic Generation of Domain Languages},
  journal = {Proceedings of the {IEEE}},
  year = 2005,
  volume = {93},
  number = {2},
  month = feb,
  pages = {387-408},
  abstract = {
The software gap - the discrepancy between the need for new software
and the aggregate capacity of the workforce to produce it - is a
serious problem for scientific software. Although users appreciate the
convenience (and, thus, improved productivity) of using relatively
high-level scripting languages, the slow execution speeds of these
languages remain a problem. Lower level languages, such as C and
Fortran, provide better performance for production applications, but
at the cost of tedious programming and optimization by experts. If
applications written in scripting languages could be routinely
compiled into highly optimized machine code, a huge productivity
advantage would be possible. It is not enough, however, to simply
develop excellent compiler technologies for scripting languages (as a
number of projects have succeeded in doing for MATLAB). In practice,
scientists typically extend these languages with their own
domain-centric components, such as the MATLAB signal processing
toolbox. Doing so effective Our approach calls for using a
library-preprocessing phase to extensively analyze and optimize
collections of libraries that define an extended language. Results of
this analysis are collected into annotated libraries and used to
generate a library-aware optimizer. The generated library-aware
optimizer uses the knowledge gathered during preprocessing to carry
out fast and effective optimization of high-level scripts. This
enables script optimization to benefit from the intense analysis
performed during preprocessing without repaying its price. Since
library preprocessing is performed only at infrequent
"language-generation" times, its cost is amortized over many
compilations of individual scripts that use the library. We call this
strategy "telescoping languages" because it merges knowledge of a
hierarchy of extended languages into a single library-aware
optimizer. We present our vision and plans for compiler frameworks
based on telescoping languages and - report on the preliminary
research that has established the effectiveness of this approach.}
}
@conference{FMCF:ICS2005,
  author = {Nathan Froyd and John Mellor-Crummey
	and Robert Fowler},
  title = {Efficient Call-stack Profiling of
	Unmodified, Optimized Code},
  booktitle = {Proceedings of the
	International Conference on Supercomputing (ICS2005)},
  month = jun,
  year = 2005,
  address = {Cambridge, {MA}},
  pages = {81-90},
  abstract = { 
Call path profiling associates resource consumption with the calling
context in which resources were consumed.  We describe the design and
implementation of a low-overhead call path profiler based on stack
sampling.  The profiler uses a novel sample-driven strategy for
collecting frequency counts for call graph edges without instrumenting
every procedure's code to count them.  The data structures and
algorithms used are efficient enough to construct the complete calling
context tree exposed during sampling.  The profiler leverages
information recorded by compilers for debugging or exception handling
to record call path profiles even for highly-optimized code. We
describe an implementation for the Tru64/Alpha platform.  Experiments
profiling the SPEC CPU2000 benchmark suite demonstrate the low
(2\%-7\%) overhead of this profiler.  A comparison with
instrumentation-based profilers, such as {\tt xgprof}, shows that for
call-intensive programs, our sampling-based strategy for call path
profiling has over an order of magnitude lower overhead.}
}
@conference{FTMF:gcc2006,
  author = {Nathan Froyd and Nathan Tallent and John Mellor-Crummey
	and Rob Fowler},
  title = {Call path profiling for unmodified, optimized binaries},
  booktitle = {Proceedings of the
	GCC and GNU Toolchain Developers' Summit},
  month = jun,
  year = 2006,
  address = {Ottawa, Canada},
  pdf = {papers/FTMF-gcc-summit-2006.pdf},
  abstract = { 
Although gprof has long been the standard for call graph profiling in
the GNU toolchain, it suffers from several shortcomings. First, in
modern object-oriented programs, costs must be attributed to full
calling contexts because the cost of function calls may be context
dependent; gprof ignores this issue. Second, gprof uses
instrumentation in procedure prologues to gather performance
data. Gprof's instrumentation-based profiling imposes four costs: (1)
recompilation is used to add instrumentation, (2) the presence of
instrumentation distorts the measurements taken, (3) the overhead of
instrumentation can significantly increase running time, and (4) the
presence of instrumentation can weaken compiler optimization. We have
developed a call-path profiler that avoids all of these shortcomings
by measuring the performance of unmodified, fully-optimized
binaries. Rather than inserting instrumentation, we use periodic
sampling of hardware performance counters and stack unwinding to
attribute samples to calling contexts and collect frequency counts for
call graph edges. Experiments with the SPEC CPU2000 benchmark suite
yield good accuracy and low (2%-7% on Tru64/Alpha) overheads at 1000
samples/sec.

A call-path profiler based on stack sampling needs information to
unwind the stack at any point in the execution. In particular, it
requires precise information about procedure epilogues -- more
information than is required to handle C++ exceptions. In this paper,
we describe the changes we have made to GCC to emit such information
and we present results of experiments on the x86-64 platform. Accuracy
and overhead are comparable to the measurements made on
Tru64/Alpha. For profiling binaries that lack sufficient information,
we propose to augment binutils to collect the necessary information
using binary analysis.  
},
  url = {http://www.gccsummit.org/}
}
@inproceedings{GFR:lacsiposter2006,
  address = {Santa Fe, MN},
  author = {Todd Gamblin and Rob Fowler and Daniel A. Reed},
  booktitle = {Proceedings of the Los Alamos Computer Science Institute 
            7th Annual Symposium},
  month = oct,
  title = {Runtime Methods for Automatic Behavioral Stratification of
                 Scientific Codes},
  year = 2006,
  note = {(Best Poster Award)}
}
@article{ZFetal:scidac2007,
  author = {Y. Zhang and R. Fowler and K. Huck and A. Malony and 
A. Porterfield and D. Reed and S. Shende and V. Taylor and X. Wu},
  title = {{US QCD} Computational Performance studies with {PERI}},
  journal = {J. Phys: Conf. Ser},
  year = 2007,
  volume = {78},
  number = {012083},
  month = aug,
  pages = {5pp}
}
@inproceedings{Gamblin2007poster,
  address = {Reno, NV},
  author = {Todd Gamblin and Prasun Ratn and Bronis R. de Supinkski 
           and Martin Schulz and Frank Mueller and Robert J. Fowler and Daniel A. Reed},
  booktitle = {Supercomputing 2007 {(SC'07)}},
  date-added = {2007-10-03 19:31:44 -0400},
  date-modified = {2007-10-03 19:35:21 -0400},
  local-url = {file://localhostUsers/tgamblin/Documents/BibDesk%20Papers/SC07Poster.pdf},
  month = {November},
  title = {An Open Framework for Scalable, Reconfigurable Performance Analysis},
  year = {2007},
  note = {(Poster)}
}
@incollection{FGKMPRR:allgrids2007,
  author = {Robert J. Fowler and Todd Gamblin and Gopi Kandaswamy and Anirban Mandal    
                  and Allan K. Porterfield and Lavanya Ramakrishnan and Daniel A. Reed},
  editor = {Lucio Grandinetti},
  title = {Challenges of Scale: When All Computing Becomes Grid Computing},
  publisher = {IOS Press},
  month = mar,
  year = 2008,
  booktitle = {High Performance Computing and Grids in Action},
  series = {Advances in Parallel Computing},
  address = {Amsterdam},
  isbn = {975-1-58603-839-7}
}
@inproceedings{GFR:ipdps2008,
  author = {Todd Gamblin and Rob Fowler and Daniel A. Reed},
  title = {Scalable Methods for Monitoring and Detecting Behavioral
Classes in Scientific Codes.},
  booktitle = {Proceedings of the International Parallel and 
                Distributed Processing Symposium 2008},
  year = {2008},
  month = apr,
  address = {Miami, FL},
  abstract = {
Emerging petascale systems will have many hundreds of thousands of
processors, while traditional task-level tracing tools already fail to
scale to much smaller systems because the I/O backbones of these systems
are not configured to handle the peak offered load of all
their cores.  Complete event traces of all processes are thus
infeasible.  To retain the benefits of detailed performance
measurement while reducing volume of collected data, we developed
AMPL, a general-purpose toolkit that reduces data volume using
stratified sampling.

We adopt a scalable sampling strategy, since the sample size required
to measure a system varies sub-linearly with process count. By
grouping, or {\em stratifying}, processes that behave similarly, we
can further reduce data overhead while also providing insight into an
application's behavior.

In this paper, we describe the AMPL toolkit and we report our
experiences using it on large-scale scientific applications.  We show
that AMPL can successfully reduce the overhead of tracing scientific
applications by an order of magnitude or more, and we show that our
tool scales sub-linearly, so the improvement will be more dramatic on
petascale machines.  Finally, we illustrate the use of AMPL to monitor
applications by performance-equivalent {\em strata}, and we show that
this technique can allow for further reductions in trace data volume
and traced execution time.}
}
@article{BLetal:CTQ2007,
  author = {David Bailey and Robert Lucas {\em et al.}},
  xxauthor = {David Bailey and Robert Lucas 
and Paul Hovland
and Boyana Norris
and Kathy Yelick
and Dan Gunter
and Bronis de Supinski
and Dan Quinlan
and Pat Worley
and Jeff Vetter
and Phil Roth
and John Mellor-Crummey
and Allan Snavely
and Jeff Hollingsworth
and Dan Reed
and Rob Fowler
and Ying Zhang
and Mary Hall
and Jacque Chame
and Jack Dongarra
and Shirley Moore},
  title = {Performance Engineering: Understanding and Improving
  the Performance of Large-Scale Codes},
  journal = {CT Watch Quarterly},
  year = {2007},
  volume = {3},
  number = {4},
  pages = {18-23},
  month = nov,
  abstract = {Achieving good performance on
  high-end computing systems is growing ever more challenging due to
  enormous scale, increasing architectural complexity, and increasing
  application complex-ity. To address these challenges in DOE�s
  SciDAC-2 program [1], the Performance Engineering Research Institute
  (PERI) has embarked on an ambitious research plan encompassing
  perform-ance modeling and prediction, automatic performance
  optimization and performance engineering of high profile
  applications. The principal new component is a research activity in
  automatic tun-ing software, which is spurred by the strong user
  preference for automatic tools.},
  pdf = {papers/BL_PERI_CT_Watch2007.pdf}
}
@inproceedings{dSFGMRAS:sthec08,
  author = {Bronis R. {de Supinski} and Rob Fowler and  Todd Gamblin
              and Frank Mueller and Prasun Ratn and Martin Schultz},
  title = {An Open Infrastructure for Scalable, Reconfigurable Analysis  },
  booktitle = {International Workshop on Scalable Tools for High-End 
          Computing ({STHEC} 2008)},
  year = {2008},
  optaddress = {Kos, Greece},
  month = {July},
  organization = {ACM/SIGARCH}
}
@techreport{LFRT2008:tr0802,
  author = {Howard M. Lander and Robert J. Fowler and Lavanya
           Ramakrishnan and Steven R. Thorpe},
  title = {Stateful Grid Resource Selection for Related Asynchronous Tasks},
  institution = {RENCI},
  year = {2008},
  number = {TR-08-02},
  address = {North Carolina},
  month = {April},
  optnote = {also submitted for publication},
  abstract = {In today�s grid deployments, resource selection is based
on the prior knowledge of the performance characteristics
of the application on a particular resource and on real-time
monitoring status of the resource such as load on the system,
network bandwidth, etc. Any lag between a resource
selection decision and the time the job appears in the system�s
monitoring facility will cause subsequent decisions to
be based on incorrect information. If two or more jobs arrive
within this hysteresis window, the incorrect assessment
of system state can have negative consequences on job response
time and system throughput. In this paper we describe
a stateful resource selection protocol we designed
to mitigate this problem for a real time storm surge modeling
project. We present results from real experiments on
a regional grid. We use emulation to compare and study
the effect of our protocol under varying load conditions.
Based on our evaluation we argue that the enhanced protocol
should be made available as a globally-aware grid
resource selection service.},
  url = {http://www.renci.org/publications/techreports/TR0802.pdf}
}
@inproceedings{AFN:mmcs08,
  author = {Allan Porterfield and Robert Fowler and Mark Neyer},
  title = {{MAESTRO:} Dynamic Runtime Power Control},
  booktitle = {Workshop on Managed Multicore systems {MMCS}},
  year = {2008},
  address = {Boston, MA},
  month = jun,
  optorganization = {{ACM/IEEE}},
  abstract = {Microprocessors are increasing cores quickly enough
that for many applications the microprocessor will provide
greater computational resources than the memory system
can supply data. Performance of even simple benchmarks
is noticeably impacted by co-scheduling multiple copies on
the cores of a current microprocessor system. Runtimes,
such as MAESTRO, can use the excess computational resources
to reduce power consumption of memory bound
applications with little performance impact. By using hardware
performance counters for the shared resources from a
dedicated core, chip-wide bottlenecks are detected. When
a threshold is exceeded, the frequency (and power) of the
cores are reduced. MAESTRO is prototyped on an AMD
Phenom as a daemon and currently detects high miss rates
for the L3 shared cache and reduces the processor frequency
until the rate drops below a second threshold. On
a desktop system, running at low frequency can save up to
36% of the total power consumed. MAESTRO allows single
core jobs to compute at full frequency and saves power
when the bottlenecks exist during parallel execution. Performance
degradation is currently 20-25\% but is expected
to fall as MAESTRO is tuned and the core counts on a chip
increase.}
}
@inproceedings{TRF:cluster08,
  author = {Jeffery L. Tilson and Mark S.C. Reed and Robert J Fowler},
  title = {Workflows for Performance Evaluation and Tuning},
  booktitle = {Proceedings 2008 {IEEE} International Conference on 
               Cluster Computing (Cluster 2008)},
  pages = {8pp},
  year = 2008,
  address = {Tsukuba, Japan},
  month = sep,
  organization = {IEEE}
}
@inproceedings{GDSFR:SC08,
  author = {Todd Gamblin and Bronis R. de~Supinski and Martin Schultz
        and Rob Fowler and Daniel A. Reed},
  title = {Scalable Load-Balance Measurement for {SPMD} Codes},
  booktitle = {Proceedings of Supercomputing 2008},
  year = 2008,
  address = {Austin, TX},
  month = nov,
  organization = {{ACM/IEEE}}
}
@article{FGetal:scidac2008,
  author = {Robert J Fowler and Todd Gamblin and Allan K Porterfield 
     and Patrick Dreher and Song Huang and Balint Joo},
  title = {Performance engineering challenges: the view from {RENCI}},
  journal = {J. Phys: Conf. Ser},
  year = 2008,
  pages = {5pp}
}
@techreport{LFRT2008:tr0807,
  author = {Allan Porterfield and Robert J. Fowler and  Anirban Mandal and Min Yeol Lim},
  title = {Performance Consistency on Multi-socket AMD Opteron Systems.},
  institution = {RENCI},
  year = {2008},
  number = {TR-08-07},
  address = {North Carolina},
  month = dec,
  abstract = {Compute nodes with multiple sockets each of which has
multiple cores are starting to dominate in the area of scientific
computing clusters. Performance inconsistencies from one execution to
the next makes any performance debugging or tuning difficult. The
resulting performance inconsistencies are bigger for memory-bound
applications but still noticeable for all but the most
compute-intensive applications. Memory and thread placement across
sockets has significant impact on performance of these systems. We
test overall performance and performance consistency for a number of
OpenMP and pthread benchmarks including Stream, pChase , the NAS
Parallel Benchmarks and SPEC OMP. The tests are run on a variety of
multi-socket quad-core AMD Opteron systems. We examine the benefits of
explicitly pinning each thread to a different core before any data
initialization, thus improving and reducing the variability of
performance due to data-to-thread co-location. Execution time
variability falls to less than 2% and for one memory-bound application
peak performance increases over 40%. For applications running on
hundreds or thousands of nodes, reducing variability will improve load
balance and total application performance. Careful memory and thread
placement is critical for the successful performance tuning of nodes
on a modern scientific compute cluster.},
  url = {http://www.renci.org/publications/techreports/TR-08-07.pdf},
  note = {(Submitted for publication)}
}
@techreport{LFRT2009:tr0901,
  author = {Allan Porterfield and Rob Fowler and Anirban Mandal 
         and Min Yeol Lim},
  title = {Empirical Evaluation of Multi-Core Memory Concurrency},
  institution = {RENCI},
  year = {2009},
  number = {TR-09-01},
  address = {Chapel Hill, North Carolina},
  month = {January},
  abstract = {Multi-socket, multi-core computers are becoming
ubiquitous, especially as nodes in compute clusters of all
sizes. Common memory benchmarks and memory performance models treat
memory as characterized by well-defined maximum bandwidth and average
latency parameters. In contrast, current and future systems are based
on deep hierarchies and NUMA memory systems, which are not easily
described this simply. Memory performance characterization of
multi-socket, multi-core systems require measurements and models more
sophisticated than than simple peak bandwidth/minimum latency
models. To investigate this issue, we performed a detailed
experimental study of the memory performance of a variety of AMD
multi-socket quad-core systems. We used the pChase benchmark to
generate memory system loads with a variable number of concurrent
memory operations in the system across a variable number of threads
pinned to specific chips in the system. While processor differences
had minor but measurable impact on bandwidth, the make-up and
structure of the memory has major impact on achievable bandwidth. Our
experiments exposed 3 different bottlenecks at different levels of the
hardware architecture: limits on the number of references outstanding
per thread; limits to the memory requests serviced by a single memory
channel; and limits on the total global memory references outstanding
were observed. We discuss the impact of these limits on constraints in
tuning code for these systems, theimpact on compilers and operating
systems, and on future system implementation decisions.},
  url = {http://www.renci.org/publications/techreports/TR-09-01.pdf}
}
@inproceedings{PNF2009,
  author = {Allan Porterfield and Nassib Nassar and Rob Fowler},
  title = {Multi-Threaded Library for Many-Core Systems},
  booktitle = {Workshop on Multithreaded Architectures and Applications},
  year = 2009,
  address = {Rome},
  month = may,
  organization = {IEEE}
}
@article{Fowleretal:Scidac09,
  author = {Robert Fowler and L Adhianto and Bronis de Supinski and
                  Michael Fagan and Todd Gamblin and Mark Krentel and John
                  Mellor-Crummey and Martin Schulz and Nathan Tallent},
  title = {Frontiers of performance analysis on leadership-class systems},
  journal = {Journal of Physics: Conference Series},
  year = {2009},
  volume = {180},
  doi = {10.1088/1742-6596/180/01241}
}
@inproceedings{BLFA2010:powermon,
  author = {Daniel Bedard and Min Yeol Lim and Robert Fowler and Allan
                  Porterfield},
  title = {{PowerMon}:  Fine-Grained and Integrated Power Monitoring
                   for Commodity Computer Systems},
  booktitle = {Proceedings Southeastcon 2010},
  year = 2010,
  address = {Charlotte, NC},
  month = mar,
  organization = {{IEEE}}
}
@inproceedings{MFP2010a:ISPASS,
  author = {Anirban Mandal and Rob Fowler and Allan Porterfield},
  title = {Modeling Memory Concurrency for Multi-Socket Multi-Core Systems},
  booktitle = {Proceedings of the 2010 {IEEE} International Symposium on Performance Analysis of Systems and Software {(ISPASS2010)}},
  year = {2010},
  address = {White Plains, NY},
  month = mar,
  organization = {IEEE},
  pages = {56-75}
}
@inproceedings{GDSFR2010:ICS,
  author = {Todd Gamblin and Bronis de~Supinski and Martin Schulz and Rob Fowler and Daniel Reed},
  title = {Efficiently Clustering Performance Data at Massive Scales},
  booktitle = {Proceedings of the International Conference on
             Supercomputing 2010 {(ICS2010)}},
  year = {2010},
  address = {Tsukuba, Japan},
  month = jun,
  organization = {ACM}
}
@inproceedings{softpower_hpdc_2010,
  author = {Min Yeol Lim and Allan Porterfield and Robert Fowler},
  title = {{SoftPower: Fine-Grain Power Estimations Using Performance 
Counters}},
  booktitle = {The ACM International Symposium on High Performance 
Distributed Computing (HPDC)},
  year = {2010},
  month = jul,
  location = {Chicago, USA},
  publisher = {ACM},
  address = {Chicago},
  note = {Best short paper award}
}
@techreport{AM:Implications,
  author = {Anirban Mandal and Min Yeol Lim and Allan Porterfield and Rob Fowler},
  title = {Effects of Multi-core Memory Concurrency Limits on Multi-threaded Applications},
  institution = {RENCI},
  number = {RENCI Technical Report TR-10-03},
  location = {Chapel Hill, NC},
  year = {2010}
}
@inproceedings{lcpc10,
  author = {Anirban Mandal and Min Lim and Allan Porterfield and Robert Fowler},
  booktitle = {Poster at International Workshop on Languages and Compilers for Parallel Computing (LCPC'10)},
  note = {(poster)},
  title = {Implications for Applications and Compilers of Multi-core Memory Concurrency},
  year = {2010}
}
@article{Tilson2011131,
  title = {CI potential energy curves for three states of RuO2+},
  journal = {Chemical Physics Letters},
  volume = {516},
  number = {4–6},
  pages = {131 - 136},
  year = {2011},
  note = {},
  issn = {0009-2614},
  doi = {10.1016/j.cplett.2011.09.075},
  url = {http://www.sciencedirect.com/science/article/pii/S0009261411012176},
  author = {Jeffrey L. Tilson and Walter C. Ermler and Robert J. Fowler}
}
@inproceedings{MFP12:WHIST,
  author = {Anirban Mandal and Robert Fowler and Allan Porterfield},
  title = {System-wide Introspection for Accurate Attribution of
Performance Bottlenecks},
  booktitle = {Workshop on High-performance Infrastructure for Scalable Tools (WHIST)},
  year = {2012},
  address = {Venice, Italy},
  month = {June}
}
@conference{ETF:SWRMACS12,
  author = {Walter Ermler and Jeffrey Tilson and Robert J. Fowler},
  title = {Spin-orbit configuration interaction calculations of 
     electronic spectra of {RuO2+} and {OsO2+} catalytic cores},
  booktitle = {{Southwest Regional Meeting of the American 
           Chemical Society (SWRMACS 2012)}},
  address = {Baton Rouge, LA},
  year = 2012,
  abstract = {Low-lying potential energy curves of RuO2+ and OsO2+
                  catalytic core molecules are analyzed using
                  large-scale spin-orbit configuration interaction
                  (SOCI) calculations based on multireference
                  molecular wavefunctions. Relativistic effects are
                  included using effective core potentials. The large
                  spin-orbit splitting energies of the 4d and 5d
                  subshells of Ru and Os require that the spin-orbit
                  coupling operator be included when calculating the
                  electronic spectra. The ground states of both
                  molecules are triply bonded systems of 0+(1Σ+)
                  symmetry having bond lengths, harmonic frequencies
                  and dissociation energies of 1.63 and 1.70 Å, 757
                  and 772 cm-1, and 83.8 and 97.2 kcal/mol,
                  respectively. These results are consistent with
                  experimental observation that the ground state of
                  complexed RuO2+ is diamagnetic.
}
}
@conference{CBFV:IPDPS13,
  author = {Jee Whan Choi and Daniel Bedard 
            and Robert Fowler and Richard Vuduc},
  title = {A roofline model of energy},
  booktitle = {Proceedings of the International Parallel and 
         Distributed Processing Symposium {(IPDPS13)}},
  address = {Boston, MA},
  month = may,
  year = 2013
}
@conference{HSMKPFB:ROSS2013,
  author = {Kevin Huck and Sameer Shende and Allen Malony and Hartmut Kaiser
      and Allan Porterfield and Rob Fowler and Ron Brightwell},
  title = {An Early Prototype of an Autonomic Performance Environment 
         for Exascale},
  booktitle = {International Workshop on Runtime and Operating Systems
         at Scale {(ROSS2013)}},
  address = {Eugene, OR},
  month = jun,
  year = 2013
}
@conference{PFBW2013:E2SC,
  author = {Allan Porterfield and Rob Fowler and Sridutt Balachandra
                  and Wei Wang},
  title = {{OpenMP} and {MPI} Application Energy Measurement Variation},
  year = 2013,
  month = nov,
  address = {Denver, CO},
  booktitle = {1st International Workshop on Energy Efficient
      SuperComputing {(E2SC)}
}
}
@conference{PFBRDL2015,
  author = {Allan Porterfield and Rob Fowler and Sridutt Bhalachandra and 
        Barry Rountree and Diptorup Deb and Robert Lewis},
  title = {Application Runtime Variability and Power Optimization 
       for Exascale Computers},
  booktitle = {International Workshop on Runtime and Operating Systems
         at Scale {(ROSS2015)}},
  address = {Portland, OR},
  month = jun,
  year = 2015
}
@conference{HPCKMS2015:SCF,
  author = {Kevin A. Huck and  Allan Porterfield  and Rob Fowler and Nick Chaimov and Hartmut Kaiser and Allen D. Malony and Thomas Sterling},
  title = {An autonomic Performance Environment for Exascale},
  booktitle = {Supercomputing Frontiers 2015},
  address = {Singapore},
  month = mar,
  year = 2015
}
@conference{RMCFTBX15,
  author = {Paul Ruth and Anirban Mandal and Claris Castillo and Robert Fowler and
        Jeff Tilson and Ilya Baldin and Yufeng Xin},
  title = {Achieving Performance Isolation on Multi-Tenant Networked Clouds Using Advanced Block Storage Mechanisms},
  address = {Portland, OR},
  year = 2015,
  month = jun,
  pages = {29--32},
  booktitle = {6th Workshop on Scientific Cloud Computing ({ScienceCloud'15})},
  doi = {10.1145/2755644.2755649}
}
@article{JSFI64,
  author = {Kevin Huck and Allan Porterfield and Nick Chaimov and Hartmut Kaiser and Allen Malony and Thomas Sterling and Rob Fowler},
  title = {An Autonomic Performance Environment for Exascale},
  journal = {Supercomputing frontiers and innovations},
  volume = {2},
  number = {3},
  year = {2015},
  keywords = {},
  abstract = {Exascale systems will require new approaches to
                  performance observation, analysis, and runtime
                  decision-making to optimize for performance and
                  efficiency. The standard \"first-person\" model, in
                  which multiple operating system processes and
                  threads observe themselves and record first-person
                  performance profiles or traces for offline analysis,
                  is not adequate to observe and capture interactions
                  at shared resources in highly concurrent, dynamic
                  systems. Further, it does not support mechanisms for
                  runtime adaptation. Our approach, called APEX
                  (Autonomic Performance Environment for eXascale),
                  provides mechanisms for sharing information among
                  the layers of the software stack, including
                  hardware, operating and runtime systems, and
                  application code, both new and legacy. The
                  performance measurement components share information
                  across layers, merging first-person data sets with
                  information collected by third-person tools
                  observing shared hardware and software states at
                  node- and global-levels. Critically, APEX provides a
                  policy engine designed to guide runtime adaptation
                  mechanisms to make algorithmic changes, re-allocate
                  resources, or change scheduling rules when
                  appropriate conditions occur.},
  issn = {2313-8734},
  url = {http://superfri.org/superfri/article/view/64}
}
@inproceedings{PBWF:IPDPS2016,
  author = {Allan Porterfield and Sridutt Bhalachandra and Wei Wang
        and Robert Fowler},
  title = {Variability: A Tuning Headache},
  year = {2016},
  booktitle = {Symposium on Parallel and Distributed Processing (IPDPS) -
         Workshop Proceedings},
  address = {Chicago, IL},
  month = may,
  organization = {IEEE},
  abstract = {Performance tuning is an ongoing activity at most HPC
                  sites. Small performance improvements can save
                  thousands of dollars. Run-to-run performance
                  variations significantly impact performance
                  tuning. Not being able to tell which code version is
                  faster (or more energy efficient) in a single run
                  greatly increases the computational expense and
                  uncertainty for the programmer. We will show
                  examples where autotuning frameworks could easily
                  choose a sub-optimal kernel. We will also examine
                  the difficulty optimizing a real-world HPC
                  application},
  doi = {10.1109/IPDPSW.2016.73}
}
@inbook{DFP:LCPC2016,
  author = {Diptorup Deb and Robert Fowler and Allan Porterfield},
  title = {{QUARC:} An Array Programming Approach to
High Performance Computing},
  year = {2016},
  month = sep,
  booktitle = {29th International Workshop on Languages and Compilers
             for Parallel Computing {(LCPC 2016)}},
  address = {Rochester, NY},
  volume = 10136,
  doi = {10.1007/978-3-319-52709-3},
  isbn = {978-3-319-52708-6},
  publisher = {Springer International Publishing},
  series = {Lecture Notes in Computer Science},
  abstract = {We present {QUARC}, a framework for the optimized
                  compilation of domain-specific extensions to
                  C++. Driven by needs for programmer productivity and
                  portable performance for lattice {QCD}, the framework
                  focuses on stencil-like computations on arrays with
                  an arbitrary number of dimensions. {QUARC} uses a
                  template meta-programming front end to define a
                  high-level array language. Unlike approaches that
                  generate scalarized loop nests in the front end, the
                  instantiation of {QUARC} templates retains high-level
                  abstraction suitable for optimization at the object
                  (array) level.  The back end compiler {(CLANG/LLVM)}
                  is extended to implement array transformations such
                  as transposition, reshaping, and partitioning for
                  parallelism and for memory locality prior to
                  scalarization. }
}
@inproceedings{Idazaketal:CSESSP,
  author = {R. Idaszak and R. Arthur and R Bartlett and I. Baxter and
  D.E. Bernholdt and R.  Boisvert and K. Fecho and R. Fowler and S. Greenspan
  and M.A. Heroux and C.  Iancu and C. Kartsaklis and D.S.  Katz and Q. Koziol
  and S. Landsberg and E. Lucier and J. McGregor and T. Ndousse-Fetter and
  A. Pawlik and A.I. Reuther and W. Scarborough and W. Schroeder},
  title = {Economics of {CSE} Software Tools},
  booktitle = {Computational Science and Engineering Software Productivity and Sustainability {(CSESSP)} Challenges Workshop Report},
  year = {2016},
  month = oct,
  address = {Washington, DC}
}
@inproceedings{DFP-LLVM-HPC2017,
  author = {Diptorup Deb and Robert J. Fowler and Allan Porterfield},
  title = {{QUARC:} An Optimized {DSL} Framework using {LLVM}},
  booktitle = {The Fourth Workshop on the {LLVM} Compiler
             Infrastructure in {HPC (LLVM-HPC2017)}},
  year = {2017},
  month = nov,
  address = {Denver, CO},
  organization = {ACM SIGHPC},
  abstract = {We describe aspects of the implementation of QUARC, a
framework layered on C++ used for a domain specific lan-
guage for Lattice Quantum Chromodynamics. It is built on
top of Clang/LLVM to leverage long term support and per-
formance portability. QUARC implements a general array
extension to C++ with implicit data parallelism. A notable
innovation is the method for using templates to capture
and encode the high-level abstractions and to communicate
these abstractions transparently to LLVM through an unmod-
ified Clang. Another notable feature is a general array trans-
formation mechanism used to improve memory hierarchy
performance and maximize opportunities for vectorization.
This reshapes and transposes arrays of structures containing
nested complex arrays into arrays of structures of arrays. We
discuss an example for which QUARC generated code has
performance competitive with the very best hand-optimized
libraries.}
}
@inproceedings{BPOPF17,
  author = {Sridutt Bhalachandra and Allan Porterfield and
            Stephen L. Olivier and Jan F. Prins and Robert J. Fowler},
  title = {Improving Energy Efficiency in Memory-constrained Applications Using
      Core-specific Power Control},
  booktitle = {5th International Workshop on Energy Efficient Supercomputing 
      (E2SC)},
  year = {2017},
  month = nov,
  address = {Denver, CO},
  organization = {{ACM SIGHPC}},
  abstract = {Power is increasingly the limiting factor in High
Performance Computing (HPC) at Exascale and will continue
to influence future advancements in supercomputing to mitigate
large operating costs and carbon footprints. Recent processors
equipped with on-board hardware counters allow real time mon-
itoring of operating conditions such as energy and temperature,
in addition to performance measures such as instructions retired
and memory accesses. Significantly, recent processors also provide
the ability to dynamically control processor power utilization at
the per-core level.
We present an experimental memory study on modern CPU
architectures, Intel Sandybridge and Haswell, to identify oppor-
tunities to reduce CPU frequency and save energy with minimal
performance impact. Using on-board hardware counters, we
identify a metric, TORo core, that detects bandwidth saturation
and increased latency in the memory system. This metric is then
used to construct a dynamic policy to modulate per-core power
controls on Haswell machines.
The policy is evaluated when applied at coarse and fine-
grained levels on six MPI mini-applications. The best energy
savings with the coarse and fine-grained application of the
dynamic policy is 32.1\% and 19.5\% respectively with a 2\%
increase in execution time in both cases. On average, the fine-
grained dynamic policy yields a 1\% speedup while the coarse-
grained dynamic policy yields a 3\% slowdown. Energy savings
through frequency reduction not only provide cost advantages,
they also reduce resource contention and create additional ther-
mal headroom for non-throttled cores that can lead to improved
performance.}
}
@inproceedings{BPOP-IPDPS17,
  author = {Sridutt Bhalachandra and Allan Porterfield and Stephen L. Olivier
                              and Jan F. Prins},
  title = {An Adaptive Core-specific Runtime for Energy
        Efficiency},
  booktitle = {31st IEEE International Parallel \& Distributed Processing Symposium },
  year = {2017},
  month = may,
  address = {Orlando, FL},
  organization = {{IEEE}}
}
@inproceedings{bhalachandra2017adaptive,
  title = {An Adaptive Core-Specific Runtime for Energy Efficiency},
  author = {Bhalachandra, Sridutt and Porterfield, Allan and Olivier,
 Stephen L and Prins, Jan F},
  booktitle = {Parallel and Distributed Processing Symposium
 (IPDPS), 2017 IEEE International},
  pages = {947--956},
  year = {2017},
  organization = {IEEE}
}

This file was generated by bibtex2html 1.98.