% ------------------------------------------------------------------------- % These refer bibliographic entries for the 26th % INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE % (1999) created by Rebecca Hoffman and Mark D. Hill % from author input. % % These entries are correct to the best of our knowledge, % but we accept no responsibility for the consequences of % any errors. Email corrections to hoffman@cs.wisc.edu. % Last change: Wed Mar 24 13:38:51 CST 1999 % % ------------------------------------------------------------------------- % % % Novel Architecture % @INPROCEEDINGS{barua:maps, AUTHOR = "R. Barua and W. Lee and S. Amarasinghe and A. Agarwal", TITLE = "Maps: A Compiler-Managed Memory System for Raw Machines", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", PAGES = "???", YEAR = "1999", MONTH = "June", ABSTRACT = " Abstract not provided by the authors. "} @INPROCEEDINGS{vajapeyam:vectors, AUTHOR = "Sriram Vajapeyam and P. J. Joseph and Tulika Mitra", TITLE = "Dynamic Vectorization: A Mechanism for Exploiting Far-Flung ILP in Ordinary Programs", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Several ILP limit studies indicate the presence of considerable ILP across dynamically far-apart instructions in program execution. This paper proposes a hardware mechanism, dynamic vectorization (DV), as a tool for quickly building up a large logical instruction window. Dynamic vectorization converts repetitive dynamic instruction sequences into vector form, enabling the processing of instructions from beyond the corresponding program loop to be overlapped with the loop. This enables vector-like execution of programs with relatively complex static control flow that may not be amenable to static, compile time vectorization. Experimental evaluation shows that a large fraction of the dynamic instructions of four of the six SPECInt92 programs can be captured in vector form. Three of these programs exhibit significant potential for ILP improvements from dynamic vectorization, with speedups of more than a factor of 2 in a scenario of realistic branch prediction and perfect memory disambiguation. Under perfect branch prediction conditions, a fourth program also shows well over a factor of 2 speedup from DV. The speedups are due to the overlap of post-loop processing with loop processing. "} @INPROCEEDINGS{goldstein:piperench, AUTHOR = "Seth C. Goldstein and Herman Schmit and Matthew Moe and Mihai Budiu and Srihari Cadambi and R. Reed Taylor and Ronald Laufer", TITLE = "PipeRench: A Coprocessor for Streaming Multimedia Acceleration", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Future computing workloads will emphasize an architecture's ability to perform relatively simple calculations on massive quantities of mixed-width data. This paper describes a novel reconfigurable fabric architecture, PipeRench, optimized to accelerate these types of computations. PipeRench enables fast, robust compilers, supports forward compatibility, and virtualizes configurations, thus removing the fixed size constraint present in other fabrics. For the first time we explore how the bit-width of processing elements affects performance and show how the PipeRench architecture has been optimized to balance the needs of the compiler against the realities of silicon. Finally, we demonstrate extreme performance speedup on certain computing kernels (up to 190x versus a modern RISC processor), and analyze how this acceleration translates to application speedup. "} % % Value Prediction % @INPROCEEDINGS{yoaz:scheduling, AUTHOR = "A. Yoaz and M. Erez and R. Ronen and S. Jourdan" TITLE = "Speculative Techniques for Improving Load Related Instruction Scheduling", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Abstract not provided by authors." } @INPROCEEDINGS{bekerman:predictors, AUTHOR = "Michael Bekerman and Stephan Jourdan and Ronny Ronen and Gilad Kirshenboim and Lihu Rappoport and Adi Yoaz and Uri Weiser", TITLE = "Correlated Load-Address Predictors", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " As microprocessors become faster, the relative performance cost of memory accesses increases. Bigger and faster caches significantly reduce the absolute load-to-use time delay. However, increase in processor operational frequencies impairs the relative load-to-use latency, measured in processor cycles (e.g. from two cycles on the Pentium\256 processor to three cycles or more in current designs). Load-address prediction techniques were introduced to partially cut the load-to-use latency. This paper focuses on advanced address-prediction schemes to further shorten program execution time. Existing address prediction schemes are capable of predicting simple address patterns, consisting mainly of constant addresses or stride-based addresses. This paper explores the characteristics of the remaining loads and suggests new enhanced techniques to improve prediction effectiveness: * Context-based prediction to tackle part of the remaining, difficult-to-predict, load instructions. * New prediction algorithms to take advantage of global correlation among different static loads. * New confidence mechanisms to increase the correct prediction rate and to eliminate costly mispredictions. * Mechanisms to prevent long or random address sequences from polluting the predictor data structures while providing some hysteresis behavior to the predictions. Such an enhanced address predictor accurately predicts 67% of all loads, while keeping the misprediction rate close to 1%. We further prove that the proposed predictor works reasonably well in a deep pipelined architecture where the predict-to-update delay may significantly impair both prediction rate and accuracy. "} @INPROCEEDINGS{calder:svp, AUTHOR = "B. Calder and G. Reinman and D. Tullsen", TITLE = "Selective Value Prediction", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Value Prediction is a relatively new technique to increase instruction-level parallelism by breaking true data dependence chains. A value prediction architecture produces values, which may be later consumed by instructions that execute speculatively using the predicted value. This paper examines selective techniques for using value prediction in the presence of predictor capacity constraints and reasonable misprediction penalties. We examine prediction and confidence mechanisms in light of these constraints, and we minimize capacity conflicts through instruction filtering. The latter technique filters which instructions put values into the value prediction table. We examine filtering techniques based on instruction type, as well as giving priority to instructions belonging to the longest data dependence path in the processor's active instruction window. We apply filtering both to the producers of predicted values and the consumers. In addition, we examine the benefit of using different confidence levels for instructions using predicted values on the longest dependence path. "} % % Memory % @INPROCEEDINGS{Qiu:trap, AUTHOR = "Xiaogang Qiu and Michel Dubois", TITLE = "Tolerating Late Memory Traps in ILP Processors", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " ILP processors can execute a large number of instructions at the same time. Thus it becomes more and more difficult to support traps efficiently. On the other hand a current trend in architecture is to support various memory functions in software rather than hardware, usually by trapping the execution processor on a cache miss, TLB miss or a failed access to a local or remote memory. These late memory traps block the faulting instruction at the top of the active list, backing up the pipeline. Moreover the support for late memory traps may affect the performance of non-faulting memory instructions as well. In this paper we analyze the overhead caused by late memory traps in ILP processors and define several measures for this overhead. In order to tolerate late memory traps, we propose hardware prefetching of exception conditions and a tagged Store buffer to implement deferred traps on Stores. We show that, with these hardware optimizations, the overhead added by the lateness of traps is significantly reduced relative to the overhead of early traps. Because of caching effects the frequency of late memory traps usually decreases as they are taken deeper in the memory hierarchy and their overall impact on the execution time becomes negligible. "} @INPROCEEDINGS{luk_mowry:forwarding, AUTHOR = "Chi-Keung Luk and Todd C. Mowry", TITLE = "Memory Forwarding: Enabling Aggressive Layout Optimizations by Gua ranteeing the Safety of Data Relocation", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " By optimizing data layout at run-time, we can potentially enhance the performance of caches by actively creating spatial locality, facilitating prefetching, and avoiding cache conflicts and false sharing. Unfortunately, it is extremely difficult to guarantee that such optimizations are safe in practice on today's machines, since accurately updating all pointers to an object requires perfect alias information, which is well beyond the scope of the compiler for languages such as C. To overcome this limitation, we propose a technique called memory forwarding which effectively adds a new layer of indirection within the memory system whenever necessary to guarantee that data relocation is always safe. Because actual forwarding rarely occurs (it exists as a safety net), the mechanism can be implemented as an exception in modern superscalar processors. Our experimental results demonstrate that the aggressive layout optimizations enabled by memory forwarding can result in significant speedups---more than twofold in some cases---by reducing the number of cache misses, improving the effectiveness of prefetching, and conserving memory bandwidth. "} @INPROCEEDINGS{cho:decoupling, AUTHOR = "Sangyeun Cho and Pen-Chung Yew and Gyungho Lee", TITLE = "Decoupling Local Variable Accesses in a Wide-Issue Superscalar Processor", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Providing adequate data bandwidth is extremely important for a wide-issue superscalar processor to achieve its full performance potential. Adding a large number of ports to a data cache, however, becomes increasingly inefficient and can add to the hardware complexity significantly. This paper takes an alternative or complementary approach for providing more data bandwidth, called the data-decoupled architecture. The approach, with support from the compiler and/or hardware, partitions the memory stream into two independent streams early in the processor pipeline, and feeds each stream to a separate memory access queue and cache. Under this model, the paper studies the potential of decoupling memory accesses to program's local variables that are allocated on the run-time stack. Using a set of integer and floating-point programs from the SPEC 95 benchmark suite, it is shown that local variable accesses constitute a large portion of all the memory references, while their reference space is very small, averaging around 7 words per (static) procedure. To service local variable accesses quickly, two optimizations, fast data forwarding and access combining, are proposed and studied. Some of the important design parameters, such as the cache size, the number of cache ports, and the degree of access combining, are studied based on simulations. The potential performance of the proposed scheme is measured using various configurations, and it is concluded that the scheme can become a viable alternative to building a single multi-ported data cache. "} @INPROCEEDINGS{roth:jump, AUTHOR = "Amir Roth and Gurindar S. Sohi", TITLE = "Effective Jump Pointer Prefetching for Linked Data Structures", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Current techniques for prefetching linked data structures (LDS) exploit the work available in one loop iteration or recursive call to overlap pointer chasing latency. Jump-pointers, which provide direct access to non-adjacent nodes, can be used for prefetching when loop and recursive procedure bodies are small and do not have sufficient work to overlap a long latency. This paper describes a framework for jump-pointer prefetching (JPP) that supports four prefetching idioms: queue, full, chain, and root jumping and three implementations: software-only, hardware-only, and a cooperative software/hardware technique. On a suite of pointer intensive programs, jump-pointer prefetching reduces memory stall time by 72% for software, 83% for cooperative and 55% for hardware, producing speedups of 15%, 20% and 22% respectively. "} % % Miscellaneous % @INPROCEEDINGS{ranganathan:imageprocessing, AUTHOR = "P. Ranganathan and S. Adve and N. Jouppi", TITLE = "Performance of Image Processing with General-Purpose Processors and Media ISA Extensions", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " This paper aims to provide a quantitative understanding of the performance of image and video processing applications on general-purpose processors, without and with media ISA extensions. We use detailed simulation of 12 benchmarks to study the effectiveness of current architectural features and identify future challenges for these workloads. Our results show that conventional techniques in current processors to enhance instruction-level parallelism (ILP) provide a factor of 2.3X to 4.2X performance improvement. The Sun VIS media ISA extensions provide an additional 1.1X to 4.2X performance improvement. The ILP features and media ISA extensions significantly reduce the CPU component of execution time, making 5 of the image processing benchmarks memory-bound. The memory behavior of our benchmarks is characterized by large working sets and streaming data accesses. Increasing the cache size has no impact on 8 of the benchmarks. The remaining benchmarks require relatively large cache sizes (dependent on the display sizes) to exploit data reuse, but derive less than 1.2X performance benefits with the larger caches. Software prefetching provides 1.4X to 2.5X performance improvement in the image processing benchmarks where memory is a significant problem. With the addition of software prefetching, all our benchmarks revert to being compute-bound. "} @INPROCEEDINGS{merten:hotspot, AUTHOR = "Matthew C. Merten and Andrew R. Trick and Christopher N. George and John C. Gyllenhaal and Wen-mei W. Hwu", TITLE = "A Hardware-Driven Profiling Scheme for Identifying Program Hot Spots to Support Runtime Optimization", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " This paper presents a novel hardware-based approach for identifying, profiling, and monitoring hot spots in order to support runtime optimization of general-purpose programs. The proposed approach consists of a set of tightly coupled hardware tables and control logic modules that are placed in the retirement stage of a processor pipeline removed from the critical path. The features of the proposed design include rapid detection of program hot spots after changes in execution behavior, runtime-tunable selection criteria for hot spot detection, and negligible overhead during application execution. Experiments using several SPEC95 benchmarks, as well as several large WindowsNT applications, demonstrate the promise of the proposed design. "} % % Coherence % @INPROCEEDINGS{Shen:CRF, AUTHOR = "Xiaowei Shen and Arvind and Larry Rudolph", TITLE = "Commit-Reconcile \& Fences (CRF): A New Memory Model for Architects and Compiler Writers", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " We present a new mechanism-oriented memory model called Commit-Reconcile & Fences (CRF) and define it using algebraic rules. Many existing memory models can be described as restricted versions of CRF. The model has been designed so that it is both easy for architects to implement, and stable enough to serve as a target machine interface for compilers of high-level languages. The CRF model exposes a semantic notion of caches (saches), and decomposes load and store instructions into finer-grain operations. We sketch how to integrate CRF into modern microprocessors and outline an adaptive coherence protocol to implement CRF in distributed shared-memory systems. CRF offers an upward compatible way to design next generation computer systems. "} @INPROCEEDINGS{gniady:sc-ilp, AUTHOR = "Chris Gniady and Babak Falsafi and T. N. Vijaykumar", TITLE = "Is SC + ILP = RC?", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Sequential consistency (SC) is the simplest programming interface for shared-memory systems but imposes program order among all memory operations, possibly precluding high performance implementations. Release consistency (RC), however, enables the highest performance implementations but puts the burden on the programmer to specify which memory operations need to be atomic and in program order. This paper shows, for the first time, that SC implementations can perform as well as RC implementations if the hardware provides enough support for speculation. Both SC and RC implementations rely on reordering and overlapping memory operations for high performance. To enforce order when necessary, an RC implementation uses software guarantees, whereas an SC implementation relies on hardware speculation. Our SC implementation, called SC++, closes the performance gap because: (1) the hardware allows not just loads, as some current SC implementations do, but also stores to bypass each other speculatively to hide remote latencies, (2) the hardware provides large speculative state for not just processor, as previously proposed, but also memory to allow out-of-order memory operations, (3) the support for hardware speculation does not add excessive overheads to processor pipeline critical paths, and (4) well-behaved applications incur infrequent rollbacks of speculative execution. Using simulation, we show that SC++ achieves an RC implementation's performance in all the six applications we studied. "} @INPROCEEDINGS{lai:msp, AUTHOR = "An-Chow Lai and Babak Falsafi", TITLE = "Memory Sharing Predictor: The Key to a Speculative Coherent DSM", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Recent research advocates using general message predictors to learn and predict the coherence activity in distributed shared memory (DSM). By accurately predicting a message and timely invoking the necessary coherence actions, a DSM can hide much of the remote access latency. This paper proposes the Memory Sharing Predictors (MSPs), pattern-based predictors that significantly improve prediction accuracy and implementation cost over general message predictors. An MSP is based on the key observation that to hide the remote access latency, a predictor must accurately predict only the remote memory accesses (i.e., request messages) and not the subsequent coherence messages invoked by an access. Simulation results indicate that MSPs improve prediction accuracy over general message predictors from 81% to 93% while requiring less storage overhead. This paper also presents the first design and evaluation for a speculative coherent DSM using pattern-based predictors. We identify simple techniques and mechanisms to trigger prediction timely and perform speculation for remote read accesses. Our speculation hardware readily works with a conventional full-map write-invalidate coherence protocol without any modifications. Simulation results indicate that performing speculative read requests alone reduces execution times by 12% in our shared-memory applications. "} % % Control and ILP % @INPROCEEDINGS{chappell:ssmt, AUTHOR = "Robert S. Chappell and Jared Stark and Sangwook P. Kim and Steven K. Reinhardt and Yale N. Patt", TITLE = "Simultaneous Subordinate Microthreading (SSMT)", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Current work in Simultaneous Multithreading provides little benefit to programs that aren't partitioned into threads. We propose Simultaneous Subordinate Microthreading (SSMT) to correct this by spawning subordinate threads that perform optimizations on behalf of the single primary thread. These threads, written in microcode, are issued and executed concurrently with the primary thread. They directly manipulate the microarchitecture to improve the primary thread's branch prediction accuracy, cache hit rate, and prefetch effectiveness. All contribute to the performance of the primary thread. This paper introduces SSMT and discusses its potential to increase performance. We illustrate its usefulness with an SSMT machine that executes subordinate microthreads to improve the branch prediction of the primary thread. We show simulation results for the SPECint 95 benchmarks. "} @INPROCEEDINGS{black:block, AUTHOR = "B. Black and B. Rychlik and J.P. Shen", TITLE = "A Block-based Trace Cache", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " The trace cache is a recently proposed solution to achieving high instruction fetch bandwidth by buffering and reusing dynamic instruction traces. This work presents a new block-based trace cache implementation that can achieve higher IPC performance with more efficient stor age of traces. Instead of explicitly storing instructions of a trace, pointers to blocks constituting a trace are stored in a much smaller trace table. The block-based trace cache re names fetch addresses at the basic block level and stores aligned blocks in a block cache. Traces are constructed by accessing the replicated block cache using block pointers from the trace table. Performance potential of the block- based trace cache is quantified and compared with perfect branch prediction and perfect fetch schemes. Comparing to the conventional trace cache, the block-based design can achieve higher IPC, with less impact on cycle time. Results: Using the SPECint95 benchmarks, a 16-wide realistic design of a block-based trace cache improves per formance 75% over a baseline design and to within 7% of a baseline design with perfect branch prediction. It is shown the block-based trace cache with a 1K-entry block cache achieves the same performance of the conventional trace cache with 32K entries. "} @INPROCEEDINGS{august:predictedexecution, AUTHOR = "D. August and W. Hwu and S. Mahlke", TITLE = "The Program Decision Logic Approach to Utilizing Predicated Execution", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Modern compilers must expose sufficient amounts of Instruction-Level Parallelism (ILP) to achieve the promised performance increases of superscalar and VLIW processors. One of the major impediments to achieving this goal has been inefficient programmatic control flow. Historically, the compiler has translated the programmer's original control structure directly into assembly code with conditional branch instructions. Eliminating inefficiencies in handling branch instructions and exploiting ILP has been the subject of much research. However, traditional branch handling techniques cannot significantly alter the program's inherent control structure. The advent of predication as a program control representation has enabled compilers to manipulate control in a form more closely related to the underlying program logic. This work takes full advantage of the predication paradigm by abstracting the program control flow into a logical form referred to as a program decision logic network. This network is modeled as a Boolean equation and minimized using modified versions of logic synthesis techniques. After minimization, the more efficient version of the program's original control flow is re-expressed in predicated code. Furthermore, this paper proposes extensions to the HPL PlayDoh predication model in support of more effective predicate decision logic network minimization. Finally, this paper shows the ability of the mechanisms presented to overcome limits on ILP previously imposed by rigid program control structure. "} % % VLSI Architecture % @INPROCEEDINGS{cuppu:dram, AUTHOR = "Vinodh Cuppu and Bruce Jacob and Brian Davis and Trevor Mudge", TITLE = "A Performance Comparison of Contemporary DRAM Architectures", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " In response to the growing gap between memory access time and processor speed, DRAM manufacturers have created several new DRAM architectures. This paper presents a simulation-based performance study of a representative group, each evaluated in a small-system organization. These small-system organizations correspond to workstation-class computers and use on the order of 10 DRAM chips. The study covers Fast Page Mode, Extended Data Out, Synchronous, Enhanced Synchronous, Synchronous Link, Rambus, and Direct Rambus designs. Our simulations reveal several things: (a) current advanced DRAM technologies are attacking the memory bandwidth problem but not the latency problem; (b) bus transmission speed will soon become a primary factor limiting memory-system performance; (c) the post-L2 address stream still contains significant locality, though it varies from application to application; and (d) as we move to wider buses, row access time becomes more prominent, making it important to investigate techniques to exploit the available locality to decrease access time. "} @INPROCEEDINGS{reinman:ftb, AUTHOR = "G. Reinman and T. Austin and B. Calder", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " In the pursuit of instruction-level parallelism, significant demands are placed on a processor's instruction delivery mechanism. Delivering the performance necessary to meet future processor execution targets requires that the performance of the instruction delivery mechanism scale with the execution core. Attaining these targets is a challenging task due to I-cache misses, branch mispredictions, and taken branches in the instruction stream. To further complicate matters, a VLSI interconnect scaling trend is materializing that further limits the performance of front-end designs in future generation process technologies. To counter these challenges, we present a fetch architecture that permits a faster cycle time than previous designs and scales better with future process technologies. Our design, called the Fetch Target Buffer, is a multi-level fetch block-oriented predictor. We decouple the FTB from the instruction fetch and decode pipelines to afford it the fastest clock possible. Through cycle-based simulation and circuit-level delay analysis, we find that our multi-level FTB design is capable of delivering instructions 25% faster than the best single-level BTB-based pipeline configuration. Moreover, we show that our design scales better to future process technologies than traditional single-level designs. "} @INPROCEEDINGS{kim:cache, AUTHOR = "Seongwoo Kim and Arun K. Somani", TITLE = "Area Efficient Architectures for Information Integrity in Cache Memories", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Information integrity in cache memories is a fundamental requirement for dependable computing. Conventional architectures for enhancing cache reliability using check codes make it difficult to trade between the level of data integrity and the chip area requirement. We focus on transient fault tolerance in primary cache memories and develop new architectural solutions to maximize fault coverage when the budgeted silicon area is not sufficient for the conventional configuration of an error checking code. The underlying idea is to exploit the corollary of reference locality in the organization and management of the code. A higher protection priority is dynamically assigned to the portions of the cache that are more error-prone and have a higher probability of access. The error-prone likelihood prediction is based on the access frequency. We evaluate the effectiveness of the proposed schemes using a trace-driven simulation combined with software error injection using four different fault manifestation models. From the simulation results, we show that for most benchmarks the proposed architectures are effective and area efficient for increasing the cache integrity under all four models. "} % % Prediction 2 % @INPROCEEDINGS{nakra:vliw, AUTHOR = "Tarun Nakra and Rajiv Gupta and Mary Lou Soffa", TITLE = "Value Prediction in VLIW Machines", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999, PAGES = "???", MONTH = "June", ABSTRACT = " The performance of VLIW architectures is dependent on the capability of the compiler to detect and exploit instruction-level paralleli sm during instruction scheduling. To exploit the detected parallelism, instructions are reordered to reduce the length of the code schedule and minimize the cycle count for execution. Code reordering is limited by the dependencies among instructions arising from both control flow and data flow. In this paper, we present the design of a VLIW architecture that uses value prediction to remove data dependencies and improve the instruction schedule. Our architecture consists of two execution engines, one for executing the original VLIW code, and the other for executing compensation code after a misprediction. Any code executed due to mispredictions is executed in parallel with the VLIW instructions. The instruction set and hardware of a traditional VLIW machine are modified accordingly to suppo rt this type of concurrent execution. The efficacy of the proposed architecture is demonstrated by implementing the prediction model in the Trimaran compiler infrastructure and studying the speedu ps that result due to the parallel execution of compensation code. "} @INPROCEEDINGS{tullsen:rvp, AUTHOR = "Dean M. Tullsen and John S. Seng", TITLE = "Storageless Value Prediction Using Prior Register Values", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " This paper presents a technique called register value prediction (RVP) which uses a type of locality called register-value reuse. By predicting that an instruction will produce the value that is already stored in the destination register, we eliminate the need for large value buffers to enable value prediction. Even without the large buffers, register-value prediction can be made as or more effective than last-value prediction, particularly with the aid of compiler management of values in the register file. Both static and dynamic register value prediction techniques are demonstrated to exploit register-value reuse, the former requiring minimal instruction set architecture changes and the latter requiring a set of small confidence counters. We show an average gain of 12% with dynamic RVP and moderate compiler assistance on a next generation processor, and 15% on a 16-wide processor. "} @INPROCEEDINGS{bilas:ionic, AUTHOR = "A. Bilas and C. Liao and J.P. Singh", TITLE = "Using Network Interface Support to Avoid Asynchronous Protocol Processing in Shared Virtual Memory Systems", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " The performance of page-based software shared virtual memory (SVM) is still far from that achieved on hardware-coherent distributed shared memory (DSM) systems. The interrupt cost for asynchronous protocol processing has been found to be a key source of performance loss and complexity. This paper shows that by providing simple and general support for asynchronous message handling in a commodity network interface (NI), and by altering SVM protocols appropriately, protocol activity can be decoupled from asynchronous message handling and the need for interrupts or polling can be eliminated. The NI mechanisms needed are generic, not SVM-dependent. They also require neither visibility into the node memory system nor code instrumentation to identify memory operations. We prototype the mechanisms and such a "synchronous home-based LRC" protocol, called GeNIMA (GEneral-purpose Network Interface support in a shared Memory Abstraction), on a cluster of SMPs with a programmable NI, though the mechanisms are simple and do not require programmability. We find that the performance improvements are substantial, bringing performance on a small-scale SMP cluster much closer to that of hardware-coherent shared memory for many applications, and we show the value of each of the mechanisms in different applications. Application performance improves by about 37% on average for reasonably well performing applications, even on our relatively slow programmable NI, and more for others. We discuss the key remaining bottlenecks at the protocol level and use a firmware performance monitor in the NI to understand the interactions with and the implications for the communication layer. "} @INPROCEEDINGS{bilir:multicast, AUTHOR = "E. Bilir and R. Dickson and Y. Hu and M. Plakal and D. Sorin and M. Hill and D. Wood", TITLE = "Multicast Snooping: A New Coherence Method Using a Multicast Address Network", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " This paper proposes a new coherence method called "multicast snooping" that dynamically adapts between broadcast snooping and a directory protocol. Multicast snooping is unique because processors predict which caches should snoop each coherence transaction by specifying a multicast "mask." Transactions are delivered with an ordered multicast network, such as an Isotach network, which eliminates the need for acknowledgment messages. Processors handle transactions as they would with a snooping protocol, while a simplified directory operates in parallel to check masks and gracefully handle incorrect ones (e.g., previous owner missing). Preliminary performance numbers with mostly SPLASH-2 benchmarks running on 32 processors show that we can limit multicasts to an average of 2-6 destinations (<< 32) and we can deliver 2-5 multicasts per network cycle (>> broadcast snooping's 1 per cycle). While these results do not include timing, they do provide encouragement that multicast snooping can obtain data directly (like broadcast snooping) but apply to larger systems (like directories). "} @INPROCEEDINGS{jiang:applications, AUTHOR = "D. Jiang and J.P. Singh", TITLE = "Scaling Application Performance on a Cache-coherent Multiprocessors", BOOKTITLE = "Proceedings of the 26th Annual International Symposium on Computer Architecture", YEAR = "1999", PAGES = "???", MONTH = "June", ABSTRACT = " Hardware-coherent, distributed shared address space systems are increasingly successful at moderate scale. However, it is unclear whether, or with how much difficulty, the performance of a load-store shared address space programming model scales to large processor counts on real applications. We examine this question using an aggressive case-study machine, the SGI Origin2000, up to 128 processors. We show for the first time that scalable performance can indeed be achieved in this programming model on a wide range of applications, including challenging kernels like FFT. However, this does not come easily, even for applications considered to be already highly optimized, and is very often not simply a matter of increasing problem size. Rather, substantial further application restructuring is often needed, which is usually quite algorithmic in nature. We examine how the restructurings compare with those needed for performance portability to shared virtual memory on clusters, and we comment on common programming guidelines for performance portability and scalability as well as on how the programming difficulty compares with that of explicit message passing. We also examine where applications spend their time on this large machine, the impact of special hardware features that the machine provides, and the impact of mapping to the network topology. "}