{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,28]],"date-time":"2025-06-28T07:03:35Z","timestamp":1751094215066,"version":"3.40.4"},"publisher-location":"Berlin, Heidelberg","reference-count":21,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642364235"},{"type":"electronic","value":"9783642364242"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2013]]},"DOI":"10.1007\/978-3-642-36424-2_12","type":"book-chapter","created":{"date-parts":[[2013,2,11]],"date-time":"2013-02-11T10:56:45Z","timestamp":1360580205000},"page":"134-146","source":"Crossref","is-referenced-by-count":4,"title":["Inter-warp Instruction Temporal Locality in Deep-Multithreaded GPUs"],"prefix":"10.1007","author":[{"given":"Ahmad","family":"Lashgar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Amirali","family":"Baniasadi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ahmad","family":"Khonsari","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"Bakhoda, A., Yuan, G.L., Fung, W.W.L., Wong, H., Aamodt, T.M.: Analyzing CUDA workloads using a detailed GPU simulator. In: Proc. of ISPASS 2009, pp. 163\u2013174 (2009)","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Che, S., Boyer, M., Meng, J., Tarjan, D., Sheaffer, J.W., Sang-Ha, L., Skadron, K.: Rodinia: A benchmark suite for heterogeneous computing. In: Proc. of IEEE International Symposium on Workload Characterization (IISWC), pp. 44\u201354 (2009)","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"12_CR3","unstructured":"Collagne, S.: Exploiting all forms of parallel locality in many-thread architectures. ALF Research Group Seminar, IRISA, Rennes (December 21, 2011)"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Collange, S., Defour, D., Tisserand, A.: Power Consumption of GPUs from a Software Perspective. In: Proc. of the 9th International Conference on Computational Science (ICCS), pp. 914\u2013923 (2009)","DOI":"10.1007\/978-3-642-01970-8_92"},{"key":"12_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1007\/978-3-642-14122-5_8","volume-title":"Euro-Par 2009 \u2013 Parallel Processing Workshops","author":"S. Collange","year":"2010","unstructured":"Collange, S., Defour, D., Zhang, Y.: Dynamic Detection of Uniform and Affine Vectors in GPGPU Computations. In: Lin, H.-X., Alexander, M., Forsell, M., Kn\u00fcpfer, A., Prodan, R., Sousa, L., Streit, A. (eds.) Euro-Par 2009. LNCS, vol.\u00a06043, pp. 46\u201355. Springer, Heidelberg (2010)"},{"key":"12_CR6","unstructured":"Coon, B.W., Mills, P.C., Oberman, S.F., Siu, M.Y.: Tracking register usage during multithreaded processing using a scoreboard. United States Patent, Patent number: 7434032"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Gebhart, M., Johnson, D.R., Tarjan, D., Keckler, S.W., Dally, W.J., Lindholm, E., Skadron, K.: Energy-efficient mechanisms for managing thread context in throughput processors. In: Proc. of the 38th Annual International Symposium on Computer Architecture (ISCA), pp. 235\u2013246 (2011)","DOI":"10.1145\/2000064.2000093"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Gharaibeh, A., Ripeanu, M.: Size Matters: Space\/Time Tradeoffs to Improve GPGPU Applications Performance. In: Proc. of ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2010)","DOI":"10.1109\/SC.2010.51"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Hiraki, M., Bajwa, R.S., Kojima, H., Gorny, D.J., Nitta, K., Shri, A.: Stage-skip pipeline: a low power processor architecture using a decoded instruction buffer. In: International Symposium on Low Power Electronics and Design, pp. 353\u2013358 (1996)","DOI":"10.1109\/LPE.1996.547538"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Hong, S., Kim, H.: An Integrated GPU Power and Performance Model. In: Proc. of ISCA 2010, pp. 280\u2013289 (2010)","DOI":"10.1145\/1816038.1815998"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Kasichayanula, K.K.: Power Aware Computing on GPUs. Master Thesis Dissertation, University of Tennessee, Knoxville (May 2012)","DOI":"10.1109\/SAAHPC.2012.26"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Kin, J., Gupta, M., Mangione-Smith, W.H.: The filter cache: an energy efficient memory structure. In: Proc. of MICRO 1997, pp. 184\u2013193 (1997)","DOI":"10.1109\/MICRO.1997.645809"},{"key":"12_CR13","unstructured":"Lindholm, J.E., Coon, B.W., Wierzbicki, J., Stoll, R.J., Oberman, S.F.: Credit-Based Streaming Multiprocessor Warp Scheduling. United States Patent, application number: 12\/885,299"},{"key":"12_CR14","unstructured":"Lindholm, J.E., Coon, B.W., Moy, S.S.: Across-thread out-of-order instruction dispatch in a multithreaded microprocessor. United States Patent, Patent number: 7676657"},{"key":"12_CR15","unstructured":"Liu, S., Lindholm, J.E., Siu, M.Y., Coon, B.W., Oberman, S.F.: Operand collector architecture. United States Patent, Patent number: 7834881"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Muralimanohar, N., Balasubramonian, R., Jouppi, N.: Optimizing NUCA Organizations and Wiring Alternatives for Large Caches with CACTI 6.0. In: Proc. of MICRO 2007, pp. 3\u201314 (2007)","DOI":"10.1109\/MICRO.2007.33"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Narasiman, V., Shebanow, M., Lee, C.J., Miftakhutdinov, R., Mutlu, O., Patt, Y.N.: Improving GPU performance via large warps and two-level warp scheduling. In: Proc. of MICRO 2011, pp. 308\u2013317 (2011)","DOI":"10.1145\/2155620.2155656"},{"key":"12_CR18","unstructured":"NVIDIA Corp. NVIDIA CUDA SDK 2.3"},{"key":"12_CR19","unstructured":"Stratton, J.A., Rodrigues, C., Sung, I.J., Obeid, N., Chang, L.W., Anssari, N., Liu, G.D., Hwu, W.M.W.: Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. IMPACT Technical Report (2012)"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Wong, H., Papadopoulou, M.M., Sadooghi-Alvandi, M., Moshovos, A.: Demystifying GPU microarchitecture through microbenchmarking. In: Proc. of ISPASS 2010, pp. 235\u2013246 (2010)","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Hu, Y., Li, B., Peng, L.: Performance and Power Analysis of ATI GPU: A Statistical Approach. In: 6th IEEE International Conference on Networking, Architecture and Storage (NAS), pp. 149\u2013158 (2011)","DOI":"10.1109\/NAS.2011.51"}],"container-title":["Lecture Notes in Computer Science","Architecture of Computing Systems \u2013 ARCS 2013"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-36424-2_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,29]],"date-time":"2025-04-29T20:34:38Z","timestamp":1745958878000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-36424-2_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013]]},"ISBN":["9783642364235","9783642364242"],"references-count":21,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-36424-2_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2013]]}}}