{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,8,6]],"date-time":"2023-08-06T04:03:48Z","timestamp":1691294628152},"reference-count":23,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"8","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Fundamentals"],"published-print":{"date-parts":[[2023,8,1]]},"DOI":"10.1587\/transfun.2022eap1084","type":"journal-article","created":{"date-parts":[[2023,2,9]],"date-time":"2023-02-09T22:14:51Z","timestamp":1675980891000},"page":"1043-1050","source":"Crossref","is-referenced-by-count":0,"title":["LFWS: Long-Operation First Warp Scheduling Algorithm to Effectively Hide the Latency for GPUs"],"prefix":"10.1587","volume":"E106.A","author":[{"given":"Song","family":"LIU","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Xi'an Jiaotong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"MA","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xi'an Jiaotong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenyu","family":"ZHAO","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xi'an Jiaotong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinhe","family":"WAN","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xi'an Jiaotong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weiguo","family":"WU","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xi'an Jiaotong University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] N. Melab, J. Gmys, M. Mezmaz, and D. Tuyttens, \u201cMany-core branch-and-bound for GPU accelerators and MIC coprocessors,\u201d High-Performance Simulation-Based Optimization, pp.275-291, 2020. 10.1007\/978-3-030-18764-4_12","DOI":"10.1007\/978-3-030-18764-4_12"},{"key":"2","doi-asserted-by":"publisher","unstructured":"[2] C. Yu, Y. Bai, and R. Wang, \u201cMIPSGPU: Minimizing pipeline stalls for GPUs with non-blocking execution,\u201d IEEE Trans. Comput., vol.70, no.11, pp.1804-1816, 2020. 10.1109\/tc.2020.3026043","DOI":"10.1109\/TC.2020.3026043"},{"key":"3","unstructured":"[3] C. Fan, \u201cResearch on GPU warp scheduling algorithm optimization,\u201d Master&apos;s thesis, Nanjing University, 2018."},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] J. Chen, X. Tao, Z. Yang, J.K. Peir, X. Li, and S.L. Lu, \u201cGuided region-based GPU scheduling: Utilizing multi-thread parallelism to hide memory latency,\u201d 2013 IEEE 27th International Symposium on Parallel and Distributed Processing, pp.441-451, May 2013. 10.1109\/ipdps.2013.95","DOI":"10.1109\/IPDPS.2013.95"},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] V. Narasiman, M. Shebanow, C.J. Lee, R. Miftakhutdinov, O. Mutlu, and Y.N. Patt, \u201cImproving GPU performance via large warps and two-level warp scheduling,\u201d 44th Annual IEEE\/ACM International Symposium on Microarchitecture, pp.308-317, 2011. 10.1145\/2155620.2155656","DOI":"10.1145\/2155620.2155656"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] J. Zhang, S. Gao, N.S. Kim, and M. Jung, \u201cCIAO: Cache interference-aware throughput-oriented architecture and scheduling for GPUs,\u201d 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp.149-159, 2018. 10.1109\/ipdps.2018.00025","DOI":"10.1109\/IPDPS.2018.00025"},{"key":"7","unstructured":"[7] T.G. Rogers, \u201cLocality and scheduling in the massively multithreaded era,\u201d Ph.D. thesis, University of British Columbia, 2015."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] G.B. Kim, J.M. Kim, and C.H. Kim, \u201cDynamic selective warp scheduling for GPUs using L1 data cache locality information,\u201d International Conference on Parallel and Distributed Computing: Applications and Technologies, pp.230-239, 2018. 10.1007\/978-981-13-5907-1_24","DOI":"10.1007\/978-981-13-5907-1_24"},{"key":"9","doi-asserted-by":"publisher","unstructured":"[9] Y. Oh, K. Kim, M.K. Yoon, J.H. Park, Y. Park, M. Annavaram, and W.W. Ro, \u201cAdaptive cooperation of prefetching and warp scheduling on GPUs,\u201d IEEE Trans. Comput., vol.68, no.4, pp.609-616, 2019. 10.1109\/tc.2018.2878671","DOI":"10.1109\/TC.2018.2878671"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] T.G. Rogers, M. O&apos;Connor, and T.M. Aamodt, \u201cCache-conscious wavefront scheduling,\u201d 2012 45th Annual IEEE\/ACM International Symposium on Microarchitecture, pp.72-83, 2012. 10.1109\/micro.2012.16","DOI":"10.1109\/MICRO.2012.16"},{"key":"11","doi-asserted-by":"publisher","unstructured":"[11] A. Jog, O. Kayiran, N.N. Chidambaram, A.K. Mishra, M.T. Kandemir, O. Mutlu, R. Iyer, and C.R. Das, \u201cOWL: Cooperative thread array aware scheduling techniques for improving GPGPU performance,\u201d ACM SIGPLAN Notices, vol.48, no.4, pp.395-406, 2013. 10.1145\/2499368.2451158","DOI":"10.1145\/2499368.2451158"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] M. Gebhart, G.R. Johnson, D. Tarjan, S.W. Keckler, W.J. Dally, E. Lindholm, and K. Skadron, \u201cEnergy-efficient mechanisms for managing thread context in throughput processors,\u201d 2011 38th Annual International Symposium on Computer Architecture (ISCA), pp.235-246, 2011. 10.1145\/2000064.2000093","DOI":"10.1145\/2000064.2000093"},{"key":"13","doi-asserted-by":"publisher","unstructured":"[13] Y. Zhang, Z. Xing, C. Liu, C. Tang, and Q. Wang, \u201cLocality based warp scheduling in GPGPUs,\u201d Future Generation Computer Systems, vol.82, pp.520-527, 2018. 10.1016\/j.future.2017.02.036","DOI":"10.1016\/j.future.2017.02.036"},{"key":"14","doi-asserted-by":"publisher","unstructured":"[14] C.T. Do, H.J. Choi, S.W. Chung, and C.H. Kim, \u201cA novel warp scheduling scheme considering long-latency operations for high-performance GPUs,\u201d The Journal of Supercomputing, vol.76, no.4, pp.3043-3062, 2020. 10.1007\/s11227-019-03091-2","DOI":"10.1007\/s11227-019-03091-2"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] M. Lee, G. Kim, J. Kim, W. Seo, Y. Cho, and S. Ryu, \u201ciPAWS: Instruction-issue pattern-based adaptive warp scheduling for GPGPUs,\u201d 2016 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp.370-381, 2016. 10.1109\/hpca.2016.7446079","DOI":"10.1109\/HPCA.2016.7446079"},{"key":"16","unstructured":"[16] J.P. Anantpur, \u201cEnhancing GPGPU performance through warp scheduling, divergence taming and runtime parallelizing transformations,\u201d Ph.D. thesis, Indian Institute of Science Bangalore, 2017."},{"key":"17","doi-asserted-by":"publisher","unstructured":"[17] S.Y. Lee, A. Arunkumar, and C.J. Wu, \u201cCAWA: Coordinated warp scheduling and cache prioritization for critical warp acceleration of GPGPU workloads,\u201d ACM SIGARCH Computer Architecture News, vol.43, no.3S, pp.515-527, 2015. 10.1145\/2872887.2750418","DOI":"10.1145\/2872887.2750418"},{"key":"18","unstructured":"[18] V.T. Vo and C.H. Kim, \u201cKAWS: Coordinate kernel-aware warp scheduling and warp sharing mechanism for advanced GPUs,\u201d Journal of Information Processing Systems, vol.17, no.6, pp.1157-1169, 2021. 10.3745\/JIPS.01.0084"},{"key":"19","doi-asserted-by":"publisher","unstructured":"[19] J. Fang, Z. Wei, and H. Yang, \u201cLocality-based cache management and warp scheduling for reducing cache contention in GPU,\u201d Micromachines, vol.12, no.10, p.1262, 2021. 10.3390\/mi12101262","DOI":"10.3390\/mi12101262"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] M. Khairy, Z. Shen, T.M. Aamodt, and T.G. Rogers, \u201cAccel-Sim: An extensible simulation framework for validated GPU modeling,\u201d 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA), pp.473-486, 2020. 10.1109\/isca45697.2020.00047","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] S. Che, M. Boyer, J. Meng, D. Tarjan, J.W. Sheaffer, S.H. Lee, and K. Skadron, \u201cRodinia: A benchmark suite for heterogeneous computing,\u201d IEEE International Symposium on Workload Characterization, pp.44-54, 2009. 10.1109\/iiswc.2009.5306797","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] A. Bakhoda, G.L. Yuan, W.W. Fung, H. Wong, and T.M. Aamodt, \u201cAnalyzing CUDA workloads using a detailed GPU simulator,\u201d 2009 IEEE International Symposium on Performance Analysis of Systems and Software, pp.163-174, 2009. 10.1109\/ispass.2009.4919648","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"23","unstructured":"[23] NVIDA, CUDA SDK: http:\/\/developer.nvidia.com\/gpu-computing-sdk"}],"container-title":["IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transfun\/E106.A\/8\/E106.A_2022EAP1084\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,5]],"date-time":"2023-08-05T04:04:16Z","timestamp":1691208256000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transfun\/E106.A\/8\/E106.A_2022EAP1084\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,1]]},"references-count":23,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2023]]}},"URL":"https:\/\/doi.org\/10.1587\/transfun.2022eap1084","relation":{},"ISSN":["0916-8508","1745-1337"],"issn-type":[{"value":"0916-8508","type":"print"},{"value":"1745-1337","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,8,1]]},"article-number":"2022EAP1084"}}