{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:03:22Z","timestamp":1776931402297,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":121,"publisher":"ACM","funder":[{"name":"Gobierno de Arag\u00f3n","award":["T5820R"],"award-info":[{"award-number":["T5820R"]}]},{"DOI":"10.13039\/501100002809","name":"Generalitat de Catalunya","doi-asserted-by":"publisher","award":["2021-SGR-00763"],"award-info":[{"award-number":["2021-SGR-00763"]}],"id":[{"id":"10.13039\/501100002809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Spanish Ministry of Science, Innovation and Universities","award":["PID2023-146511NB-I00"],"award-info":[{"award-number":["PID2023-146511NB-I00"]}]},{"name":"Spanish National Research Agency","award":["PID2022-136454NB-C22"],"award-info":[{"award-number":["PID2022-136454NB-C22"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756030","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1793-1808","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Delegato: Locality-Aware Atomic Memory Operations on Chiplets"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8337-6326","authenticated-orcid":false,"given":"V\u00edctor","family":"Soria-Pardos","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center (BSC), Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2869-668X","authenticated-orcid":false,"given":"Adri\u00e0","family":"Armejach","sequence":"additional","affiliation":[{"name":"Universitat Politecnica de Catalunya (UPC), Barcelona, Spain and Barcelona Supercomputing Center (BSC), Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6515-0312","authenticated-orcid":false,"given":"Tiago","family":"M\u00fcck","sequence":"additional","affiliation":[{"name":"Arm, Austin, Texas, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7490-4067","authenticated-orcid":false,"given":"Dar\u00edo Su\u00e1rez","family":"Gracia","sequence":"additional","affiliation":[{"name":"Universidad de Zaragoza, Zaragoza, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3571-5562","authenticated-orcid":false,"given":"Jose","family":"Joao","sequence":"additional","affiliation":[{"name":"Arm, Austin, Texas, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9848-8758","authenticated-orcid":false,"given":"Miquel","family":"Moret\u00f3","sequence":"additional","affiliation":[{"name":"Universitat Politecnica de Catalunya (UPC), Barcelona, Spain and Barcelona Supercomputing Center (BSC), Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.1997.569661"},{"key":"e_1_3_3_2_3_2","unstructured":"Arm holdings. 2020. Arm Neoverse N2 Core Technical Reference Manual. https:\/\/developer.arm.com\/documentation\/102099\/0001\/The-Neoverse-N2\u2013core. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_4_2","unstructured":"Arm Holdings. 2021. AMBA 5 CHI Architecture Specification. https:\/\/developer.arm.com\/architectures\/system-architectures\/amba\/amba-5. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527385"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD63220.2024.00092"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/277650.277734"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00047"},{"key":"e_1_3_3_2_9_2","unstructured":"Vignesh Balaji Dhruva Tirumala and Brandon Lucia. 2017. Flexible Support for Fast Parallel Commutative Updates. arxiv:https:\/\/arXiv.org\/abs\/1709.09491\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/1709.09491"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2015.12"},{"key":"e_1_3_3_2_11_2","unstructured":"Scott Beamer Krste Asanovi\u0107 and David Patterson. 2017. The GAP Benchmark Suite. arxiv:https:\/\/arXiv.org\/abs\/1508.03619\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/1508.03619"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.112"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218539"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454128"},{"key":"e_1_3_3_2_15_2","first-page":"141","volume-title":"Proceedings of the Seventh Workshop on Languages and Compilers for Parallel Computing","author":"Blume Bill","year":"1994","unstructured":"Bill Blume, Rudolf Eigenmann, Keith Faigin, John Grout, Jay Hoeflinger, David Padua, Paul Petersen, Bill Pottenger, Lawrence Rauchwerger, Peng Tu, et\u00a0al. 1994. Polaris: The next generation in parallelizing compilers. In Proceedings of the Seventh Workshop on Languages and Compilers for Parallel Computing. Citeseer, 141\u2013154."},{"key":"e_1_3_3_2_16_2","volume-title":"Learning OpenCV: Computervision with the OpenCV library","author":"Bradski Gary","year":"2008","unstructured":"Gary Bradski and Adrian Kaehler. 2008. Learning OpenCV: Computervision with the OpenCV library. O\u2019Reilly."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254718"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","unstructured":"Paul Caheny Lluc Alvarez Said Derradji Mateo Valero Miquel Moret\u00f3 and Marc Casas. 2018. Reducing Cache Coherence Traffic with a NUMA-Aware Runtime Approach. IEEE Transactions on Parallel and Distributed Systems 29 5 (2018) 1174\u20131187. 10.1109\/TPDS.2017.2787123","DOI":"10.1109\/TPDS.2017.2787123"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-03850-6_7"},{"key":"e_1_3_3_2_20_2","volume-title":"5th USENIX Workshop on Hot Topics in Parallelism (HotPar 13)","author":"Calciu Irina","year":"2013","unstructured":"Irina Calciu, Justin Gottschlich, and Maurice Herlihy. 2013. Using Elimination and Delegation to Implement a Scalable NUMA-Friendly Stack. In 5th USENIX Workshop on Hot Topics in Parallelism (HotPar 13). USENIX Association, San Jose, CA. https:\/\/www.usenix.org\/conference\/hotpar13\/workshop-program\/presentation\/calciu"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"Dimitrios Chasapis Marc Casas Miquel Moret\u00f3 Raul Vidal Eduard Ayguad\u00e9 Jes\u00fas Labarta and Mateo Valero. 2015. PARSECSs: Evaluating the Impact of Task Parallelism in the PARSEC Benchmark Suite. ACM Trans. Archit. Code Optim. 12 4 Article 41 (Dec. 2015) 22\u00a0pages. 10.1145\/2829952","DOI":"10.1145\/2829952"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.63"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2015.7322443"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-45550-1_24"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00056"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00058"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","unstructured":"Debendra Das\u00a0Sharma Gerald Pasdast Zhiguo Qian and Kemal Aygun. 2022. Universal Chiplet Interconnect Express (UCIe): An Open Industry Standard for Innovations With Chiplets at Package Level. IEEE Transactions on Components Packaging and Manufacturing Technology 12 9 (2022) 1423\u20131431. 10.1109\/TCPMT.2022.3207195","DOI":"10.1109\/TCPMT.2022.3207195"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Timothy\u00a0A. Davis and Yifan Hu. 2011. The University of Florida Sparse Matrix Collection. ACM Trans. Math. Softw. 38 1 Article 1 (dec 2011) 25\u00a0pages.","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","unstructured":"Jeffrey Dean and Sanjay Ghemawat. 2008. MapReduce: simplified data processing on large clusters. Commun. ACM 51 1 (jan 2008) 107\u2013113. 10.1145\/1327452.1327492","DOI":"10.1145\/1327452.1327492"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392736"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","unstructured":"M. Dubois J. Skeppstedt and P. Strenstrom. 1995. Essential Misses and Data Traffic in Coherence Protocols. J. Parallel and Distrib. Comput. 29 2 (1995) 108\u2013125. 10.1006\/jpdc.1995.1112","DOI":"10.1006\/jpdc.1995.1112"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.1992.753322"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"P ERDdS and A R&wi. 1959. On random graphs I. Publ. math. debrecen 6 290-297 (1959) 18.","DOI":"10.5486\/PMD.1959.6.3-4.12"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3357526.3357564"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/1583991.1584017"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3587135.3591413"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2001.953304"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254694"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480073"},{"key":"e_1_3_3_2_40_2","unstructured":"JR Goodman and HHJ Hum. 2004. Mesif: A two-hop cache coherency protocol for point-to-point interconnects (2009). URL: https:\/\/www. cs. auckland. ac. nz\/goodman\/TechnicalReports\/MESIF-2009. pdf (2004)."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","unstructured":"James\u00a0R. Goodman Mary\u00a0K. Vernon and Philip\u00a0J. Woest. 1989. Efficient synchronization primitives for large-scale cache-coherent multiprocessors. SIGARCH Comput. Archit. News 17 2 (April 1989) 64\u201375. 10.1145\/68182.68188","DOI":"10.1145\/68182.68188"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/285930.285983"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-48311-X_56"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/335231.335239"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC55918.2022.00015"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.1998.727289"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2001.924963"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45414-4_21"},{"key":"e_1_3_3_2_49_2","unstructured":"Arm Holdings. [n. d.]. AMBA 5 CHI Architecture Specification. https:\/\/developer.arm.com\/architectures\/system-architectures\/amba\/amba-5. [Online; 19-June-2025]."},{"key":"e_1_3_3_2_50_2","unstructured":"Arm Holdings. 2022. Armv9 Architecture Reference Manual for A-profile architecture. https:\/\/developer.arm.com\/documentation\/ddi0602\/2025-03\/. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_51_2","unstructured":"Arm Holdings. 2022. Do near or far atomics give the best performance on Neoverse systems?https:\/\/developer.arm.com\/documentation\/ka004706\/latest\/. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_52_2","unstructured":"Arm Holdings. 2025. The Armv9.6 architecture extension.https:\/\/developer.arm.com\/documentation\/109697\/2025_03\/Feature-descriptions\/The-Armv9-6-architecture-extension. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","unstructured":"Libo Huang Zhiying Wang Nong Xiao Yongwen Wang and Qiang Dou. 2014. Integrated Coherence Prediction: Towards Efficient Cache Coherence on NoC-Based Multicore Architectures. ACM Trans. Des. Autom. Electron. Syst. 19 3 Article 24 (June 2014) 22\u00a0pages. 10.1145\/2611756","DOI":"10.1145\/2611756"},{"key":"e_1_3_3_2_54_2","unstructured":"Intel. 2021. Intel\u00ae 64 and IA-32 Architectures Software Developer\u2019s Manual.https:\/\/cdrdv2.intel.com\/v1\/dl\/getContent\/671200. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815971"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","unstructured":"Aamer Jaleel Kevin\u00a0B. Theobald Simon\u00a0C. Steely and Joel Emer. 2010. High performance cache replacement using re-reference interval prediction (RRIP). SIGARCH Comput. Archit. News 38 3 (June 2010) 60\u201371. 10.1145\/1816038.1815971","DOI":"10.1145\/1816038.1815971"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3579838"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-32041-5_15"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628108"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/264107.264166"},{"key":"e_1_3_3_2_61_2","unstructured":"Nodari Kankava. 2020. Exploring the Efficiency of Multi-Word Compare-and-Swap. Master\u2019s thesis. Uppsala University Department of Information Technology."},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/CMPCON.1993.289660"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-09873-9_48"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/277830.277847"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.38"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","unstructured":"George Kurian Omer Khan and Srinivas Devadas. 2013. The locality-aware adaptive cache coherence protocol. SIGARCH Comput. Archit. News 41 3 (June 2013) 523\u2013534. 10.1145\/2508148.2485967","DOI":"10.1145\/2508148.2485967"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218661"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.1999.765949"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-16-1376-0_9"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"publisher","DOI":"10.1145\/264107.264206"},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"crossref","unstructured":"H.\u00a0Q. Le J.\u00a0A. Van\u00a0Norstrand B.\u00a0W. Thompto J.\u00a0E. Moreira D.\u00a0Q. Nguyen D. Hrusecky M.\u00a0J. Genden and M. Kroener. 2018. IBM POWER9 processor core. IBM Journal of Research and Development 62 4\/5 (2018) 2:1\u20132:12.","DOI":"10.1147\/JRD.2018.2854039"},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","unstructured":"D. Lenoski J. Laudon K. Gharachorloo W.-D. Weber A. Gupta J. Hennessy M. Horowitz and M.S. Lam. 1992. The Stanford Dash multiprocessor. Computer 25 3 (1992) 63\u201379. 10.1109\/2.121510","DOI":"10.1109\/2.121510"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"publisher","DOI":"10.1007\/11564126_17"},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"publisher","unstructured":"Paul\u00a0T. Lin Michael\u00a0A. Heroux Richard\u00a0F. Barrett and Alan\u00a0B. Williams. 2015. Assessing a mini-application as a performance proxy for a finite element method engineering application. Concurrency and Computation: Practice and Experience 27 17 (2015) 5374\u20135389. 10.1002\/cpe.3587 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/cpe.3587","DOI":"10.1002\/cpe.3587"},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","DOI":"10.23919\/DATE51398.2021.9474021"},{"key":"e_1_3_3_2_76_2","unstructured":"Jason Lowe-Power Abdul\u00a0Mutaal Ahmad Ayaz Akram Mohammad Alian Rico Amslinger Matteo Andreozzi Adri\u00e0 Armejach Nils Asmussen Brad Beckmann Srikant Bharadwaj Gabe Black Gedare Bloom Bobby\u00a0R. Bruce Daniel\u00a0Rodrigues Carvalho Jeronimo Castrillon Lizhong Chen Nicolas Derumigny Stephan Diestelhorst Wendy Elsasser Carlos Escuin Marjan Fariborz Amin Farmahini-Farahani Pouya Fotouhi Ryan Gambord Jayneel Gandhi Dibakar Gope Thomas Grass Anthony Gutierrez Bagus Hanindhito Andreas Hansson Swapnil Haria Austin Harris Timothy Hayes Adrian Herrera Matthew Horsnell Syed Ali\u00a0Raza Jafri Radhika Jagtap Hanhwi Jang Reiley Jeyapaul Timothy\u00a0M. Jones Matthias Jung Subash Kannoth Hamidreza Khaleghzadeh Yuetsu Kodama Tushar Krishna Tommaso Marinelli Christian Menard Andrea Mondelli Miquel Moreto Tiago M\u00fcck Omar Naji Krishnendra Nathella Hoa Nguyen Nikos Nikoleris Lena\u00a0E. Olson Marc Orr Binh Pham Pablo Prieto Trivikram Reddy Alec Roelke Mahyar Samani Andreas Sandberg Javier Setoain Boris Shingarov Matthew\u00a0D. Sinclair Tuan Ta Rahul Thakur Giacomo Travaglini Michael Upton Nilay Vaish Ilias Vougioukas William Wang Zhengrong Wang Norbert Wehn Christian Weis David\u00a0A. Wood Hongil Yoon and \u00c9der F.\u00a0Zulian. 2020. The gem5 Simulator: Version 20.0+. arxiv:https:\/\/arXiv.org\/abs\/2007.03152\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2007.03152"},{"key":"e_1_3_3_2_77_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2016.7482086"},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"publisher","unstructured":"Lori\u00e9n L\u00f3pez-Villellas Rub\u00e9n Langarita-Ben\u00edtez Asaf Badouh V\u00edctor Soria-Pardos Quim Aguado-Puig Guillem L\u00f3pez-Parad\u00eds Max Doblas Javier Setoain Chulho Kim Makoto Ono Adri\u00e0 Armejach Santiago Marco-Sola Jes\u00fas Alastruey-Bened\u00e9 Pablo Ib\u00e1\u00f1ez and Miquel Moret\u00f3. 2024. GenArchBench: A genomics benchmark suite for arm HPC processors. Future Generation Computer Systems 157 (2024) 313\u2013329. 10.1016\/j.future.2024.03.050","DOI":"10.1016\/j.future.2024.03.050"},{"key":"e_1_3_3_2_79_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD46524.2019.00041"},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155650"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"crossref","unstructured":"Edward\u00a0M McCreight. 1985. The dragon computer system. Microarchitecture of VLSI Computers (1985) 83\u2013101.","DOI":"10.1007\/978-94-009-5143-3_3"},{"key":"e_1_3_3_2_82_2","first-page":"509","volume-title":"Parallel and Distributed Computing and Systems","author":"McKenney Paul\u00a0E","year":"1998","unstructured":"Paul\u00a0E McKenney and John\u00a0D Slingwine. 1998. Read-copy update: Using execution history to solve concurrency problems. In Parallel and Distributed Computing and Systems , Vol.\u00a0509518. Citeseer, 509\u2013518."},{"key":"e_1_3_3_2_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/996841.996848"},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2009.22"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2015.83"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","unstructured":"Shubhendu\u00a0S. Mukherjee and Mark\u00a0D. Hill. 1998. Using prediction to accelerate coherence protocols. SIGARCH Comput. Archit. News 26 3 (April 1998) 179\u2013190. 10.1145\/279361.279386","DOI":"10.1145\/279361.279386"},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807649"},{"key":"e_1_3_3_2_88_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00014"},{"key":"e_1_3_3_2_89_2","first-page":"511","volume-title":"11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)","author":"Narula Neha","year":"2014","unstructured":"Neha Narula, Cody Cutler, Eddie Kohler, and Robert Morris. 2014. Phase Reconciliation for Contended In-Memory Transactions. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14). USENIX Association, Broomfield, CO, 511\u2013524. https:\/\/www.usenix.org\/conference\/osdi14\/technical-sessions\/presentation\/narula"},{"key":"e_1_3_3_2_90_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-68031-1_5"},{"key":"e_1_3_3_2_91_2","doi-asserted-by":"publisher","DOI":"10.4230\/LIPIcs.DISC.2019.28"},{"key":"e_1_3_3_2_92_2","doi-asserted-by":"publisher","DOI":"10.1145\/3490148.3538572"},{"key":"e_1_3_3_2_93_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-58184-7_115"},{"key":"e_1_3_3_2_94_2","unstructured":"Nvidia. 2024. NVIDIA Grace Hopper Superchip Architecture Whitepaper. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/nvidia-grace-hopper?ncid=no-ncid [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_95_2","unstructured":"Nvidia. 2025. Parallel Thread Execution ISA. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_96_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.1994.81"},{"key":"e_1_3_3_2_97_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2001.991127"},{"key":"e_1_3_3_2_98_2","doi-asserted-by":"publisher","DOI":"10.1145\/224170.224399"},{"key":"e_1_3_3_2_99_2","doi-asserted-by":"publisher","unstructured":"E. Rosti E. Smirni T.\u00a0D. Wagner A.\u00a0W. Apon and L.\u00a0W. Dowdy. 1993. The KSR1: experimentation and modeling of poststore. SIGMETRICS Perform. Eval. Rev. 21 1 (June 1993) 74\u201385. 10.1145\/166962.166985","DOI":"10.1145\/166962.166985"},{"key":"e_1_3_3_2_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/237090.237144"},{"key":"e_1_3_3_2_101_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614284"},{"key":"e_1_3_3_2_102_2","doi-asserted-by":"publisher","DOI":"10.1145\/277830.277903"},{"key":"e_1_3_3_2_103_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589065"},{"key":"e_1_3_3_2_104_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSD67783.2025.00066"},{"key":"e_1_3_3_2_105_2","doi-asserted-by":"publisher","DOI":"10.5821\/dissertation-2117-367546"},{"key":"e_1_3_3_2_106_2","doi-asserted-by":"publisher","DOI":"10.1145\/36206.36199"},{"key":"e_1_3_3_2_107_2","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145869"},{"key":"e_1_3_3_2_108_2","doi-asserted-by":"crossref","unstructured":"J\u00a0Hans Van\u00a0Hateren and Arjen van\u00a0der Schaaf. 1998. Independent component filters of natural images compared with simple cells in primary visual cortex. Proceedings of the Royal Society of London. Series B: Biological Sciences 265 1394 (1998) 359\u2013366.","DOI":"10.1098\/rspb.1998.0303"},{"key":"e_1_3_3_2_109_2","doi-asserted-by":"publisher","unstructured":"Guru Venkataramani Christopher\u00a0J. Hughes Sanjeev Kumar and Milos Prvulovic. 2011. DeFT: Design space exploration for on-the-fly detection of coherence misses. ACM Trans. Archit. Code Optim. 8 2 Article 8 (June 2011) 27\u00a0pages. 10.1145\/1970386.1970389","DOI":"10.1145\/1970386.1970389"},{"key":"e_1_3_3_2_110_2","first-page":"249","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Wang Jiawei","year":"2022","unstructured":"Jiawei Wang, Diogo Behrens, Ming Fu, Lilith Oberhauser, Jonas Oberhauser, Jitang Lei, Geng Chen, Hermann H\u00e4rtig, and Haibo Chen. 2022. BBQ: A Block-based Bounded Queue for Exchanging Data and Profiling. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 249\u2013262. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/wang-jiawei"},{"key":"e_1_3_3_2_111_2","unstructured":"Minjie Wang Da Zheng Zihao Ye Quan Gan Mufei Li Xiang Song Jinjing Zhou Chao Ma Lingfan Yu Yu Gai Tianjun Xiao Tong He George Karypis Jinyang Li and Zheng Zhang. 2019. Deep Graph Library: A Graph-Centric Highly-Performant Package for Graph Neural Networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.01315 (2019)."},{"key":"e_1_3_3_2_112_2","unstructured":"Andrew Waterman and Krste Asanovi\u0107. 2025. The RISC-V Instruction Set Manual Volume I: Unprivileged ISA.https:\/\/github.com\/riscv\/riscv-isa-manual\/releases\/download\/riscv-isa-release-f122839-2025-03-27\/riscv-unprivileged.pdf. [Online; accessed 19-June-2025]."},{"key":"e_1_3_3_2_113_2","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454125"},{"key":"e_1_3_3_2_114_2","doi-asserted-by":"publisher","DOI":"10.1145\/223982.223990"},{"key":"e_1_3_3_2_115_2","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155671"},{"key":"e_1_3_3_2_116_2","doi-asserted-by":"publisher","unstructured":"Chaoran Yang and John Mellor-Crummey. 2016. A wait-free queue as fast as fetch-and-add. SIGPLAN Not. 51 8 Article 16 (Feb. 2016) 13\u00a0pages. 10.1145\/3016078.2851168","DOI":"10.1145\/3016078.2851168"},{"key":"e_1_3_3_2_117_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASP-DAC58780.2024.10473961"},{"key":"e_1_3_3_2_118_2","doi-asserted-by":"publisher","DOI":"10.1145\/2591635.2667180"},{"key":"e_1_3_3_2_119_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC.2014.12"},{"key":"e_1_3_3_2_120_2","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830774"},{"key":"e_1_3_3_2_121_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2004.1302981"},{"key":"e_1_3_3_2_122_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00056"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756030","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:47:16Z","timestamp":1769464036000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756030"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":121,"alternative-id":["10.1145\/3725843.3756030","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756030","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}