{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:53:31Z","timestamp":1773194011742,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T00:00:00Z","timestamp":1750377600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2047220, 2112562, 2147946"],"award-info":[{"award-number":["2047220, 2112562, 2147946"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100016299","name":"NetApp","doi-asserted-by":"publisher","award":["Faculty Fellowship"],"award-info":[{"award-number":["Faculty Fellowship"]}],"id":[{"id":"10.13039\/100016299","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731074","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:46:17Z","timestamp":1750437977000},"page":"1311-1326","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["CORD: Low-Latency, Bandwidth-Efficient and Scalable Release Consistency via Directory Ordering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8292-935X","authenticated-orcid":false,"given":"Yanpeng","family":"Yu","sequence":"first","affiliation":[{"name":"Yale University, New Haven, CT, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9272-0518","authenticated-orcid":false,"given":"Nicolai","family":"Oswald","sequence":"additional","affiliation":[{"name":"Nvidia, Santa Clara, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2199-6391","authenticated-orcid":false,"given":"Anurag","family":"Khandelwal","sequence":"additional","affiliation":[{"name":"Yale University, New Haven, CT, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"2024. Arm AMBA CHI specification. https:\/\/developer.arm.com\/documentation\/ihi0050\/latest\/."},{"key":"e_1_3_3_3_3_2","unstructured":"2024. PCIe specifications. https:\/\/pcisig.com\/specifications."},{"key":"e_1_3_3_3_4_2","unstructured":"2025. CCIX Consortium. https:\/\/www.ccixconsortium.com\/."},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.1988.5238"},{"key":"e_1_3_3_3_6_2","volume-title":"Proc. IEEE HPCA","author":"Agarwal Neha","unstructured":"Neha Agarwal, David Nellans, Eiman Ebrahimi, Thomas\u00a0F Wenisch, John Danskin, and Stephen\u00a0W Keckler. [n. d.]. Selective GPU caches to eliminate CPU-GPU HW cache coherence. In Proc. IEEE HPCA."},{"key":"e_1_3_3_3_7_2","volume-title":"Towards a Formalization of the HSA Memory Model in the cat Language","author":"Alglave Jade","year":"2016","unstructured":"Jade Alglave and Luc Maranget. 2016. Towards a Formalization of the HSA Memory Model in the cat Language. Technical Report. HSA Foundation. https:\/\/hsafoundation.com\/wp-content\/uploads\/2021\/02\/cat_ModelExpressions-1.1.pdf"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"crossref","unstructured":"Jade Alglave Luc Maranget and Michael Tautschnig. 2014. Herding Cats: Modelling Simulation Testing and Data Mining for Weak Memory. ACM Trans. Program. Lang. Syst. (2014).","DOI":"10.1145\/2594291.2594347"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783729"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00031"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"crossref","unstructured":"James Archibald and Jean-Loup Baer. 1986. Cache Coherence Protocols: Evaluation Using a Multiprocessor Simulation Model. ACM Trans. Comput. Syst. (1986).","DOI":"10.1145\/6513.6514"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"crossref","unstructured":"Rajeev Balasubramonian Andrew\u00a0B Kahng Naveen Muralimanohar Ali Shafiee and Vaishnav Srinivas. 2017. CACTI 7: New tools for interconnect exploration in innovative off-chip memories. ACM Transactions on Architecture and Code Optimization (TACO) (2017).","DOI":"10.1145\/3085572"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"crossref","unstructured":"Mark Batty Scott Owens Susmit Sarkar Peter Sewell and Tjark Weber. 2011. Mathematizing C++ concurrency. SIGPLAN Not. (2011).","DOI":"10.1145\/1926385.1926394"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155647"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"crossref","unstructured":"Nathan Binkert Bradford Beckmann Gabriel Black Steven\u00a0K. Reinhardt Ali Saidi Arkaprava Basu Joel Hestness Derek\u00a0R. Hower Tushar Krishna Somayeh Sardashti Rathijit Sen Korey Sewell Muhammad Shoaib Nilay Vaish Mark\u00a0D. Hill and David\u00a0A. Wood. 2011. The gem5 simulator. SIGARCH Comput. Archit. News (2011).","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/2039252.2039255"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2013.6704684"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.16"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2011.21"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLSITechnologyandCir46783.2024.10631462"},{"key":"e_1_3_3_3_21_2","unstructured":"NVIDIA Corporation. 2024. NVIDIA Grace Blackwell Superchip. https:\/\/nvidianews.nvidia.com\/news\/nvidia-blackwell-platform-arrives-to-power-a-new-era-of-computing."},{"key":"e_1_3_3_3_22_2","unstructured":"NVIDIA Corporation. 2024. NVIDIA Grace Hopper Superchip. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/nvidia-grace-hopper."},{"key":"e_1_3_3_3_23_2","unstructured":"NVIDIA Corporation. 2024. NVIDIA NVLink. https:\/\/www.nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/."},{"key":"e_1_3_3_3_24_2","unstructured":"NVIDIA Corporation. 2024. NVIDIA NVLink-C2C. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink-c2c\/."},{"key":"e_1_3_3_3_25_2","unstructured":"CXL Consortium. 2022. CXL 3.0 Specification. https:\/\/www.computeexpresslink.org\/download-the-specification."},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"crossref","unstructured":"Debendra Das\u00a0Sharma Robert Blankenship and Daniel Berger. 2024. An Introduction to the Compute Express Link (CXL) Interconnect. ACM Comput. Surv. (2024).","DOI":"10.1145\/3669900"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-61474-5_86"},{"key":"e_1_3_3_3_28_2","volume-title":"IEEE International Solid-State Circuits Conference","author":"Dorsey J.","year":"2007","unstructured":"J. Dorsey, Shawn Searles, M. Ciraula, S. Johnson, N. Bujanos, D. Wu, M. Braganza, S. Meyers, E. Fang, and R. Kumar. 2007. An Integrated Quad-Core Opteron Processor. In IEEE International Solid-State Circuits Conference."},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"crossref","unstructured":"Andr\u00e9s Goens Soham Chakraborty Susmit Sarkar Sukarn Agarwal Nicolai Oswald and Vijay Nagarajan. 2023. Compound Memory Models. Proc. ACM Program. Lang. (2023).","DOI":"10.1145\/3591267"},{"key":"e_1_3_3_3_30_2","volume-title":"MESIF: A two-hop cache coherency protocol for point-to- point interconnects","author":"Goodman J.","year":"2004","unstructured":"J. Goodman and H. Hum. 2004. MESIF: A two-hop cache coherency protocol for point-to- point interconnects. Technical Report 2004-002. Department of Computer Science, University of Auckland."},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2017.7975269"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835930"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541981"},{"key":"e_1_3_3_3_34_2","unstructured":"Intel Corporation. 2024. An Introduction to the Intel QuickPath Interconnect. https:\/\/www.intel.ca\/content\/dam\/doc\/white-paper\/quick-path-interconnect-introduction-paper.pdf."},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538932"},{"key":"e_1_3_3_3_36_2","volume-title":"Proc. USENIX OSDI","author":"Jiang Yimin","year":"2020","unstructured":"Yimin Jiang, Yibo Zhu, Chang Lan, Bairen Yi, Yong Cui, and Chuanxiong Guo. 2020. A unified architecture for accelerating distributed { DNN} training in heterogeneous { GPU\/CPU} clusters. In Proc. USENIX OSDI."},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"crossref","unstructured":"Konstantinos Koukos Alberto Ros Erik Hagersten and Stefanos Kaxiras. 2016. Building Heterogeneous Unified Virtual Memories (UVMs) without the Overhead. ACM Trans. Archit. Code Optim. (2016).","DOI":"10.1145\/2889488"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750421"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483561"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3578835"},{"key":"e_1_3_3_3_41_2","volume-title":"Proc. 2015 IEEE International Conference on Networking, Architecture and Storage (NAS)","author":"Li Peilong","year":"2015","unstructured":"Peilong Li, Yan Luo, Ning Zhang, and Yu Cao. 2015. Heterospark: A heterogeneous cpu\/gpu spark platform for machine learning algorithms. In Proc. 2015 IEEE International Conference on Networking, Architecture and Storage (NAS)."},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDCAT.2012.34"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304043"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.7148\/2014-0508"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00070"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"crossref","unstructured":"Sparsh Mittal and Jeffrey\u00a0S. Vetter. 2015. A Survey of CPU-GPU Heterogeneous Computing Techniques. ACM Comput. Surv. (2015).","DOI":"10.1145\/2788396"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-01764-3"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037715"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00030"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00077"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00061"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-03359-9_27"},{"key":"e_1_3_3_3_53_2","unstructured":"Saptadeep Pal Eiman Ebrahimi Arslan Zulfiqar Yaosheng Fu Victor Zhang Szymon Migacz David Nellans and Puneet Gupta. 2019. Optimizing Multi-GPU Parallelization Strategies for Deep Learning Training. Proc. IEEE\/ACM MICRO (2019)."},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/800015.808204"},{"key":"e_1_3_3_3_55_2","volume-title":"Proc. IEEE\/IFIP DSN","author":"Patil Adarsh","year":"2023","unstructured":"Adarsh Patil, Vijay Nagarajan, Nikos Nikoleris, and Nicolai Oswald. 2023. \u0100pta: Fault-tolerant object-granular CXL disaggregated memory for accelerating FaaS. In Proc. IEEE\/IFIP DSN."},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.5555\/77493"},{"key":"e_1_3_3_3_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540747"},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00054"},{"key":"e_1_3_3_3_59_2","doi-asserted-by":"crossref","unstructured":"Susmit Sarkar Peter Sewell Jade Alglave Luc Maranget and Derek Williams. 2011. Understanding POWER multiprocessors(Proc. ACM PLDI).","DOI":"10.1145\/1993498.1993520"},{"key":"e_1_3_3_3_60_2","volume-title":"Proc. ICPP","author":"Schieffer Gabin","year":"2024","unstructured":"Gabin Schieffer, Jacob Wahlgren, Jie Ren, Jennifer Faj, and Ivy Peng. 2024. Harnessing Integrated CPUg-GPU System Memory for HPC: a first look into Grace Hopper. In Proc. ICPP."},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624868"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"crossref","unstructured":"Debendra\u00a0Das Sharma. 2024. PCI-Express: Evolution of a Ubiquitous Load-Store Interconnect Over Two Decades and the Path Forward for the Next Two Decades. IEEE Circuits and Systems Magazine (2024).","DOI":"10.1109\/MCAS.2024.3373556"},{"key":"e_1_3_3_3_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3636480.3637097"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00068"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"crossref","unstructured":"J. Stuecheli B. Blaner C.\u00a0R. Johns and M.\u00a0S. Siegel. 2015. CAPI: A Coherent Accelerator Processor Interface. IBM Journal of Research and Development (2015).","DOI":"10.1147\/JRD.2014.2380198"},{"key":"e_1_3_3_3_66_2","unstructured":"Yifan Sun Nicolas\u00a0Bohm Agostini Shi Dong and David\u00a0R. Kaeli. 2019. Summarizing CPU and GPU Design Trends with Product Data. CoRR (2019)."},{"key":"e_1_3_3_3_67_2","unstructured":"U.S. Department of Energy. 2014. Characterization of the DOE Mini-apps. https:\/\/portal.nersc.gov\/project\/CAL\/doe-miniapps.htm."},{"key":"e_1_3_3_3_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/MCHPC56545.2022.00007"},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00025"},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"crossref","unstructured":"Chenyang Zhang Feng Zhang Xiaoguang Guo Bingsheng He Xiao Zhang and Xiaoyong Du. 2020. iMLBench: A machine learning benchmark suite for CPU-GPU integrated architectures. IEEE Transactions on Parallel and Distributed Systems (2020).","DOI":"10.1109\/TPDS.2020.3046870"},{"key":"e_1_3_3_3_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613135"},{"key":"e_1_3_3_3_72_2","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539177"}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731074","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731074","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:05:57Z","timestamp":1750503957000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731074"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":71,"alternative-id":["10.1145\/3695053.3731074","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731074","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}