{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T07:51:41Z","timestamp":1780473101435,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":75,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2047516,2146873,2230944,2403088,2428108,2341378,2333899,2333895,2334273,2402942"],"award-info":[{"award-number":["2047516,2146873,2230944,2403088,2428108,2341378,2333899,2333895,2334273,2402942"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"HORIZON EUROPE","award":["101092912"],"award-info":[{"award-number":["101092912"]}]},{"name":"Commonwealth Cyber Initiative","award":["HC-3Q24-047"],"award-info":[{"award-number":["HC-3Q24-047"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3725774","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"205-220","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["TMModel: Modeling Texture Memory and Mobile GPU Performance to Accelerate DNN Computations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5274-9169","authenticated-orcid":false,"given":"Jiexiong","family":"Guan","sequence":"first","affiliation":[{"name":"University of Thessaly, Volos, Greece and William &amp; Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4892-755X","authenticated-orcid":false,"given":"Zhenqing","family":"Hu","sequence":"additional","affiliation":[{"name":"William &amp; Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6486-062X","authenticated-orcid":false,"given":"Christos D.","family":"Antonopoulos","sequence":"additional","affiliation":[{"name":"University of Thessaly, Volos, Greece"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9522-9136","authenticated-orcid":false,"given":"Nikolaos","family":"Bellas","sequence":"additional","affiliation":[{"name":"University of Thessaly, Volos, Greece"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2232-3559","authenticated-orcid":false,"given":"Spyros","family":"Lalis","sequence":"additional","affiliation":[{"name":"University of Thessaly, Volos, Greece"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8754-581X","authenticated-orcid":false,"given":"Evgenia","family":"Smirni","sequence":"additional","affiliation":[{"name":"William &amp; Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4425-9837","authenticated-orcid":false,"given":"Gang","family":"Zhou","sequence":"additional","affiliation":[{"name":"William &amp; Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2609-1428","authenticated-orcid":false,"given":"Gagan","family":"Agrawal","sequence":"additional","affiliation":[{"name":"University of Georgia, Athens, GA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4116-5237","authenticated-orcid":false,"given":"Bin","family":"Ren","sequence":"additional","affiliation":[{"name":"William &amp; Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2022. Adreno OpenCL Machine Learning SDK v3.0. https:\/\/developer.qualcomm.com\/downloads\/adreno-opencl-machine-learning-sdk-v30."},{"key":"e_1_3_3_2_3_2","first-page":"265","volume-title":"OSDI 2016","author":"Abadi Martin","year":"2016","unstructured":"Martin Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek\u00a0G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A system for large-scale machine learning. In OSDI 2016. USENIX Association, USA, 265\u2013283."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392761"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476221"},{"key":"e_1_3_3_2_6_2","unstructured":"ARM.2024. Arm Immortalis and Mali GPU OpenCL Developer Guide. (2024)."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656630"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483553"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330355"},{"key":"e_1_3_3_2_10_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. { TVM} : An automated { End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00490"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651369"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Thanh\u00a0Tuan Dao Jungwon Kim Sangmin Seo Bernhard Egger and Jaejin Lee. 2014. A performance model for GPUs with caches. IEEE Transactions on Parallel and Distributed Systems 26 7 (2014) 1800\u20131813.","DOI":"10.1109\/TPDS.2014.2333526"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/PerComWorkshops53856.2022.9767442"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/781131.781159"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Michael Doggett. 2012. Texture caches. IEEE Micro 32 3 (2012) 136\u2013141.","DOI":"10.1109\/MM.2012.44"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_25"},{"key":"e_1_3_3_2_18_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Minquan Fang Jianbin Fang Weimin Zhang Haifang Zhou Jianxing Liao and Yuangang Wang. 2018. Benchmarking the GPU memory at the warp level. Parallel Comput. 71 (2018) 23\u201341.","DOI":"10.1016\/j.parco.2017.11.003"},{"key":"e_1_3_3_2_20_2","unstructured":"Trevor Gale Deepak Narayanan Cliff Young and Matei Zaharia. 2023. Megablocks: Efficient sparse training with mixture-of-experts. Proceedings of Machine Learning and Systems 5 (2023) 288\u2013304."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3539765"},{"key":"e_1_3_3_2_22_2","unstructured":"Google. 2023. Tensorflow XLA. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/264107.264152"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00047"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Mark\u00a0D Hill and Alan\u00a0Jay Smith. 1989. Evaluating associativity in CPU caches. IEEE Trans. Comput. 38 12 (1989) 1612\u20131630.","DOI":"10.1109\/12.40842"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/285305.285321"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623779"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071051"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Byunghyun Jang Dana Schaa Perhaad Mistry and David Kaeli. 2010. Exploiting memory access patterns to improve memory performance in data-parallel architectures. IEEE Transactions on Parallel and Distributed Systems 22 1 (2010) 105\u2013118.","DOI":"10.1109\/TPDS.2010.107"},{"key":"e_1_3_3_2_31_2","unstructured":"Xiaotang Jiang Huan Wang Yiliu Chen Ziqi Wu Lichuan Wang Bin Zou Yafeng Yang Zongyang Cui Yu Cai Tianhang Yu Chengfei Lv and Zhihua Wu. 2020. MNN: A universal and efficient inference engine. Proceedings of Machine Learning and Systems 2 (2020) 1\u201313."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-11970-5_15"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Mohsen Kiani and Amir Rajabzadeh. 2018. Efficient cache performance modeling in GPUs using reuse distance analysis. ACM Transactions on Architecture and Code Optimization (TACO) 15 4 (2018) 1\u201324.","DOI":"10.1145\/3291051"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358252"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2014.7040988"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Yann LeCun L\u00e9on Bottou Yoshua Bengio and Patrick Haffner. 1998. Gradient-based learning applied to document recognition. Proc. IEEE 86 11 (1998) 2278\u20132324.","DOI":"10.1109\/5.726791"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00072"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3517020"},{"key":"e_1_3_3_2_41_2","unstructured":"Ji Lin Ligeng Zhu Wei-Ming Chen Wei-Chen Wang Chuang Gan and Song Han. 2022. On-device training under 256kb memory. Advances in Neural Information Processing Systems 35 (2022) 22941\u201322954."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Weile Luo Ruibo Fan Zeyu Li Dayou Du Qiang Wang and Xiaowen Chu. 2024. Benchmarking and dissecting the nvidia hopper gpu architecture. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13499 (2024).","DOI":"10.1109\/IPDPS57955.2024.00064"},{"key":"e_1_3_3_2_43_2","first-page":"1273","volume-title":"Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS)","volume":"54","author":"McMahan H.\u00a0Brendan","year":"2017","unstructured":"H.\u00a0Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and Blaise Aguera\u00a0y Arcas. 2017. Communication-Efficient Learning of Deep Networks from Decentralized Data. In Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS) , Vol.\u00a054. 1273\u20131282."},{"key":"e_1_3_3_2_44_2","unstructured":"MediaTek. 2023. MediaTek Dimensity 1100. https:\/\/www.mediatek.com\/products\/tablets\/mediatek-dimensity-1100."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Xinxin Mei and Xiaowen Chu. 2016. Dissecting GPU memory hierarchy through microbenchmarking. IEEE Transactions on Parallel and Distributed Systems 28 1 (2016) 72\u201386.","DOI":"10.1109\/TPDS.2016.2549523"},{"key":"e_1_3_3_2_46_2","unstructured":"Ji\u00a0Joong Moon Parichay Kapoor Ji\u00a0Hoon Lee Myung\u00a0Joo Ham and Hyun\u00a0Suk Lee. 2022. NNTrainer: Light-weight on-device training framework. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2206.04688 (2022)."},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2012.117"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Wei Niu Zhengang Li Xiaolong Ma Peiyan Dong Gang Zhou Xuehai Qian Xue Lin Yanzhi Wang and Bin Ren. 2021. Grim: A general real-time deep learning inference framework for mobile devices based on fine-grained structured weight sparsity. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 10 (2021) 6224\u20136239.","DOI":"10.1109\/TPAMI.2021.3089687"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651384"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835955"},{"key":"e_1_3_3_2_52_2","unstructured":"Nvidia.2024. CUDA C++ Programming Guide. (2024)."},{"key":"e_1_3_3_2_53_2","unstructured":"Nvidia.2024. Hopper Tuning Guide. (2024)."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062320"},{"key":"e_1_3_3_2_55_2","unstructured":"Qualcomm. 2016. Snapdragon Profiler. https:\/\/developer.qualcomm.com\/software\/snapdragon-profiler."},{"key":"e_1_3_3_2_56_2","unstructured":"Qualcomm.2023. Qualcomm Snapdragon Mobile Platform OpenCL General Programming and Optimization. (2023)."},{"key":"e_1_3_3_2_57_2","unstructured":"Qualcomm. 2023. Snapdragon Gen2. https:\/\/en.wikipedia.org\/wiki\/List_of_Qualcomm_Snapdragon_systems_on_chips."},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_3_2_59_2","unstructured":"Joseph Redmon. 2013\u20132016. Darknet: Open Source Neural Networks in C. http:\/\/pjreddie.com\/darknet\/."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"crossref","unstructured":"Rafael\u00a0H Saavedra and Alan\u00a0Jay Smith. 1995. Measuring cache and TLB performance and their effect on benchmark runtimes. IEEE Trans. Comput. 44 10 (1995) 1223\u20131235.","DOI":"10.1109\/12.467697"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"crossref","unstructured":"Yuki Sugimoto Fumihiko Ino and Kenichi Hagihara. 2014. Improving cache locality for GPU-based volume rendering. Parallel Comput. 40 5-6 (2014) 59\u201369.","DOI":"10.1016\/j.parco.2014.03.013"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2011.16"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"e_1_3_3_2_65_2","volume-title":"Understanding latency hiding on GPUs","author":"Volkov Vasily","year":"2016","unstructured":"Vasily Volkov. 2016. Understanding latency hiding on GPUs. University of California, Berkeley."},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.5555\/1413370.1413402"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00085"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538928"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"crossref","unstructured":"Samuel Williams Andrew Waterman and David Patterson. 2009. Roofline: an insightful visual performance model for multicore architectures. Commun. ACM 52 4 (2009) 65\u201376.","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"crossref","unstructured":"Bo Wu Zhijia Zhao Eddy\u00a0Zheng Zhang Yunlian Jiang and Xipeng Shen. 2013. Complexity analysis and algorithm design for reorganizing data to minimize non-coalesced memory accesses on gpu. ACM SIGPLAN Notices 48 8 (2013) 57\u201368.","DOI":"10.1145\/2517327.2442523"},{"key":"e_1_3_3_2_73_2","first-page":"1203","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Xie Zhen","year":"2024","unstructured":"Zhen Xie, Murali Emani, Xiaodong Yu, Dingwen Tao, Xin He, Pengfei Su, Keren Zhou, and Venkatram Vishwanath. 2024. Centimani: Enabling Fast { AI} Accelerator Selection for { DNN} Training with a Novel Performance Predictor. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 1203\u20131221."},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3560545"},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid59990.2024.00031"},{"key":"e_1_3_3_2_76_2","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569674"},{"key":"e_1_3_3_2_77_2","unstructured":"Geng Yuan Xiaolong Ma Wei Niu Zhengang Li Zhenglun Kong Ning Liu Yifan Gong Zheng Zhan Chaoyang He Qing Jin et\u00a0al. 2021. Mest: Accurate and fast memory-economic sparse training framework on the edge. Advances in Neural Information Processing Systems 34 (2021) 20838\u201320850."},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458864.3467882"},{"key":"e_1_3_3_2_79_2","first-page":"863","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody\u00a0Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating { High-Performance} tensor programs for deep learning. In 14th USENIX symposium on operating systems design and implementation (OSDI 20). 863\u2013879."},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_3_2_81_2","first-page":"177","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Zhou Qihua","year":"2021","unstructured":"Qihua Zhou, Song Guo, Zhihao Qu, Jingcai Guo, Zhenda Xu, Jiewei Zhang, Tao Guo, Boyuan Luo, and Jingren Zhou. 2021. Octo:{ INT8} training with loss-aware compensation and backward quantization for tiny on-device learning. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 177\u2013191."},{"key":"e_1_3_3_2_82_2","first-page":"233","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. 2022. { ROLLER} : Fast and efficient tensor compilation for deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 233\u2013248."}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725774","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725774","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:02:47Z","timestamp":1755867767000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3725774"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":81,"alternative-id":["10.1145\/3721145.3725774","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3725774","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}