{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T04:58:53Z","timestamp":1750309133941,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,7]],"date-time":"2024-05-07T00:00:00Z","timestamp":1715040000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Wallenberg AI, Autonomous Systems and Software Program (WASP)","award":["372202-37200072"],"award-info":[{"award-number":["372202-37200072"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,7]]},"DOI":"10.1145\/3649153.3649196","type":"proceedings-article","created":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T10:21:29Z","timestamp":1719915689000},"page":"126-137","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DNNOPT: A Framework for Efficiently Selecting On-chip Memory Loop Optimizations of DNN Accelerators"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7006-2376","authenticated-orcid":false,"given":"Piyumal","family":"Ranawaka","sequence":"first","affiliation":[{"name":"Chalmers University of Technology, Gothenburg, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0477-4540","authenticated-orcid":false,"given":"Muhammad Waqar","family":"Azhar","sequence":"additional","affiliation":[{"name":"Chalmers University of Technology, Gothenburg, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7441-8245","authenticated-orcid":false,"given":"Per","family":"Stenstrom","sequence":"additional","affiliation":[{"name":"Chalmers University of Technology, Gothenburg, Sweden"}]}],"member":"320","published-online":{"date-parts":[[2024,7,2]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2023. DeepBench. https:\/\/github.com\/baidu-research\/DeepBench#inference-benchmark"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"e_1_3_2_1_3_1","volume-title":"Compilers: Principles, Techniques, & Tools. Pearson Education India.","author":"Aho Alfred V","year":"2007","unstructured":"Alfred V Aho, Monica S Lam, Ravi Sethi, and Jeffrey D Ullman. 2007. Compilers: Principles, Techniques, & Tools. Pearson Education India."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628092"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/197405.197406"},{"key":"e_1_3_2_1_6_1","unstructured":"Riyadh Baghdadi Abdelkader Nadir Debbagh Kamel Abdous Benhamida Fatima Zohra Alex Renda Jonathan Elliott Frankle Michael Carbin and Saman Amarasinghe. [n. d.]. TIRAMISU: A Polyhedral Compiler for Dense and Sparse Deep Learning. ([n. d.])."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/3314872.3314896"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2896389"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2896389"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Amirali Boroumand Saugata Ghose Berkin Akin Ravi Narayanaswami Geraldo F Oliveira Xiaoyu Ma Eric Shiu and Onur Mutlu. 2021. Google Neural Network Models for Edge Devices: Analyzing and Mitigating Machine Learning Inference Bottlenecks. In 2021 30th International Conference on Parallel Architectures and Compilation Techniques (PACT). IEEE 159--172.","DOI":"10.1109\/PACT52795.2021.00019"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3066883"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/195470.195557"},{"key":"e_1_3_2_1_15_1","volume-title":"T-YOLO: Tiny Vehicle Detection Based on YOLO and Multi-Scale Convolutional Neural Networks","author":"Carrasco Daniel Padilla","year":"2021","unstructured":"Daniel Padilla Carrasco, Hatem A Rashwan, Miguel \u00c1ngel Garc\u00eda, and Dom\u00e8nec Puig. 2021. T-YOLO: Tiny Vehicle Detection Based on YOLO and Multi-Scale Convolutional Neural Networks. IEEE Access (2021)."},{"key":"e_1_3_2_1_16_1","first-page":"1","article-title":"Marvel: A Data-centric Approach for Mapping Deep Learning Operators on Spatial Accelerators","volume":"19","author":"Chatarasi Prasanth","year":"2021","unstructured":"Prasanth Chatarasi, Hyoukjun Kwon, Angshuman Parashar, Michael Pellauer, Tushar Krishna, and Vivek Sarkar. 2021. Marvel: A Data-centric Approach for Mapping Deep Learning Operators on Spatial Accelerators. ACM Transactions on Architecture and Code Optimization (TACO) 19, 1 (2021), 1--26.","journal-title":"ACM Transactions on Architecture and Code Optimization (TACO)"},{"key":"e_1_3_2_1_17_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. {TVM}: An Automated {End-to-End} Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578--594."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3358198"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the First International Workshop on Polyhedral Compilation Techniques (IMPACT)","volume":"2011","author":"Grosser Tobias","year":"2011","unstructured":"Tobias Grosser, Hongbin Zheng, Raghesh Aloor, Andreas Simb\u00fcrger, Armin Gr\u00f6\u00dflinger, and Louis-No\u00ebl Pouchet. 2011. Polly-Polyhedral optimization in LLVM. In Proceedings of the First International Workshop on Polyhedral Compilation Techniques (IMPACT), Vol. 2011. 1."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the First International Workshop on Polyhedral Compilation Techniques (IMPACT)","volume":"2011","author":"Grosser Tobias","year":"2011","unstructured":"Tobias Grosser, Hongbin Zheng, Raghesh Aloor, Andreas Simb\u00fcrger, Armin Gr\u00f6\u00dflinger, and Louis-No\u00ebl Pouchet. 2011. Polly-Polyhedral optimization in LLVM. In Proceedings of the First International Workshop on Polyhedral Compilation Techniques (IMPACT), Vol. 2011. 1."},{"key":"e_1_3_2_1_23_1","unstructured":"Mehmet G\u00fcnel. 2016. GoogLeNet."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS55109.2022.00039"},{"key":"e_1_3_2_1_25_1","volume-title":"Mobilenets: Efficient Convolutional Neural Networks for Mobile Vision Applications. arXiv preprint arXiv:1704.04861","author":"Howard Andrew G","year":"2017","unstructured":"Andrew G Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, and Hartwig Adam. 2017. Mobilenets: Efficient Convolutional Neural Networks for Mobile Vision Applications. arXiv preprint arXiv:1704.04861 (2017)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00050"},{"key":"e_1_3_2_1_27_1","first-page":"1","article-title":"Beyond Data and Model Parallelism for Deep Neural Networks","volume":"1","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. 2019. Beyond Data and Model Parallelism for Deep Neural Networks. Proceedings of Machine Learning and Systems 1 (2019), 1--13.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3400302.3415639"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2985963"},{"key":"e_1_3_2_1_30_1","volume-title":"MAESTRO: An Open-Source Infrastructure for the Cost-Benefit Analysis of Dataflows within Deep Learning Accelerators.","author":"Kwon Hyoukjun","year":"2018","unstructured":"Hyoukjun Kwon and Tushar Krishna. 2018. MAESTRO: An Open-Source Infrastructure for the Cost-Benefit Analysis of Dataflows within Deep Learning Accelerators. (2018)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/989393.989437"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3030548"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446759"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021736"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3059962"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578360.3580257"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925952"},{"key":"e_1_3_2_1_38_1","first-page":"28","article-title":"CACTI 6.0: A Tool to Model Large Caches","volume":"27","author":"Muralimanohar Naveen","year":"2009","unstructured":"Naveen Muralimanohar, Rajeev Balasubramonian, and Norman P Jouppi. 2009. CACTI 6.0: A Tool to Model Large Caches. HP Laboratories 27 (2009), 28.","journal-title":"HP Laboratories"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_2_1_40_1","volume-title":"Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 169--174","author":"Peemen Maurice","year":"2015","unstructured":"Maurice Peemen, Bart Mesman, and Henk Corporaal. 2015. Inter-tile reuse optimization applied to bandwidth constrained embedded accelerators. In 2015 Design, Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 169--174."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2021.3060509"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3150211"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2017.7927162"},{"key":"e_1_3_2_1_44_1","volume-title":"Mlperf Inference Benchmark. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). IEEE, 446--459","author":"Reddi Vijay Janapa","year":"2020","unstructured":"Vijay Janapa Reddi, Christine Cheng, David Kanter, Peter Mattson, Guenther Schmuelling, Carole-Jean Wu, Brian Anderson, Maximilien Breughe, Mark Charlebois, William Chou, et al. 2020. Mlperf Inference Benchmark. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). IEEE, 446--459."},{"key":"e_1_3_2_1_45_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. [n. d.]. Faster R-CNN: Towards Real-Time Object Detection With Region Proposal Networks. Advances in Neural Information Processing Systems 28 ([n. d.])."},{"key":"e_1_3_2_1_46_1","volume-title":"Xla: Compiling machine learning for peak performance.","author":"Sabne Amit","year":"2020","unstructured":"Amit Sabne. 2020. Xla: Compiling machine learning for peak performance. (2020)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00016"},{"key":"e_1_3_2_1_48_1","volume-title":"Memory Requirements for Convolutional Neural Network Hardware Accelerators. In 2018 IEEE International Symposium on Workload Characterization (IISWC). IEEE, 111--121","author":"Siu Kevin","year":"2018","unstructured":"Kevin Siu, Dylan Malone Stuart, Mostafa Mahmoud, and Andreas Moshovos. 2018. Memory Requirements for Convolutional Neural Network Hardware Accelerators. In 2018 IEEE International Symposium on Workload Characterization (IISWC). IEEE, 111--121."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2022.12.008"},{"key":"e_1_3_2_1_50_1","volume-title":"Optimally Scheduling CNN Convolutions for Efficient Memory Access. arXiv preprint arXiv:1902.01492","author":"Stoutchinin Arthur","year":"2019","unstructured":"Arthur Stoutchinin, Francesco Conti, and Luca Benini. 2019. Optimally Scheduling CNN Convolutions for Efficient Memory Access. arXiv preprint arXiv:1902.01492 (2019)."},{"key":"e_1_3_2_1_51_1","volume-title":"Loma: Fast Auto-scheduling on DNN Accelerators Through Loop-Order-based Memory Allocation. In 2021 IEEE 3rd International Conference on Artificial Intelligence Circuits and Systems (AICAS)","author":"Symons Arne","year":"2021","unstructured":"Arne Symons, Linyan Mei, and Marian Verhelst. 2021. Loma: Fast Auto-scheduling on DNN Accelerators Through Loop-Order-based Memory Allocation. In 2021 IEEE 3rd International Conference on Artificial Intelligence Circuits and Systems (AICAS). IEEE, 1--4."},{"key":"e_1_3_2_1_52_1","volume-title":"AI Powered Compiler Techniques for DL Code Optimization. arXiv preprint arXiv:2104.05573","author":"Tavarageri Sanket","year":"2021","unstructured":"Sanket Tavarageri, Gagandeep Goyal, Sasikanth Avancha, Bharat Kaul, and Ramakrishna Upadrasta. 2021. AI Powered Compiler Techniques for DL Code Optimization. arXiv preprint arXiv:2104.05573 (2021)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_54_1","volume-title":"Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions. arXiv preprint arXiv:1802.04730","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zachary DeVito, William S Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions. arXiv preprint arXiv:1802.04730 (2018)."},{"key":"e_1_3_2_1_55_1","volume-title":"Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions. arXiv preprint arXiv:1802.04730","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zachary DeVito, William S Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions. arXiv preprint arXiv:1802.04730 (2018)."},{"key":"e_1_3_2_1_56_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD45719.2019.8942127"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.683"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD45719.2019.8942149"},{"key":"e_1_3_2_1_60_1","volume-title":"Energy-efficient Scheduling Method with Cross-loop Model for Resource-limited CNN Accelerator Designs. In 2017 IEEE International Symposium on Circuits and Systems (ISCAS). IEEE, 1--4.","author":"Yang Kaiyi","year":"2017","unstructured":"Kaiyi Yang, Shihao Wang, Jianbin Zhou, and Takeshi Yoshimura. 2017. Energy-efficient Scheduling Method with Cross-loop Model for Resource-limited CNN Accelerator Designs. In 2017 IEEE International Symposium on Circuits and Systems (ISCAS). IEEE, 1--4."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378514"},{"key":"e_1_3_2_1_62_1","volume-title":"Nikhil Bhagdikar, Stephen Richardson, Shahar Kvatinsky, Jonathan Ragan-Kelley, Ardavan Pedram, and Mark Horowitz.","author":"Yang Xuan","year":"2016","unstructured":"Xuan Yang, Jing Pu, Blaine Burton Rister, Nikhil Bhagdikar, Stephen Richardson, Shahar Kvatinsky, Jonathan Ragan-Kelley, Ardavan Pedram, and Mark Horowitz. 2016. A Systematic Approach to Blocking Convolutional Neural Networks. arXiv preprint arXiv:1606.04209 (2016)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.2983694"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2684746.2689060"}],"event":{"name":"CF '24: 21st ACM International Conference on Computing Frontiers","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"],"location":"Ischia Italy","acronym":"CF '24"},"container-title":["Proceedings of the 21st ACM International Conference on Computing Frontiers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649153.3649196","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649153.3649196","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:02Z","timestamp":1750287002000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649153.3649196"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,7]]},"references-count":64,"alternative-id":["10.1145\/3649153.3649196","10.1145\/3649153"],"URL":"https:\/\/doi.org\/10.1145\/3649153.3649196","relation":{},"subject":[],"published":{"date-parts":[[2024,5,7]]},"assertion":[{"value":"2024-07-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}