{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,5]],"date-time":"2026-02-05T06:17:53Z","timestamp":1770272273387,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","funder":[{"name":"Strategic Priority Research Program of Chinese Academy of Sciences","award":["No.XDB0500102"],"award-info":[{"award-number":["No.XDB0500102"]}]},{"name":"Laoshan Laboratory","award":["No.LSKJ202300305"],"award-info":[{"award-number":["No.LSKJ202300305"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730421","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"975-990","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["CIExplorer: Microarchitecture-Aware Exploration for Tightly Integrated Custom Instruction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4115-9475","authenticated-orcid":false,"given":"Xiaoyu","family":"Hao","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6494-8502","authenticated-orcid":false,"given":"Sen","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3366-9881","authenticated-orcid":false,"given":"Liang","family":"Qiao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9729-8821","authenticated-orcid":false,"given":"Qingcai","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9888-6238","authenticated-orcid":false,"given":"Jun","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6487-3658","authenticated-orcid":false,"given":"Junshi","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and Laoshan Laboratory, Qingdao, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3900-3722","authenticated-orcid":false,"given":"Hong","family":"An","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and Laoshan Laboratory, Qingdao, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3385-2053","authenticated-orcid":false,"given":"Xulong","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6161-2075","authenticated-orcid":false,"given":"Hao","family":"Shu","sequence":"additional","affiliation":[{"name":"NIO, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8429-2758","authenticated-orcid":false,"given":"Honghui","family":"Yuan","sequence":"additional","affiliation":[{"name":"NIO, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2018. SPEC CPU 2006. https:\/\/www.spec.org\/cpu2006\/. Accessed: 2024-12."},{"key":"e_1_3_3_2_3_2","unstructured":"2020. Arm Custom Instructions: Enabling Innovation and Greater Flexibility on Arm. https:\/\/armkeil.blob.core.windows.net\/developer\/Files\/pdf\/white-paper\/arm-custom-instructions-wp.pdf."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304062"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Junwhan Ahn and Kiyoung Choi. 2012. Isomorphism-aware identification of custom instructions with I\/O serialization. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 32 1 (2012) 34\u201346.","DOI":"10.1109\/TCAD.2012.2214033"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/1084834.1084880"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Kubilay Atasu Wayne Luk Oskar Mencer Can Ozturan and G\u00fcnhan Dundar. 2010. FISH: Fast instruction synthesis for custom processors. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 20 1 (2010) 52\u201365.","DOI":"10.1109\/TVLSI.2010.2090543"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/775832.775897"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614289"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Paolo Bonzini and Laura Pozzi. 2008. Recurrence-aware instruction set selection for extensible embedded processors. IEEE transactions on very large scale integration (VLSI) systems 16 10 (2008) 1259\u20131267.","DOI":"10.1109\/TVLSI.2008.2001863"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Iulian Brumar Georgios Zacharopoulos Yuan Yao Saketh Rama David Brooks and Gu-Yeon Wei. 2023. Early dse and automatic generation of coarse-grained merged accelerators. ACM Transactions on Embedded Computing Systems 22 2 (2023) 1\u201329.","DOI":"10.1145\/3546070"},{"key":"e_1_3_3_2_12_2","unstructured":"Christopher Celio Palmer Dabbelt David\u00a0A Patterson and Krste Asanovi\u0107. 2016. The renewed case for the reduced instruction set computer: Avoiding isa bloat with macro-op fusion for risc-v. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1607.02318 (2016)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2005.9"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/1176760.1176779"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2004.5"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2003.1253189"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Nathan\u00a0T Clark Hongtao Zhong and Scott\u00a0A Mahlke. 2005. Automated custom instruction generation for domain-specific processor acceleration. IEEE Trans. Comput. 54 10 (2005) 1258\u20131270.","DOI":"10.1109\/TC.2005.156"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/968280.968307"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530432"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-92990-1_24"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Brian Fields Rastislav Bodik and Mark\u00a0D Hill. 2002. Slack: Maximizing performance under technological constraints. ACM SIGARCH Computer Architecture News 30 2 (2002) 47\u201358.","DOI":"10.1145\/545214.545222"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.1996.566472"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-73625-7_30"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Emanuele Giaquinta Anadi Mishra and Laura Pozzi. 2015. Maximum convex subgraphs under I\/O constraint for automatic identification of custom instructions. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 34 3 (2015) 483\u2013494.","DOI":"10.1109\/TCAD.2014.2387375"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Vikkitharan Gnanasambandapillai Jorgen Peddersen Roshan Ragel and Sri Parameswaran. 2020. Finder: Find efficient parallel instructions for asips to improve performance of large applications. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 39 11 (2020) 3577\u20133588.","DOI":"10.1109\/TCAD.2020.3012211"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532390"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Ricardo\u00a0E Gonzalez. 2000. Xtensa: A configurable and extensible processor. IEEE micro 20 2 (2000) 60\u201370.","DOI":"10.1109\/40.848473"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Venkatraman Govindaraju Chen-Han Ho Tony Nowatzki Jatin Chhugani Nadathur Satish Karthikeyan Sankaralingam and Changkyu Kim. 2012. Dyser: Unifying functionality and parallelism specialization for energy-efficient computing. IEEE Micro 32 5 (2012) 38\u201351.","DOI":"10.1109\/MM.2012.51"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155623"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Juris Hartmanis. 1982. Computers and intractability: a guide to the theory of np-completeness (michael r. garey and david s. johnson). Siam Review 24 1 (1982) 90.","DOI":"10.1137\/1024022"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASPDAC.2014.6742893"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Eslam Hussein Bernd Waschneck and Christian Mayr. 2024. Automating application-driven customization of ASIPs: A survey. Journal of Systems Architecture (2024) 103080.","DOI":"10.1016\/j.sysarc.2024.103080"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.5555\/1204936"},{"key":"e_1_3_3_2_34_2","unstructured":"Intel Corporation. 2024. Intel\u00aeIntrinsics Guide. https:\/\/www.intel.com\/content\/www\/us\/en\/docs\/intrinsics-guide\/index.html. Accessed: 2024-06-13."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_3_2_36_2","unstructured":"Hyungyo Kim Gaohan Ye Nachuan Wang Amir Yazdanbakhsh and Nam\u00a0Sung Kim. 2024. Exploiting Intel\u00ae Advanced Matrix Extensions (AMX) for Large Language Model Inference. IEEE Computer Architecture Letters (2024)."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.59"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS49563.2019.00006"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","first-page":"330","DOI":"10.1109\/MICRO.1997.645830","volume-title":"Proceedings of 30th Annual International Symposium on Microarchitecture","author":"Lee Chunho","year":"1997","unstructured":"Chunho Lee, Miodrag Potkonjak, and William\u00a0H Mangione-Smith. 1997. Mediabench: A tool for evaluating and synthesizing multimedia and communications systems. In Proceedings of 30th Annual International Symposium on Microarchitecture. IEEE, 330\u2013335."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/1669112.1669172"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Tao Li Wu Jigang Siew-Kei Lam Thambipillai Srikanthan and Xicheng Lu. 2010. Selecting profitable custom instructions for reconfigurable processors. Journal of Systems Architecture 56 8 (2010) 340\u2013351.","DOI":"10.1016\/j.sysarc.2010.04.004"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Mahim Mishra Timothy\u00a0J Callahan Tiberiu Chelcea Girish Venkataramani Seth\u00a0C Goldstein and Mihai Budiu. 2006. Tartan: evaluating spatial computation for whole program execution. ACM SIGARCH Computer Architecture News 34 5 (2006) 163\u2013174.","DOI":"10.1145\/1168919.1168878"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/581199.581210"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Nahri Moreano Edson Borin Cid De\u00a0Souza and Guido Araujo. 2005. Efficient datapath merging for partially reconfigurable architectures. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 24 7 (2005) 969\u2013980.","DOI":"10.1109\/TCAD.2005.850844"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750380"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Tony Nowatzki Venkatraman Govindaraju and Karthikeyan Sankaralingam. 2015. A graph-based program representation for analyzing hardware specialization approaches. IEEE Computer Architecture Letters 14 2 (2015) 94\u201398.","DOI":"10.1109\/LCA.2015.2476801"},{"key":"e_1_3_3_2_48_2","unstructured":"NVIDIA Corporation. 2017. NVIDIA Tesla V100 GPU Architecture. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLSID.2007.40"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Laura Pozzi Kubilay Atasu and Paolo Ienne. 2006. Exact and approximate algorithms for the extension of embedded processor instruction sets. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 25 7 (2006) 1209\u20131229.","DOI":"10.1109\/TCAD.2005.855950"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/1086297.1086300"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661174"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"crossref","unstructured":"Yakun\u00a0Sophia Shao Brandon Reagen Gu-Yeon Wei and David Brooks. 2014. Aladdin: A pre-rtl power-performance accelerator simulator enabling large design space exploration of customized architectures. ACM SIGARCH Computer Architecture News 42 3 (2014) 97\u2013108.","DOI":"10.1145\/2678373.2665689"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783752"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Timothy Sherwood Erez Perelman Greg Hamerly and Brad Calder. 2002. Automatically characterizing large scale program behavior. ACM SIGPLAN Notices 37 10 (2002) 45\u201357.","DOI":"10.1145\/605432.605403"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"crossref","unstructured":"Shaojie Tan Qingcai Jiang Zhenwei Cao Xiaoyu Hao Junshi Chen and Hong An. 2024. Uncovering the performance bottleneck of modern HPC processor with static code analyzer: a case study on Kunpeng 920. CCF Transactions on High Performance Computing 6 3 (2024) 343\u2013364.","DOI":"10.1007\/s42514-023-00160-0"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480094"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/1289881.1289905"},{"key":"e_1_3_3_2_59_2","unstructured":"Shanshan Wang and Chenglong Xiao. 2023. Reinforcement Learning for Selecting Custom Instructions under Area Constraint. IEEE Transactions on Artificial Intelligence (2023)."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"crossref","unstructured":"Shanshan Wang Chenglong Xiao Wanjun Liu and Emmanuel Casseau. 2016. A comparison of heuristic algorithms for custom instruction selection. Microprocessors and Microsystems 45 (2016) 176\u2013186.","DOI":"10.1016\/j.micpro.2016.05.001"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"crossref","unstructured":"Chenglong Xiao and Emmanuel Casseau. 2012. Exact custom instruction enumeration for extensible processors. Integration 45 3 (2012) 263\u2013270.","DOI":"10.1016\/j.vlsi.2011.11.011"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"crossref","unstructured":"Chenglong Xiao Emmanuel Casseau Shanshan Wang and Wanjun Liu. 2014. Automatic custom instruction identification for application-specific instruction set processors. Microprocessors and Microsystems 38 8 (2014) 1012\u20131024.","DOI":"10.1016\/j.micpro.2014.09.001"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD46524.2019.00024"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-22047-5_21"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1109\/SASP.2008.4570779"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Marcela Zuluaga and Nigel Topham. 2009. Design-space exploration of resource-sharing solutions for custom instruction set extensions. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 28 12 (2009) 1788\u20131801.","DOI":"10.1109\/TCAD.2009.2026355"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730421","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:59:16Z","timestamp":1755867556000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730421"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":65,"alternative-id":["10.1145\/3721145.3730421","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730421","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}