{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T04:16:49Z","timestamp":1777954609395,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No.2022YFB4501600"],"award-info":[{"award-number":["No.2022YFB4501600"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,16]]},"DOI":"10.1145\/3694906.3743309","type":"proceedings-article","created":{"date-parts":[[2025,7,16]],"date-time":"2025-07-16T16:19:56Z","timestamp":1752682796000},"page":"487-498","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["JOVS: Joint Optimization of Vectorization and Scheduling for DNN on AI DSPs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7744-4707","authenticated-orcid":false,"given":"Yaochen","family":"Han","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2258-7346","authenticated-orcid":false,"given":"Hongxu","family":"Jiang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3487-5289","authenticated-orcid":false,"given":"Runhua","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5211-1664","authenticated-orcid":false,"given":"Rui","family":"She","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,16]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems. 1004--1016","author":"Safeer Ahmad Maaz Bin","year":"2022","unstructured":"Maaz Bin Safeer Ahmad, Alexander J Root, Andrew Adams, Shoaib Kamil, and Alvin Cheung. 2022. Vector instruction selection for digital signal processors using program synthesis. In Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems. 1004--1016."},{"key":"e_1_3_2_1_2_1","unstructured":"ARM. [n. d.]. Exploring the Arm dot product instructions. https:\/\/community.arm.com\/developer\/tools-software\/tools\/b\/tools-softwareides-blog\/posts\/exploring-the-arm-dot-product-instructions."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582061"},{"key":"e_1_3_2_1_4_1","unstructured":"Cadence. [n. d.]. Vision DSPs. https:\/\/www.cadence.com\/en_US\/home\/tools\/silicon-solutions\/compute-ip\/vision-dsps.html."},{"key":"e_1_3_2_1_5_1","unstructured":"CEVA. [n. d.]. Ceva-NeuPro Studio CDNN. https:\/\/www.ceva-ip.com\/product\/ceva-deep-neural-network-cdnn\/."},{"key":"e_1_3_2_1_6_1","unstructured":"CEVA. [n.d.]. SensPro2. https:\/\/www.ceva-ip.com\/product\/ceva-senspro\/."},{"key":"e_1_3_2_1_7_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. {TVM}: An automated {End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578--594."},{"key":"e_1_3_2_1_8_1","volume-title":"Learning to optimize tensor programs. Advances in Neural Information Processing Systems 31","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Lianmin Zheng, Eddie Yan, Ziheng Jiang, Thierry Moreau, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. Learning to optimize tensor programs. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00050"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446692"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575702"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582018"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","unstructured":"Intel. [n. d.]. Intel Advanced Matrix Extensions Overview. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/docs\/accelerator-engines\/advanced-matrix-extensions\/overview.html."},{"key":"e_1_3_2_1_16_1","volume-title":"Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2019. Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579990.3580015"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640385"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579990.3580025"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00044"},{"key":"e_1_3_2_1_21_1","unstructured":"Nvidia. [n. d.]. NVIDIA Tensor Cores. https:\/\/www.nvidia.com\/en-us\/datacenter\/tensorcore\/."},{"key":"e_1_3_2_1_22_1","unstructured":"Qualcomm. [n. d.]. Hexagon NN Library. https:\/\/developer.qualcomm.com\/software\/hexagon-dsp-sdk."},{"key":"e_1_3_2_1_23_1","unstructured":"Qualcomm. [n. d.]. Snapdragon 865. https:\/\/www.qualcomm.com\/products\/snapdragon-8655g-mobile-platform."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_26_1","volume-title":"International conference on machine learning. PMLR, 6105--6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. PMLR, 6105--6114."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624873"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446707"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370330"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587440"},{"key":"e_1_3_2_1_31_1","volume-title":"An evaluation of edge tpu accelerators for convolutional neural networks. arXiv preprint arXiv:2102.10423 1, 6","author":"Yazdanbakhsh Amir","year":"2021","unstructured":"Amir Yazdanbakhsh, Kiran Seshadri, Berkin Akin, James Laudon, and Ravi Narayanaswami. 2021. An evaluation of edge tpu accelerators for convolutional neural networks. arXiv preprint arXiv:2102.10423 1, 6 (2021)."},{"key":"e_1_3_2_1_32_1","volume-title":"Automatic End-to-End Joint Optimization for Kernel Compilation on DSPs. In 2023 60th ACM\/IEEE Design Automation Conference (DAC). IEEE, 1--6.","author":"Zhao Xiaolei","year":"2023","unstructured":"Xiaolei Zhao, Zhaoyun Chen, Yang Shi, Mei Wen, and Chunyun Zhang. 2023. Automatic End-to-End Joint Optimization for Kernel Compilation on DSPs. In 2023 60th ACM\/IEEE Design Automation Conference (DAC). IEEE, 1--6."},{"key":"e_1_3_2_1_33_1","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, et al. 2020. Ansor: Generating {High-Performance} tensor programs for deep learning. In 14th USENIX symposium on operating systems design and implementation (OSDI 20). 863--879."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071018"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00042"}],"event":{"name":"SPAA '25: 37th ACM Symposium on Parallelism in Algorithms and Architectures","location":"Portland OR USA","acronym":"SPAA '25","sponsor":["SIGACT ACM Special Interest Group on Algorithms and Computation Theory","SIGARCH ACM Special Interest Group on Computer Architecture","EATCS European Association for Theoretical Computer Science"]},"container-title":["Proceedings of the 37th ACM Symposium on Parallelism in Algorithms and Architectures"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694906.3743309","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T19:20:13Z","timestamp":1777922413000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694906.3743309"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,16]]},"references-count":37,"alternative-id":["10.1145\/3694906.3743309","10.1145\/3694906"],"URL":"https:\/\/doi.org\/10.1145\/3694906.3743309","relation":{},"subject":[],"published":{"date-parts":[[2025,7,16]]},"assertion":[{"value":"2025-07-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}