{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T11:44:11Z","timestamp":1766231051209,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3750720.3757284","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T11:42:38Z","timestamp":1766230958000},"page":"33-43","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Hardware-aware Graph Partition for Mobile Inference Acceleration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0471-0216","authenticated-orcid":false,"given":"Meng-Shiun","family":"Yu","sequence":"first","affiliation":[{"name":"National Tsing Hua University, Taiwan, HsinChu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0532-7009","authenticated-orcid":false,"given":"Feng-Yi","family":"Zhan","sequence":"additional","affiliation":[{"name":"National Tsing Hua University, Taiwan, HsinChu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9643-552X","authenticated-orcid":false,"given":"Ming-Zhang","family":"Huang","sequence":"additional","affiliation":[{"name":"National Tsing Hua University, Taiwan, HsinChu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3347-8161","authenticated-orcid":false,"given":"Tai-Liang","family":"Chen","sequence":"additional","affiliation":[{"name":"National Tsing Hua University, Taiwan, HsinChu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9919-6258","authenticated-orcid":false,"given":"Jenq-Kuen","family":"Lee","sequence":"additional","affiliation":[{"name":"National Tsing Hua University, Taiwan, HsinChu, Taiwan"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3665314.3670841"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","unstructured":"G. Alaejos A. Castell\u00f3 P. Alonso-Jord\u00e1 F.\u00a0D. Igual H. Mart\u00ednez and E.\u00a0S. Quintana-Ort\u00ed. 2024. Algorithm 1039: Automatic Generators for a Family of Matrix Multiplication Routines with Apache TVM. ACM Trans. Math. Software 50 1 Article 6 (March 2024) 34\u00a0pages. 10.1145\/3638532","DOI":"10.1145\/3638532"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3089801.3089805"},{"key":"e_1_3_3_2_5_2","unstructured":"Android Developers. 2024. Android Neural Networks API (NNAPI). https:\/\/developer.android.com\/ndk\/guides\/neuralnetworks."},{"key":"e_1_3_3_2_6_2","unstructured":"Apple. 2024. Core ML Documentation. https:\/\/developer.apple.com\/documentation\/coreml\/."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","unstructured":"M. Canesche V. Ros\u00e1rio E. Borin and F.\u00a0Q. Pereira. 2024. The Droplet Search Algorithm for Kernel Scheduling. ACM Transactions on Architecture and Code Optimization 21 2 Article 35 (May 2024) 28\u00a0pages. 10.1145\/3650109","DOI":"10.1145\/3650109"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3529538.3530001"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","unstructured":"Y.-M. Chang C.-Y. Sung Y.-C. Sheu M.-S. Yu M.-Y. Hsu and J.-K. Lee. 2021. Support NNEF execution model for NNAPI. Journal of Supercomputing 77 9 (Sept. 2021) 10065\u201310096. 10.1007\/s11227-021-03625-7","DOI":"10.1007\/s11227-021-03625-7"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","unstructured":"T.-L. Chen Y.-R. Chen M.-S. Yu and J.-K. Lee. 2021. NNBlocks: a Blockly framework for AI computing. Journal of Supercomputing 77 8 (Aug. 2021) 8622\u20138652. 10.1007\/s11227-021-03631-9","DOI":"10.1007\/s11227-021-03631-9"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3547276.3548514"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","unstructured":"X. Gao. 2023. TAS: A Temperature-Aware Scheduling for Heterogeneous Computing. IEEE Access 11 (June 2023) 54773\u201354781. 10.1109\/ACCESS.2023.3281839","DOI":"10.1109\/ACCESS.2023.3281839"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3452411.3464446"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","unstructured":"K. He X. Zhang S. Ren and J. Sun. 2015. Deep Residual Learning for Image Recognition. (2015). 10.48550\/arXiv.1512.03385 arxiv:https:\/\/arXiv.org\/abs\/1512.03385\u00a0[cs.CV]","DOI":"10.48550\/arXiv.1512.03385"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2017.101"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651383"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"H.-R. Huang D.-Y. Hong J.-J. Wu K.-F. Chen P. Liu and W.-C. Hsu. 2022. Accelerating Video Captioning on Heterogeneous System Architectures. ACM Transactions on Architecture and Code Optimization 19 3 Article 38 (May 2022) 25\u00a0pages. 10.1145\/3527609","DOI":"10.1145\/3527609"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378494"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","unstructured":"F.\u00a0N. Iandola S. Han M.\u00a0W. Moskewicz K. Ashraf W.\u00a0J. Dally and K. Keutzer. 2016. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size. (2016). 10.48550\/arXiv.1602.07360 arxiv:https:\/\/arXiv.org\/abs\/1602.07360\u00a0[cs.CV]","DOI":"10.48550\/arXiv.1602.07360"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3368756.3369091"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"E. Jeong J. Kim and S. Ha. 2022. TensorRT-Based Framework and Optimization Methodology for Deep Learning Inference on Jetson Boards. ACM Transactions on Embedded Computing Systems 21 5 Article 51 (Oct. 2022) 26\u00a0pages. 10.1145\/3508391","DOI":"10.1145\/3508391"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","unstructured":"D. Kang J. Oh J. Choi Y. Yi and S. Ha. 2020. Scheduling of Deep Learning Applications Onto Heterogeneous Processors in an Embedded Device. IEEE Access 8 (March 2020) 43980\u201343991. 10.1109\/ACCESS.2020.2977496","DOI":"10.1109\/ACCESS.2020.2977496"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","unstructured":"A. Krizhevsky I. Sutskever and G.\u00a0E. Hinton. 2017. ImageNet classification with deep convolutional neural networks. Commun. ACM 60 6 (May 2017) 84\u201390. 10.1145\/3065386","DOI":"10.1145\/3065386"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3409390.3409393"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","unstructured":"Y. Lecun L. Bottou Y. Bengio and P. Haffner. 1998. Gradient-based learning applied to document recognition. Proc. IEEE 86 11 (Nov 1998) 2278\u20132324. 10.1109\/5.726791","DOI":"10.1109\/5.726791"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656626"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS54860.2022.00051"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","unstructured":"Y. Liao Y. Xu H. Xu Z. Yao L. Wang and C. Qiao. 2023. Accelerating Federated Learning With Data and Model Parallelism in Edge Computing. IEEE\/ACM Transactions on Networking 32 1 (Aug. 2023) 904\u2013918. 10.1109\/TNET.2023.3299851","DOI":"10.1109\/TNET.2023.3299851"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCAI57533.2023.10201248"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3569966.3571191"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575707"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","unstructured":"R. Liu Y. Wu K. Zhao Z. Zhou X. Gao X. Lin X. Zhang X. Chen and G. Lu. 2024. Online Resource Provisioning and Batch Scheduling for AIoT Inference Serving in an XPU Edge Cloud. IEEE Transactions on Emerging Topics in Computing (May 2024) 1\u201316. 10.1109\/TETC.2024.3403874","DOI":"10.1109\/TETC.2024.3403874"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695955"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.5555\/3639940.3639991"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25063-735"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.23919\/ICMU48249.2019.9006661"},{"key":"e_1_3_3_2_37_2","unstructured":"MediaTek. 2024. MediaTek NeuroPilot AI Platform. https:\/\/neuropilot.mediatek.com\/."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","unstructured":"M. Motamedi D. Fong and S. Ghiasi. 2017. Machine Intelligence on Resource-Constrained IoT Devices: The Case of Thread Granularity Optimization for CNN Inference. ACM Transactions on Embedded Computing Systems 16 5s Article 151 (Sept. 2017) 19\u00a0pages. 10.1145\/3126555","DOI":"10.1145\/3126555"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3583120.3587045"},{"key":"e_1_3_3_2_40_2","unstructured":"Qualcomm. 2024. Qualcomm Neural Processing SDK for AI. https:\/\/www.qualcomm.com\/developer\/software\/neural-processing-sdk-for-ai."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3341162.3344849"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3583740.3628437"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","unstructured":"K. Simonyan and A. Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. (2015). 10.48550\/arXiv.1409.1556 arxiv:https:\/\/arXiv.org\/abs\/1409.1556\u00a0[cs.CV]","DOI":"10.48550\/arXiv.1409.1556"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","unstructured":"O. Spantidi G. Zervakis S. Alsalamin I. Roman-Ballesteros J. Henkel H. Amrouch and I. Anagnostopoulos. 2023. Targeting DNN Inference Via Efficient Utilization of Heterogeneous Precision DNN Accelerators. IEEE Transactions on Emerging Topics in Computing 11 1 (Jan 2023) 112\u2013125. 10.1109\/TETC.2022.3178730","DOI":"10.1109\/TETC.2022.3178730"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","unstructured":"O. Ullah H.\u00a0U. Khan Z. Halim S. Anwar and M. Waqas. 2023. On Neuroevolution of Multi-Input Compositional Pattern Producing Networks: A Case of Entertainment Computing Edge Devices and Smart Cities. ACM Transactions on Sensor Networks (Oct. 2023) 1\u201314. 10.1145\/3628430","DOI":"10.1145\/3628430"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","unstructured":"Q. Wang W. Fang L. Qian Y. Chen and N.\u00a0N. Xiong. 2024. An Intelligent Co-Scheduling Framework for Efficient Super-Resolution on Edge Platforms With Heterogeneous Processors. IEEE Internet of Things Journal 11 10 (May 2024) 17651\u201317662. 10.1109\/JIOT.2024.3357898","DOI":"10.1109\/JIOT.2024.3357898"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3597926.3598105"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530490"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3626079"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Chun-Chieh Yang Yi-Ru Chen Hui-Hsin Liao Yuan-Ming Chang and Jenq-Kuen Lee. 2023. Auto-tuning fixed-point precision with TVM on RISC-V packed SIMD extension. ACM Transactions on Design Automation of Electronic Systems 28 3 (2023) 1\u201321.","DOI":"10.1145\/3569939"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","unstructured":"M.-S. Yu C.-Y. Yuan T.-L. Chen and J.-K. Lee. 2024. Case Study: Optimization Methods With TVM Hybrid-OP on RISC-V Packed SIMD. IEEE Access 12 (May 2024) 64193\u201364211. 10.1109\/ACCESS.2024.3397195","DOI":"10.1109\/ACCESS.2024.3397195"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447993.3448628"},{"key":"e_1_3_3_2_53_2","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI \u201922)","author":"Zheng L.","year":"2022","unstructured":"L. Zheng, Z. Li, H. Zhang, Y. Zhuang, Z. Chen, Y. Huang, Y. Wang, Y. Xu, D. Zhuo, E.\u00a0P. Xing, J.\u00a0E. Gonzalez, and I. Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI \u201922). USENIX Association, Carlsbad, CA, USA, 559\u2013578. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zheng-lianmin"}],"event":{"name":"ICPP Workshops '25: The 54th International Conference on Parallel Processing Workshops","location":"San Diego CA USA","acronym":"ICPP Workshops '25"},"container-title":["Workshop Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3750720.3757284","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T11:43:07Z","timestamp":1766230987000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3750720.3757284"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":52,"alternative-id":["10.1145\/3750720.3757284","10.1145\/3750720"],"URL":"https:\/\/doi.org\/10.1145\/3750720.3757284","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}