{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,15]],"date-time":"2025-08-15T01:06:26Z","timestamp":1755219986904,"version":"3.43.0"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T00:00:00Z","timestamp":1754524800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T00:00:00Z","timestamp":1754524800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Research and Development Center of Transport Industry of New Generation of Artificial Intelligence Technology","award":["202207H"],"award-info":[{"award-number":["202207H"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"DOI":"10.1007\/s11227-025-07706-9","type":"journal-article","created":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T17:14:01Z","timestamp":1754586841000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Batched data layout optimization for Im2col-based convolutions on CPUs"],"prefix":"10.1007","volume":"81","author":[{"given":"Hongzhi","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Xun","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Ruiyang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Chao","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Yangyang","family":"He","sequence":"additional","affiliation":[]},{"given":"Deyang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jinxiang","family":"Xie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,7]]},"reference":[{"key":"7706_CR1","doi-asserted-by":"publisher","unstructured":"Liu W, Anguelov D, Erhan D et\u00a0al (2016) Ssd: single shot multibox detector. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14. Springer, pp 21\u201337. https:\/\/doi.org\/10.1007\/978-3-319-46448-0_2","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"7706_CR2","unstructured":"Jocher G, Qiu J, Chaurasia A (2023) Ultralytics yolo. https:\/\/github.com\/ultralytics\/ultralytics"},{"issue":"11","key":"7706_CR3","doi-asserted-by":"publisher","first-page":"6170","DOI":"10.1109\/JSEN.2020.2973049","volume":"20","author":"G Chen","year":"2020","unstructured":"Chen G, Hong L, Dong J et al (2020) Eddd: event-based drowsiness driving detection through facial motion analysis with neuromorphic vision sensor. IEEE Sens J 20(11):6170\u20136181. https:\/\/doi.org\/10.1109\/JSEN.2020.2973049","journal-title":"IEEE Sens J"},{"key":"7706_CR4","doi-asserted-by":"publisher","unstructured":"Qi D, Tan W, Yao Q et al (2022) Yolo5face: why reinventing a face detector. In: European Conference on Computer Vision. Springer, pp 228\u2013244. https:\/\/doi.org\/10.48550\/arXiv.2105.12931","DOI":"10.48550\/arXiv.2105.12931"},{"issue":"4","key":"7706_CR5","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1109\/MSP.2020.2985815","volume":"37","author":"G Chen","year":"2020","unstructured":"Chen G, Cao H, Conradt J et al (2020) Event-based neuromorphic vision for autonomous driving: a paradigm shift for bio-inspired visual sensing and perception. IEEE Signal Process Mag 37(4):34\u201349. https:\/\/doi.org\/10.1109\/MSP.2020.2985815","journal-title":"IEEE Signal Process Mag"},{"key":"7706_CR6","doi-asserted-by":"publisher","unstructured":"Liu L, Wang Y, Shi W (2022) Understanding time variations of dnn inference in autonomous driving. arXiv preprint arXiv:2209.05487https:\/\/doi.org\/10.48550\/arXiv.2209.05487","DOI":"10.48550\/arXiv.2209.05487"},{"issue":"2","key":"7706_CR7","doi-asserted-by":"publisher","first-page":"1112","DOI":"10.1109\/TITS.2023.3315070","volume":"25","author":"HY Yatbaz","year":"2023","unstructured":"Yatbaz HY, Dianati M, Woodman R (2023) Introspection of dnn-based perception functions in automated driving systems: state-of-the-art and open research challenges. IEEE Trans Intell Transp Syst 25(2):1112\u20131130. https:\/\/doi.org\/10.1109\/TITS.2023.3315070","journal-title":"IEEE Trans Intell Transp Syst"},{"issue":"1","key":"7706_CR8","doi-asserted-by":"publisher","first-page":"286","DOI":"10.1109\/TWC.2022.3192613","volume":"22","author":"W Shi","year":"2022","unstructured":"Shi W, Zhou S, Niu Z et al (2022) Multiuser co-inference with batch processing capable edge server. IEEE Trans Wirel Commun 22(1):286\u2013300. https:\/\/doi.org\/10.1109\/TWC.2022.3192613","journal-title":"IEEE Trans Wirel Commun"},{"key":"7706_CR9","doi-asserted-by":"publisher","unstructured":"Wei C, Jia H, Zhang Y et\u00a0al (2022) Lbbgemm: A load-balanced batch gemm framework on arm cpus. In: 2022 IEEE 24th International Conference on High Performance Computing & Communications; 8th International Conference on Data Science & Systems; 20th International Conference on Smart City; 8th International Conference on Dependability in Sensor, Cloud & Big Data Systems & Application (HPCC\/DSS\/SmartCity\/DependSys), IEEE, pp 59\u201366. https:\/\/doi.org\/10.1109\/HPCC-DSS-SmartCity-DependSys57074.2022.00042","DOI":"10.1109\/HPCC-DSS-SmartCity-DependSys57074.2022.00042"},{"key":"7706_CR10","doi-asserted-by":"publisher","unstructured":"Zhang Y, Wang Y, Mo Z et\u00a0al (2022) Accelerating small matrix multiplications by adaptive batching strategy on gpu. In: 2022 IEEE 24th International Conference on High Performance Computing & Communications; 8th International Conference on Data Science & Systems; 20th International Conference on Smart City; 8th International Conference on Dependability in Sensor, Cloud & Big Data Systems & Application (HPCC\/DSS\/SmartCity\/DependSys). IEEE, pp 882\u2013887. https:\/\/doi.org\/10.1109\/HPCC-DSS-SmartCity-DependSys57074.2022.00143","DOI":"10.1109\/HPCC-DSS-SmartCity-DependSys57074.2022.00143"},{"key":"7706_CR11","doi-asserted-by":"publisher","unstructured":"Liu Y, Wang Y, Yu R et\u00a0al (2019) Optimizing cnn model inference on cpus. In: 2019 USENIX Annual Technical Conference (USENIX ATC 19), pp 1025\u20131040. https:\/\/doi.org\/10.48550\/arXiv.1809.02697","DOI":"10.48550\/arXiv.1809.02697"},{"issue":"10","key":"7706_CR12","doi-asserted-by":"publisher","first-page":"5095","DOI":"10.1109\/TNNLS.2021.3071762","volume":"33","author":"S Mittal","year":"2021","unstructured":"Mittal S, Rajput P, Subramoney S (2021) A survey of deep learning on cpus: opportunities and co-optimizations. IEEE Trans Neural Netw Learn Syst 33(10):5095\u20135115. https:\/\/doi.org\/10.1109\/TNNLS.2021.3071762","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"7706_CR13","doi-asserted-by":"publisher","unstructured":"Chen J, Xie Z, Liang W et\u00a0al (2024) Quantization-aware optimization approach for cnns inference on cpus. In: 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, pp 878\u2013883. https:\/\/doi.org\/10.1109\/ASP-DAC58780.2024.10473863","DOI":"10.1109\/ASP-DAC58780.2024.10473863"},{"issue":"1","key":"7706_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3570305","volume":"20","author":"I Korostelev","year":"2023","unstructured":"Korostelev I, De Carvalho JPL, Moreira J et al (2023) Yaconv: convolution with low cache footprint. ACM Trans Archit Code Optim 20(1):1\u201318. https:\/\/doi.org\/10.1145\/3570305","journal-title":"ACM Trans Archit Code Optim"},{"key":"7706_CR15","doi-asserted-by":"publisher","unstructured":"Chellapilla K, Puri S, Simard P (2006) High performance convolutional neural networks for document processing. In: Tenth International Workshop on Frontiers in Handwriting Recognition. Suvisoft. https:\/\/doi.org\/10.1109\/ICFHR.2006.104","DOI":"10.1109\/ICFHR.2006.104"},{"key":"7706_CR16","doi-asserted-by":"publisher","unstructured":"Paszke A, Gross S, Massa F et\u00a0al (2019) Pytorch: an imperative style, high-performance deep learning library. Adv Neural Inf Process Syst 32. https:\/\/doi.org\/10.48550\/arXiv.1912.01703","DOI":"10.48550\/arXiv.1912.01703"},{"key":"7706_CR17","doi-asserted-by":"publisher","unstructured":"Abadi M, Barham P, Chen J et al (2016) Tensorflow: a system for large-scale machine learning. In: 12th USENIX symposium on operating systems design and implementation (OSDI 16), pp 265\u2013283. https:\/\/doi.org\/10.48550\/arXiv.1605.08695","DOI":"10.48550\/arXiv.1605.08695"},{"key":"7706_CR18","doi-asserted-by":"publisher","unstructured":"Chen T, Li M, Li Y, et\u00a0al (2015) Mxnet: a flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274https:\/\/doi.org\/10.48550\/arXiv.1512.01274","DOI":"10.48550\/arXiv.1512.01274"},{"key":"7706_CR19","doi-asserted-by":"publisher","unstructured":"Cho M, Brand D (2017) Mec: memory-efficient convolution for deep neural network. In: International Conference on Machine Learning, PMLR, pp 815\u2013824. https:\/\/doi.org\/10.48550\/arXiv.1706.06873","DOI":"10.48550\/arXiv.1706.06873"},{"key":"7706_CR20","doi-asserted-by":"publisher","unstructured":"Ofir A, Ben-Artzi G (2022) Smm-conv: scalar matrix multiplication with zero packing for accelerated convolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 3067\u20133075. https:\/\/doi.org\/10.1109\/CVPRW56347.2022.00346","DOI":"10.1109\/CVPRW56347.2022.00346"},{"issue":"1","key":"7706_CR21","doi-asserted-by":"publisher","first-page":"433","DOI":"10.1109\/TNNLS.2021.3095276","volume":"34","author":"T Zhao","year":"2021","unstructured":"Zhao T, Hu Q, He X et al (2021) Ecbc: efficient convolution via blocked columnizing. IEEE Trans Neural Netw Learn Syst 34(1):433\u2013445. https:\/\/doi.org\/10.1109\/TNNLS.2021.3095276","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"issue":"15","key":"7706_CR22","doi-asserted-by":"publisher","first-page":"26203","DOI":"10.1109\/JIOT.2024.3395335","volume":"11","author":"A Maci\u00e1-Lillo","year":"2024","unstructured":"Maci\u00e1-Lillo A, Barrachina S, Fabregat G et al (2024) Optimizing convolutions for deep learning inference on arm cortex-m processors. IEEE Internet Things J 11(15):26203\u201326219. https:\/\/doi.org\/10.1109\/JIOT.2024.3395335","journal-title":"IEEE Internet Things J"},{"key":"7706_CR23","doi-asserted-by":"publisher","unstructured":"Niu Q, Dinan J, Lu Q et\u00a0al (2012) Parda: a fast parallel reuse distance analysis algorithm. In: 2012 IEEE 26th International Parallel and Distributed Processing Symposium. IEEE, pp 1284\u20131294. https:\/\/doi.org\/10.1109\/IPDPS.2012.117","DOI":"10.1109\/IPDPS.2012.117"},{"key":"7706_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2022.102806","volume":"135","author":"S Barrachina","year":"2023","unstructured":"Barrachina S, Castell\u00f3 A, Dolz MF et al (2023) Reformulating the direct convolution for high-performance deep learning inference on arm processors. J Syst Architect 135:102806. https:\/\/doi.org\/10.1016\/j.sysarc.2022.102806","journal-title":"J Syst Architect"},{"key":"7706_CR25","doi-asserted-by":"publisher","unstructured":"Fu X, Zhang X, Ma J, et\u00a0al (2024) High performance im2win and direct convolutions using three tensor layouts on simd architectures. arXiv preprint arXiv:2408.00278https:\/\/doi.org\/10.48550\/arXiv.2408.00278","DOI":"10.48550\/arXiv.2408.00278"},{"key":"7706_CR26","unstructured":"Eigen Library (2024) Eigen: A c++ template library for linear algebra. Website, https:\/\/eigen.tuxfamily.org"},{"key":"7706_CR27","unstructured":"Zhang X, Martin K, Werner S et\u00a0al (2024) Openblas: an optimized blas library. Website, http:\/\/www.openblas.net\/"},{"key":"7706_CR28","doi-asserted-by":"publisher","unstructured":"Li B, Xue Q, Yuan G et\u00a0al (2022) Optimizing data layout for training deep neural networks. In: Companion Proceedings of the Web Conference, 2022m pp 548\u2013554. https:\/\/doi.org\/10.1145\/3487553.3524856","DOI":"10.1145\/3487553.3524856"},{"key":"7706_CR29","doi-asserted-by":"publisher","unstructured":"Chetlur S, Woolley C, Vandermersch P et al (2014) cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759https:\/\/doi.org\/10.48550\/arXiv.1410.0759","DOI":"10.48550\/arXiv.1410.0759"},{"key":"7706_CR30","doi-asserted-by":"publisher","unstructured":"Haidar A, Dong T, Luszczek P et al (2015) Optimization for performance and energy for batched matrix computations on gpus. In: Proceedings of the 8th Workshop on General Purpose Processing Using GPUs, pp 59\u201369. https:\/\/doi.org\/10.1145\/2716282.2716288","DOI":"10.1145\/2716282.2716288"},{"key":"7706_CR31","doi-asserted-by":"publisher","unstructured":"Zhou Y, Yang M, Guo C et al (2021) Characterizing and demystifying the implicit convolution algorithm on commercial matrix-multiplication accelerators. In: 2021 IEEE International Symposium on Workload Characterization (IISWC), IEEE, pp 214\u2013225. https:\/\/doi.org\/10.1109\/IISWC53511.2021.00029","DOI":"10.1109\/IISWC53511.2021.00029"},{"key":"7706_CR32","doi-asserted-by":"publisher","unstructured":"Kim J, Na S, Lee S et al (2023) Improving data reuse in npu on-chip memory with interleaved gradient order for dnn training. In: Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture, pp 438\u2013451. https:\/\/doi.org\/10.1145\/3613424.3614299","DOI":"10.1145\/3613424.3614299"},{"key":"7706_CR33","doi-asserted-by":"publisher","unstructured":"Kouris A, Venieris SI, Laskaridis S et al (2022) Fluid batching: Exit-aware preemptive serving of early-exit neural networks on edge npus. arXiv preprint arXiv:2209.13443https:\/\/doi.org\/10.48550\/arXiv.2209.13443","DOI":"10.48550\/arXiv.2209.13443"},{"key":"7706_CR34","doi-asserted-by":"publisher","unstructured":"Yan D, Wang W, Chu X (2020) Optimizing batched winograd convolution on gpus. In: Proceedings of the 25th ACM SIGPLAN symposium on principles and practice of parallel programming, pp 32\u201344. https:\/\/doi.org\/10.1145\/3332466.3374520","DOI":"10.1145\/3332466.3374520"},{"key":"7706_CR35","unstructured":"Intel Corporation (2024) Intel\u00ae math kernel library (intel\u00ae mkl). Website, https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/onemkl.html"},{"key":"7706_CR36","doi-asserted-by":"publisher","unstructured":"Wei C, Jia H, Zhang Y et al (2022) Iatf: An input-aware tuning framework for compact blas based on armv8 cpus. In: Proceedings of the 51st International Conference on Parallel Processing, pp 1\u201311. https:\/\/doi.org\/10.1145\/3545008.354503","DOI":"10.1145\/3545008.354503"},{"key":"7706_CR37","unstructured":"PyTorch Team (2024) Libtorch: Pytorch c++ api. Website, https:\/\/pytorch.org\/cppdocs\/"},{"key":"7706_CR38","unstructured":"Intel Corporation (2023) Intel\u00ae 64 and ia-32 architectures optimization reference manual. Website, https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/intel-sdm.html"},{"key":"7706_CR39","volume-title":"Computer architecture: a quantitative approach","author":"JL Hennessy","year":"2017","unstructured":"Hennessy JL, Patterson DA (2017) Computer architecture: a quantitative approach, 6th edn. Morgan Kaufmann, San Francisco","edition":"6"},{"key":"7706_CR40","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S et al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"7706_CR41","doi-asserted-by":"publisher","unstructured":"Sandler M, Howard A, Zhu M et al (2018) Mobilenetv2: Inverted residuals and linear bottlenecks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4510\u20134520. https:\/\/doi.org\/10.1109\/CVPR.2018.00474","DOI":"10.1109\/CVPR.2018.00474"},{"issue":"5","key":"7706_CR42","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1109\/MM.2021.3085578","volume":"41","author":"J Xia","year":"2021","unstructured":"Xia J, Cheng C, Zhou X et al (2021) Kunpeng 920: the first 7-nm chiplet-based 64-core arm soc for cloud services. IEEE Micro 41(5):67\u201375. https:\/\/doi.org\/10.1109\/MM.2021.3085578","journal-title":"IEEE Micro"},{"key":"7706_CR43","doi-asserted-by":"publisher","unstructured":"Szegedy C, Vanhoucke V, Ioffe S et al (2016) Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 2818\u20132826. https:\/\/doi.org\/10.1109\/CVPR.2016.308","DOI":"10.1109\/CVPR.2016.308"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07706-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-025-07706-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07706-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T17:14:03Z","timestamp":1754586843000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-025-07706-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,7]]},"references-count":43,"journal-issue":{"issue":"12","published-online":{"date-parts":[[2025,8]]}},"alternative-id":["7706"],"URL":"https:\/\/doi.org\/10.1007\/s11227-025-07706-9","relation":{},"ISSN":["1573-0484"],"issn-type":[{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2025,8,7]]},"assertion":[{"value":"28 July 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 August 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare no competing interests.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"1219"}}