{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:51:31Z","timestamp":1742914291057,"version":"3.40.3"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031299261"},{"type":"electronic","value":"9783031299278"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-29927-8_34","type":"book-chapter","created":{"date-parts":[[2023,4,7]],"date-time":"2023-04-07T12:02:50Z","timestamp":1680868970000},"page":"441-452","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Optimizing Depthwise Convolutions on\u00a0ARMv8 Architecture"],"prefix":"10.1007","author":[{"given":"Ruochen","family":"Hao","sequence":"first","affiliation":[]},{"given":"Qinglin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shangfei","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Tianyang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Qingyang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Songzhu","family":"Mei","sequence":"additional","affiliation":[]},{"given":"Siqi","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,4,8]]},"reference":[{"key":"34_CR1","unstructured":"ARM: Compute library (2021). https:\/\/github.com\/ARM-software\/ComputeLibrary. Accessed 3 Sept 2021"},{"key":"34_CR2","unstructured":"Chen, T., et al.: MXNet: a flexible and efficient machine learning library for heterogeneous distributed systems (2015)"},{"issue":"23","key":"34_CR3","doi-asserted-by":"publisher","first-page":"e4800","DOI":"10.1002\/cpe.4800","volume":"30","author":"X Chen","year":"2018","unstructured":"Chen, X., Xie, P., Chi, L., Liu, J., Gong, C.: An efficient SIMD compression format for sparse matrix-vector multiplication. Concurr. Comput.: Pract. Exp. 30(23), e4800 (2018)","journal-title":"Concurr. Comput.: Pract. Exp."},{"key":"34_CR4","doi-asserted-by":"publisher","unstructured":"Harris, M.: Mapping computational concepts to GPUs. In: ACM SIGGRAPH 2005 Courses, SIGGRAPH 2005, p. 50-es. Association for Computing Machinery, New York (2005). https:\/\/doi.org\/10.1145\/1198555.1198768","DOI":"10.1145\/1198555.1198768"},{"key":"34_CR5","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Delving deep into rectifiers: surpassing human-level performance on ImageNet classification. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 1026\u20131034 (2015)","DOI":"10.1109\/ICCV.2015.123"},{"key":"34_CR6","unstructured":"Howard, A.G., et al.: MobileNets: efficient convolutional neural networks for mobile vision applications. CoRR (2017)"},{"key":"34_CR7","doi-asserted-by":"publisher","unstructured":"Huang, X., Wang, Q., Lu, S., Hao, R., Mei, S., Liu, J.: Evaluating FFT-based algorithms for strided convolutions on ARMv8 architectures. Perform. Eval. 102248 (2021). https:\/\/doi.org\/10.1016\/j.peva.2021.102248","DOI":"10.1016\/j.peva.2021.102248"},{"key":"34_CR8","doi-asserted-by":"publisher","unstructured":"Jia, Y., et al.: Caffe: convolutional architecture for fast feature embedding. In: Proceedings of the 22nd ACM International Conference on Multimedia, MM 2014, pp. 675\u2013678. Association for Computing Machinery, New York (2014). https:\/\/doi.org\/10.1145\/2647868.2654889","DOI":"10.1145\/2647868.2654889"},{"key":"34_CR9","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.neucom.2016.11.046","volume":"230","author":"S Li","year":"2017","unstructured":"Li, S., Dou, Y., Niu, X., Lv, Q., Wang, Q.: A fast and memory saved GPU acceleration algorithm of convolutional neural networks for target detection. Neurocomputing 230, 48\u201359 (2017)","journal-title":"Neurocomputing"},{"key":"34_CR10","unstructured":"Marvell: Thunderx_CP family (2022). https:\/\/www.marvell.com\/server-processors\/thunderx-arm-processors\/thunderx-cp. Accessed 1 Jan 2022"},{"key":"34_CR11","doi-asserted-by":"crossref","unstructured":"Matsuoka, S.: Fugaku and A64FX: the first exascale supercomputer and its innovative arm CPU. In: 2021 Symposium on VLSI Circuits, pp. 1\u20133 (2021). 10.23919\/VLSICircuits52068.2021.9492415","DOI":"10.23919\/VLSICircuits52068.2021.9492415"},{"key":"34_CR12","doi-asserted-by":"publisher","unstructured":"Mittal, S., Rajput, P., Subramoney, S.: A survey of deep learning on CPUs: opportunities and co-optimizations. IEEE Trans. Neural Netw. Learn. Syst. 1\u201321 (2021). https:\/\/doi.org\/10.1109\/TNNLS.2021.3071762","DOI":"10.1109\/TNNLS.2021.3071762"},{"key":"34_CR13","unstructured":"OPEN AI LAB: Tengine (2021). https:\/\/github.com\/OAID\/Tengine. Accessed 3 Sept 2021"},{"key":"34_CR14","unstructured":"Paszke, A., Gross, S., Massa, F., et al.: PyTorch: an imperative style, high-performance deep learning library (2019)"},{"key":"34_CR15","unstructured":"Phytium: FT-1500A\/16 (2022). https:\/\/www.phytium.com.cn\/Product\/detail?language=1 &product_id=9. Accessed 1 Jan 2022"},{"key":"34_CR16","unstructured":"Rajovic, N., et al.: The Mont-Blanc prototype: an alternative approach for high-performance computing systems (2016)"},{"key":"34_CR17","doi-asserted-by":"publisher","unstructured":"Renganarayana, L., Bondhugula, U., Derisavi, S., Eichenberger, A.E., O\u2019Brien, K.: Compact multi-dimensional kernel extraction for register tiling. In: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis, SC 2009, Association for Computing Machinery, New York (2009). https:\/\/doi.org\/10.1145\/1654059.1654105","DOI":"10.1145\/1654059.1654105"},{"key":"34_CR18","doi-asserted-by":"publisher","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., Chen, L.C.: MobileNetv 2: inverted residuals and linear bottlenecks. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4510\u20134520 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00474","DOI":"10.1109\/CVPR.2018.00474"},{"key":"34_CR19","doi-asserted-by":"crossref","unstructured":"Singh, R.K., Gorantla, R.: DMENet: diabetic macular edema diagnosis using hierarchical ensemble of CNNs. PLOS One 15(2), e0220677 (2020)","DOI":"10.1371\/journal.pone.0220677"},{"key":"34_CR20","doi-asserted-by":"publisher","unstructured":"Tan, M., et al.: MnasNet: platform-aware neural architecture search for mobile. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2815\u20132823 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00293","DOI":"10.1109\/CVPR.2019.00293"},{"key":"34_CR21","unstructured":"Tencent: FeatherCNN (2021). https:\/\/github.com\/Tencent\/FeatherCNN. Accessed 3 Sept 2021"},{"key":"34_CR22","unstructured":"Tencent: nCNN (2021). https:\/\/github.com\/Tencent\/ncnn. Accessed 3 Sept 2021"},{"key":"34_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"248","DOI":"10.1007\/978-3-030-57675-2_16","volume-title":"Euro-Par 2020: Parallel Processing","author":"Q Wang","year":"2020","unstructured":"Wang, Q., Li, D., Huang, X., Shen, S., Mei, S., Liu, J.: Optimizing FFT-based convolution on ARMv8 multi-core CPUs. In: Malawski, M., Rzadca, K. (eds.) Euro-Par 2020. LNCS, vol. 12247, pp. 248\u2013262. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-57675-2_16"},{"issue":"6","key":"34_CR24","doi-asserted-by":"publisher","first-page":"1140","DOI":"10.7544\/issn1000-1239.2020.20200107","volume":"57","author":"Q Wang","year":"2020","unstructured":"Wang, Q., Li, D., Mei, S., Lai, Z., Dou, Y.: Optimizing winograd-based fast convolution algorithm on phytium multi-core CPUs (in Chinese). J. Comput. Res. Dev. 57(6), 1140\u20131151 (2020). https:\/\/doi.org\/10.7544\/issn1000-1239.2020.20200107","journal-title":"J. Comput. Res. Dev."},{"key":"34_CR25","doi-asserted-by":"publisher","unstructured":"Wang, Q., Songzhu, M., Liu, J., Gong, C.: Parallel convolution algorithm using implicit matrix multiplication on multi-core CPUs. In: 2019 International Joint Conference on Neural Networks (IJCNN), pp. 1\u20137 (2019). https:\/\/doi.org\/10.1109\/IJCNN.2019.8852012","DOI":"10.1109\/IJCNN.2019.8852012"},{"key":"34_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1007\/978-3-030-18645-6_6","volume-title":"Supercomputing Frontiers","author":"X You","year":"2019","unstructured":"You, X., Yang, H., Luan, Z., Liu, Y., Qian, D.: Performance evaluation and analysis of linear algebra kernels in the prototype Tianhe-3 cluster. In: Abramson, D., de Supinski, B.R. (eds.) SCFA 2019. LNCS, vol. 11416, pp. 86\u2013105. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-18645-6_6"},{"key":"34_CR27","unstructured":"Zhang, J., Franchetti, F., Low, T.M.: High performance zero-memory overhead direct convolutions. In: International Conference on Machine Learning, pp. 5771\u20135780 (2018)"},{"key":"34_CR28","doi-asserted-by":"crossref","unstructured":"Zhang, P., Lo, E., Lu, B.: High performance depthwise and pointwise convolutions on mobile devices. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 6795\u20136802. AAAI Press (2020)","DOI":"10.1609\/aaai.v34i04.6159"}],"container-title":["Lecture Notes in Computer Science","Parallel and Distributed Computing, Applications and Technologies"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-29927-8_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,7]],"date-time":"2023-04-07T12:09:27Z","timestamp":1680869367000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-29927-8_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031299261","9783031299278"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-29927-8_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"8 April 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PDCAT","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Parallel and Distributed Computing: Applications and Technologies","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sendai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 December 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 December 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pdcat2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.hpc.is.tohoku.ac.jp\/pdcat2022\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Open","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"95","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"24","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"16","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"25% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}