{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T13:38:41Z","timestamp":1761917921102,"version":"3.37.3"},"reference-count":35,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2021,3,25]],"date-time":"2021-03-25T00:00:00Z","timestamp":1616630400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,3,25]],"date-time":"2021-03-25T00:00:00Z","timestamp":1616630400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/100014718","name":"Innovative Research Group Project of the National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61521092"],"award-info":[{"award-number":["61521092"]}],"id":[{"id":"10.13039\/100014718","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","award":["2017YFB1003103"],"award-info":[{"award-number":["2017YFB1003103"]}],"id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1007\/s10766-021-00701-6","type":"journal-article","created":{"date-parts":[[2021,3,25]],"date-time":"2021-03-25T22:40:18Z","timestamp":1616712018000},"page":"628-645","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Compiler-assisted Operator Template Library for DNN Accelerators"],"prefix":"10.1007","volume":"49","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2924-5189","authenticated-orcid":false,"given":"Jiansong","family":"Li","sequence":"first","affiliation":[]},{"given":"Wei","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Xiao","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Guangli","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xueying","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Lei","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xiaobing","family":"Feng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,3,25]]},"reference":[{"key":"701_CR1","unstructured":"Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man\u00e9, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi\u00e9gas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems (2015). http:\/\/tensorflow.org\/. Software available from tensorflow.org"},{"key":"701_CR2","unstructured":"AnandTech: Cambricon, Makers of Huawei\u2019s Kirin NPU IP. https:\/\/www.anandtech.com\/show\/12815\/cambricon-makers-of-huaweis-kirin-npu-ip-build-a-big-ai-chip-and-pcie-card (2018)"},{"key":"701_CR3","doi-asserted-by":"publisher","unstructured":"Chen, T., Du, Z., Sun, N., Wang, J., Wu, C., Chen, Y., Temam, O.: Diannao: A small-footprint high-throughput accelerator for ubiquitous machine-learning. In: Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS \u201914, pp. 269\u2013284. ACM, New York, NY, USA (2014). https:\/\/doi.org\/10.1145\/2541940.2541967","DOI":"10.1145\/2541940.2541967"},{"key":"701_CR4","volume-title":"CUDA Programming: A Developer\u2019s Guide to Parallel Computing with GPUs","author":"S Cook","year":"2012","unstructured":"Cook, S.: CUDA Programming: A Developer\u2019s Guide to Parallel Computing with GPUs, 1st edn. Morgan Kaufmann Publishers Inc., San Francisco, CA, USA (2012)","edition":"1"},{"issue":"1","key":"701_CR5","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1109\/TIT.1967.1053964","volume":"13","author":"T Cover","year":"2006","unstructured":"Cover, T., Hart, P.: Nearest Neighbor Pattern Classification. IEEE Trans. Inf. Theor. 13(1), 21\u201327 (2006)","journal-title":"IEEE Trans. Inf. Theor."},{"key":"701_CR6","unstructured":"Culberson, J.C.: Iterated Greedy Graph Coloring and the Difficulty Landscape. Tech. rep. (1992)"},{"key":"701_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., jia Li, L., Li, K., Fei-fei, L.: Imagenet: A large-scale hierarchical image database. In: In CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"701_CR8","unstructured":"DMLC teams: mshadow. https:\/\/github.com\/dmlc\/mshadow (2018)"},{"key":"701_CR9","unstructured":"Guennebaud, G., Jacob, B., et\u00a0al.: Eigen v3. http:\/\/eigen.tuxfamily.org (2010)"},{"key":"701_CR10","unstructured":"He, K., et\u00a0al.: Deep residual learning for image recognition. CoRR abs\/1512.03385 (2015)"},{"issue":"4","key":"701_CR11","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1109\/5254.708428","volume":"13","author":"MA Hearst","year":"1998","unstructured":"Hearst, M.A.: Support Vector Machines. IEEE Intelligent Systems 13(4), 18\u201328 (1998)","journal-title":"IEEE Intelligent Systems"},{"issue":"5","key":"701_CR12","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1016\/0893-6080(89)90020-8","volume":"2","author":"K Hornik","year":"1989","unstructured":"Hornik, K., Stinchcombe, M., White, H.: Multilayer feedforward networks are universal approximators. Neural Netw. 2(5), 359\u2013366 (1989)","journal-title":"Neural Netw."},{"key":"701_CR13","unstructured":"Howard, A.G., et\u00a0al.: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. CoRR abs\/1704.04861 (2017)"},{"key":"701_CR14","doi-asserted-by":"publisher","unstructured":"Huang, G., Liu, Z., Van Der Maaten, L., Weinberger, K.Q.: Densely Connected Convolutional Networks. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2261\u20132269 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.243","DOI":"10.1109\/CVPR.2017.243"},{"key":"701_CR15","unstructured":"Iandola, F.N., et\u00a0al.: SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and $$<$$1MB model size. CoRR abs\/1602.07360 (2016)"},{"key":"701_CR16","doi-asserted-by":"publisher","unstructured":"J.\u00a0Hrdtlein C.\u00a0Pflaum, A.L.C.H.W.: Advanced expression templates programming. In: Computing and Visualization in Science. Springer (2010). https:\/\/doi.org\/10.1007\/s00791-009-0128-2","DOI":"10.1007\/s00791-009-0128-2"},{"key":"701_CR17","doi-asserted-by":"publisher","unstructured":"Jianwen Zhu: Static memory allocation by pointer analysis and coloring. In: Proceedings Design, Automation and Test in Europe. Conference and Exhibition 2001, pp. 785\u2013790 (2001). https:\/\/doi.org\/10.1109\/DATE.2001.915121","DOI":"10.1109\/DATE.2001.915121"},{"key":"701_CR18","doi-asserted-by":"publisher","unstructured":"Jouppi, N.P., Young, C., Patil, N., Patterson, D., et\u00a0al.: In-datacenter performance analysis of a tensor processing unit. ISCA\u201917, p. 1\u201312. Association for Computing Machinery, New York, NY, USA (2017). https:\/\/doi.org\/10.1145\/3079856.3080246","DOI":"10.1145\/3079856.3080246"},{"key":"701_CR19","unstructured":"Krizhevsky, A., et\u00a0al.: ImageNet Classification with Deep Convolutional Neural Networks. NIPS\u201912, pp. 1097\u20131105. Curran Associates Inc., USA (2012)"},{"key":"701_CR20","doi-asserted-by":"publisher","unstructured":"Li, L., Feng, H., Xue, J.: Compiler-directed scratchpad memory management via graph coloring. ACM Trans. Archit. Code Optim. 6(3) (2009). https:\/\/doi.org\/10.1145\/1582710.1582711","DOI":"10.1145\/1582710.1582711"},{"key":"701_CR21","doi-asserted-by":"publisher","unstructured":"Lian Li, Lin Gao, Jingling Xue: Memory coloring: a compiler approach for scratchpad memory management. In: 14th International Conference on Parallel Architectures and Compilation Techniques (PACT\u201905), pp. 329\u2013338 (2005). https:\/\/doi.org\/10.1109\/PACT.2005.27","DOI":"10.1109\/PACT.2005.27"},{"key":"701_CR22","doi-asserted-by":"publisher","unstructured":"Liao, H., Tu, J., Xia, J., Zhou, X.: Davinci: A scalable architecture for neural network computing. In: 2019 IEEE Hot Chips 31 Symposium (HCS), pp. 1\u201344. IEEE Computer Society, Los Alamitos, CA, USA (2019). https:\/\/doi.org\/10.1109\/HOTCHIPS.2019.8875654","DOI":"10.1109\/HOTCHIPS.2019.8875654"},{"key":"701_CR23","doi-asserted-by":"publisher","unstructured":"Liu, S., Du, Z., Tao, J., Han, D., Luo, T., Xie, Y., Chen, Y., Chen, T.: Cambricon: An instruction set architecture for neural networks. In: Proceedings of the 43rd International Symposium on Computer Architecture, ISCA \u201916, p. 393\u2013405. IEEE Press (2016). https:\/\/doi.org\/10.1109\/ISCA.2016.42","DOI":"10.1109\/ISCA.2016.42"},{"key":"701_CR24","doi-asserted-by":"publisher","unstructured":"Moazeni, M., Bui, A., Sarrafzadeh, M.: A memory optimization technique for software-managed scratchpad memory in gpus. In: 2009 IEEE 7th Symposium on Application Specific Processors, pp. 43\u201349 (2009). https:\/\/doi.org\/10.1109\/SASP.2009.5226334","DOI":"10.1109\/SASP.2009.5226334"},{"key":"701_CR25","volume-title":"Advanced Compiler Design and Implementation","author":"SS Muchnick","year":"1998","unstructured":"Muchnick, S.S.: Advanced Compiler Design and Implementation. Morgan Kaufmann Publishers Inc., San Francisco, CA, USA (1998)"},{"key":"701_CR26","volume-title":"OpenCL Programming Guide","author":"A Munshi","year":"2011","unstructured":"Munshi, A., Gaster, B., Mattson, T.G., Fung, J., Ginsburg, D.: OpenCL Programming Guide, 1st edn. Addison-Wesley Professional, Boston (2011)","edition":"1"},{"key":"701_CR27","unstructured":"NVIDIA teams: Cutlass. https:\/\/github.com\/NVIDIA\/cutlass (2017)"},{"key":"701_CR28","doi-asserted-by":"crossref","unstructured":"P.\u00a0Briggs, K.D.C., Torczon, L.: Improvements to graph coloring register allocation. ACM Trans. Program. Lang. Syst. 16(3), 428\u2013455 (1994)","DOI":"10.1145\/177492.177575"},{"key":"701_CR29","doi-asserted-by":"crossref","unstructured":"Progsch, J., Ineichen, Y., Adelmann, A.: A new vectorization technique for expression templates in C++. CoRR abs\/1109.1264 (2011). arXiv:1264","DOI":"10.33697\/ajur.2012.003"},{"key":"701_CR30","unstructured":"Simonyan, K., Zisserman, A.: Very Deep Convolutional Networks for Large-Scale Image Recognition (2014). arXiv:1409.1556"},{"key":"701_CR31","doi-asserted-by":"publisher","unstructured":"Springer, M., Sun, Y., Masuhara, H.: Inner Array Inlining for Structure of Arrays Layout. In: Proceedings of the 5th ACM SIGPLAN International Workshop on Libraries, Languages, and Compilers for Array Programming, ARRAY 2018, p. 50\u201358. Association for Computing Machinery, New York, NY, USA (2018). https:\/\/doi.org\/10.1145\/3219753.3219760","DOI":"10.1145\/3219753.3219760"},{"key":"701_CR32","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et\u00a0al.: Going deeper with convolutions. In: Computer Vision and Pattern Recognition (CVPR) (2015). arXiv:1409.4842","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"701_CR33","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et\u00a0al.: Rethinking the inception architecture for computer vision. CoRR abs\/1512.00567 (2015)","DOI":"10.1109\/CVPR.2016.308"},{"issue":"4","key":"701_CR34","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1145\/1498765.1498785","volume":"52","author":"S Williams","year":"2009","unstructured":"Williams, S., Waterman, A., Patterson, D.: Roofline: An Insightful Visual Performance Model for Multicore Architectures. Commun. ACM 52(4), 65\u201376 (2009). https:\/\/doi.org\/10.1145\/1498765.1498785","journal-title":"Commun. ACM"},{"key":"701_CR35","doi-asserted-by":"publisher","unstructured":"Wu, J., Belevich, A., Bendersky, E., Heffernan, M., Leary, C., Pienaar, J., Roune, B., Springer, R., Weng, X., Hundt, R.: Gpucc: An Open-Source GPGPU Compiler. In: Proceedings of the 2016 International Symposium on Code Generation and Optimization, CGO \u201916, p. 105\u2013116. Association for Computing Machinery, New York, NY, USA (2016). https:\/\/doi.org\/10.1145\/2854038.2854041","DOI":"10.1145\/2854038.2854041"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-021-00701-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10766-021-00701-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-021-00701-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,26]],"date-time":"2024-08-26T21:19:46Z","timestamp":1724707186000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10766-021-00701-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,3,25]]},"references-count":35,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2021,10]]}},"alternative-id":["701"],"URL":"https:\/\/doi.org\/10.1007\/s10766-021-00701-6","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[2021,3,25]]},"assertion":[{"value":"3 November 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 March 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 March 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}