{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T08:56:09Z","timestamp":1778662569695,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T00:00:00Z","timestamp":1676937600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2232120, 2034169, 2042084, 1937403, 1955909, 2018016, 2009007"],"award-info":[{"award-number":["2232120, 2034169, 2042084, 1937403, 1955909, 2018016, 2009007"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,25]]},"DOI":"10.1145\/3572848.3577478","type":"proceedings-article","created":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T16:02:30Z","timestamp":1676995350000},"page":"260-273","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["TDC"],"prefix":"10.1145","author":[{"given":"Lizhi","family":"Xiang","sequence":"first","affiliation":[{"name":"University of Utah, Salt Lake City, UT"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Miao","family":"Yin","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengming","family":"Zhang","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aravind","family":"Sukumaran-Rajam","sequence":"additional","affiliation":[{"name":"Meta and University of Utah"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"P.","family":"Sadayappan","sequence":"additional","affiliation":[{"name":"University of Utah, Salt Lake City, UT"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Yuan","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dingwen","family":"Tao","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,2,21]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2951913.2976746"},{"key":"e_1_3_2_1_2_1","volume-title":"Distributed optimization and statistical learning via the alternating direction method of multipliers","author":"Boyd Stephen","year":"2011","unstructured":"Stephen Boyd, Neal Parikh, and Eric Chu. Distributed optimization and statistical learning via the alternating direction method of multipliers. Now Publishers Inc, 2011."},{"key":"e_1_3_2_1_3_1","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. Tvm: An automated end-to-end optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, pages 578--594, 2018."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1063\/5.0013065"},{"key":"e_1_3_2_1_5_1","volume-title":"cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759","author":"Chetlur Sharan","year":"2014","unstructured":"Sharan Chetlur, Cliff Woolley, Philippe Vandermersch, Jonathan Cohen, John Tran, Bryan Catanzaro, and Evan Shelhamer. cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759, 2014."},{"key":"e_1_3_2_1_6_1","volume-title":"Convolution kernels for natural language. Advances in neural information processing systems, 14","author":"Collins Michael","year":"2001","unstructured":"Michael Collins and Nigel Duffy. Convolution kernels for natural language. Advances in neural information processing systems, 14, 2001."},{"key":"e_1_3_2_1_7_1","unstructured":"cuDNN v2: Higher Performance for Deep Learning on GPUs. https:\/\/developer.nvidia.com\/blog\/cudnn-v2-higher-performance-deep-learning-gpus\/."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218499"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476157"},{"key":"e_1_3_2_1_11_1","volume-title":"Ultimate tensorization: compressing convolutional and fc layers alike. arXiv preprint arXiv:1611.03214","author":"Garipov Timur","year":"2016","unstructured":"Timur Garipov, Dmitry Podoprikhin, Alexander Novikov, and Dmitry Vetrov. Ultimate tensorization: compressing convolutional and fc layers alike. arXiv preprint arXiv:1611.03214, 2016."},{"key":"e_1_3_2_1_12_1","unstructured":"Groq. The Challenge of Batch Size 1: Groq Adds Responsiveness to Inference Performance. https:\/\/groq.com\/wp-content\/uploads\/2020\/04\/GROQP002_groq_whitepaper_V1-DB-1.pdf."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00306"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401063"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00447"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307681.3326608"},{"key":"e_1_3_2_1_19_1","volume-title":"International Conference on Learning Representations","author":"Kim Yong-Deok","year":"2016","unstructured":"Yong-Deok Kim, Eunhyeok Park, Sungjoo Yoo, Taelim Choi, Lu Yang, and Dongjun Shin. Compression of deep convolutional neural networks for fast and low power mobile applications. In International Conference on Learning Representations, 2016."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00054"},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Learning Representations","author":"Lebedev Vadim","year":"2015","unstructured":"Vadim Lebedev, Yaroslav Ganin, Maksim Rakhuba, Ivan Oseledets, and Victor Lempitsky. Speeding-up convolutional neural networks using fine-tuned cp-decomposition. In International Conference on Learning Representations, 2015."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00637"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW52791.2021.00115"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00160"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5954"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00068"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_35"},{"key":"e_1_3_2_1_28_1","first-page":"442","article-title":"Tensorizing neural networks","volume":"28","author":"Novikov Alexander","year":"2015","unstructured":"Alexander Novikov, Dmitrii Podoprikhin, Anton Osokin, and Dmitry P Vetrov. Tensorizing neural networks. Advances in Neural Information Processing Systems, 28:442--450, 2015.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","unstructured":"Nvidia. NVIDIA Deep Learning cuDNN Documentation. https:\/\/docs.nvidia.com\/deeplearning\/cudnn\/api\/index.html."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1137\/090752286"},{"key":"e_1_3_2_1_31_1","volume-title":"Hai Li, Yiran Chen, and Pradeep Dubey. Faster cnns with direct sparse convolutions and guided pruning. arXiv preprint arXiv:1608.01409","author":"Park Jongsoo","year":"2016","unstructured":"Jongsoo Park, Sheng Li, Wei Wen, Ping Tak Peter Tang, Hai Li, Yiran Chen, and Pradeep Dubey. Faster cnns with direct sparse convolutions and guided pruning. arXiv preprint arXiv:1608.01409, 2016."},{"key":"e_1_3_2_1_32_1","volume-title":"et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58526-6_31"},{"key":"e_1_3_2_1_34_1","first-page":"3743","volume-title":"Interspeech","author":"Povey Daniel","year":"2018","unstructured":"Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohammadi, and Sanjeev Khudanpur. Semi-orthogonal low-rank matrix factorization for deep neural networks. In Interspeech, pages 3743--3747, 2018."},{"key":"e_1_3_2_1_35_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556, 2014."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_37_1","first-page":"10936","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Tang Yehui","year":"2020","unstructured":"Yehui Tang, Yunhe Wang, Yixing Xu, Dacheng Tao, Chunjing XU, Chao Xu, and Chang Xu. Scop: Scientific control for reliable neural network pruning. In Advances in Neural Information Processing Systems, volume 33, pages 10936--10947, 2020."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02289464"},{"key":"e_1_3_2_1_39_1","first-page":"9329","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Wang Wenqi","year":"2018","unstructured":"Wenqi Wang, Yifan Sun, Brian Eriksson, Wenlin Wang, and Vaneet Aggarwal. Wide compression: Tensor ring nets. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 9329--9338, 2018."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/136"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01191"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01053"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447818.3459988"},{"key":"e_1_3_2_1_44_1","volume-title":"Tensor ring decomposition. arXiv preprint arXiv:1606.05535","author":"Zhao Qibin","year":"2016","unstructured":"Qibin Zhao, Guoxu Zhou, Shengli Xie, Liqing Zhang, and Andrzej Cichocki. Tensor ring decomposition. arXiv preprint arXiv:1606.05535, 2016."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00907"}],"event":{"name":"PPoPP '23: The 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Montreal QC Canada","acronym":"PPoPP '23","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577478","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3572848.3577478","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3572848.3577478","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:08:09Z","timestamp":1750183689000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577478"}},"subtitle":["Towards Extremely Efficient CNNs on GPUs via Hardware-Aware Tucker Decomposition"],"short-title":[],"issued":{"date-parts":[[2023,2,21]]},"references-count":45,"alternative-id":["10.1145\/3572848.3577478","10.1145\/3572848"],"URL":"https:\/\/doi.org\/10.1145\/3572848.3577478","relation":{},"subject":[],"published":{"date-parts":[[2023,2,21]]},"assertion":[{"value":"2023-02-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}