{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T08:29:22Z","timestamp":1777105762977,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,3,9]],"date-time":"2020-03-09T00:00:00Z","timestamp":1583712000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Plan","award":["No. 2017YFC0803700"],"award-info":[{"award-number":["No. 2017YFC0803700"]}]},{"name":"National Natural Science Foundation of China","award":["No. 61772218"],"award-info":[{"award-number":["No. 61772218"]}]},{"name":"National Natural Science Foundation of China","award":["No. 61832006"],"award-info":[{"award-number":["No. 61832006"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,3,9]]},"DOI":"10.1145\/3373376.3378505","type":"proceedings-article","created":{"date-parts":[[2020,3,13]],"date-time":"2020-03-13T22:37:01Z","timestamp":1584139021000},"page":"891-905","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":126,"title":["Capuchin"],"prefix":"10.1145","author":[{"given":"Xuan","family":"Peng","sequence":"first","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"given":"Xuanhua","family":"Shi","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"given":"Hulin","family":"Dai","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"given":"Hai","family":"Jin","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"given":"Weiliang","family":"Ma","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"given":"Qian","family":"Xiong","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"given":"Xuehai","family":"Qian","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2020,3,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Gradient-checkpointing. https:\/\/github.com\/cybertronai\/gradient-checkpointing.  Gradient-checkpointing. https:\/\/github.com\/cybertronai\/gradient-checkpointing."},{"key":"e_1_3_2_1_2_1","first-page":"265","volume-title":"Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","volume":"16","author":"Abadi M.","year":"2016","unstructured":"Abadi , M. , Barham , P. , Chen , J. , Chen , Z. , Davis , A. , Dean , J. , Devin , M. , Ghemawat , S. , Irving , G. , Isard , M. , Kudlur , M. , Levenberg , J. , Monga , R. , Moore , S. , Murray , D. G. , Steiner , B. , Tucker , P. , Vasudevan , V. , Warden , P. , Wicke , M. , Yu , Y. , and Zheng , X . TensorFlow: A System for Large-Scale Machine Learning . In Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) ( 2016 ), vol. 16 , USENIX Association , pp. 265 -- 283 . Abadi, M., Barham, P., Chen, J., Chen, Z., Davis, A., Dean, J., Devin, M., Ghemawat, S., Irving, G., Isard, M., Kudlur, M., Levenberg, J., Monga, R., Moore, S., Murray, D. G., Steiner, B., Tucker, P., Vasudevan, V., Warden, P., Wicke, M., Yu, Y., and Zheng, X. TensorFlow: A System for Large-Scale Machine Learning. In Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) (2016), vol. 16, USENIX Association, pp. 265--283."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the 2nd Conference on Systems and Machine Learning (SysML'19)","author":"Agrawal A.","year":"2019","unstructured":"Agrawal , A. , Modi , A. N. , Passos , A. , Lavoie , A. , Agarwal , A. , Shankar , A. , Ganichev , I. , Levenberg , J. , Hong , M. , Monga , R. , and Cai , S . Tensorflow eager: A multi-stage, python-embedded dsl for machine learning . In Proceedings of the 2nd Conference on Systems and Machine Learning (SysML'19) ( 2019 ). Agrawal, A., Modi, A. N., Passos, A., Lavoie, A., Agarwal, A., Shankar, A., Ganichev, I., Levenberg, J., Hong, M., Monga, R., and Cai, S. Tensorflow eager: A multi-stage, python-embedded dsl for machine learning. In Proceedings of the 2nd Conference on Systems and Machine Learning (SysML'19) (2019)."},{"key":"e_1_3_2_1_4_1","volume-title":"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274","author":"Chen T.","year":"2015","unstructured":"Chen , T. , Li , M. , Li , Y. , Lin , M. , Wang , N. , Wang , M. , Xiao , T. , Xu , B. , Zhang , C. , and Zhang , Z . Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274 ( 2015 ). Chen, T., Li, M., Li, Y., Lin, M., Wang, N., Wang, M., Xiao, T., Xu, B., Zhang, C., and Zhang, Z. Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274 (2015)."},{"key":"e_1_3_2_1_5_1","volume-title":"Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174","author":"Chen T.","year":"2016","unstructured":"Chen , T. , Xu , B. , Zhang , C. , and Guestrin , C . Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174 ( 2016 ). Chen, T., Xu, B., Zhang, C., and Guestrin, C. Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174 (2016)."},{"key":"e_1_3_2_1_6_1","volume-title":"cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759","author":"Chetlur S.","year":"2014","unstructured":"Chetlur , S. , Woolley , C. , Vandermersch , P. , Cohen , J. , Tran , J. , Catanzaro , B. , and Shelhamer , E . cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 ( 2014 ). Chetlur, S., Woolley, C., Vandermersch, P., Cohen, J., Tran, J., Catanzaro, B., and Shelhamer, E. cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901323"},{"key":"e_1_3_2_1_8_1","first-page":"248","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","author":"Deng J.","year":"2009","unstructured":"Deng , J. , Dong , W. , Socher , R. , Li , L.-J. , Li , K. , and Li , F . -F. Imagenet: A large-scale hierarchical image database . In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition ( 2009 ), pp. 248 -- 255 . Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., and Li, F.-F. Imagenet: A large-scale hierarchical image database. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (2009), pp. 248--255."},{"key":"e_1_3_2_1_9_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805","author":"Devlin J.","year":"2018","unstructured":"Devlin , J. , Chang , M.-W. , Lee , K. , and Toutanova , K . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 ( 2018 ). Devlin, J., Chang, M.-W., Lee, K., and Toutanova, K. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_10_1","volume-title":"Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal P.","year":"2017","unstructured":"Goyal , P. , Doll\u00e1r , P. , Girshick , R. , Noordhuis , P. , Wesolowski , L. , Kyrola , A. , Tulloch , A. , Jia , Y. , and He , K . Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 ( 2017 ). Goyal, P., Doll\u00e1r, P., Girshick, R., Noordhuis, P., Wesolowski, L., Kyrola, A., Tulloch, A., Jia, Y., and He, K. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00070"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 2nd Conference on Systems and Machine Learning (SysML'19)","author":"Jia Z.","year":"2019","unstructured":"Jia , Z. , Zaharia , M. , and Aiken , A . Beyond data and model parallelism for deep neural networks . In Proceedings of the 2nd Conference on Systems and Machine Learning (SysML'19) ( 2019 ). Jia, Z., Zaharia, M., and Aiken, A. Beyond data and model parallelism for deep neural networks. In Proceedings of the 2nd Conference on Systems and Machine Learning (SysML'19) (2019)."},{"key":"e_1_3_2_1_16_1","volume-title":"Layer-centric memory reuse and data migration for extreme-scale deep learning on many-core architectures. ACM Transactions on Architecture and Code Optimization (TACO) 15, 3","author":"Jin H.","year":"2018","unstructured":"Jin , H. , Liu , B. , Jiang , W. , Ma , Y. , Shi , X. , He , B. , and Zhao , S . Layer-centric memory reuse and data migration for extreme-scale deep learning on many-core architectures. ACM Transactions on Architecture and Code Optimization (TACO) 15, 3 ( 2018 ), 37. Jin, H., Liu, B., Jiang, W., Ma, Y., Shi, X., He, B., and Zhao, S. Layer-centric memory reuse and data migration for extreme-scale deep learning on many-core architectures. ACM Transactions on Architecture and Code Optimization (TACO) 15, 3 (2018), 37."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_18_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma D. P.","year":"2014","unstructured":"Kingma , D. P. , and Ba , J . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 ( 2014 ). Kingma, D. P., and Ba, J. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/E14-3011"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304044"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of ML Systems Workshop in NIPS","author":"Meng C.","year":"2017","unstructured":"Meng , C. , Sun , M. , Yang , J. , Qiu , M. , and Gu , Y . Training deeper models by gpu memory optimization on tensorflow . In Proceedings of ML Systems Workshop in NIPS ( 2017 ). Meng, C., Sun, M., Yang, J., Qiu, M., and Gu, Y. Training deeper models by gpu memory optimization on tensorflow. In Proceedings of ML Systems Workshop in NIPS (2017)."},{"key":"e_1_3_2_1_23_1","first-page":"8024","volume-title":"Advances in Neural Information Processing Systems","author":"Paszke A.","year":"2019","unstructured":"Paszke , A. , Gross , S. , Massa , F. , Lerer , A. , Bradbury , J. , Chanan , G. , Killeen , T. , Lin , Z. , Gimelshein , N. , Antiga , L. , Desmaison , A. , Kopf , A. , Yang , E. , DeVito , Z. , Raison , M. , Tejani , A. , Chilamkurthy , S. , Steiner , B. , Fang , L. , Bai , J. , and Chintala , S . Pytorch: An imperative style, high-performance deep learning library . In Advances in Neural Information Processing Systems ( 2019 ), pp. 8024 -- 8035 . Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., Desmaison, A., Kopf, A., Yang, E., DeVito, Z., Raison, M., Tejani, A., Chilamkurthy, S., Steiner, B., Fang, L., Bai, J., and Chintala, S. Pytorch: An imperative style, high-performance deep learning library. In Advances in Neural Information Processing Systems (2019), pp. 8024--8035."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00017"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2945397"},{"key":"e_1_3_2_1_27_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan K.","year":"2014","unstructured":"Simonyan , K. , and Zisserman , A . Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 ( 2014 ). Simonyan, K., and Zisserman, A. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00036"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00027"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"e_1_3_2_1_32_1","volume-title":"Unifying data, model and hybrid parallelism in deep learning via tensor tiling. arXiv preprint arXiv:1805.04170","author":"Wang M.","year":"2018","unstructured":"Wang , M. , Huang , C.-c. , and Li , J . Unifying data, model and hybrid parallelism in deep learning via tensor tiling. arXiv preprint arXiv:1805.04170 ( 2018 ). Wang, M., Huang, C.-c., and Li, J. Unifying data, model and hybrid parallelism in deep learning via tensor tiling. arXiv preprint arXiv:1805.04170 (2018)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"e_1_3_2_1_34_1","first-page":"595","volume-title":"Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao W.","year":"2018","unstructured":"Xiao , W. , Bhardwaj , R. , Ramjee , R. , Sivathanu , M. , Kwatra , N. , Han , Z. , Patel , P. , Peng , X. , Zhao , H. , Zhang , Q. , Yang , F. , and Zhou , L . Gandiva: Introspective cluster scheduling for deep learning . In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18) ( 2018 ), pp. 595 -- 610 . Xiao, W., Bhardwaj, R., Ramjee, R., Sivathanu, M., Kwatra, N., Han, Z., Patel, P., Peng, X., Zhao, H., Zhang, Q., Yang, F., and Zhou, L. Gandiva: Introspective cluster scheduling for deep learning. In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18) (2018), pp. 595--610."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3275445"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICNN.1993.298720"},{"key":"e_1_3_2_1_37_1","volume-title":"Efficient memory management for gpu-based deep learning systems. arXiv preprint arXiv:1903.06631","author":"Zhang J.","year":"2019","unstructured":"Zhang , J. , Yeung , S. H. , Shu , Y. , He , B. , and Wang , W . Efficient memory management for gpu-based deep learning systems. arXiv preprint arXiv:1903.06631 ( 2019 ). Zhang, J., Yeung, S. H., Shu, Y., He, B., and Wang, W. Efficient memory management for gpu-based deep learning systems. arXiv preprint arXiv:1903.06631 (2019)."}],"event":{"name":"ASPLOS '20: Architectural Support for Programming Languages and Operating Systems","location":"Lausanne Switzerland","acronym":"ASPLOS '20","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3373376.3378505","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3373376.3378505","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:38:16Z","timestamp":1750199896000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3373376.3378505"}},"subtitle":["Tensor-based GPU Memory Management for Deep Learning"],"short-title":[],"issued":{"date-parts":[[2020,3,9]]},"references-count":37,"alternative-id":["10.1145\/3373376.3378505","10.1145\/3373376"],"URL":"https:\/\/doi.org\/10.1145\/3373376.3378505","relation":{},"subject":[],"published":{"date-parts":[[2020,3,9]]},"assertion":[{"value":"2020-03-13","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}