{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T05:33:25Z","timestamp":1757309605112,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,9,14]],"date-time":"2023-09-14T00:00:00Z","timestamp":1694649600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,9,14]]},"DOI":"10.1145\/3604915.3608778","type":"proceedings-article","created":{"date-parts":[[2023,9,14]],"date-time":"2023-09-14T22:40:23Z","timestamp":1694731223000},"page":"430-442","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["InTune: Reinforcement Learning-based Data Pipeline Optimization for Deep Recommendation Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0214-4812","authenticated-orcid":false,"given":"Kabir","family":"Nagrecha","sequence":"first","affiliation":[{"name":"Computer Science &amp; Engineering, University of California, San Diego, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7793-0860","authenticated-orcid":false,"given":"Lingyi","family":"Liu","sequence":"additional","affiliation":[{"name":"Machine Learning Platform, Netflix, Inc., USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7783-9035","authenticated-orcid":false,"given":"Pablo","family":"Delgado","sequence":"additional","affiliation":[{"name":"Machine Learning Platform, Netflix, Inc., USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0570-6796","authenticated-orcid":false,"given":"Prasanna","family":"Padmanabhan","sequence":"additional","affiliation":[{"name":"Machine Learning Platform, Netflix, Inc., USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,9,14]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00072"},{"key":"e_1_3_2_1_2_1","volume-title":"Heterogeneous Acceleration Pipeline for Recommendation System Training. arXiv preprint arXiv:2204.05436","author":"Adnan Muhammad","year":"2022","unstructured":"Muhammad Adnan, Yassaman\u00a0Ebrahimzadeh Maboud, Divya Mahajan, and Prashant\u00a0J Nair. 2022. Heterogeneous Acceleration Pipeline for Recommendation System Training. arXiv preprint arXiv:2204.05436 (2022)."},{"key":"e_1_3_2_1_3_1","volume-title":"Generalization in reinforcement learning: Safely approximating the value function. Advances in neural information processing systems 7","author":"Boyan Justin","year":"1994","unstructured":"Justin Boyan and Andrew Moore. 1994. Generalization in reinforcement learning: Safely approximating the value function. Advances in neural information processing systems 7 (1994)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2988450.2988454"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/40.46766"},{"volume-title":"Deep Learning","author":"Goodfellow Ian","key":"e_1_3_2_1_6_1","unstructured":"Ian Goodfellow, Yoshua Bengio, and Aaron Courville. 2016. Deep Learning. MIT Press. http:\/\/www.deeplearningbook.org."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","unstructured":"Ameer Haj-Ali Nesreen\u00a0K. Ahmed Ted Willke Joseph Gonzalez Krste Asanovic and Ion Stoica. 2019. A View on Deep Reinforcement Learning in System Optimization. https:\/\/doi.org\/10.48550\/ARXIV.1908.01275","DOI":"10.48550\/ARXIV.1908.01275"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","unstructured":"Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil Devanur Greg Ganger and Phil Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. https:\/\/doi.org\/10.48550\/ARXIV.1806.03377","DOI":"10.48550\/ARXIV.1806.03377"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCOM.2017.1700246"},{"key":"e_1_3_2_1_11_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V Le, Yonghui Wu, 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","unstructured":"Yanping Huang Youlong Cheng Ankur Bapna Orhan Firat Mia\u00a0Xu Chen Dehao Chen HyoukJoong Lee Jiquan Ngiam Quoc\u00a0V. Le Yonghui Wu and Zhifeng Chen. 2018. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. https:\/\/doi.org\/10.48550\/ARXIV.1811.06965","DOI":"10.48550\/ARXIV.1811.06965"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3517848"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","unstructured":"Zhihao Jia Matei Zaharia and Alex Aiken. 2018. Beyond Data and Model Parallelism for Deep Neural Networks. https:\/\/doi.org\/10.48550\/ARXIV.1807.05358","DOI":"10.48550\/ARXIV.1807.05358"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","unstructured":"Anssi Kanervisto Christian Scheller and Ville Hautam\u00e4ki. 2020. Action Space Shaping in Deep Reinforcement Learning. https:\/\/doi.org\/10.48550\/ARXIV.2004.00980","DOI":"10.48550\/ARXIV.2004.00980"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","unstructured":"Sanjay Krishnan Zongheng Yang Ken Goldberg Joseph Hellerstein and Ion Stoica. 2018. Learning to Optimize Join Queries With Deep Reinforcement Learning. https:\/\/doi.org\/10.48550\/ARXIV.1808.03196","DOI":"10.48550\/ARXIV.1808.03196"},{"key":"e_1_3_2_1_17_1","first-page":"33","article-title":"Plumber: Diagnosing and removing performance bottlenecks in machine learning data pipelines","volume":"4","author":"Kuchnik Michael","year":"2022","unstructured":"Michael Kuchnik, Ana Klimovic, Jiri Simsa, Virginia Smith, and George Amvrosiadis. 2022. Plumber: Diagnosing and removing performance bottlenecks in machine learning data pipelines. Proceedings of Machine Learning and Systems 4 (2022), 33\u201351.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"11th Annual Conference on Innovative Data Systems Research (CIDR \u201821)","author":"Kumar Arun","year":"2021","unstructured":"Arun Kumar, Supun Nakandala, Yuhao Zhang, Side Li, Advitya Gemawat, and Kabir Nagrecha. 2021. Cerebro: A layered data platform for scalable deep learning. In 11th Annual Conference on Innovative Data Systems Research (CIDR \u201821)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","unstructured":"Dmitry Lepikhin HyoukJoong Lee Yuanzhong Xu Dehao Chen Orhan Firat Yanping Huang Maxim Krikun Noam Shazeer and Zhifeng Chen. 2020. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. https:\/\/doi.org\/10.48550\/ARXIV.2006.16668","DOI":"10.48550\/ARXIV.2006.16668"},{"key":"e_1_3_2_1_20_1","volume-title":"Massively parallel hyperparameter tuning. arXiv preprint arXiv:1810.05934 5","author":"Li Liam","year":"2018","unstructured":"Liam Li, Kevin Jamieson, Afshin Rostamizadeh, Ekaterina Gonina, Moritz Hardt, Benjamin Recht, and Ameet Talwalkar. 2018. Massively parallel hyperparameter tuning. arXiv preprint arXiv:1810.05934 5 (2018)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","unstructured":"Zhuohan Li Siyuan Zhuang Shiyuan Guo Danyang Zhuo Hao Zhang Dawn Song and Ion Stoica. 2021. TeraPipe: Token-Level Pipeline Parallelism for Training Large-Scale Language Models. https:\/\/doi.org\/10.48550\/ARXIV.2102.07988","DOI":"10.48550\/ARXIV.2102.07988"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.14778\/3342263.3342644"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3211954.3211957"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","unstructured":"Jayashree Mohan Amar Phanishayee Ashish Raniwala and Vijay Chidambaram. 2020. Analyzing and Mitigating Data Stalls in DNN Training. https:\/\/doi.org\/10.48550\/ARXIV.2007.06775","DOI":"10.48550\/ARXIV.2007.06775"},{"key":"e_1_3_2_1_25_1","volume-title":"Ray: A Distributed Framework for Emerging AI Applications. arxiv:1712.05889\u00a0[cs.DC]","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael\u00a0I. Jordan, and Ion Stoica. 2018. Ray: A Distributed Framework for Emerging AI Applications. arxiv:1712.05889\u00a0[cs.DC]"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","unstructured":"Dheevatsa Mudigere 2021. Software-Hardware Co-design for Fast and Scalable Training of Deep Learning Recommendation Models. https:\/\/doi.org\/10.48550\/ARXIV.2104.05158","DOI":"10.48550\/ARXIV.2104.05158"},{"key":"e_1_3_2_1_27_1","volume-title":"tf. data: A machine learning data processing framework. arXiv preprint arXiv:2101.12127","author":"Murray G","year":"2021","unstructured":"Derek\u00a0G Murray, Jiri Simsa, Ana Klimovic, and Ihor Indyk. 2021. tf. data: A machine learning data processing framework. arXiv preprint arXiv:2101.12127 (2021)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3450571"},{"key":"e_1_3_2_1_29_1","unstructured":"Kabir Nagrecha. 2023. Systems for Parallel and Distributed Large-Model Deep Learning Training."},{"key":"e_1_3_2_1_30_1","volume-title":"Hydra: A System for Large Multi-Model Deep Learning. arxiv:2110.08633\u00a0[cs.DC]","author":"Nagrecha Kabir","year":"2022","unstructured":"Kabir Nagrecha and Arun Kumar. 2022. Hydra: A System for Large Multi-Model Deep Learning. arxiv:2110.08633\u00a0[cs.DC]"},{"key":"e_1_3_2_1_31_1","volume-title":"Saturn: An Optimized Data System for Multi-Large-Model Deep Learning Workloads (Information System Architectures).","author":"Nagrecha Kabir","year":"2023","unstructured":"Kabir Nagrecha and Arun Kumar. 2023. Saturn: An Optimized Data System for Multi-Large-Model Deep Learning Workloads (Information System Architectures). (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3517846"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397461"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488793"},{"key":"e_1_3_2_1_35_1","unstructured":"Kunle Olukotun. 2022. Systems for ML and ML for Systems: A Virtuous Cycle. (2022). https:\/\/mlsys.org\/virtual\/2022\/invited-talk\/2065 MLSys."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Jennifer Ortiz Magdalena Balazinska Johannes Gehrke and S.\u00a0Sathiya Keerthi. 2018. Learning State Representations for Query Optimization with Deep Reinforcement Learning. https:\/\/doi.org\/10.48550\/ARXIV.1803.08604","DOI":"10.48550\/ARXIV.1803.08604"},{"key":"e_1_3_2_1_37_1","unstructured":"Guilherme Penedo Quentin Malartic Daniel Hesslow Ruxandra Cojocaru Alessandro Cappelli Hamza Alobeidli Baptiste Pannier Ebtesam Almazrouei and Julien Launay. 2023. The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data and Web Data Only. arxiv:2306.01116\u00a0[cs.CL]"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3052895"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3052895"},{"key":"e_1_3_2_1_41_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning.. In OSDI, Vol.\u00a021. 1\u201318.","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao, Sang\u00a0Keun Choe, Suhas\u00a0Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory\u00a0R Ganger, and Eric\u00a0P Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning.. In OSDI, Vol.\u00a021. 1\u201318."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","unstructured":"Jie Ren Samyam Rajbhandari Reza\u00a0Yazdani Aminabadi Olatunji Ruwase Shuangyan Yang Minjia Zhang Dong Li and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. https:\/\/doi.org\/10.48550\/ARXIV.2101.06840","DOI":"10.48550\/ARXIV.2101.06840"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507777"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. https:\/\/doi.org\/10.48550\/ARXIV.1909.08053","DOI":"10.48550\/ARXIV.1909.08053"},{"key":"e_1_3_2_1_45_1","unstructured":"Gerald Tesauro Rajarshi Das and Nicholas\u00a0K. Jong. 2005. Online Performance Management Using Hybrid Reinforcement Learning."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the Fourth Connectionist Models Summer School, Vol.\u00a0255","author":"Thrun Sebastian","year":"1993","unstructured":"Sebastian Thrun and Anton Schwartz. 1993. Issues in using function approximation for reinforcement learning. In Proceedings of the Fourth Connectionist Models Summer School, Vol.\u00a0255. Hillsdale, NJ, 263."},{"key":"e_1_3_2_1_47_1","first-page":"664","article-title":"sensai: Convnets decomposition via class parallelism for fast inference on live data","volume":"3","author":"Wang Guanhua","year":"2021","unstructured":"Guanhua Wang, Zhuang Liu, Brandon Hsieh, Siyuan Zhuang, Joseph Gonzalez, Trevor Darrell, and Ion Stoica. 2021. sensai: Convnets decomposition via class parallelism for fast inference on live data. Proceedings of Machine Learning and Systems 3 (2021), 664\u2013679.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00144"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3547405"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3546765"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2111.00364"},{"key":"e_1_3_2_1_52_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, 2018. Gandiva: Introspective cluster scheduling for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 595\u2013610."},{"key":"e_1_3_2_1_53_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. 2020. { AntMan} : Dynamic Scaling on { GPU} Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 533\u2013548."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533044"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","unstructured":"Victor Zhong Caiming Xiong and Richard Socher. 2017. Seq2SQL: Generating Structured Queries from Natural Language using Reinforcement Learning. https:\/\/doi.org\/10.48550\/ARXIV.1709.00103","DOI":"10.48550\/ARXIV.1709.00103"}],"event":{"name":"RecSys '23: Seventeenth ACM Conference on Recommender Systems","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGAI ACM Special Interest Group on Artificial Intelligence","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGIR ACM Special Interest Group on Information Retrieval","SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGecom Special Interest Group on Economics and Computation"],"location":"Singapore Singapore","acronym":"RecSys '23"},"container-title":["Proceedings of the 17th ACM Conference on Recommender Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3604915.3608778","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3604915.3608778","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:06Z","timestamp":1750178766000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3604915.3608778"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,14]]},"references-count":55,"alternative-id":["10.1145\/3604915.3608778","10.1145\/3604915"],"URL":"https:\/\/doi.org\/10.1145\/3604915.3608778","relation":{},"subject":[],"published":{"date-parts":[[2023,9,14]]},"assertion":[{"value":"2023-09-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}