{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T00:06:46Z","timestamp":1755907606683,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,2,2]],"date-time":"2024-02-02T00:00:00Z","timestamp":1706832000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,2,2]]},"DOI":"10.1145\/3651671.3651737","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T18:55:50Z","timestamp":1717786550000},"page":"530-535","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Distilling Multi-Step Reasoning Capabilities into Smaller Language Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1458-1208","authenticated-orcid":false,"given":"Yauwai","family":"Yim","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3811-5968","authenticated-orcid":false,"given":"Zirui","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01065"},{"key":"e_1_3_2_1_2_1","volume-title":"Scott\u00a0Wen tau Yih, and Yejin Choi","author":"Bhagavatula Chandra","year":"2020","unstructured":"Chandra Bhagavatula, Ronan\u00a0Le Bras, Chaitanya Malaviya, Keisuke Sakaguchi, Ari Holtzman, Hannah Rashkin, Doug Downey, Scott\u00a0Wen tau Yih, and Yejin Choi. 2020. Abductive Commonsense Reasoning. arxiv:1908.05739\u00a0[cs.CL]"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Prajjwal Bhargava and Vincent Ng. 2022. DiscoSense: Commonsense Reasoning with Discourse Connectives. arxiv:2210.12478\u00a0[cs.CL]","DOI":"10.18653\/v1\/2022.emnlp-main.703"},{"key":"e_1_3_2_1_4_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown B.","year":"2020","unstructured":"Tom\u00a0B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel\u00a0M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual, Hugo Larochelle, Marc\u2019Aurelio Ranzato, Raia Hadsell, Maria-Florina Balcan, and Hsuan-Tien Lin (Eds.)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2304.14827"},{"key":"e_1_3_2_1_6_1","volume-title":"Scaling Instruction-Finetuned Language Models. CoRR abs\/2210.11416","author":"Chung Hyung\u00a0Won","year":"2022","unstructured":"Hyung\u00a0Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang\u00a0Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent\u00a0Y. Zhao, Yanping Huang, Andrew\u00a0M. Dai, Hongkun Yu, Slav Petrov, Ed\u00a0H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc\u00a0V. Le, and Jason Wei. 2022. Scaling Instruction-Finetuned Language Models. CoRR abs\/2210.11416 (2022). arXiv:2210.11416"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","volume":"1","author":"Geva Mor","year":"2019","unstructured":"Mor Geva, Eric Malmi, Idan Szpektor, and Jonathan Berant. 2019. DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and Short Papers), Jill Burstein, Christy Doran, and Thamar Solorio (Eds.). Association for Computational Linguistics, 3443\u20133455."},{"key":"e_1_3_2_1_8_1","volume-title":"Distilling the Knowledge in a Neural Network. CoRR abs\/1503.02531","author":"Hinton E.","year":"2015","unstructured":"Geoffrey\u00a0E. Hinton, Oriol Vinyals, and Jeffrey Dean. 2015. Distilling the Knowledge in a Neural Network. CoRR abs\/1503.02531 (2015). arXiv:1503.02531"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2305.12870"},{"key":"e_1_3_2_1_10_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR abs\/1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR abs\/1907.11692 (2019). arXiv:1907.11692"},{"key":"e_1_3_2_1_11_1","volume-title":"Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations, ICLR 2019","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May 6-9, 2019. OpenReview.net."},{"key":"e_1_3_2_1_12_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll\u00a0L. Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray John Schulman Jacob Hilton Fraser Kelton Luke Miller Maddie Simens Amanda Askell Peter Welinder Paul\u00a0F. Christiano Jan Leike and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. In NeurIPS."},{"key":"e_1_3_2_1_13_1","volume-title":"Instruction Tuning with GPT-4. CoRR abs\/2304.03277","author":"Peng Baolin","year":"2023","unstructured":"Baolin Peng, Chunyuan Li, Pengcheng He, Michel Galley, and Jianfeng Gao. 2023. Instruction Tuning with GPT-4. CoRR abs\/2304.03277 (2023). arXiv:2304.03277"},{"key":"e_1_3_2_1_14_1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter\u00a0J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. J. Mach. Learn. Res. 21 (2020), 140:1\u2013140:67.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.191"},{"key":"e_1_3_2_1_16_1","unstructured":"Keisuke Sakaguchi Ronan\u00a0Le Bras Chandra Bhagavatula and Yejin Choi. 2019. WinoGrande: An Adversarial Winograd Schema Challenge at Scale. arxiv:1907.10641\u00a0[cs.CL]"},{"key":"e_1_3_2_1_17_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. CoRR abs\/1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. CoRR abs\/1910.01108 (2019). arXiv:1910.01108"},{"key":"e_1_3_2_1_18_1","volume-title":"Multitask Prompted Training Enables Zero-Shot Task Generalization. In The Tenth International Conference on Learning Representations, ICLR 2022","author":"Sanh Victor","year":"2022","unstructured":"Victor Sanh, Albert Webson, Colin Raffel, Stephen\u00a0H. Bach, Lintang Sutawika, Zaid Alyafeai, Antoine Chaffin, Arnaud Stiegler, Arun Raja, Manan Dey, M\u00a0Saiful Bari, Canwen Xu, Urmish Thakker, Shanya\u00a0Sharma Sharma, Eliza Szczechla, Taewoon Kim, Gunjan Chhablani, Nihal\u00a0V. Nayak, Debajyoti Datta, Jonathan Chang, Mike\u00a0Tian-Jian Jiang, Han Wang, Matteo Manica, Sheng Shen, Zheng\u00a0Xin Yong, Harshit Pandey, Rachel Bawden, Thomas Wang, Trishala Neeraj, Jos Rozen, Abheesht Sharma, Andrea Santilli, Thibault F\u00e9vry, Jason\u00a0Alan Fries, Ryan Teehan, Teven\u00a0Le Scao, Stella Biderman, Leo Gao, Thomas Wolf, and Alexander\u00a0M. Rush. 2022. Multitask Prompted Training Enables Zero-Shot Task Generalization. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022. OpenReview.net."},{"key":"e_1_3_2_1_19_1","volume-title":"2022. BLOOM: A 176B-Parameter Open-Access Multilingual Language Model. CoRR abs\/2211.05100","author":"Scao Teven\u00a0Le","year":"2022","unstructured":"Teven\u00a0Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ilic, Daniel Hesslow, Roman Castagn\u00e9, Alexandra\u00a0Sasha Luccioni, Fran\u00e7ois Yvon, Matthias Gall\u00e9, Jonathan Tow, Alexander\u00a0M. Rush, Stella Biderman, Albert Webson, Pawan\u00a0Sasanka Ammanamanchi, Thomas Wang, Beno\u00eet Sagot, Niklas Muennighoff, Albert\u00a0Villanova del Moral, Olatunji Ruwase, Rachel Bawden, Stas Bekman, Angelina McMillan-Major, Iz Beltagy, Huu Nguyen, Lucile Saulnier, Samson Tan, Pedro\u00a0Ortiz Suarez, Victor Sanh, Hugo Lauren\u00e7on, Yacine Jernite, Julien Launay, Margaret Mitchell, Colin Raffel, Aaron Gokaslan, Adi Simhi, Aitor Soroa, Alham\u00a0Fikri Aji, Amit Alfassy, Anna Rogers, Ariel\u00a0Kreisberg Nitzav, Canwen Xu, Chenghao Mou, Chris Emezue, Christopher Klamm, Colin Leong, Daniel van Strien, David\u00a0Ifeoluwa Adelani, and et al.2022. BLOOM: A 176B-Parameter Open-Access Multilingual Language Model. CoRR abs\/2211.05100 (2022). arXiv:2211.05100"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.441"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1351"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1355"},{"key":"e_1_3_2_1_23_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Furu Wei, Li Dong, Hangbo Bao, Nan Yang, and Ming Zhou. 2020. MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual, Hugo Larochelle, Marc\u2019Aurelio Ranzato, Raia Hadsell, Maria-Florina Balcan, and Hsuan-Tien Lin (Eds.)."},{"key":"e_1_3_2_1_24_1","volume-title":"The Tenth International Conference on Learning Representations, ICLR 2022","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Maarten Bosma, Vincent\u00a0Y. Zhao, Kelvin Guu, Adams\u00a0Wei Yu, Brian Lester, Nan Du, Andrew\u00a0M. Dai, and Quoc\u00a0V. Le. 2022. Finetuned Language Models are Zero-Shot Learners. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022. OpenReview.net."},{"key":"e_1_3_2_1_25_1","volume-title":"Emergent Abilities of Large Language Models. Trans. Mach. Learn. Res. 2022","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, Ed\u00a0H. Chi, Tatsunori Hashimoto, Oriol Vinyals, Percy Liang, Jeff Dean, and William Fedus. 2022. Emergent Abilities of Large Language Models. Trans. Mach. Learn. Res. 2022 (2022)."},{"key":"e_1_3_2_1_26_1","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Brian Ichter Fei Xia Ed\u00a0H. Chi Quoc\u00a0V. Le and Denny Zhou. 2022. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In NeurIPS."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.341"},{"key":"e_1_3_2_1_28_1","volume-title":"HuggingFace\u2019s Transformers: State-of-the-art Natural Language Processing. CoRR abs\/1910.03771","author":"Wolf Thomas","year":"2019","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, and Jamie Brew. 2019. HuggingFace\u2019s Transformers: State-of-the-art Natural Language Processing. CoRR abs\/1910.03771 (2019). arXiv:1910.03771"},{"key":"e_1_3_2_1_29_1","volume-title":"XLNet: Generalized Autoregressive Pretraining for Language Understanding. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime\u00a0G. Carbonell, Ruslan Salakhutdinov, and Quoc\u00a0V. Le. 2019. XLNet: Generalized Autoregressive Pretraining for Language Understanding. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada, Hanna\u00a0M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d\u2019Alch\u00e9-Buc, Emily\u00a0B. Fox, and Roman Garnett (Eds.). 5754\u20135764."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Rowan Zellers Ari Holtzman Yonatan Bisk Ali Farhadi and Yejin Choi. 2019. HellaSwag: Can a Machine Really Finish Your Sentence?arxiv:1905.07830\u00a0[cs.CL]","DOI":"10.18653\/v1\/P19-1472"},{"key":"e_1_3_2_1_31_1","volume-title":"Do Not Blindly Imitate the Teacher: Using Perturbed Loss for Knowledge Distillation. CoRR abs\/2305.05010","author":"Zhang Rongzhi","year":"2023","unstructured":"Rongzhi Zhang, Jiaming Shen, Tianqi Liu, Jialu Liu, Michael Bendersky, Marc Najork, and Chao Zhang. 2023. Do Not Blindly Imitate the Teacher: Using Perturbed Loss for Knowledge Distillation. CoRR abs\/2305.05010 (2023). arXiv:2305.05010"},{"key":"e_1_3_2_1_32_1","volume-title":"OPT: Open Pre-trained Transformer Language Models. CoRR abs\/2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona\u00a0T. Diab, Xian Li, Xi\u00a0Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit\u00a0Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. CoRR abs\/2205.01068 (2022). arXiv:2205.01068"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.622"}],"event":{"name":"ICMLC 2024: 2024 16th International Conference on Machine Learning and Computing","acronym":"ICMLC 2024","location":"Shenzhen China"},"container-title":["Proceedings of the 2024 16th International Conference on Machine Learning and Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3651671.3651737","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3651671.3651737","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T11:18:24Z","timestamp":1755861504000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3651671.3651737"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,2]]},"references-count":33,"alternative-id":["10.1145\/3651671.3651737","10.1145\/3651671"],"URL":"https:\/\/doi.org\/10.1145\/3651671.3651737","relation":{},"subject":[],"published":{"date-parts":[[2024,2,2]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}