{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T05:48:18Z","timestamp":1777873698901,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737245","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:07:39Z","timestamp":1754255259000},"page":"4935-4944","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Matryoshka Model Learning for Improved Elastic Student Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4308-2862","authenticated-orcid":false,"given":"Chetan","family":"Verma","sequence":"first","affiliation":[{"name":"Google, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5934-6079","authenticated-orcid":false,"given":"Aditya Srinivas","family":"Timmaraju","sequence":"additional","affiliation":[{"name":"Google DeepMind, Bengaluru, KA, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3520-9627","authenticated-orcid":false,"given":"Cho-Jui","family":"Hsieh","sequence":"additional","affiliation":[{"name":"Google, Los Angeles, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6694-0255","authenticated-orcid":false,"given":"Suyash","family":"Damle","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4566-4890","authenticated-orcid":false,"given":"Ngot","family":"Bui","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9522-0366","authenticated-orcid":false,"given":"Yang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3496-8530","authenticated-orcid":false,"given":"Wen","family":"Chen","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6023-0226","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0833-3438","authenticated-orcid":false,"given":"Prateek","family":"Jain","sequence":"additional","affiliation":[{"name":"Google DeepMind, Bengaluru, KA, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2759-1416","authenticated-orcid":false,"given":"Inderjit","family":"Dhillon","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Rohan Anil Sandra Gadanho Da Huang Nijith Jacob Zhuoshu Li Dong Lin Todd Phillips Cristina Pop Kevin Regan Gil I Shamir et al. 2022. On the factory floor: ML engineering for industrial-scale ads recommendation models. arXiv preprint arXiv:2209.05310(2022)."},{"key":"e_1_3_2_2_2_1","unstructured":"Rohan Anil Vineet Gupta Tomer Koren Kevin Regan and Yoram Singer. 2020. Scalable second order optimization for deep learning. arXiv preprint arXiv:2002.09018(2020)."},{"key":"e_1_3_2_2_3_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Anil Rohan","year":"2019","unstructured":"Rohan Anil, Vineet Gupta, Tomer Koren, and Yoram Singer. 2019. Memory efficient adaptive optimization. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_2_4_1","unstructured":"Rohan Anil Gabriel Pereyra Alexandre Passos Robert Ormandi George E Dahl and Geoffrey E Hinton. 2018. Large scale distributed neural network training through online distillation. arXiv preprint arXiv:1804.03235(2018)."},{"key":"e_1_3_2_2_5_1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Nick Ryder Melanie Subbiah Benjamin Mann","year":"2020","unstructured":"Benjamin Mann Nick Ryder Melanie Subbiah Jared D. Kaplan Prafulla Dhariwal Arvind Neelakantan et al. Brown, Tom. 2020. Language models are few-shot learners. Advances in Neural Information Processing Systems, Vol. 33, 1877-1901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_6_1","volume-title":"Once-for-all: Train one network and specialize it for efficient deployment. arXiv preprint arXiv:1908.09791(2019).","author":"Cai Han","year":"2019","unstructured":"Han Cai, Chuang Gan, Tianzhe Wang, Zhekai Zhang, and Song Han. 2019. Once-for-all: Train one network and specialize it for efficient deployment. arXiv preprint arXiv:1908.09791(2019)."},{"key":"e_1_3_2_2_7_1","volume-title":"Flextron: Many-in-One Flexible Large Language Model. arXiv preprint arXiv:2406.10260(2024).","author":"Cai Ruisi","year":"2024","unstructured":"Ruisi Cai, Saurav Muralidharan, Greg Heinrich, Hongxu Yin, Zhangyang Wang, Jan Kautz, and Pavlo Molchanov. 2024. Flextron: Many-in-One Flexible Large Language Model. arXiv preprint arXiv:2406.10260(2024)."},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"al Clark Christopher","year":"2019","unstructured":"Christopher et al Clark. 2019. BoolQ: Exploring the Surprising Difficulty of Natural Yes\/No Questions. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)."},{"key":"e_1_3_2_2_9_1","unstructured":"Peter Clark Isaac Cowhey Oren Etzioni Tushar Khot Ashish Sabharwal Carissa Schoenick and Oyvind Tafjord. 2018. Think you have solved question answering? try arc the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457(2018)."},{"key":"e_1_3_2_2_10_1","volume-title":"International Conference on Machine Learning. PMLR, 7480-7512","author":"Dehghani Mostafa","year":"2023","unstructured":"Mostafa Dehghani, Josip Djolonga, Basil Mustafa, Piotr Padlewski, Jonathan Heek, Justin Gilmer, Andreas Peter Steiner, Mathilde Caron, Robert Geirhos, Ibrahim Alabdulmohsin, et al., 2023. Scaling vision transformers to 22 billion parameters. In International Conference on Machine Learning. PMLR, 7480-7512."},{"key":"e_1_3_2_2_11_1","volume-title":"MatFormer: Nested Transformer for Elastic Inference. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Devvrit Fnu","year":"2024","unstructured":"Fnu Devvrit, Sneha Kudugunta, Aditya Kusupati, Tim Dettmers, Kaifeng Chen, Inderjit S Dhillon, Yulia Tsvetkov, Hannaneh Hajishirzi, Sham M. Kakade, Ali Farhadi, and Prateek Jain. 2024. MatFormer: Nested Transformer for Elastic Inference. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_12_1","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi John","year":"2011","unstructured":"John Duchi, Elad Hazan, and Yoram Singer. 2011a. Adaptive subgradient methods for online learning and stochastic optimization. Journal of machine learning research, Vol. 12, 7 (2011).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_2_13_1","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi John","year":"2011","unstructured":"John Duchi, Elad Hazan, and Yoram Singer. 2011b. Adaptive subgradient methods for online learning and stochastic optimization. Journal of machine learning research, Vol. 12, 7 (2011).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_2_14_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Duvvuri Sai Surya","unstructured":"Sai Surya Duvvuri, Fnu Devvrit, Rohan Anil, Cho-Jui Hsieh, and Inderjit S Dhillon. [n.d.]. CASPR: Combining Axes Preconditioners through Kronecker Approximation for Deep Learning. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_15_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, Vol. 23, 120 (2022), 1-39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_16_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman et al. 2024. The Llama 3 herd of models. arXiv preprint arXiv:2407.21783."},{"key":"e_1_3_2_2_17_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Gupta Vineet","year":"2018","unstructured":"Vineet Gupta, Tomer Koren, and Yoram Singer. 2018a. Shampoo: Preconditioned stochastic tensor optimization. In International Conference on Machine Learning. PMLR, 1842-1850."},{"key":"e_1_3_2_2_18_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Gupta Vineet","year":"2018","unstructured":"Vineet Gupta, Tomer Koren, and Yoram Singer. 2018b. Shampoo: Preconditioned stochastic tensor optimization. In International Conference on Machine Learning. PMLR, 1842-1850."},{"key":"e_1_3_2_2_19_1","volume-title":"Distilling the knowledge in a neural network. CoRR","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. CoRR, Vol. abs\/1503.02531 (2015)."},{"key":"e_1_3_2_2_20_1","volume-title":"International conference on machine learning. PMLR, 9099-9117","author":"Hua Weizhe","year":"2022","unstructured":"Weizhe Hua, Zihang Dai, Hanxiao Liu, and Quoc Le. 2022. Transformer quality in linear time. In International conference on machine learning. PMLR, 9099-9117."},{"key":"e_1_3_2_2_21_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014).","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma. 2014a. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014)."},{"key":"e_1_3_2_2_22_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014).","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma. 2014b. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014)."},{"key":"e_1_3_2_2_23_1","first-page":"2117","article-title":"Knowledge Distillation from Offline to Streaming RNN Transducer for End-to-End Speech Recognition","author":"Kurata Gakuto","year":"2020","unstructured":"Gakuto Kurata and George Saon. 2020. Knowledge Distillation from Offline to Streaming RNN Transducer for End-to-End Speech Recognition. In Interspeech. 2117-2121.","journal-title":"Interspeech."},{"key":"e_1_3_2_2_24_1","unstructured":"George Kurian Somayeh Sardashti Ryan Sims Felix Berger Gary Holt Yang Li Jeremiah Willcock Kaiyuan Wang Herve Quiroz Abdulrahman Salem and Julian Grady. 2025. Scalable Machine Learning Training Infrastructure for Online Ads Recommendation and Auction Scoring Modeling at Google. arxiv:2501.10546 [cs.DC] https:\/\/arxiv.org\/abs\/2501.10546"},{"key":"e_1_3_2_2_25_1","first-page":"284","volume-title":"Perth","author":"Lan Xu","year":"2019","unstructured":"Xu Lan, Xiatian Zhu, and Shaogang Gong. 2019. Self-referenced deep learning. In Computer Vision-ACCV 2018: 14th Asian Conference on Computer Vision, Perth, Australia, December 2-6, 2018, Revised Selected Papers, Part II 14. Springer, 284-300."},{"key":"e_1_3_2_2_26_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_2_27_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694-9705."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3615017"},{"key":"e_1_3_2_2_29_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research","volume":"20867","author":"Liang Chen","year":"2023","unstructured":"Chen Liang, Simiao Zuo, Qingru Zhang, Pengcheng He, Weizhu Chen, and Tuo Zhao. 2023. Less is More: Task-aware Layer-wise Distillation for Language Model Compression. In Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett(Eds.). PMLR, 20852-20867."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Yuchen Liu Hao Xiong Zhongjun He Jiajun Zhang Hua Wu Haifeng Wang and Chengqing Zong. 2019. End-to-end speech translation with knowledge distillation. arXiv preprint arXiv:1904.08075(2019).","DOI":"10.21437\/Interspeech.2019-2582"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-852"},{"key":"e_1_3_2_2_32_1","volume-title":"International conference on machine learning. PMLR, 2408-2417","author":"Martens James","year":"2015","unstructured":"James Martens and Roger Grosse. 2015. Optimizing neural networks with Kronecker-factored approximate curvature. In International conference on machine learning. PMLR, 2408-2417."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401098"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1144"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02033"},{"key":"e_1_3_2_2_36_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_2_37_1","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research, Vol. 21, 140 (2020), 1-67. http:\/\/jmlr.org\/papers\/v21\/20-074.html","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_38_1","volume-title":"Antoine Chassang, Carlo Gatta, and Yoshua Bengio.","author":"Romero Adriana","year":"2014","unstructured":"Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. 2014. Fitnets: Hints for thin deep nets. arXiv preprint arXiv:1412.6550(2014)."},{"key":"e_1_3_2_2_39_1","unstructured":"Hao-Jun Michael Shi Tsung-Hsien Lee Shintaro Iwasaki Jose Gallego-Posada Zhijing Li Kaushik Rangadurai Dheevatsa Mudigere and Michael Rabbat. 2023. A distributed data-parallel pytorch implementation of the distributed shampoo optimizer for training neural networks at-scale. arXiv preprint arXiv:2309.06497(2023)."},{"key":"e_1_3_2_2_40_1","volume-title":"QUILL: Query Intent with Large Language Models using Retrieval Augmentation and Multi-stage Distillation. arxiv:2210.15718 [cs.CL] https:\/\/arxiv.org\/abs\/2210.15718","author":"Srinivasan Krishna","year":"2022","unstructured":"Krishna Srinivasan, Karthik Raman, Anupam Samanta, Lingrui Liao, Luca Bertelli, and Mike Bendersky. 2022. QUILL: Query Intent with Large Language Models using Retrieval Augmentation and Multi-stage Distillation. arxiv:2210.15718 [cs.CL] https:\/\/arxiv.org\/abs\/2210.15718"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2016.7900006"},{"key":"e_1_3_2_2_42_1","volume-title":"International conference on machine learning. PMLR, 10347-10357","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. In International conference on machine learning. PMLR, 10347-10357."},{"key":"e_1_3_2_2_43_1","unstructured":"Karen Ullrich Edward Meeds and Max Welling. 2017. Soft weight-sharing for neural network compression. arXiv preprint arXiv:1702.04008(2017)."},{"key":"e_1_3_2_2_44_1","unstructured":"Mojtaba Valipour Mehdi Rezagholizadeh Hossein Rajabzadeh Marzieh Tahaei Boxing Chen and Ali Ghodsi. 2023. Sortednet a place for every network and every network in its place: Towards a generalized solution for training many-in-one neural networks. arXiv preprint arXiv:2309.00255(2023)."},{"key":"e_1_3_2_2_45_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett(Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_2_46_1","volume-title":"SOAP: Improving and stabilizing shampoo using Adam. arXiv preprint arXiv:2409.11321(2024).","author":"Vyas Nikhil","year":"2024","unstructured":"Nikhil Vyas, Depen Morwani, Rosie Zhao, Itai Shapira, David Brandfonbrener, Lucas Janson, and Sham Kakade. 2024. SOAP: Improving and stabilizing shampoo using Adam. arXiv preprint arXiv:2409.11321(2024)."},{"key":"e_1_3_2_2_47_1","first-page":"13461","volume-title":"Deploy Anywhere: Matryoshka Representation Learning for Multimodal Recommendation. In Findings of the Association for Computational Linguistics: EMNLP","author":"Wang Yueqi","year":"2024","unstructured":"Yueqi et al. Wang. 2024. Train Once, Deploy Anywhere: Matryoshka Representation Learning for Multimodal Recommendation. In Findings of the Association for Computational Linguistics: EMNLP 2024. Association for Computational Linguistics, 13461-13472."},{"key":"e_1_3_2_2_48_1","unstructured":"Chuhan Wu Fangzhao Wu and Yongfeng Huang. 2021. One teacher is enough? pre-trained language model distillation from multiple teachers. arXiv preprint arXiv:2106.01023(2021)."},{"key":"e_1_3_2_2_49_1","first-page":"53201","article-title":"Does graph distillation see like vision dataset counterpart","volume":"36","author":"Yang Beining","year":"2023","unstructured":"Beining Yang, Kai Wang, Qingyun Sun, Cheng Ji, Xingcheng Fu, Hao Tang, Yang You, and Jianxin Li. 2023. Does graph distillation see like vision dataset counterpart? Advances in Neural Information Processing Systems, Vol. 36 (2023), 53201-53226.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_50_1","unstructured":"Jiahui Yu Wei Han Anmol Gulati Chung-Cheng Chiu Bo Li Tara N Sainath Yonghui Wu and Ruoming Pang. 2020. Dual-mode ASR: Unify and improve streaming ASR with full-context modeling. arXiv preprint arXiv:2010.06030(2020)."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401156"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00381"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00454"},{"key":"e_1_3_2_2_55_1","first-page":"2299","volume-title":"AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models. In Findings of the Association for Computational Linguistics: NAACL","author":"al Zhong Wanjun","year":"2024","unstructured":"Wanjun et al Zhong. 2024. AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models. In Findings of the Association for Computational Linguistics: NAACL 2024. Association for Computational Linguistics, 2299-2314. endthebibl"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737245","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T18:12:16Z","timestamp":1777572736000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737245"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":55,"alternative-id":["10.1145\/3711896.3737245","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737245","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}