{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T15:58:28Z","timestamp":1778255908733,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,8,4]],"date-time":"2023-08-04T00:00:00Z","timestamp":1691107200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62222215"],"award-info":[{"award-number":["62222215"]}]},{"name":"Beijing Natural Science Foundation","award":["4222027"],"award-info":[{"award-number":["4222027"]}]},{"name":"Beijing Outstanding Young Scientist Program","award":["BJJWZYJH012019100020098"],"award-info":[{"award-number":["BJJWZYJH012019100020098"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,8,6]]},"DOI":"10.1145\/3580305.3599850","type":"proceedings-article","created":{"date-parts":[[2023,8,4]],"date-time":"2023-08-04T18:10:58Z","timestamp":1691172658000},"page":"5660-5672","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["JiuZhang 2.0: A Unified Chinese Pre-trained Language Model for Multi-task Mathematical Problem Solving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8333-6196","authenticated-orcid":false,"given":"Xin","family":"Zhao","sequence":"first","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0650-9521","authenticated-orcid":false,"given":"Kun","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5607-4346","authenticated-orcid":false,"given":"Beichen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6954-8208","authenticated-orcid":false,"given":"Zheng","family":"Gong","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4875-5465","authenticated-orcid":false,"given":"Zhipeng","family":"Chen","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7299-4815","authenticated-orcid":false,"given":"Yuanhang","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9777-9676","authenticated-orcid":false,"given":"Ji-Rong","family":"Wen","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5295-3857","authenticated-orcid":false,"given":"Jing","family":"Sha","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9202-7678","authenticated-orcid":false,"given":"Shijin","family":"Wang","sequence":"additional","affiliation":[{"name":"iFLYTEK AI Research (Central China), iFLYTEK Research, State Key Laboratory of Cognitive Intelligence, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0328-423X","authenticated-orcid":false,"given":"Cong","family":"Liu","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1038-0336","authenticated-orcid":false,"given":"Guoping","family":"Hu","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,8,4]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Muppet: Massive Multi-task Representations with Pre-Finetuning. ArXiv","author":"Aghajanyan Armen","year":"2021","unstructured":"Armen Aghajanyan , Anchit Gupta , Akshat Shrivastava , Xilun Chen , Luke Zettlemoyer , and Sonal Gupta . 2021 . Muppet: Massive Multi-task Representations with Pre-Finetuning. ArXiv , Vol. abs\/ 2101 .11038 (2021). Armen Aghajanyan, Anchit Gupta, Akshat Shrivastava, Xilun Chen, Luke Zettlemoyer, and Sonal Gupta. 2021. Muppet: Massive Multi-task Representations with Pre-Finetuning. ArXiv, Vol. abs\/2101.11038 (2021)."},{"key":"e_1_3_2_2_2_1","volume-title":"ICLR","author":"Bahdanau Dzmitry","year":"2015","unstructured":"Dzmitry Bahdanau , Kyunghyun Cho , and Yoshua Bengio . 2015 . Neural Machine Translation by Jointly Learning to Align and Translate . In ICLR 2015. Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2015. Neural Machine Translation by Jointly Learning to Align and Translate. In ICLR 2015."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-23599-3_31"},{"key":"e_1_3_2_2_4_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell etal 2020. Language models are few-shot learners. NeurIPS (2020). Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. NeurIPS (2020)."},{"key":"e_1_3_2_2_5_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.","author":"Chen Mark","year":"2021","unstructured":"Mark Chen , Jerry Tworek , Heewoo Jun , Qiming Yuan , Henrique Ponde de Oliveira Pinto , Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021 . Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021). Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)."},{"key":"e_1_3_2_2_6_1","volume-title":"Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. arXiv preprint arXiv:2211.12588","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen , Xueguang Ma , Xinyi Wang , and William W Cohen . 2022. Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. arXiv preprint arXiv:2211.12588 ( 2022 ). Wenhu Chen, Xueguang Ma, Xinyi Wang, and William W Cohen. 2022. Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. arXiv preprint arXiv:2211.12588 (2022)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"Ting-Rui Chiang and Yun-Nung Chen. 2019. Semantically-Aligned Equation Generation for Solving and Reasoning Math Word Problems. In NAACL. Ting-Rui Chiang and Yun-Nung Chen. 2019. Semantically-Aligned Equation Generation for Solving and Reasoning Math Word Problems. In NAACL.","DOI":"10.18653\/v1\/N19-1272"},{"key":"e_1_3_2_2_8_1","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano etal 2021. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021). Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano et al. 2021. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_2_9_1","volume-title":"Pre-training with whole word masking for chinese bert. TASLP","author":"Cui Yiming","year":"2021","unstructured":"Yiming Cui , Wanxiang Che , Ting Liu , Bing Qin , and Ziqing Yang . 2021. Pre-training with whole word masking for chinese bert. TASLP ( 2021 ). Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, and Ziqing Yang. 2021. Pre-training with whole word masking for chinese bert. TASLP (2021)."},{"key":"e_1_3_2_2_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2018 . Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018). Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_11_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2019 . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL. Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL."},{"key":"e_1_3_2_2_12_1","volume-title":"Unified Language Model Pre-training for Natural Language Understanding and Generation. ArXiv","author":"Dong Li","year":"2019","unstructured":"Li Dong , Nan Yang , Wenhui Wang , Furu Wei , Xiaodong Liu , Yu Wang , Jianfeng Gao , M. Zhou , and Hsiao-Wuen Hon . 2019. Unified Language Model Pre-training for Natural Language Understanding and Generation. ArXiv , Vol. abs\/ 1905 .03197 ( 2019 ). Li Dong, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng Gao, M. Zhou, and Hsiao-Wuen Hon. 2019. Unified Language Model Pre-training for Natural Language Understanding and Generation. ArXiv, Vol. abs\/1905.03197 (2019)."},{"key":"e_1_3_2_2_13_1","unstructured":"Iddo Drori Sunny Tran Roman Wang Newman Cheng Kevin Liu Leonard Tang Elizabeth Ke Nikhil Singh Taylor L Patti Jayson Lynch etal 2021. A neural network solves and generates mathematics problems by program synthesis: Calculus differential equations linear algebra and more. arXiv preprint arXiv:2112.15594 (2021). Iddo Drori Sunny Tran Roman Wang Newman Cheng Kevin Liu Leonard Tang Elizabeth Ke Nikhil Singh Taylor L Patti Jayson Lynch et al. 2021. A neural network solves and generates mathematics problems by program synthesis: Calculus differential equations linear algebra and more. arXiv preprint arXiv:2112.15594 (2021)."},{"key":"e_1_3_2_2_14_1","volume-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. arXiv preprint arXiv:2101.03961","author":"Fedus William","year":"2021","unstructured":"William Fedus , Barret Zoph , and Noam Shazeer . 2021. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. arXiv preprint arXiv:2101.03961 ( 2021 ). William Fedus, Barret Zoph, and Noam Shazeer. 2021. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. arXiv preprint arXiv:2101.03961 (2021)."},{"key":"e_1_3_2_2_15_1","volume-title":"Complexity-Based Prompting for Multi-Step Reasoning. ArXiv","author":"Fu Yao","year":"2022","unstructured":"Yao Fu , Hao-Chun Peng , Ashish Sabharwal , Peter Clark , and Tushar Khot . 2022. Complexity-Based Prompting for Multi-Step Reasoning. ArXiv , Vol. abs\/ 2210 .00720 ( 2022 ). Yao Fu, Hao-Chun Peng, Ashish Sabharwal, Peter Clark, and Tushar Khot. 2022. Complexity-Based Prompting for Multi-Step Reasoning. ArXiv, Vol. abs\/2210.00720 (2022)."},{"key":"e_1_3_2_2_16_1","volume-title":"PAL: Program-aided Language Models. arXiv preprint arXiv:2211.10435","author":"Gao Luyu","year":"2022","unstructured":"Luyu Gao , Aman Madaan , Shuyan Zhou , Uri Alon , Pengfei Liu , Yiming Yang , Jamie Callan , and Graham Neubig . 2022 . PAL: Program-aided Language Models. arXiv preprint arXiv:2211.10435 (2022). Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie Callan, and Graham Neubig. 2022. PAL: Program-aided Language Models. arXiv preprint arXiv:2211.10435 (2022)."},{"key":"e_1_3_2_2_17_1","volume-title":"SimCSE: Simple Contrastive Learning of Sentence Embeddings. ArXiv","author":"Gao Tianyu","year":"2021","unstructured":"Tianyu Gao , Xingcheng Yao , and Danqi Chen . 2021. SimCSE: Simple Contrastive Learning of Sentence Embeddings. ArXiv , Vol. abs\/ 2104 .08821 ( 2021 ). Tianyu Gao, Xingcheng Yao, and Danqi Chen. 2021. SimCSE: Simple Contrastive Learning of Sentence Embeddings. ArXiv, Vol. abs\/2104.08821 (2021)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Zheng Gong Kun Zhou Xin Zhao Jing Sha Shijin Wang and Ji-Rong Wen. 2022. Continual Pre-training of Language Models for Math Problem Understanding with Syntax-Aware Memory Network. In ACL. 5923--5933. Zheng Gong Kun Zhou Xin Zhao Jing Sha Shijin Wang and Ji-Rong Wen. 2022. Continual Pre-training of Language Models for Math Problem Understanding with Syntax-Aware Memory Network. In ACL. 5923--5933.","DOI":"10.18653\/v1\/2022.acl-long.408"},{"key":"e_1_3_2_2_19_1","volume-title":"Ahmed Hassan Awadallah, and Jianfeng Gao","author":"Gupta Shashank","year":"2022","unstructured":"Shashank Gupta , Subhabrata Mukherjee , Krishan Subudhi , Eduardo Gonzalez , Damien Jose , Ahmed Hassan Awadallah, and Jianfeng Gao . 2022 . Sparsely Activated Mixture-of-Experts are Robust Multi-Task Learners. ArXiv , Vol. abs\/ 2204 .07689 (2022). Shashank Gupta, Subhabrata Mukherjee, Krishan Subudhi, Eduardo Gonzalez, Damien Jose, Ahmed Hassan Awadallah, and Jianfeng Gao. 2022. Sparsely Activated Mixture-of-Experts are Robust Multi-Task Learners. ArXiv, Vol. abs\/2204.07689 (2022)."},{"key":"e_1_3_2_2_20_1","volume-title":"Smith","author":"Gururangan Suchin","year":"2020","unstructured":"Suchin Gururangan , Ana Marasovi\u0107 , Swabha Swayamdipta , Kyle Lo , Iz Beltagy , Doug Downey , and Noah A . Smith . 2020 . Don't Stop Pretraining : Adapt Language Models to Domains and Tasks. In ACL. Suchin Gururangan, Ana Marasovi\u0107, Swabha Swayamdipta, Kyle Lo, Iz Beltagy, Doug Downey, and Noah A. Smith. 2020. Don't Stop Pretraining: Adapt Language Models to Domains and Tasks. In ACL."},{"key":"e_1_3_2_2_21_1","volume-title":"Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks , Collin Burns , Saurav Kadavath , Akul Arora , Steven Basart , Eric Tang , Dawn Song , and Jacob Steinhardt . 2021a. Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874 ( 2021 ). Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. 2021a. Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874 (2021)."},{"key":"e_1_3_2_2_22_1","volume-title":"Dawn Xiaodong Song, and Jacob Steinhardt","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks , Collin Burns , Saurav Kadavath , Akul Arora , Steven Basart , Eric Tang , Dawn Xiaodong Song, and Jacob Steinhardt . 2021 b. Measuring Mathematical Problem Solving With the MATH Dataset. ArXiv , Vol. abs\/ 2103 .03874 (2021). Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Xiaodong Song, and Jacob Steinhardt. 2021b. Measuring Mathematical Problem Solving With the MATH Dataset. ArXiv, Vol. abs\/2103.03874 (2021)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Yoon Kim. 2014. Convolutional Neural Networks for Sentence Classification. In EMNLP. Yoon Kim. 2014. Convolutional Neural Networks for Sentence Classification. In EMNLP.","DOI":"10.3115\/v1\/D14-1181"},{"key":"e_1_3_2_2_25_1","volume-title":"Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima , Shixiang Shane Gu , Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022 . Large Language Models are Zero-Shot Reasoners. ArXiv , Vol. abs\/ 2205 .11916 (2022). Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022. Large Language Models are Zero-Shot Reasoners. ArXiv, Vol. abs\/2205.11916 (2022)."},{"key":"e_1_3_2_2_26_1","volume-title":"Beyond Distillation: Task-level Mixture-of-Experts for Efficient Inference. In Conference on Empirical Methods in Natural Language Processing.","author":"Kudugunta Sneha","year":"2021","unstructured":"Sneha Kudugunta , Yanping Huang , Ankur Bapna , Maxim Krikun , Dmitry Lepikhin , Minh-Thang Luong , and Orhan Firat . 2021 . Beyond Distillation: Task-level Mixture-of-Experts for Efficient Inference. In Conference on Empirical Methods in Natural Language Processing. Sneha Kudugunta, Yanping Huang, Ankur Bapna, Maxim Krikun, Dmitry Lepikhin, Minh-Thang Luong, and Orhan Firat. 2021. Beyond Distillation: Task-level Mixture-of-Experts for Efficient Inference. In Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Siwei Lai Liheng Xu Kang Liu and Jun Zhao. 2015. Recurrent convolutional neural networks for text classification. In AAAI. Siwei Lai Liheng Xu Kang Liu and Jun Zhao. 2015. Recurrent convolutional neural networks for text classification. In AAAI.","DOI":"10.1609\/aaai.v29i1.9513"},{"key":"e_1_3_2_2_28_1","volume-title":"Yan Wang, Dongxiang Zhang, and Ee-Peng Lim.","author":"Lan Yihuai","year":"2021","unstructured":"Yihuai Lan , Lei Wang , Qiyuan Zhang , Yunshi Lan , Bing Tian Dai , Yan Wang, Dongxiang Zhang, and Ee-Peng Lim. 2021 . MWPToolkit: An Open-Source Framework for Deep Learning-Based Math Word Problem Solvers . arXiv preprint arXiv:2109.00799 (2021). Yihuai Lan, Lei Wang, Qiyuan Zhang, Yunshi Lan, Bing Tian Dai, Yan Wang, Dongxiang Zhang, and Ee-Peng Lim. 2021. MWPToolkit: An Open-Source Framework for Deep Learning-Based Math Word Problem Solvers. arXiv preprint arXiv:2109.00799 (2021)."},{"key":"e_1_3_2_2_29_1","volume-title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL.","author":"Lewis Mike","year":"2020","unstructured":"Mike Lewis , Yinhan Liu , Naman Goyal , Marjan Ghazvininejad , Abdelrahman Mohamed , Omer Levy , Veselin Stoyanov , and Luke Zettlemoyer . 2020 . BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL."},{"key":"e_1_3_2_2_30_1","volume-title":"Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, Yuhuai Wu, Behnam Neyshabur, Guy Gur-Ari, and Vedant Misra.","author":"Lewkowycz Aitor","year":"2022","unstructured":"Aitor Lewkowycz , Anders Andreassen , David Dohan , Ethan Dyer , Henryk Michalewski , Vinay Venkatesh Ramasesh , Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, Yuhuai Wu, Behnam Neyshabur, Guy Gur-Ari, and Vedant Misra. 2022 . Solving Quantitative Reasoning Problems with Language Models. ArXiv , Vol. abs\/ 2206 .14858 (2022). Aitor Lewkowycz, Anders Andreassen, David Dohan, Ethan Dyer, Henryk Michalewski, Vinay Venkatesh Ramasesh, Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, Yuhuai Wu, Behnam Neyshabur, Guy Gur-Ari, and Vedant Misra. 2022. Solving Quantitative Reasoning Problems with Language Models. ArXiv, Vol. abs\/2206.14858 (2022)."},{"key":"e_1_3_2_2_31_1","volume-title":"Bing Tian Dai, and Dongxiang Zhang","author":"Li Jierui","year":"2019","unstructured":"Jierui Li , Lei Wang , Jipeng Zhang , Yan Wang , Bing Tian Dai, and Dongxiang Zhang . 2019 . Modeling intra-relation in math word problems with different functional multi-head attentions. In ACL. Jierui Li, Lei Wang, Jipeng Zhang, Yan Wang, Bing Tian Dai, and Dongxiang Zhang. 2019. Modeling intra-relation in math word problems with different functional multi-head attentions. In ACL."},{"key":"e_1_3_2_2_32_1","volume-title":"On the advance of making language models better reasoners. arXiv preprint arXiv:2206.02336","author":"Li Yifei","year":"2022","unstructured":"Yifei Li , Zeqi Lin , Shizhuo Zhang , Qiang Fu , Bei Chen , Jian-Guang Lou , and Weizhu Chen . 2022. On the advance of making language models better reasoners. arXiv preprint arXiv:2206.02336 ( 2022 ). Yifei Li, Zeqi Lin, Shizhuo Zhang, Qiang Fu, Bei Chen, Jian-Guang Lou, and Weizhu Chen. 2022. On the advance of making language models better reasoners. arXiv preprint arXiv:2206.02336 (2022)."},{"key":"e_1_3_2_2_33_1","volume-title":"Workshop on Knowledge Extraction and Integration for Deep Learning Architectures; Deep Learning Inside Out.","author":"Liu Jiachang","year":"2021","unstructured":"Jiachang Liu , Dinghan Shen , Yizhe Zhang , Bill Dolan , Lawrence Carin , and Weizhu Chen . 2021 . What Makes Good In-Context Examples for GPT-3? . In Workshop on Knowledge Extraction and Integration for Deep Learning Architectures; Deep Learning Inside Out. Jiachang Liu, Dinghan Shen, Yizhe Zhang, Bill Dolan, Lawrence Carin, and Weizhu Chen. 2021. What Makes Good In-Context Examples for GPT-3?. In Workshop on Knowledge Extraction and Integration for Deep Learning Architectures; Deep Learning Inside Out."},{"key":"e_1_3_2_2_34_1","volume-title":"Multi-Task Deep Neural Networks for Natural Language Understanding. In Annual Meeting of the Association for Computational Linguistics.","author":"Liu Xiaodong","year":"2019","unstructured":"Xiaodong Liu , Pengcheng He , Weizhu Chen , and Jianfeng Gao . 2019 . Multi-Task Deep Neural Networks for Natural Language Understanding. In Annual Meeting of the Association for Computational Linguistics. Xiaodong Liu, Pengcheng He, Weizhu Chen, and Jianfeng Gao. 2019. Multi-Task Deep Neural Networks for Natural Language Understanding. In Annual Meeting of the Association for Computational Linguistics."},{"key":"e_1_3_2_2_35_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In ICLR. Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_2_36_1","volume-title":"A Survey of Deep Learning for Mathematical Reasoning. ArXiv","author":"Lu Pan","year":"2022","unstructured":"Pan Lu , Liang Qiu , Wenhao Yu , Sean Welleck , and Kai-Wei Chang . 2022. A Survey of Deep Learning for Mathematical Reasoning. ArXiv , Vol. abs\/ 2212 .10535 ( 2022 ). Pan Lu, Liang Qiu, Wenhao Yu, Sean Welleck, and Kai-Wei Chang. 2022. A Survey of Deep Learning for Mathematical Reasoning. ArXiv, Vol. abs\/2212.10535 (2022)."},{"key":"e_1_3_2_2_37_1","volume-title":"A Survey in Mathematical Language Processing. ArXiv","author":"Meadows Jordan","year":"2022","unstructured":"Jordan Meadows and Andr\u00e9 Freitas . 2022. A Survey in Mathematical Language Processing. ArXiv , Vol. abs\/ 2205 .15231 ( 2022 ). Jordan Meadows and Andr\u00e9 Freitas. 2022. A Survey in Mathematical Language Processing. ArXiv, Vol. abs\/2205.15231 (2022)."},{"key":"e_1_3_2_2_38_1","volume-title":"Lila: A Unified Benchmark for Mathematical Reasoning. ArXiv","author":"Mishra Swaroop","year":"2022","unstructured":"Swaroop Mishra , Matthew Finlayson , Pan Lu , Leonard Tang , Sean Welleck , Chitta Baral , Tanmay Rajpurohit , Oyvind Tafjord , Ashish Sabharwal , Peter Clark , and A. Kalyan . 2022 . Lila: A Unified Benchmark for Mathematical Reasoning. ArXiv , Vol. abs\/ 2210 .17517 (2022). Swaroop Mishra, Matthew Finlayson, Pan Lu, Leonard Tang, Sean Welleck, Chitta Baral, Tanmay Rajpurohit, Oyvind Tafjord, Ashish Sabharwal, Peter Clark, and A. Kalyan. 2022. Lila: A Unified Benchmark for Mathematical Reasoning. ArXiv, Vol. abs\/2210.17517 (2022)."},{"key":"e_1_3_2_2_39_1","volume-title":"MathBERT: A Pre-Trained Model for Mathematical Formula Understanding. arXiv preprint arXiv:2105.00377","author":"Peng Shuai","year":"2021","unstructured":"Shuai Peng , Ke Yuan , Liangcai Gao , and Zhi Tang . 2021. MathBERT: A Pre-Trained Model for Mathematical Formula Understanding. arXiv preprint arXiv:2105.00377 ( 2021 ). Shuai Peng, Ke Yuan, Liangcai Gao, and Zhi Tang. 2021. MathBERT: A Pre-Trained Model for Mathematical Formula Understanding. arXiv preprint arXiv:2105.00377 (2021)."},{"key":"e_1_3_2_2_40_1","volume-title":"Generative language modeling for automated theorem proving. arXiv preprint arXiv:2009.03393","author":"Polu Stanislas","year":"2020","unstructured":"Stanislas Polu and Ilya Sutskever . 2020. Generative language modeling for automated theorem proving. arXiv preprint arXiv:2009.03393 ( 2020 ). Stanislas Polu and Ilya Sutskever. 2020. Generative language modeling for automated theorem proving. arXiv preprint arXiv:2009.03393 (2020)."},{"key":"e_1_3_2_2_41_1","volume-title":"Combining Modular Skills in Multitask Learning. ArXiv","author":"Ponti E.","year":"2022","unstructured":"E. Ponti , Alessandro Sordoni , and Siva Reddy . 2022. Combining Modular Skills in Multitask Learning. ArXiv , Vol. abs\/ 2202 .13914 ( 2022 ). E. Ponti, Alessandro Sordoni, and Siva Reddy. 2022. Combining Modular Skills in Multitask Learning. ArXiv, Vol. abs\/2202.13914 (2022)."},{"key":"e_1_3_2_2_42_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever etal 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9. Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_2_43_1","volume-title":"CPT: A Pre-Trained Unbalanced Transformer for Both Chinese Language Understanding and Generation. arXiv preprint arXiv:2109.05729","author":"Shao Yunfan","year":"2021","unstructured":"Yunfan Shao , Zhichao Geng , Yitao Liu , Junqi Dai , Fei Yang , Li Zhe , Hujun Bao , and Xipeng Qiu . 2021 . CPT: A Pre-Trained Unbalanced Transformer for Both Chinese Language Understanding and Generation. arXiv preprint arXiv:2109.05729 (2021). Yunfan Shao, Zhichao Geng, Yitao Liu, Junqi Dai, Fei Yang, Li Zhe, Hujun Bao, and Xipeng Qiu. 2021. CPT: A Pre-Trained Unbalanced Transformer for Both Chinese Language Understanding and Generation. arXiv preprint arXiv:2109.05729 (2021)."},{"key":"e_1_3_2_2_44_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. ArXiv","author":"Shazeer Noam M.","year":"2017","unstructured":"Noam M. Shazeer , Azalia Mirhoseini , Krzysztof Maziarz , Andy Davis , Quoc V. Le , Geoffrey E. Hinton , and Jeff Dean . 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. ArXiv , Vol. abs\/ 1701 .06538 ( 2017 ). Noam M. Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc V. Le, Geoffrey E. Hinton, and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. ArXiv, Vol. abs\/1701.06538 (2017)."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Shuming Shi Yuehui Wang Chin-Yew Lin Xiaojiang Liu and Yong Rui. 2015. Automatically solving number word problems by semantic parsing and reasoning. In EMNLP. 1132--1142. Shuming Shi Yuehui Wang Chin-Yew Lin Xiaojiang Liu and Yong Rui. 2015. Automatically solving number word problems by semantic parsing and reasoning. In EMNLP. 1132--1142.","DOI":"10.18653\/v1\/D15-1135"},{"key":"e_1_3_2_2_46_1","volume-title":"Why are NLP Models Fumbling at Elementary Math? A Survey of Deep Learning based Word Problem Solvers. ArXiv","author":"Sundaram Sowmya S.","year":"2022","unstructured":"Sowmya S. Sundaram , Sairam Gurajada , Marco Fisichella , Deepak P, and Savitha Sam Abraham . 2022. Why are NLP Models Fumbling at Elementary Math? A Survey of Deep Learning based Word Problem Solvers. ArXiv , Vol. abs\/ 2205 .15683 ( 2022 ). Sowmya S. Sundaram, Sairam Gurajada, Marco Fisichella, Deepak P, and Savitha Sam Abraham. 2022. Why are NLP Models Fumbling at Elementary Math? A Survey of Deep Learning based Word Problem Solvers. ArXiv, Vol. abs\/2205.15683 (2022)."},{"key":"e_1_3_2_2_47_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008. Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008."},{"key":"e_1_3_2_2_48_1","volume-title":"Chi, and Denny Zhou","author":"Wang Xuezhi","year":"2022","unstructured":"Xuezhi Wang , Jason Wei , Dale Schuurmans , Quoc Le , Ed Chi, and Denny Zhou . 2022 . Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2203.11171 (2022). Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, and Denny Zhou. 2022. Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2203.11171 (2022)."},{"key":"e_1_3_2_2_49_1","volume-title":"Quoc Le, and Denny Zhou.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei , Xuezhi Wang , Dale Schuurmans , Maarten Bosma , Ed Huai hsin Chi , Quoc Le, and Denny Zhou. 2022 . Chain of Thought Prompting Elicits Reasoning in Large Language Models. ArXiv , Vol. abs\/ 2201 .11903 (2022). Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Huai hsin Chi, Quoc Le, and Denny Zhou. 2022. Chain of Thought Prompting Elicits Reasoning in Large Language Models. ArXiv , Vol. abs\/2201.11903 (2022)."},{"key":"e_1_3_2_2_50_1","unstructured":"Qinyuan Ye Juan Zha and Xiang Ren. 2022. Eliciting and Understanding Cross-Task Skills with Task-Level Mixture-of-Experts. Qinyuan Ye Juan Zha and Xiang Ren. 2022. Eliciting and Understanding Cross-Task Skills with Task-Level Mixture-of-Experts."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10032-011-0174-4"},{"key":"e_1_3_2_2_52_1","volume-title":"Smola","author":"Zhang Zhuosheng","year":"2022","unstructured":"Zhuosheng Zhang , Aston Zhang , Mu Li , and Alexander J . Smola . 2022 . Automatic Chain of Thought Prompting in Large Language Models. ArXiv , Vol. abs\/ 2210 .03493 (2022). Zhuosheng Zhang, Aston Zhang, Mu Li, and Alexander J. Smola. 2022. Automatic Chain of Thought Prompting in Large Language Models. ArXiv, Vol. abs\/2210.03493 (2022)."},{"key":"e_1_3_2_2_53_1","volume-title":"Mengzi: Towards Lightweight yet Ingenious Pre-trained Models for Chinese. arXiv preprint arXiv:2110.06696","author":"Zhang Zhuosheng","year":"2021","unstructured":"Zhuosheng Zhang , Hanqing Zhang , Keming Chen , Yuhang Guo , Jingyun Hua , Yulong Wang , and Ming Zhou . 2021 . Mengzi: Towards Lightweight yet Ingenious Pre-trained Models for Chinese. arXiv preprint arXiv:2110.06696 (2021). Zhuosheng Zhang, Hanqing Zhang, Keming Chen, Yuhang Guo, Jingyun Hua, Yulong Wang, and Ming Zhou. 2021. Mengzi: Towards Lightweight yet Ingenious Pre-trained Models for Chinese. arXiv preprint arXiv:2110.06696 (2021)."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539131"},{"key":"e_1_3_2_2_55_1","volume-title":"A Survey of Large Language Models. arXiv preprint arXiv:2303.18223","author":"Zhao Wayne Xin","year":"2023","unstructured":"Wayne Xin Zhao , Kun Zhou , Junyi Li , Tianyi Tang , Xiaolei Wang , Yupeng Hou , Yingqian Min , Beichen Zhang , Junjie Zhang , Zican Dong , Yifan Du , Chen Yang , Yushuo Chen , Zhipeng Chen , Jinhao Jiang , Ruiyang Ren , Yifan Li , Xinyu Tang , Zikang Liu , Peiyu Liu , Jian-Yun Nie , and Ji-Rong Wen . 2023. A Survey of Large Language Models. arXiv preprint arXiv:2303.18223 ( 2023 ). http:\/\/arxiv.org\/abs\/2303.18223 Wayne Xin Zhao, Kun Zhou, Junyi Li, Tianyi Tang, Xiaolei Wang, Yupeng Hou, Yingqian Min, Beichen Zhang, Junjie Zhang, Zican Dong, Yifan Du, Chen Yang, Yushuo Chen, Zhipeng Chen, Jinhao Jiang, Ruiyang Ren, Yifan Li, Xinyu Tang, Zikang Liu, Peiyu Liu, Jian-Yun Nie, and Ji-Rong Wen. 2023. A Survey of Large Language Models. arXiv preprint arXiv:2303.18223 (2023). http:\/\/arxiv.org\/abs\/2303.18223"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462794"},{"key":"e_1_3_2_2_57_1","volume-title":"Chi","author":"Zhou Denny","year":"2022","unstructured":"Denny Zhou , Nathanael Sch\u00e4rli , Le Hou , Jason Wei , Nathan Scales , Xuezhi Wang , Dale Schuurmans , Olivier Bousquet , Quoc Le , and Ed Chi . 2022 . Least-to-most prompting enables complex reasoning in large language models. arXiv preprint arXiv:2205.10625 (2022). Denny Zhou, Nathanael Sch\u00e4rli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Olivier Bousquet, Quoc Le, and Ed Chi. 2022. Least-to-most prompting enables complex reasoning in large language models. arXiv preprint arXiv:2205.10625 (2022)."},{"key":"e_1_3_2_2_58_1","volume-title":"Solving Math Word Problem via Cooperative Reasoning induced Language Models. ArXiv","author":"Zhu Xinyu","year":"2022","unstructured":"Xinyu Zhu , Junjie Wang , Lin Zhang , Yuxiang Zhang , Ruyi Gan , Jiaxing Zhang , and Yujiu Yang . 2022. Solving Math Word Problem via Cooperative Reasoning induced Language Models. ArXiv , Vol. abs\/ 2210 .16257 ( 2022 ). Xinyu Zhu, Junjie Wang, Lin Zhang, Yuxiang Zhang, Ruyi Gan, Jiaxing Zhang, and Yujiu Yang. 2022. Solving Math Word Problem via Cooperative Reasoning induced Language Models. ArXiv, Vol. abs\/2210.16257 (2022)."},{"key":"e_1_3_2_2_59_1","unstructured":"Barret Zoph Irwan Bello Sameer Kumar Nan Du Yanping Huang Jeff Dean Noam M. Shazeer and William Fedus. 2022. ST-MoE: Designing Stable and Transferable Sparse Expert Models. Barret Zoph Irwan Bello Sameer Kumar Nan Du Yanping Huang Jeff Dean Noam M. Shazeer and William Fedus. 2022. ST-MoE: Designing Stable and Transferable Sparse Expert Models."}],"event":{"name":"KDD '23: The 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Long Beach CA USA","acronym":"KDD '23","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3580305.3599850","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3580305.3599850","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:23Z","timestamp":1750182563000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3580305.3599850"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,4]]},"references-count":59,"alternative-id":["10.1145\/3580305.3599850","10.1145\/3580305"],"URL":"https:\/\/doi.org\/10.1145\/3580305.3599850","relation":{},"subject":[],"published":{"date-parts":[[2023,8,4]]},"assertion":[{"value":"2023-08-04","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}