{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T21:07:43Z","timestamp":1774127263114,"version":"3.50.1"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s10994-024-06681-1","type":"journal-article","created":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T13:10:19Z","timestamp":1737119419000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["ArithmeticGPT: empowering small-size large language models with advanced arithmetic skills"],"prefix":"10.1007","volume":"114","author":[{"given":"Zitao","family":"Liu","sequence":"first","affiliation":[]},{"given":"Ying","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Zhibo","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Jiahao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Tianqiao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Mi","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Weiqi","family":"Luo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,17]]},"reference":[{"key":"6681_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S.m, Anadkat, S., et\u00a0al. (2023). GPT-4 technical report. arXiv preprint. arXiv:2303.08774"},{"key":"6681_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.tine.2017.05.001","volume":"7","author":"S Ashkenazi","year":"2017","unstructured":"Ashkenazi, S., & Danan, Y. (2017). The role of mathematical anxiety and working memory on the performance of different types of arithmetic tasks. Trends in Neuroscience and Education, 7, 1\u201310.","journal-title":"Trends in Neuroscience and Education"},{"key":"6681_CR3","unstructured":"Azerbayev, Z., Schoelkopf, H., Paster, K., Dos\u00a0Santos, M., McAleer, S., Jiang, A., Deng, J., Biderman, S., & Welleck, S. (2023). Llemma: An open language model for mathematics. In The 3rd Workshop on Mathematical Reasoning and AI at NeurIPS\u201923,"},{"key":"6681_CR4","unstructured":"Bai, J., Bai, S., Chu, Y., Cui, Z., Dang, K., Deng, X., Fan, Y., Ge, W., Han, Y., Huang, F. et\u00a0al. (2023). Qwen technical report. arXiv preprint arXiv:2309.16609"},{"key":"6681_CR5","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Louradour, J., Collobert, R., & Weston, J. (2009). Curriculum learning. In Proceedings of the 26th Annual International Conference on Machine Learning (pp. 41\u201348).","DOI":"10.1145\/1553374.1553380"},{"key":"6681_CR6","unstructured":"Cai, T., Wang, X., Ma, T., Chen, X., & Zhou, D. (2023). Large language models as tool makers. In The Twelfth International Conference on Learning Representations."},{"key":"6681_CR7","unstructured":"Chen, W., Ma, X., Wang, X., & Cohen, W. W. (2022). Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. Transactions on Machine Learning Research."},{"key":"6681_CR8","unstructured":"Cobbe, K., Kosaraju, V., Bavarian, M., Chen, M., Jun, H., Kaiser, L., Plappert, M., Tworek, J., Hilton, J., Nakano, R., et\u00a0al. (2021). Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168"},{"key":"6681_CR9","unstructured":"Creswell, A., Shanahan, M., & Higgins, I. (2022). Selection-Inference: Exploiting large language models for interpretable logical reasoning. In The Eleventh International Conference on Learning Representations"},{"issue":"11","key":"6681_CR10","first-page":"688","volume":"2","author":"D Demszky","year":"2023","unstructured":"Demszky, D., Yang, D., Yeager, D. S., Bryan, C. J., Clapper, M., Chandhok, S., Eichstaedt, J. C., Hecht, C., Jamieson, J., Johnson, M., et al. (2023). Using large language models in psychology. Nature Reviews Psychology, 2(11), 688\u2013701.","journal-title":"Nature Reviews Psychology"},{"key":"6681_CR11","doi-asserted-by":"crossref","unstructured":"Du, Z., Qian, Y., Liu, X., Ding, M., Qiu, J., Yang, Z., & Tang, J. (2022). Glm: General language model pretraining with autoregressive blank infilling. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 320\u2013335).","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"6681_CR12","unstructured":"Gao, L., Madaan, A., Zhou, S., Alon, U., Liu, P., Yang, Y., Callan, J., & Neubig, G. (2023). PAL: Program-aided language models. In International Conference on Machine Learning (pp. 10764\u201310799). PMLR."},{"key":"6681_CR13","doi-asserted-by":"crossref","unstructured":"Garcez, A., & Lamb, L.\u00a0C. (2023). Neurosymbolic AI: The 3rd wave. Artificial Intelligence Review (pp. 1\u201320).","DOI":"10.1007\/s10462-023-10448-w"},{"key":"6681_CR14","doi-asserted-by":"crossref","unstructured":"Ghosal, D., Majumder, N., Mehrish, A., & Poria, S. (2023). Text-to-audio generation using instruction-tuned llm and latent diffusion model. arXiv preprint arXiv:2304.13731","DOI":"10.1145\/3581783.3612348"},{"key":"6681_CR15","unstructured":"Gou, Z., Shao, Z., Gong, Y., Yang, Y., Huang, M., Duan, N., & Chen, W. ToRA: A tool-integrated reasoning agent for mathematical problem solving. In The Twelfth International Conference on Learning Representations."},{"key":"6681_CR16","unstructured":"Hendy, A., Abdelrehim, M., Sharaf, A., Raunak, V., Gabr, M., Matsushita, H., ... & Awadalla, H. H. (2023). How good are GPT models at machine translation? a comprehensive evaluation. arXiv preprint arXiv:2302.09210."},{"key":"6681_CR17","unstructured":"InternLM Team. (2023). InternLM: A multilingual language model with progressively enhanced capabilities."},{"key":"6681_CR18","unstructured":"Khot, T., Trivedi, H., Finlayson, M., Fu, Y., Richardson, K., Clark, P., & Sabharwal, A. (2022). Decomposed prompting: A modular approach for solving complex tasks. In The 11th International Conference on Learning Representations."},{"issue":"2","key":"6681_CR19","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pdig.0000198","volume":"2","author":"TH Kung","year":"2023","unstructured":"Kung, T. H., Cheatham, M., Medenilla, A., Sillos, C., De Leon, L., Elepa\u00f1o, C., Madriaga, M., Aggabao, R., Diaz-Candido, G., Maningo, J., et al. (2023). Performance of chatgpt on usmle: Potential for ai-assisted medical education using large language models. PLoS Digital Health, 2(2), e0000198.","journal-title":"PLoS Digital Health"},{"key":"6681_CR20","doi-asserted-by":"crossref","unstructured":"Lee, S., & Kim, G. (2023, July). Recursion of thought: A divide-and-conquer approach to multi-context reasoning with language models. In The 61st Annual Meeting Of The Association For Computational Linguistics.","DOI":"10.18653\/v1\/2023.findings-acl.40"},{"key":"6681_CR21","first-page":"3843","volume":"35","author":"A Lewkowycz","year":"2022","unstructured":"Lewkowycz, A., Andreassen, A., Dohan, D., Dyer, E., Michalewski, H., Ramasesh, V., Slone, A., Anil, C., Schlag, I., Gutman-Solo, T., et al. (2022). Solving quantitative reasoning problems with language models. Advances in Neural Information Processing Systems, 35, 3843\u20133857.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"6681_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., Lin, Z., Zhang, S., Fu, Q., Chen, B., Lou, J.-G., & Chen, W. (2023). Making language models better reasoners with step-aware verifier. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 5315\u20135333).","DOI":"10.18653\/v1\/2023.acl-long.291"},{"key":"6681_CR23","unstructured":"Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., ... & Cobbe, K. (2023). Let\u2019s verify step by step. In The Twelfth International Conference on Learning Representations."},{"key":"6681_CR24","doi-asserted-by":"crossref","unstructured":"Lin, X.\u00a0V., Mihaylov, T., Artetxe, M., Wang, T., Chen, S., Simig, D., Ott, M., Goyal, N., Bhosale, S., Du, J., et\u00a0al. (2021). Few-shot learning with multilingual language models. arXiv preprint arXiv:2112.10668.","DOI":"10.18653\/v1\/2022.emnlp-main.616"},{"key":"6681_CR25","unstructured":"Liu, T. & Low, B. K. H. (2023). GOAT: Fine-tuned llama outperforms GPT-4 on arithmetic tasks. arXiv preprint arXiv:2305.14201."},{"key":"6681_CR26","unstructured":"Liu, W., Hu, H., Zhou, J., Ding, Y., Li, J., Zeng, J., He, M., Chen, Q., Jiang, B., Zhou, A., et al. (2023). Mathematical Language Models: A survey. arXiv preprint arXiv:2312.07622."},{"key":"6681_CR27","unstructured":"Luo, H., Sun, Q., Xu, C., Zhao, P., Lou, J., Tao, C., Geng, X., Lin, Q., Chen, S., & Zhang, D. (2023) WizardMath: Empowering mathematical reasoning for large language models via reinforced evol-instruct. arXiv preprint arXiv:2308.09583"},{"key":"6681_CR28","unstructured":"Madaan, A., Tandon, N., Gupta, P., Hallinan, S., Gao, L., Wiegreffe, S., ... & Clark, P. Self-refine: Iterative refinement with self-feedback. In Thirty-seventh Conference on Neural Information Processing Systems."},{"issue":"4","key":"6681_CR29","doi-asserted-by":"publisher","first-page":"1093","DOI":"10.1016\/j.asej.2014.04.011","volume":"5","author":"W Medhat","year":"2014","unstructured":"Medhat, W., Hassan, A., & Korashy, H. (2014). Sentiment analysis algorithms and applications: A survey. Ain Shams Engineering Journal, 5(4), 1093\u20131113.","journal-title":"Ain Shams Engineering Journal"},{"key":"6681_CR30","unstructured":"Nogueira, R., Jiang, Z., & Lin, J. (2021). Investigating the limitations of transformers with simple arithmetic tasks. arXiv preprint arXiv:2102.13019."},{"key":"6681_CR31","unstructured":"Schick, T., Dwivedi-Yu, J., Dess\u00ed, R., Raileanu, R., Lomeli, M., Hambro, E., ... & Scialom, T. (2023, December). Toolformer: language models can teach themselves to use tools. In Proceedings of the 37th International Conference on Neural Information Processing Systems (pp. 68539\u201368551)."},{"key":"6681_CR32","doi-asserted-by":"crossref","unstructured":"Shah, N. H., Entwistle, D., & Pfeffer, M.\u00a0A. (2023). Creation and adoption of large language models in medicine. JAMA: Journal of the American Medical Association, 330(9).","DOI":"10.1001\/jama.2023.14217"},{"key":"6681_CR33","unstructured":"Taori, R., Gulrajani, I., Zhang, T., Dubois, Y., Li, X., Guestrin, C., Liang, P. & Hashimoto, T. B. (2023). Stanford alpaca: An instruction-following llama model."},{"issue":"8","key":"6681_CR34","doi-asserted-by":"publisher","first-page":"1930","DOI":"10.1038\/s41591-023-02448-8","volume":"29","author":"AJ Thirunavukarasu","year":"2023","unstructured":"Thirunavukarasu, A. J., Ting, D. S. J., Elangovan, K., Gutierrez, L., Tan, T. F., & Ting, D. S. W. (2023). Large language models in medicine. Nature Medicine, 29(8), 1930\u20131940.","journal-title":"Nature Medicine"},{"key":"6681_CR35","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S. et\u00a0al. (2023). Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288."},{"key":"6681_CR36","unstructured":"Wang, K., Ren, H., Zhou, A., Lu, Z., Luo, S., Shi, W., ... & Li, H. MathCoder: Seamless code integration in LLMs for enhanced mathematical reasoning. In The Twelfth International Conference on Learning Representations."},{"key":"6681_CR37","unstructured":"Wang, S.\u00a0I., & Manning, C.\u00a0D. (2012). Baselines and bigrams: Simple, good sentiment and topic classification. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers) (pp. 90\u201394)."},{"issue":"9","key":"6681_CR38","first-page":"4555","volume":"44","author":"X Wang","year":"2021","unstructured":"Wang, X., Chen, Y., & Zhu, W. (2021). A survey on curriculum learning. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(9), 4555\u20134576.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"6681_CR39","unstructured":"Wang, Z., Xia, R., & Liu, P. (2023b). Generative ai for math: Part i\u2013mathpile: A billion-token-scale pretraining corpus for math. arXiv preprint arXiv:2312.17120."},{"key":"6681_CR40","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., Wang, X., Schuurmans, B., Maarten, X., Fei, C. E., Le, Q. V., Zhou, D., et al. (2022). Chain-of-Thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems, 35, 24824\u201324837.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"6681_CR41","first-page":"2550","volume":"2023","author":"Y Weng","year":"2023","unstructured":"Weng, Y., Zhu, M., Xia, F., Li, B., He, S., Liu, S., Sun, B., Liu, K., & Zhao, J. (2023). Large language models are better reasoners with self-verification. In Findings of the Association for Computational Linguistics: EMNLP, 2023, 2550\u20132575.","journal-title":"In Findings of the Association for Computational Linguistics: EMNLP"},{"key":"6681_CR42","unstructured":"Wu, S., Irsoy, O., Lu, S., Dabravolski, V., Dredze, M., Gehrmann, S., ... & Mann, G. (2023). BloombergGPT: A large language model for finance (No. 2303.17564)."},{"key":"6681_CR43","unstructured":"Yang, A., Xiao, B., Wang, B., Zhang, B., Bian, C., Yin, C., Lv, C., Pan, D., Wang, D., Yan, D., et\u00a0al. (2023). Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305."},{"key":"6681_CR44","unstructured":"Yu, L., Jiang, W., Shi, H., Jincheng, Y. U., Liu, Z., Zhang, Y., ... & Liu, W (2023). MetaMath: Bootstrap your own mathematical questions for large language models. In The Twelfth International Conference on Learning Representations."},{"key":"6681_CR45","unstructured":"Yue, X., Qu, X., Zhang, G., Fu, Y., Huang, W., Sun, H., ... & Chen, W. (2023). MAmmoTH: Building Math Generalist Models through Hybrid Instruction Tuning. In The Twelfth International Conference on Learning Representations."},{"key":"6681_CR46","unstructured":"Zeng, A., Liu, X., Du, Z., Wang, Z., Lai, H., Ding, M., Yang, Z., Xu, Y., Zheng, W., Xia, X., et\u00a0al. (2022). GLM-130B: An open bilingual pre-trained model. In The 11th International Conference on Learning Representations."},{"key":"6681_CR52","doi-asserted-by":"crossref","unstructured":"Zhan, B., Guo, T., Li, X., Hou, M., Liang, Q., Gao, B., ... & Liu, Z. (2024, July). Knowledge tracing as language processing: A large-scale autoregressive paradigm. In International Conference on Artificial Intelligence in Education (pp. 177\u2013191).","DOI":"10.1007\/978-3-031-64302-6_13"},{"issue":"1","key":"6681_CR47","doi-asserted-by":"publisher","first-page":"22","DOI":"10.3390\/brainsci13010022","volume":"13","author":"Y Zhang","year":"2022","unstructured":"Zhang, Y., Tolmie, A., & Gordon, R. (2022). The relationship between working memory and arithmetic in primary school children: A meta-analysis. Brain Sciences, 13(1), 22.","journal-title":"Brain Sciences"},{"key":"6681_CR48","doi-asserted-by":"crossref","unstructured":"Zhao, J. X., Xie, Y., Kawaguchi, K., He, J., & Xie, M. Q. (2023). Automatic model selection with large language\nmodels for reasoning. In The 2023 Conference on Empirical Methods in Natural Language Processing.","DOI":"10.18653\/v1\/2023.findings-emnlp.55"},{"key":"6681_CR51","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Li, X., Huang, Y., Liang, Q., Guo, T., Hou, M., ... & Luo, W. (2024, July). Automatic lesson plan generation via large language models with self-critique prompting. In International Conference on Artificial Intelligence in Education (pp. 16\u2013178). Springer Nature Switzerland.","DOI":"10.1007\/978-3-031-64315-6_13"},{"key":"6681_CR49","unstructured":"Zhou, A., Wang, K., Lu, Z., Shi, W., Luo, S., Qin, Z., ... & Li, H. (2024, May). Solving challenging math word problems using GPT-4 code interpreter with code-based self-verification. In 12th International Conference on Learning Representations (ICLR 2024)."},{"key":"6681_CR50","unstructured":"Zhou, D., Sch\u00e4rli, N.,\u00a0Hou, L., Wei, J., Scales, N., Wang, X., Schuurmans, D., Cui, C., Bousquet, O., Le, Q.\u00a0V., et\u00a0al. (2022). Least-to-most prompting enables complex reasoning in large language models. In The 11th International Conference on Learning Representations."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-024-06681-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-024-06681-1","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-024-06681-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T01:02:59Z","timestamp":1768611779000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-024-06681-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1]]},"references-count":52,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["6681"],"URL":"https:\/\/doi.org\/10.1007\/s10994-024-06681-1","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1]]},"assertion":[{"value":"30 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 August 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 December 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 January 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}],"article-number":"24"}}