{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T19:28:47Z","timestamp":1773170927314,"version":"3.50.1"},"reference-count":86,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&#x0026;D Program of China","award":["2022YFA1008200"],"award-info":[{"award-number":["2022YFA1008200"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92270001"],"award-info":[{"award-number":["92270001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12571567"],"award-info":[{"award-number":["12571567"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12371511"],"award-info":[{"award-number":["12371511"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12422119"],"award-info":[{"award-number":["12422119"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai","doi-asserted-by":"publisher","award":["025ZR1402280"],"award-info":[{"award-number":["025ZR1402280"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]},{"name":"HPC of School of Mathematical Sciences and the Student Innovation Center"},{"name":"Center for High Performance Computing at Shanghai Jiao Tong University"},{"name":"Key Laboratory of Marine Intelligent Equipment and System, Ministry of Education, China"},{"name":"SJTU Kunpeng &#x0026; Ascend Center of Excellence"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1109\/tpami.2025.3646483","type":"journal-article","created":{"date-parts":[[2025,12,19]],"date-time":"2025-12-19T18:58:30Z","timestamp":1766170710000},"page":"4336-4349","source":"Crossref","is-referenced-by-count":0,"title":["Complexity Control Facilitates Reasoning-Based Compositional Generalization in Transformers"],"prefix":"10.1109","volume":"48","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4202-8556","authenticated-orcid":false,"given":"Zhongwang","family":"Zhang","sequence":"first","affiliation":[{"name":"Institute of Natural Sciences, School of Mathematical Sciences, MOE-LSC, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pengxiao","family":"Lin","sequence":"additional","affiliation":[{"name":"Institute of Natural Sciences, School of Mathematical Sciences, MOE-LSC, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiwei","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Natural Sciences, School of Mathematical Sciences, MOE-LSC, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yaoyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Natural Sciences, School of Mathematical Sciences, MOE-LSC, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0627-3520","authenticated-orcid":false,"given":"Zhi-Qin","family":"John Xu","sequence":"additional","affiliation":[{"name":"Institute of Natural Sciences, School of Mathematical Sciences, MOE-LSC, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.4335905"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s12599-023-00795-x"},{"key":"ref6","article-title":"Sparks of artificial general intelligence: Early experiments with GPT-4","author":"Bubeck","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/1187.001.0001"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1002\/aaai.12065"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/0010-0277(88)90031-5"},{"key":"ref10","first-page":"9177","article-title":"Measuring compositional generalization: A comprehensive method on realistic data","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Keysers","year":"2020"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.397"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.11674"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.378"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.731"},{"key":"ref15","first-page":"95238","article-title":"Grokked transformers are implicit reasoners: A mechanistic journey to the edge of generalization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2024"},{"key":"ref16","first-page":"14093","article-title":"Initialization is critical to whether transformers fit composite functions by reasoning or memorizing","volume-title":"Proc. 38th Annu. Conf. Neural Inf. Process. Syst.","author":"Zhang","year":"2024"},{"key":"ref17","article-title":"How does GPT obtain its ability? Tracing emergent abilities of language models to their sources","author":"Fu","year":"2022","journal-title":"Yao Fu\u2019s Notion"},{"key":"ref18","article-title":"Emergent abilities of large language models","author":"Wei","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref19","article-title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","author":"Srivastava","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref20","article-title":"The neural data router: Adaptive control flow in transformers improves systematic generalization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Csord\u00e1s","year":"2022"},{"key":"ref21","article-title":"Faith and fate: Limits of transformers on compositionality","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Dziri","year":"2024"},{"key":"ref22","article-title":"Learning compositionally through attentive guidance","author":"Hupkes","year":"2018"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1848"},{"key":"ref24","article-title":"Compositional abilities emerge multiplicatively: Exploring diffusion models on a synthetic task","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36, pp. 50173--50195","author":"Okawa","year":"2024"},{"key":"ref25","article-title":"Do vision-language pretrained models learn composable primitive concepts?","author":"Yun","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.893"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.662"},{"key":"ref28","article-title":"How capable can a transformer become? A study on synthetic, interpretable tasks","volume-title":"Proc. Workshop Symmetry Geometry Neural Representations","author":"Ramesh","year":"2023"},{"key":"ref29","first-page":"27244","article-title":"Transformers learn shortcuts to automata","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liu","year":"2023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1800"},{"key":"ref31","first-page":"26038","article-title":"Selection-Inference: Exploiting large language models for interpretable logical reasoning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Creswell","year":"2023"},{"key":"ref32","article-title":"Faithful reasoning using large language models","author":"Creswell","year":"2022"},{"key":"ref33","first-page":"118701","article-title":"Improving generalization and convergence by enhancing implicit regularization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2024"},{"key":"ref34","first-page":"25781","article-title":"Understanding the expressive power and mechanisms of transformer for sequence modeling","volume-title":"Proc. Adv. Neural Inf. Process","author":"Wang","year":"2024"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.609"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.591"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.4208\/jml.250723"},{"key":"ref38","article-title":"A very preliminary analysis of DALL-E 2","author":"Marcus","year":"2022"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.ssaho.2023.100648"},{"key":"ref40","article-title":"Testing relational understanding in text-guided image generation","author":"Conwell","year":"2022"},{"key":"ref41","article-title":"Benchmarking spatial relationships in text-to-image generation","author":"Gokhale","year":"2022"},{"key":"ref42","first-page":"8489","article-title":"Reduce, reuse, recycle: Compositional generation with energy-based diffusion models and MCMC","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Du","year":"2023"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_26"},{"key":"ref44","first-page":"35506","article-title":"Training-free structured diffusion guidance for compositional text-to-image synthesis","volume-title":"Proc. 11th Int. Conf. Learn. Representations","volume":"2023","author":"Feng"},{"key":"ref45","first-page":"15121","article-title":"How diffusion models learn to factorize and compose","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liang","year":"2024"},{"key":"ref46","article-title":"Dynamics of concept learning and compositional generalization","volume-title":"Proc. Workshop Math. Modern Mach. Learn.","author":"Yang","year":"2024"},{"key":"ref47","first-page":"84698","article-title":"Emergence of hidden capabilities: Exploring learning dynamics in concept space","volume-title":"Proc. 38th Annu. Conf. Neural Inf. Process. Syst.","author":"Park","year":"2024"},{"key":"ref48","first-page":"8141","article-title":"On exact computation with an infinitely wide neural net","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Arora","year":"2019"},{"key":"ref49","first-page":"3036","article-title":"On the global convergence of gradient descent for over-parameterized models using optimal transport","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Chizat","year":"2018"},{"key":"ref50","first-page":"144","article-title":"A type of generalization error induced by initialization in deep neural networks","volume-title":"Proc. 1st Math. Sci. Mach. Learn. Conf.","author":"Zhang","year":"2020"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s11425-019-1628-5"},{"key":"ref52","first-page":"8571","article-title":"Neural tangent kernel: Convergence and generalization in neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Jacot","year":"2018"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1806579115"},{"key":"ref54","first-page":"7146","article-title":"Parameters as interacting particles: Long time convergence and asymptotic error scaling of neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Rotskoff","year":"2018"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.spa.2019.06.003"},{"key":"ref56","article-title":"Gradient dynamics of shallow univariate ReLU networks","volume":"32","author":"Williams","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"71","key":"ref57","first-page":"1","article-title":"Phase diagram for two-layer ReLU neural networks at infinite-width limit","volume":"22","author":"Luo","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref58","first-page":"26021","article-title":"Empirical phase diagram for three-layer neural networks with infinite width","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Zhou","year":"2022"},{"key":"ref59","article-title":"Linear stability hypothesis and rank stratification for nonlinear models","author":"Zhang","year":"2022"},{"key":"ref60","first-page":"369","article-title":"Loss spike in training neural networks","author":"Zhang","year":"2026","journal-title":"J. Comput. Math."},{"key":"ref61","first-page":"31095","article-title":"Stochastic modified equations and dynamics of dropout algorithm","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang","year":"2024"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3357172"},{"key":"ref63","article-title":"Grokking: Generalization beyond overfitting on small algorithmic datasets","author":"Power","year":"2022"},{"key":"ref64","article-title":"How do transformers fill in the blanks? A case study on matrix completion","volume-title":"Proc. ICML Workshop Mechanistic Interpretability","author":"Gopalani","year":"2024"},{"key":"ref65","first-page":"10604","article-title":"Omnigrok: Grokking beyond algorithmic data","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Liu","year":"2023"},{"key":"ref66","first-page":"4475","article-title":"Improving transformer optimization through better initialization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Huang","year":"2020"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.463"},{"key":"ref68","first-page":"34456","article-title":"Mimetic initialization of self-attention layers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Trockman","year":"2023"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3386927"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1083"},{"key":"ref71","first-page":"16410","article-title":"Gradinit: Learning to initialize neural networks for stable and efficient training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Zhu","year":"2021"},{"issue":"8","key":"ref72","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref73","first-page":"2184","article-title":"Towards understanding the condensation of neural networks at initial training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhou","year":"2022"},{"key":"ref74","first-page":"14848","article-title":"Embedding principle of loss landscape of deep neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Zhang","year":"2021"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.4208\/jml.220108"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/1255443.1255449"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1561\/2200000048"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1017\/9781108231596"},{"key":"ref79","first-page":"2873","article-title":"Generalization without systematicity: On the compositional skills of sequence-to-sequence recurrent networks","volume-title":"Proc. Int. Conf. Mach. L.","author":"Lake","year":"2018"},{"key":"ref80","first-page":"19438","article-title":"Case-based or rule-based: How do transformers do the math?","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Hu","year":"2024"},{"key":"ref81","article-title":"SlimPajama: A 627B token cleaned and deduplicated version of RedPajama","author":"Soboleva","year":"2023"},{"key":"ref82","first-page":"35413","article-title":"Language models are greedy reasoners: A systematic formal analysis of chain-of-thought","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Saparov","year":"2023"},{"key":"ref83","first-page":"233","article-title":"A closer look at memorization in deep networks","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","volume":"70","author":"Arpit","year":"2017"},{"key":"ref84","first-page":"5301","article-title":"On the spectral bias of deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rahaman","year":"2019"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.4208\/cicp.oa-2020-0085"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-36708-4_22"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11424231\/11304601.pdf?arnumber=11304601","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T01:34:20Z","timestamp":1773106460000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11304601\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":86,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3646483","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4]]}}}