{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T19:56:13Z","timestamp":1775073373404,"version":"3.50.1"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,7,11]],"date-time":"2025-07-11T00:00:00Z","timestamp":1752192000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,11]],"date-time":"2025-07-11T00:00:00Z","timestamp":1752192000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-04182-z","type":"journal-article","created":{"date-parts":[[2025,7,11]],"date-time":"2025-07-11T06:53:38Z","timestamp":1752216818000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Grokking in Neural Networks: A Review"],"prefix":"10.1007","volume":"6","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7216-1049","authenticated-orcid":false,"given":"Tathagat","family":"Agrawal","sequence":"first","affiliation":[]},{"given":"Manoj","family":"Kumar","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,11]]},"reference":[{"key":"4182_CR1","unstructured":"Abramov R, Steinbauer F, Kasneci G. Grokking in the wild: data augmentation for real-world multi-hop reasoning with transformers 2025. arXiv preprint arXiv:2504.20752."},{"key":"4182_CR2","first-page":"21750","volume":"35","author":"B Barak","year":"2022","unstructured":"Barak B, Edelman B, Goel S, Kakade S, Malach E, Zhang C. Hidden progress in deep learning: Sgd learns parities near the computational limit. Adv Neural Inf Process Syst. 2022;35:21750\u201364.","journal-title":"Adv Neural Inf Process Syst"},{"issue":"32","key":"4182_CR3","doi-asserted-by":"publisher","first-page":"15849","DOI":"10.1073\/pnas.1903070116","volume":"116","author":"M Belkin","year":"2019","unstructured":"Belkin M, Hsu D, Ma S, Mandal S. Reconciling modern machine-learning practice and the classical bias-variance trade-off. Proc Natl Acad Sci. 2019;116(32):15849\u201354.","journal-title":"Proc Natl Acad Sci"},{"key":"4182_CR4","unstructured":"Davies X, Langosco L, Krueger D. Unifying grokking and double descent. In: NeurIPS ML Safety Workshop 2022."},{"key":"4182_CR5","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Fei-Fei L. Imagenet: a large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition. IEEE 2009. pp. 248\u201355.","DOI":"10.1109\/CVPR.2009.5206848"},{"issue":"6","key":"4182_CR6","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/MSP.2012.2211477","volume":"29","author":"L Deng","year":"2012","unstructured":"Deng L. The Mnist database of handwritten digit images for machine learning research. IEEE Signal Process Mag. 2012;29(6):141\u20132.","journal-title":"IEEE Signal Process Mag"},{"key":"4182_CR7","unstructured":"Fan S, Pascanu R, Jaggi M. Deep grokking: Would deep neural networks generalize better? 2024. arXiv preprint arXiv:2405.19454 ."},{"key":"4182_CR8","doi-asserted-by":"publisher","first-page":"3574","DOI":"10.1609\/aaai.v33i01.33013574","volume":"33","author":"S Fort","year":"2019","unstructured":"Fort S, Scherlis A. The goldilocks zone: towards better understanding of neural network loss landscapes. Proc AAAI Conf Artif Intell. 2019;33:3574\u201381.","journal-title":"Proceedings of the aaai conference on artificial intelligence"},{"key":"4182_CR9","unstructured":"Gromov A. Grokking modular arithmetic 2023. arXiv preprint arXiv:2301.02679"},{"key":"4182_CR10","unstructured":"Heckel R, Yilmaz FF. Early stopping in deep networks: double descent and how to eliminate it. arXiv preprint arXiv:2007.10099 (2020)"},{"key":"4182_CR11","unstructured":"Huang Y, Hu S, Han X, Liu Z, Sun M. Unified view of grokking, double descent and emergent abilities: a perspective from circuits competition 2024. arXiv preprint arXiv:2402.15175"},{"key":"4182_CR12","unstructured":"Humayun AI, Balestriero R, Baraniuk R. Deep networks always grok and here is why. In: High-dimensional learning dynamics 2024: the emergence of structure and reasoning 2024."},{"key":"4182_CR13","unstructured":"Krizhevsky A, Hinton G. Learning multiple layers of features from tiny images 2009."},{"key":"4182_CR14","unstructured":"Kumar T, Bordelon B, Gershman SJ, Pehlevan C. Grokking as the transition from lazy to rich training dynamics. In: The twelfth international conference on learning representations 2024."},{"key":"4182_CR15","unstructured":"Lee J, Kang BG, Kim K, Lee KM. Grokfast: Accelerated grokking by amplifying slow gradients 2024. arXiv preprint arXiv:2405.20233"},{"key":"4182_CR16","first-page":"34651","volume":"35","author":"Z Liu","year":"2022","unstructured":"Liu Z, Kitouni O, Nolte NS, Michaud E, Tegmark M, Williams M. Towards understanding grokking: an effective theory of representation learning. Adv Neural Inf Process Syst. 2022;35:34651\u201363.","journal-title":"Adv Neural Inf Process Syst"},{"key":"4182_CR17","unstructured":"Liu Z, Michaud EJ, Tegmark M. Omnigrok: grokking beyond algorithmic data. In: The eleventh international conference on learning representations 2023."},{"key":"4182_CR18","unstructured":"Lyu K, Jin J, Li Z, Du SS, Lee JD, Hu W. Dichotomy of early and late phase implicit biases can provably induce grokking. In: The twelfth international conference on learning representations 2024."},{"key":"4182_CR19","unstructured":"Maas A, Daly RE, Pham PT, Huang D, Ng AY, Potts C. Learning word vectors for sentiment analysis. In: Proceedings of the 49th annual meeting of the association for computational linguistics: human language technologies. 2011. pp. 142\u201350."},{"key":"4182_CR20","unstructured":"Merrill W, Tsilivis N, Shukla A. A tale of two circuits: Grokking as competition of sparse and dense subnetworks 2023. https:\/\/arxiv.org\/abs\/2303.11873"},{"key":"4182_CR21","unstructured":"Mohamadi MA, Li Z, Wu L, Sutherland D. Grokking modular arithmetic can be explained by margin maximization. In: NeurIPS 2023 Workshop on mathematics of modern machine learning 2023."},{"issue":"12","key":"4182_CR22","doi-asserted-by":"publisher","DOI":"10.1088\/1742-5468\/ac3a74","volume":"2021","author":"P Nakkiran","year":"2021","unstructured":"Nakkiran P, Kaplun G, Bansal Y, Yang T, Barak B, Sutskever I. Deep double descent: where bigger models and more data hurt. J Stat Mech Theory Exp. 2021;2021(12): 124003.","journal-title":"J Stat Mech: Theory Exp"},{"key":"4182_CR23","unstructured":"Nanda N, Chan L, Lieberum T, Smith J, Steinhardt J. Progress measures for grokking via mechanistic interpretability. In: The eleventh international conference on learning representations 2023."},{"key":"4182_CR24","unstructured":"Pezeshki M, Mitra A, Bengio Y, Lajoie G. Multi-scale feature learning dynamics: Insights for double descent. In: International conference on machine learning. PMLR 2022. pp. 17669\u201390."},{"key":"4182_CR25","unstructured":"Power A, Burda Y, Edwards H, Babuschkin I, Misra V. Grokking: Generalization beyond overfitting on small algorithmic datasets 2022. arXiv preprint arXiv:2201.02177"},{"issue":"1","key":"4182_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/sdata.2014.22","volume":"1","author":"R Ramakrishnan","year":"2014","unstructured":"Ramakrishnan R, Dral PO, Rupp M, Von Lilienfeld OA. Quantum chemistry structures and properties of 134 kilo molecules. Sci Data. 2014;1(1):1\u20137.","journal-title":"Scientific data"},{"key":"4182_CR27","unstructured":"Rubin N, Seroussi I, Ringel Z. Grokking as a first order phase transition in two layer networks. In: The twelfth international conference on learning representations 2024."},{"key":"4182_CR28","unstructured":"Stephenson C, Lee T. When and how epochwise double descent happens 2021. arXiv preprint arXiv:2108.12006"},{"key":"4182_CR29","unstructured":"Thilak V, Littwin E, Zhai S, Saremi O, Paiss R, Susskind JM. The slingshot effect: a late-stage optimization anomaly in adam-family of optimization methods. In: Transactions on machine learning research (TMLR) 2024."},{"key":"4182_CR30","unstructured":"Varma V, Shah R, Kenton Z, Kram\u00e1r J, Kumar R. Explaining grokking through circuit efficiency 2023. arXiv preprint arXiv:2309.02390"},{"key":"4182_CR31","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I. Attention is all you need. Adv Neural Inf Process Syst. 2017;30."},{"key":"4182_CR32","unstructured":"Wang B, Yue X, Su Y, Sun H. Grokked transformers are implicit reasoners: a mechanistic journey to the edge of generalization 2024. arXiv preprint arXiv:2405.15071"},{"key":"4182_CR33","unstructured":"Zhu X, Fu Y, Zhou B, Lin Z. Critical data size of language models from a grokking perspective 2024. arXiv preprint arXiv:2401.10463"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04182-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-04182-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04182-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,7]],"date-time":"2025-09-07T05:05:54Z","timestamp":1757221554000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-04182-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,11]]},"references-count":33,"journal-issue":{"issue":"6","published-online":{"date-parts":[[2025,8]]}},"alternative-id":["4182"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-04182-z","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,11]]},"assertion":[{"value":"26 November 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"This work is based solely on the analysis and synthesis of previously published research and publicly available information. No human participants, animals, or stakeholders were involved. The authors have ensured that all sources are properly cited to acknowledge the original contributors and avoid plagiarism.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed Consent"}},{"value":"Not applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Research Involving Human and\/or Animals"}}],"article-number":"627"}}