{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T21:29:14Z","timestamp":1770067754727,"version":"3.49.0"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T00:00:00Z","timestamp":1769990400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T00:00:00Z","timestamp":1769990400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","award":["RGPIN- 2024-05086"],"award-info":[{"award-number":["RGPIN- 2024-05086"]}],"id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","award":["RGPIN-2020-06535"],"award-info":[{"award-number":["RGPIN-2020-06535"]}],"id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","award":["RGPIN-2025-06911"],"award-info":[{"award-number":["RGPIN-2025-06911"]}],"id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-04704-9","type":"journal-article","created":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T10:05:03Z","timestamp":1770026703000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Zeroth-Order Kronecker Optimization for Pretraining Language Models"],"prefix":"10.1007","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0694-8216","authenticated-orcid":false,"given":"Nathan","family":"Allaire","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3148-5090","authenticated-orcid":false,"given":"S\u00e9bastien","family":"Le Digabel","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8017-7687","authenticated-orcid":false,"given":"Dominique","family":"Orban","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6673-4224","authenticated-orcid":false,"given":"Vahid","family":"Partovi Nia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,2]]},"reference":[{"key":"4704_CR1","unstructured":"Kaplan J, McCandlish S, Henighan T, Brown TB, Chess B, Child R, Gray S, Radford A, Wu J, Amodei D. Scaling laws for neural language models 2020. arxiv.org\/abs\/2001.08361"},{"key":"4704_CR2","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I. Attention is all you need. CoRR 2017. arxiv.org\/abs\/1706.03762"},{"key":"4704_CR3","unstructured":"Malladi S, Gao T, Nichani E, Damian A, Lee JD, Chen D, Arora S. Fine-tuning language models with just forward passes 2024. arxiv.org\/abs\/2305.17333"},{"key":"4704_CR4","unstructured":"Hu EJ, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W. LoRA: low-rank adaptation of large language models 2021. arxiv.org\/abs\/2106.09685"},{"issue":"4\u20135","key":"4704_CR5","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1016\/0925-2312(93)90006-O","volume":"5","author":"Amari Si","year":"1993","unstructured":"Si Amari. Backpropagation and stochastic gradient descent method. Neurocomputing. 1993;5(4\u20135):185\u201396. https:\/\/doi.org\/10.1016\/0925-2312(93)90006-O.","journal-title":"Neurocomputing"},{"key":"4704_CR6","unstructured":"Kingma DP, Ba J. Adam: a method for stochastic optimization 2017. arxiv.org\/abs\/1412.6980"},{"issue":"3","key":"4704_CR7","doi-asserted-by":"publisher","first-page":"332","DOI":"10.1109\/9.119632","volume":"37","author":"JC Spall","year":"1992","unstructured":"Spall JC. Multivariate stochastic approximation using a simultaneous perturbation gradient approximation. IEEE Trans Autom Control. 1992;37(3):332\u201341. https:\/\/doi.org\/10.1109\/9.119632.","journal-title":"IEEE Trans Autom Control"},{"issue":"4","key":"4704_CR8","doi-asserted-by":"publisher","first-page":"2341","DOI":"10.1137\/120880811","volume":"23","author":"S Ghadimi","year":"2013","unstructured":"Ghadimi S, Lan G. Stochastic first- and zeroth-order methods for nonconvex stochastic programming. SIAM J Optim. 2013;23(4):2341\u201368. https:\/\/doi.org\/10.1137\/120880811.","journal-title":"SIAM J Optim"},{"key":"4704_CR9","unstructured":"Duchi JC, Jordan MI, Wainwright MJ, Wibisono A. Optimal rates for zero-order convex optimization: the power of two function evaluations 2014. arxiv.org\/abs\/1312.2139"},{"issue":"2","key":"4704_CR10","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/s10208-015-9296-2","volume":"17","author":"Y Nesterov","year":"2017","unstructured":"Nesterov Y, Spokoiny V. Random gradient-free minimization of convex functions. Found Comput Math. 2017;17(2):527\u201366. https:\/\/doi.org\/10.1007\/s10208-015-9296-2.","journal-title":"Found Comput Math"},{"key":"4704_CR11","unstructured":"Zhang Y, Li P, Hong J, Li J, Zhang Y, Zheng W, Chen PY, Lee JD, Yin W, Hong M, Wang Z, Liu S, Chen T. Revisiting zeroth-order optimization for memory-efficient LLM fine-tuning: a benchmark 2024. arxiv.org\/abs\/2402.11592"},{"key":"4704_CR12","unstructured":"Chen Y, Zhang Y, Cao L, Yuan K, Wen Z. Enhancing zeroth-order fine-tuning for language models with low-rank structures 2024. arxiv.org\/abs\/2410.07698"},{"key":"4704_CR13","doi-asserted-by":"publisher","unstructured":"Allaire N, Ghazvini\u00a0Nejad M, Le\u00a0Digabel S, Partovi\u00a0Nia V. Zeroth order optimization for pretraining language models. In: Proceedings of the 14th international conference on pattern recognition applications and methods - Volume\u00a01: ICPRAM, pp. 113\u2013121. SciTePress, Porto, Portugal 2025. https:\/\/doi.org\/10.5220\/0013261100003905","DOI":"10.5220\/0013261100003905"},{"key":"4704_CR14","doi-asserted-by":"publisher","unstructured":"Audet C, Hare W. Derivative-free and blackbox optimization. Springer Series in Operations Research and Financial Engineering. Springer, Cham, Switzerland 2017. https:\/\/doi.org\/10.1007\/978-3-319-68913-5","DOI":"10.1007\/978-3-319-68913-5"},{"key":"4704_CR15","doi-asserted-by":"publisher","unstructured":"Blum JR. Multidimensional stochastic approximation methods. Ann Math Stat, 1954:737\u2013744. https:\/\/doi.org\/10.1214\/aoms\/1177728659","DOI":"10.1214\/aoms\/1177728659"},{"key":"4704_CR16","unstructured":"Ilyas A, Engstrom L, Athalye A, Lin J. Black-box adversarial attacks with limited queries and information 2018. arxiv.org\/abs\/1804.08598"},{"key":"4704_CR17","doi-asserted-by":"crossref","unstructured":"Zhao P, Liu S, Chen PY, Hoang N, Xu K, Kailkhura B, Lin X. On the design of black-box adversarial examples by leveraging gradient-free optimization and operator splitting method 2019. arxiv.org\/abs\/1907.11684","DOI":"10.1109\/ICCV.2019.00021"},{"key":"4704_CR18","unstructured":"Tu CC, Ting P, Chen PY, Liu S, Zhang H, Yi J, Hsieh CJ, Cheng SM. AutoZOOM: autoencoder-based zeroth order optimization method for attacking black-box neural networks 2020. arxiv.org\/abs\/1805.11770"},{"key":"4704_CR19","unstructured":"Dhurandhar A, Pedapati T, Balakrishnan A, Chen PY, Shanmugam K, Puri R. Model agnostic contrastive explanations for structured data 2019. arxiv.org\/abs\/1906.00117"},{"key":"4704_CR20","unstructured":"Tribes C, Benarroch-Lelong S, Lu P, Kobyzev I. Hyperparameter optimization for large language model instruction-tuning. Technical Report G-2023-62, Les cahiers du GERAD 2023. 10.48550\/arXiv.2312.00949"},{"issue":"3","key":"4704_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450975","volume":"47","author":"D Lakhmiri","year":"2021","unstructured":"Lakhmiri D, Le Digabel S, Tribes C. HyperNOMAD: hyperparameter optimization of deep neural networks using mesh adaptive direct search. ACM Trans Math Softw. 2021;47(3):1\u201327. https:\/\/doi.org\/10.1145\/3450975.","journal-title":"ACM Trans Math Softw"},{"key":"4704_CR22","unstructured":"Zhang L, Li B, Thekumparampil KK, Oh S, Muehlebach M, He N. Zeroth-order optimization finds flat minima 2025. arxiv.org\/abs\/2506.05454"},{"key":"4704_CR23","doi-asserted-by":"publisher","unstructured":"Gururangan S, Marasovi\u0107 A, Swayamdipta S, Lo K, Beltagy I, Downey D, Smith NA. Don\u2019t stop pretraining: Adapt language models to domains and tasks. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 8342\u20138360. Association for Computational Linguistics, Online, hosted in Seattle, WA 2020. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.740","DOI":"10.18653\/v1\/2020.acl-main.740"},{"key":"4704_CR24","unstructured":"Loshchilov I, Hutter F. Decoupled weight decay regularization 2019. arxiv.org\/abs\/1711.05101"},{"key":"4704_CR25","unstructured":"Li C, Farkhoor H, Liu R, Yosinski J. Measuring the intrinsic dimension of objective landscapes 2018. arxiv.org\/abs\/1804.08838"},{"key":"4704_CR26","unstructured":"Larsen BW, Fort S, Becker N, Ganguli S. How many degrees of freedom do we need to train deep networks: a loss landscape perspective 2022. arxiv.org\/abs\/2107.05802"},{"key":"4704_CR27","doi-asserted-by":"publisher","unstructured":"Gao P, Trautmann E, Yu B, Santhanam G, Ryu S, Shenoy K, Ganguli S. A theory of multineuronal dimensionality, dynamics and measurement. bioRxiv 2017. https:\/\/doi.org\/10.1101\/214262","DOI":"10.1101\/214262"},{"key":"#cr-split#-4704_CR28.1","unstructured":"Herfindahl OC. Concentration in the steel industry. PhD thesis, Columbia University (1950). PhD thesis"},{"key":"#cr-split#-4704_CR28.2","unstructured":"accessible via Internet Archive (published online 2021). https:\/\/archive.org\/details\/herfindahl-concentration-in-the-steel-industry-1950-publish Accessed 2025-09-08"},{"key":"4704_CR29","unstructured":"Graham A. Kronecker products and matrix calculus with applications. Mineola, NY: Courier Dover Publications; 2018."},{"key":"4704_CR30","unstructured":"Hameed MGA, Mosleh A, Tahaei MS, Partovi Nia V. SeKron: a decomposition method supporting many factorization structures 2022. arxiv.org\/abs\/2210.06299"},{"key":"4704_CR31","unstructured":"Radford A, Wu J, Child R, Luan D, Amodei D, Sutskever I. Language models are unsupervised multitask learners. Technical report, OpenAI 2019. https:\/\/cdn.openai.com\/better-language-models\/language_models_are_unsupervised_multitask_learners.pdf"},{"key":"4704_CR32","unstructured":"Biderman S, Gao L, Lehman J, Hallahan E, Zue C, Gray S, McDonell K, Phang J, Weinbach S, Pavlick E. OpenWebText Corpus. 2019. https:\/\/skylion007.github.io\/OpenWebTextCorpus\/"},{"key":"4704_CR33","unstructured":"Karpathy A. NanoGPT. https:\/\/github.com\/karpathy\/nanoGPT. GitHub repository 2023"},{"key":"4704_CR34","unstructured":"Zhang S, Roller S, Goyal N, Artetxe M, Chen M, Chen S, Dewan C, Diab M, Li X, Lin XV, Mihaylov T, Ott M, Shleifer S, Shuster K, Simig D, Koura PS, Sridhar A, Wang T, Zettlemoyer L. OPT: Open Pre-trained Transformer Language Models 2022. arxiv.org\/abs\/2205.01068"},{"key":"4704_CR35","unstructured":"Wang A, Singh A, Michael J, Hill F, Levy O, Bowman SR. GLUE: a multi-task benchmark and analysis platform for natural language understanding 2019. arxiv.org\/abs\/1804.07461"},{"issue":"1","key":"4704_CR36","doi-asserted-by":"publisher","first-page":"188","DOI":"10.1137\/040603371","volume":"17","author":"C Audet","year":"2006","unstructured":"Audet C, Dennis JE. Mesh adaptive direct search algorithms for constrained optimization. SIAM J Optim. 2006;17(1):188\u2013217. https:\/\/doi.org\/10.1137\/040603371.","journal-title":"SIAM J Optim"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04704-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-04704-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04704-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T10:05:19Z","timestamp":1770026719000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-04704-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,2]]},"references-count":37,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2026,2]]}},"alternative-id":["4704"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-04704-9","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,2]]},"assertion":[{"value":"10 September 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"On behalf of all authors, the corresponding author states that there is no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"162"}}