{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T08:28:47Z","timestamp":1773736127684,"version":"3.50.1"},"reference-count":49,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National R&#x0026;D Program of China","award":["2022ZD0115301"],"award-info":[{"award-number":["2022ZD0115301"]}]},{"name":"Major Key Project of PCL","award":["PCL2023AS7-1"],"award-info":[{"award-number":["PCL2023AS7-1"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976144"],"award-info":[{"award-number":["61976144"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61806128"],"award-info":[{"award-number":["61806128"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen International Research Cooperation","award":["GJHZ20220913142611021"],"award-info":[{"award-number":["GJHZ20220913142611021"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1109\/tpami.2023.3319005","type":"journal-article","created":{"date-parts":[[2023,9,25]],"date-time":"2023-09-25T18:19:16Z","timestamp":1695665956000},"page":"465-478","source":"Crossref","is-referenced-by-count":32,"title":["Re-Thinking the Effectiveness of Batch Normalization and Beyond"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9715-473X","authenticated-orcid":false,"given":"Hanyang","family":"Peng","sequence":"first","affiliation":[{"name":"Peng Cheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9865-2212","authenticated-orcid":false,"given":"Yue","family":"Yu","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5213-5877","authenticated-orcid":false,"given":"Shiqi","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Research Institute of Trustworthy Autonomous Systems, Southern University of Science and Technology, Shenzhen, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","volume-title":"Proc. Int. Conf. Int. Conf. Mach. Learn.","author":"Ioffe"},{"key":"ref2","article-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"ref3","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vaswani"},{"key":"ref4","article-title":"Instance normalization: The missing ingredient for fast stylization","author":"Ulyanov","year":"2016"},{"key":"ref5","article-title":"Spectral normalization for generative adversarial networks","author":"Miyato","year":"2018"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2018.03.005"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"ref9","first-page":"1945","article-title":"Batch renormalization: Towards reducing minibatch dependence in batch-normalized models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ioffe"},{"key":"ref10","article-title":"Towards stabilizing batch statistics in backward propagation of batch normalization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yan"},{"key":"ref11","article-title":"Machine Learning has become a alchemy","author":"Ali Rahimi","year":"2017","journal-title":"Proc. NIPS Test Time Award Speech"},{"key":"ref12","first-page":"2488","article-title":"How does batch normalization help optimization? (no, it is not about internal covariate shift)","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Santurkar"},{"key":"ref13","first-page":"2232","article-title":"An investigation into neural net optimization via Hessian eigenvalue density","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ghorbani"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_23"},{"key":"ref15","article-title":"Recurrent batch normalization","author":"Cooijmans","year":"2016"},{"key":"ref16","first-page":"16304","article-title":"Stochastic normalization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kou"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00647"},{"key":"ref18","first-page":"901","article-title":"Weight normalization: A simple reparameterization to accelerate training of deep neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Salimans"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11717"},{"key":"ref20","first-page":"8433","article-title":"Online normalization for training neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Chiley"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00500"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5917"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00856"},{"key":"ref24","first-page":"16990","article-title":"Proxy-normalizing activations to match batch normalization while removing batch dependence","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Labatie"},{"key":"ref25","article-title":"Understanding batch normalization","author":"Bjorck","year":"2018"},{"key":"ref26","first-page":"806","article-title":"Exponential convergence rates for batch normalization: The power of length-direction decoupling in non-convex optimization","volume-title":"Proc. 22nd Int. Conf. Artif. Intell. Statist.","author":"Kohler"},{"key":"ref27","article-title":"Theoretical analysis of auto rate-tuning by batch normalization","author":"Arora","year":"2018"},{"key":"ref28","first-page":"18387","article-title":"Batch normalization provably avoids ranks collapse for randomly initialised deep networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Daneshmand"},{"key":"ref29","first-page":"4778","article-title":"Beyond batchnorm: Towards a unified understanding of normalization in deep learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lubana"},{"key":"ref30","article-title":"ADADELTA: An adaptive learning rate method","author":"Zeiler","year":"2012"},{"key":"ref31","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma"},{"key":"ref32","article-title":"On the convergence of Adam and beyond","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Reddi"},{"key":"ref33","article-title":"Variance reduction for faster non-convex optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Allen-Zhu"},{"key":"ref34","first-page":"687","article-title":"Spider: Near-optimal non-convex optimization via stochastic path-integrated differential estimator","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Fang"},{"key":"ref35","first-page":"2403","article-title":"SpiderBoost and momentum: Faster variance reduction algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref36","first-page":"560","article-title":"SIGNSGD: Compressed optimisation for non-convex problems","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bernstein"},{"key":"ref37","first-page":"2021","article-title":"FedPAQ: A communication-efficient federated learning method with periodic averaging and quantization","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Reisizadeh"},{"key":"ref38","article-title":"On the convergence of a class of Adam-type algorithms for non-convex optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen"},{"key":"ref39","first-page":"15210","article-title":"Momentum-based variance reduction in non-convex SGD","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Cutkosky"},{"key":"ref40","first-page":"315","article-title":"Accelerating stochastic gradient descent using predictive variance reduction","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Johnson"},{"key":"ref41","article-title":"All you need is a good init","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Mishkin"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00089"},{"key":"ref43","first-page":"8580","article-title":"Neural tangent kernel: Convergence and generalization in neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jacot"},{"key":"ref44","first-page":"1675","article-title":"Gradient descent finds global minima of deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Du"},{"key":"ref45","first-page":"2937","article-title":"On lazy training in differentiable programming","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Chizat"},{"key":"ref46","article-title":"A mean field theory of batch normalization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yang"},{"key":"ref47","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Karen Simonyan","year":"2016"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref49","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tan"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10345401\/10262355.pdf?arnumber=10262355","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,19]],"date-time":"2023-12-19T22:12:09Z","timestamp":1703023929000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10262355\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1]]},"references-count":49,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3319005","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,1]]}}}