{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:20Z","timestamp":1750309520500,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":23,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,17]],"date-time":"2024-10-17T00:00:00Z","timestamp":1729123200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,17]]},"DOI":"10.1145\/3704137.3704148","type":"proceedings-article","created":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T09:32:11Z","timestamp":1740994331000},"page":"73-79","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Sub-Batch Update Mechanism for Component-Wise Natural Gradient Descent"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4211-049X","authenticated-orcid":false,"given":"Sang Van","family":"Tran","sequence":"first","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6383-7105","authenticated-orcid":false,"given":"Toshiyuki","family":"Nakata","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6359-2221","authenticated-orcid":false,"given":"Rie Shigetomi","family":"Yamaguchi","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6229-5561","authenticated-orcid":false,"given":"Irvan","family":"Mhd","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7901-4787","authenticated-orcid":false,"given":"Yoshihide","family":"Yoshimoto","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,3,3]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Shun-ichi Amari. 1993. Backpropagation and stochastic gradient descent method. Neurocomputing 5 4-5 (1993) 185\u2013196.","DOI":"10.1016\/0925-2312(93)90006-O"},{"key":"e_1_3_3_1_3_2","volume-title":"AISTATS","author":"Coates Adam","year":"2011","unstructured":"Adam Coates, Andrew Ng, and Honglak Lee. 2011. An Analysis of Single Layer Networks in Unsupervised Feature Learning. In AISTATS."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/s10710-017-9314-z"},{"key":"e_1_3_3_1_6_2","unstructured":"Priya Goyal Piotr Doll\u00e1r Ross\u00a0B. Girshick Pieter Noordhuis Lukasz Wesolowski Aapo Kyrola Andrew Tulloch Yangqing Jia and Kaiming He. 2017. Accurate Large Minibatch SGD: Training ImageNet in 1 Hour. CoRR abs\/1706.02677 (2017). arXiv:https:\/\/arXiv.org\/abs\/1706.02677http:\/\/arxiv.org\/abs\/1706.02677"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"e_1_3_3_1_8_2","series-title":"(NIPS\u201917)","first-page":"1729","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"Hoffer Elad","year":"2017","unstructured":"Elad Hoffer, Itay Hubara, and Daniel Soudry. 2017. Train Longer, Generalize Better: Closing the Generalization Gap in Large Batch Training of Neural Networks. In Proceedings of the 31st International Conference on Neural Information Processing Systems (Long Beach, California, USA) (NIPS\u201917). Curran Associates Inc., Red Hook, NY, USA, 1729\u20131739."},{"key":"e_1_3_3_1_9_2","series-title":"(NIPS\u201920)","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Karakida Ryo","year":"2020","unstructured":"Ryo Karakida and Kazuki Osawa. 2020. Understanding Approximate Fisher Information for Fast Convergence of Natural Gradient Descent in Wide Neural Networks. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS\u201920). Curran Associates Inc., Red Hook, NY, USA, Article 914, 11\u00a0pages."},{"key":"e_1_3_3_1_10_2","unstructured":"Kenji Kawaguchi. 2016. Deep learning without poor local minima. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_1_11_2","unstructured":"Nitish\u00a0Shirish Keskar Dheevatsa Mudigere Jorge Nocedal Mikhail Smelyanskiy and Ping Tak\u00a0Peter Tang. 2016. On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima. CoRR abs\/1609.04836 (2016). arXiv:https:\/\/arXiv.org\/abs\/1609.04836http:\/\/arxiv.org\/abs\/1609.04836"},{"key":"e_1_3_3_1_12_2","unstructured":"Alex Krizhevsky. 2014. One weird trick for parallelizing convolutional neural networks. CoRR abs\/1404.5997 (2014). arXiv:https:\/\/arXiv.org\/abs\/1404.5997http:\/\/arxiv.org\/abs\/1404.5997"},{"key":"e_1_3_3_1_13_2","unstructured":"Alex Krizhevsky Vinod Nair and Geoffrey Hinton. 2010. CIFAR-10 (Canadian Institute for Advanced Research). (2010). http:\/\/www.cs.toronto.edu\/\u00a0kriz\/cifar.html"},{"key":"e_1_3_3_1_14_2","volume-title":"Advances in Neural Information Processing Systems","author":"Kunstner Frederik","year":"2019","unstructured":"Frederik Kunstner, Philipp Hennig, and Lukas Balles. 2019. Limitations of the empirical Fisher approximation for natural gradient descent. In Advances in Neural Information Processing Systems, H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.), Vol.\u00a032. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/46a558d97954d0692411c861cf78ef79-Paper.pdf"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/2623330.2623612"},{"key":"e_1_3_3_1_16_2","series-title":"Proceedings of Machine Learning Research","first-page":"2408","volume-title":"Proceedings of the 32nd International Conference on Machine Learning","volume":"37","author":"Martens James","year":"2015","unstructured":"James Martens and Roger Grosse. 2015. Optimizing Neural Networks with Kronecker-factored Approximate Curvature. In Proceedings of the 32nd International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a037), Francis Bach and David Blei (Eds.). PMLR, Lille, France, 2408\u20132417. https:\/\/proceedings.mlr.press\/v37\/martens15.html"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2355061"},{"key":"e_1_3_3_1_19_2","unstructured":"Samuel\u00a0L. Smith Pieter-Jan Kindermans and Quoc\u00a0V. Le. 2017. Don\u2019t Decay the Learning Rate Increase the Batch Size. CoRR abs\/1711.00489 (2017). arXiv:https:\/\/arXiv.org\/abs\/1711.00489http:\/\/arxiv.org\/abs\/1711.00489"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"Sang Tran Toshiyuki Nakata Rie Yamaguchi Irvan Mhd and Yoshihide Yoshimoto. 2024. Enhanced Component-Wise Natural Gradient Descent Training Method for Deep Neural Networks. 10.21203\/rs.3.rs-3808006\/v1","DOI":"10.21203\/rs.3.rs-3808006\/v1"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CANDAR57322.2022.00016"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CSCI58124.2022.00036"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","unstructured":"D. Wilson and Tony Martinez. 2004. The general inefficiency of batch training for gradient descent learning. Neural networks : the official journal of the International Neural Network Society 16 (01 2004) 1429\u201351. 10.1016\/S0893-6080(03)00138-2","DOI":"10.1016\/S0893-6080(03)00138-2"},{"key":"e_1_3_3_1_24_2","unstructured":"Yang You Igor Gitman and Boris Ginsburg. 2017. Scaling SGD Batch Size to 32K for ImageNet Training. CoRR abs\/1708.03888 (2017). arXiv:https:\/\/arXiv.org\/abs\/1708.03888http:\/\/arxiv.org\/abs\/1708.03888"}],"event":{"name":"ICAAI 2024: 2024 The 8th International Conference on Advances in Artificial Intelligence","acronym":"ICAAI 2024","location":"London United Kingdom"},"container-title":["Proceedings of the 2024 8th International Conference on Advances in Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3704137.3704148","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3704137.3704148","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:06Z","timestamp":1750295886000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3704137.3704148"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,17]]},"references-count":23,"alternative-id":["10.1145\/3704137.3704148","10.1145\/3704137"],"URL":"https:\/\/doi.org\/10.1145\/3704137.3704148","relation":{},"subject":[],"published":{"date-parts":[[2024,10,17]]},"assertion":[{"value":"2025-03-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}