{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,19]],"date-time":"2025-12-19T15:45:08Z","timestamp":1766159108856,"version":"3.37.3"},"reference-count":66,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2020AAA0105601"],"award-info":[{"award-number":["2020AAA0105601"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976174","62276208"],"award-info":[{"award-number":["61976174","62276208"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1109\/tnnls.2023.3242969","type":"journal-article","created":{"date-parts":[[2023,2,17]],"date-time":"2023-02-17T20:54:30Z","timestamp":1676667270000},"page":"10576-10590","source":"Crossref","is-referenced-by-count":1,"title":["Understanding Short-Range Memory Effects in Deep Neural Networks"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7091-898X","authenticated-orcid":false,"given":"Chengli","family":"Tan","sequence":"first","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8395-1180","authenticated-orcid":false,"given":"Jiangshe","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1462-7248","authenticated-orcid":false,"given":"Junmin","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1061\/taceat.0006518"},{"volume-title":"Long-Term Storage: An Experimental Study","year":"1965","author":"Black","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1038\/srep06577"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/1-84628-048-6_11"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/90.803379"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/MIC.2004.46"},{"key":"ref7","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi","year":"2011","journal-title":"J. Mach. Learn. Res."},{"key":"ref8","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Kingma"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2955777"},{"key":"ref10","first-page":"1","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Keskar"},{"key":"ref11","first-page":"1","article-title":"Understanding deep learning requires rethinking generalization","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Zhang"},{"key":"ref12","first-page":"1731","article-title":"Train longer, generalize better: Closing the generalization gap in large batch training of neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hoffer"},{"key":"ref13","article-title":"Three factors influencing minima in SGD","author":"Jastrz\u0229bski","year":"2017","journal-title":"arXiv:1711.04623"},{"key":"ref14","first-page":"7654","article-title":"The anisotropic noise in stochastic gradient descent: Its behavior of escaping from sharp minima and regularization effects","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhu"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1137\/16M1080173"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2017.2672978"},{"key":"ref17","first-page":"1","article-title":"Don\u2019t decay the learning rate, increase the batch size","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Smith"},{"key":"ref18","first-page":"4949","article-title":"Hessian-based analysis of large batch training and robustness to adversaries","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yao"},{"key":"ref19","first-page":"1143","article-title":"Control batch size and learning rate to generalize well: Theoretical and empirical evidence","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"He"},{"key":"ref20","first-page":"1","article-title":"On the noisy gradient descent that generalizes as SGD","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wu"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2957003"},{"issue":"1","key":"ref22","first-page":"4873","article-title":"Stochastic gradient descent as approximate Bayesian inference","volume":"18","author":"Mandt","year":"2017","journal-title":"J. Mach. Learn. Res."},{"key":"ref23","first-page":"2101","article-title":"Stochastic modified equations and adaptive stochastic gradient algorithms","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ITA.2018.8503224"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.4310\/AMSA.2019.v4.n1.a1"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/168304.168306"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.1.1"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1088\/1742-5468\/ab39d9"},{"key":"ref29","first-page":"5827","article-title":"A tail-index analysis of stochastic gradient noise in deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Simsekli"},{"key":"ref30","first-page":"273","article-title":"First exit time analysis of stochastic gradient descent under heavy-tailed gradient noise","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Nguyen"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1093\/oso\/9780198507659.001.0001"},{"key":"ref32","article-title":"Opening the black box of deep neural networks via information","author":"Shwartz-Ziv","year":"2017","journal-title":"arXiv:1703.00810"},{"key":"ref33","first-page":"1019","article-title":"Sharp minima can generalize for deep nets","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Dinh"},{"key":"ref34","first-page":"1","article-title":"A diffusion theory for deep learning dynamics: Stochastic gradient descent exponentially favors flat minima","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Xie"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045167"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.123"},{"key":"ref37","first-page":"249","article-title":"Understanding the difficulty of training deep feedforward neural networks","volume-title":"Proc. Int. Conf. Artif. Intell. Stat.","author":"Glorot"},{"key":"ref38","article-title":"Non-Gaussianity of stochastic gradient noise","author":"Panigrahi","year":"2019","journal-title":"arXiv:1910.09626"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref40","first-page":"1","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Dosovitskiy"},{"issue":"1","key":"ref41","first-page":"6","article-title":"Curves in Hilbert space which are invariant with respect to a one-parameter group of motions","volume":"26","author":"Kolmogorov","year":"1940","journal-title":"Dokl. Akad. Nauk SSSR"},{"issue":"1","key":"ref42","first-page":"141","article-title":"Correlation theory of processes with random stationary nth increments","volume":"79","author":"Yaglom","year":"1955","journal-title":"Matematicheskii Sbornik"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1137\/1010093"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/90.282603"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1515\/9781400830213"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2017.2674692"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1126\/science.156.3775.636"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1214\/009117904000000892"},{"issue":"1","key":"ref49","first-page":"55","article-title":"Differential equations driven by fractional Brownian motion","volume":"53","author":"Rascanu","year":"2002","journal-title":"Collect. Math."},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1080\/17442500802024892"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/BF01010843"},{"key":"ref52","first-page":"6389","article-title":"Visualizing the loss landscape of neural nets","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1214\/EJP.v8-125"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1016\/c2013-0-05932-4"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.5488\/CMP.14.23002"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevE.81.041119"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1063\/1.4707349"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1080\/00207729308949547"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2022.3210045"},{"key":"ref60","first-page":"1","article-title":"Fantastic generalization measures and where to find them","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Jiang"},{"key":"ref61","first-page":"1","article-title":"In search of robust measures of generalization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dziugaite"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1209\/0295-5075\/86\/30001"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1515\/9781400825103"},{"key":"ref64","first-page":"203","article-title":"Hurst exponent and financial market predictability","volume-title":"Proc. IASTED Int. Conf.","author":"Qian"},{"key":"ref65","first-page":"1","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Keskar"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5962385\/10623582\/10047985.pdf?arnumber=10047985","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,6]],"date-time":"2024-08-06T05:19:12Z","timestamp":1722921552000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10047985\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8]]},"references-count":66,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2023.3242969","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"type":"print","value":"2162-237X"},{"type":"electronic","value":"2162-2388"}],"subject":[],"published":{"date-parts":[[2024,8]]}}}