{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T21:33:07Z","timestamp":1757626387472,"version":"3.44.0"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030865221"},{"type":"electronic","value":"9783030865238"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-86523-8_38","type":"book-chapter","created":{"date-parts":[[2021,9,10]],"date-time":"2021-09-10T06:05:16Z","timestamp":1631253916000},"page":"628-643","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["MaxVA: Fast Adaptation of Step Sizes by Maximizing Observed Variance of Gradients"],"prefix":"10.1007","author":[{"given":"Chen","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Yu","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Zhe","family":"Gan","sequence":"additional","affiliation":[]},{"given":"Furong","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Jingjing","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Tom","family":"Goldstein","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,11]]},"reference":[{"key":"38_CR1","unstructured":"Balles, L., Hennig, P.: Dissecting adam: the sign, magnitude and variance of stochastic gradients. In: ICML, pp. 404\u2013413 (2018)"},{"key":"38_CR2","unstructured":"Bernstein, J., Wang, Y.X., Azizzadenesheli, K., Anandkumar, A.: signsgd: compressed optimisation for non-convex problems. In: ICML, pp. 560\u2013569 (2018)"},{"key":"38_CR3","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)"},{"key":"38_CR4","unstructured":"Cettolo, M., Niehues, J., St\u00fcker, S., Bentivogli, L., Federico, M.: Report on the 11th iwslt evaluation campaign, iwslt 2014. In: IWSLT, vol. 57 (2014)"},{"key":"38_CR5","unstructured":"Chen, J., Zhou, D., Tang, Y., Yang, Z., Gu, Q.: Closing the generalization gap of adaptive gradient methods in training deep neural networks. arXiv:1806.06763 (2018)"},{"key":"38_CR6","unstructured":"Chen, X., Liu, S., Sun, R., Hong, M.: On the convergence of a class of adam-type algorithms for non-convex optimization. In: ICLR (2019)"},{"key":"38_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: NAACL, pp. 4171\u20134186 (2019)"},{"key":"38_CR8","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive subgradient methods for online learning and stochastic optimization. JMLR (2011)"},{"issue":"4","key":"38_CR9","doi-asserted-by":"publisher","first-page":"2341","DOI":"10.1137\/120880811","volume":"23","author":"S Ghadimi","year":"2013","unstructured":"Ghadimi, S., Lan, G.: Stochastic first-and zeroth-order methods for nonconvex stochastic programming. SIAM J. Optim. 23(4), 2341\u20132368 (2013)","journal-title":"SIAM J. Optim."},{"key":"38_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"38_CR11","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)"},{"key":"38_CR12","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Bengio, Y., LeCun, Y. (eds.) ICLR (2015)"},{"key":"38_CR13","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: Albert: a lite bert for self-supervised learning of language representations. In: ICLR (2020)"},{"key":"38_CR14","unstructured":"Li, H., Xu, Z., Taylor, G., Studer, C., Goldstein, T.: Visualizing the loss landscape of neural nets. In: Advances in Neural Information Processing Systems, pp. 6389\u20136399 (2018)"},{"key":"38_CR15","unstructured":"Liu, L., et al.: On the variance of the adaptive learning rate and beyond. In: ICLR (2020)"},{"key":"38_CR16","doi-asserted-by":"crossref","unstructured":"Liu, L., Liu, X., Gao, J., Chen, W., Han, J.: Understanding the difficulty of training transformers. arXiv:2004.08249 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.463"},{"key":"38_CR17","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach. arXiv:1907.11692 (2019)"},{"key":"38_CR18","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2018)"},{"key":"38_CR19","unstructured":"Luo, L., Xiong, Y., Liu, Y., Sun, X.: Adaptive gradient methods with dynamic bound of learning rate. In: ICLR (2019)"},{"key":"38_CR20","unstructured":"Ma, J., Yarats, D.: On the adequacy of untuned warmup for adaptive optimization. arXiv:1910.04209 (2019)"},{"key":"38_CR21","doi-asserted-by":"crossref","unstructured":"Ott, M., Edunov, S., Grangier, D., Auli, M.: Scaling neural machine translation. In: WMT, pp. 1\u20139 (2018)","DOI":"10.18653\/v1\/W18-6301"},{"key":"38_CR22","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. arXiv:1910.10683 (2019)"},{"key":"38_CR23","unstructured":"Reddi, S.J., Kale, S., Kumar, S.: On the convergence of adam and beyond. In: ICLR (2018)"},{"key":"38_CR24","unstructured":"Schaul, T., Zhang, S., LeCun, Y.: No more pesky learning rates. In: ICML, pp. 343\u2013351 (2013)"},{"key":"38_CR25","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. JMLR (2014)"},{"key":"38_CR26","unstructured":"Tieleman, T., Hinton, G.: Lecture 6.5\u2013RmsProp: divide the gradient by a running average of its recent magnitude. COURSERA (2012)"},{"key":"38_CR27","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"38_CR28","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: Glue: a multi-task benchmark and analysis platform for natural language understanding. In: EMNLP (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"38_CR29","unstructured":"Wilson, A.C., Roelofs, R., Stern, M., Srebro, N., Recht, B.: The marginal value of adaptive gradient methods in machine learning. In: Neurips, pp. 4148\u20134158 (2017)"},{"key":"38_CR30","unstructured":"Wu, Y., Ren, M., Liao, R., Grosse, R.: Understanding short-horizon bias in stochastic meta-optimization. arXiv:1803.02021 (2018)"},{"key":"38_CR31","unstructured":"You, Y., Gitman, I., Ginsburg, B.: Scaling SGD batch size to 32k for imagenet training. CoRR abs\/1708.03888 (2017)"},{"key":"38_CR32","unstructured":"You, Y., et al.: Large batch optimization for deep learning: training bert in 76 minutes. In: ICLR (2020)"},{"key":"38_CR33","unstructured":"Zaheer, M., Reddi, S., Sachan, D., Kale, S., Kumar, S.: Adaptive methods for nonconvex optimization. In: NeurIPS, pp. 9793\u20139803 (2018)"},{"key":"38_CR34","unstructured":"Zeiler, M.D.: ADADELTA: an adaptive learning rate method. CoRR (2012)"},{"key":"38_CR35","unstructured":"Zhang, G., et al.: Which algorithmic choices matter at which batch sizes? insights from a noisy quadratic model. In: NeurIPS, pp. 8194\u20138205 (2019)"},{"key":"38_CR36","unstructured":"Zhang, J., et al.: Why are adaptive methods good for attention models? In: NeurIPS 33 (2020)"},{"key":"38_CR37","unstructured":"Zhang, M.R., Lucas, J., Ba, J., Hinton, G.E.: Lookahead optimizer: k steps forward, 1 step back. In: NeurIPS (2019)"},{"key":"38_CR38","unstructured":"Zhu, C., Cheng, Y., Gan, Z., Sun, S., Goldstein, T., Liu, J.: Freelb: enhanced adversarial training for natural language understanding. In: ICLR (2020)"},{"key":"38_CR39","unstructured":"Zhuang, J., et al.: Adabelief optimizer: adapting stepsizes by the belief in observed gradients. In: NeurIPS (2020)"},{"key":"38_CR40","unstructured":"Ziyin, L., Wang, Z.T., Ueda, M.: Laprop: a better way to combine momentum with adaptive gradient. arXiv:2002.04839 (2020)"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases. Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-86523-8_38","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T22:04:54Z","timestamp":1757455494000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-86523-8_38"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030865221","9783030865238"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-86523-8_38","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"11 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bilbao","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Spain","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2021.ecmlpkdd.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"869","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"210","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"24% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3-4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3-9","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held online due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}