{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T08:00:26Z","timestamp":1769587226768,"version":"3.49.0"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031200496","type":"print"},{"value":"9783031200502","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20050-2_23","type":"book-chapter","created":{"date-parts":[[2022,10,27]],"date-time":"2022-10-27T22:09:58Z","timestamp":1666908598000},"page":"389-405","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Scalable Learning to\u00a0Optimize: A Learned Optimizer Can Train Big Models"],"prefix":"10.1007","author":[{"given":"Xuxi","family":"Chen","sequence":"first","affiliation":[]},{"given":"Tianlong","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Weizhu","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Ahmed","family":"Awadallah","sequence":"additional","affiliation":[]},{"given":"Zhangyang","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,28]]},"reference":[{"key":"23_CR1","unstructured":"Andrychowicz, M., et al.: Learning to learn by gradient descent by gradient descent. In: Advances in Neural Information Processing Systems (NeurIPS) (2016)"},{"key":"23_CR2","unstructured":"Bello, I., Zoph, B., Vasudevan, V., Le, Q.V.: Neural optimizer search with reinforcement learning. In: Precup, D., Teh, Y.W. (eds.) Proceedings of the 34th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 70, pp. 459\u2013468. PMLR, 06\u201311 Aug 2017. https:\/\/proceedings.mlr.press\/v70\/bello17a.html"},{"key":"23_CR3","unstructured":"Cao, Y., Chen, T., Wang, Z., Shen, Y.: Learning to optimize in swarms. In: Advances in Neural Information Processing Systems (NeurIPS), pp. 15018\u201315028 (2019)"},{"key":"23_CR4","unstructured":"Chen, T., Chen, X., Chen, W., Heaton, H., Liu, J., Wang, Z., Yin, W.: Learning to optimize: a primer and a benchmark. arXiv preprint arXiv:2103.12828 (2021)"},{"key":"23_CR5","unstructured":"Chen, T., Zhang, W., Zhou, J., Chang, S., Liu, S., Amini, L., Wang, Z.: Training stronger baselines for learning to optimize. arXiv preprint arXiv:2010.09089 (2020)"},{"key":"23_CR6","unstructured":"Chen, W., Yu, Z., Wang, Z., Anandkumar, A.: Automated synthetic-to-real generalization. In: International Conference on Machine Learning (ICML). pp. 1746\u20131756 (2020)"},{"key":"23_CR7","unstructured":"Chen, X., et al.: Self-PU: self boosted and calibrated positive-unlabeled training. In: International Conference on Machine Learning (ICML), pp. 1510\u20131519 (2020)"},{"key":"23_CR8","unstructured":"Chen, Y., et al.: Learning to learn without gradient descent by gradient descent. In: International Conference on Machine Learning (ICML), pp. 748\u2013756 (2017)"},{"key":"23_CR9","unstructured":"Cranmer, M.: PYSR: Fast & parallelized symbolic regression in python\/Julia (2020)"},{"key":"23_CR10","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"23_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"issue":"7","key":"23_CR12","first-page":"1","volume":"12","author":"J Duchi","year":"2011","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive subgradient methods for online learning and stochastic optimization. J. Mach. Learn. Res. 12(7), 1\u201339 (2011)","journal-title":"J. Mach. Learn. Res."},{"key":"23_CR13","doi-asserted-by":"publisher","unstructured":"Du\u0161ek, O., Howcroft, D.M., Rieser, V.: Semantic noise matters for neural natural language generation. In: Proceedings of the 12th International Conference on Natural Language Generation, pp. 421\u2013426. Association for Computational Linguistics, Tokyo, Japan, October\u2013November 2019. https:\/\/doi.org\/10.18653\/v1\/W19-8652, https:\/\/www.aclweb.org\/anthology\/W19-8652","DOI":"10.18653\/v1\/W19-8652"},{"key":"23_CR14","unstructured":"Gressmann, F., Eaton-Rosen, Z., Luschi, C.: Improving neural network training in low dimensional random bases. arXiv preprint arXiv:2011.04720 (2020)"},{"key":"23_CR15","unstructured":"Gur-Ari, G., Roberts, D.A., Dyer, E.: Gradient descent happens in a tiny subspace. arXiv preprint arXiv:1812.04754 (2018)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Hecht-Nielsen, R.: Theory of the backpropagation neural network. In: Neural Networks For Perception, pp. 65\u201393. Elsevier (1992)","DOI":"10.1016\/B978-0-12-741252-8.50010-8"},{"issue":"8","key":"23_CR18","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"23_CR19","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Chen, W.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Jaderberg, M., Vedaldi, A., Zisserman, A.: Speeding up convolutional neural networks with low rank expansions. In: Proceedings of the British Machine Vision Conference. BMVA Press (2014)","DOI":"10.5244\/C.28.88"},{"key":"23_CR21","unstructured":"Jiang, H., Chen, Z., Shi, Y., Dai, B., Zhao, T.: Learning to defense by learning to attack. arXiv preprint arXiv:1811.01213 (2018)"},{"key":"23_CR22","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"23_CR23","unstructured":"Krizhevsky, A., et al.: Learning multiple layers of features from tiny images (2009)"},{"key":"23_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"500","DOI":"10.1007\/978-3-030-58545-7_29","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Li","year":"2020","unstructured":"Li, C., Chen, T., You, H., Wang, Z., Lin, Y.: HALO: hardware-aware learning to optimize. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 500\u2013518. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_29"},{"key":"23_CR25","unstructured":"Li, C., Farkhoor, H., Liu, R., Yosinski, J.: Measuring the intrinsic dimension of objective landscapes. arXiv preprint arXiv:1804.08838 (2018)"},{"key":"23_CR26","first-page":"1","volume":"31","author":"H Li","year":"2018","unstructured":"Li, H., Xu, Z., Taylor, G., Studer, C., Goldstein, T.: Visualizing the loss landscape of neural nets. Adv. Neural. Inf. Process. Syst. 31, 1\u201311 (2018)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR27","unstructured":"Li, K., Malik, J.: Learning to optimize. arXiv preprint arXiv:1606.01885 (2016)"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Li, T., Tan, L., Tao, Q., Liu, Y., Huang, X.: Low dimensional landscape hypothesis is true: DNNs can be trained in tiny subspaces (2021)","DOI":"10.1109\/TPAMI.2022.3178101"},{"key":"23_CR29","unstructured":"Lv, K., Jiang, S., Li, J.: Learning gradient descent: better generalization and longer horizons. In: International Conference on Machine Learning (ICML), pp. 2247\u20132255 (2017)"},{"key":"23_CR30","unstructured":"Metz, L., Maheswaranathan, N., Nixon, J., Freeman, D., Sohl-Dickstein, J.: Understanding and correcting pathologies in the training of learned optimizers. In: International Conference on Machine Learning, pp. 4556\u20134565. PMLR (2019)"},{"key":"23_CR31","unstructured":"Oymak, S., Fabian, Z., Li, M., Soltanolkotabi, M.: Generalization guarantees for neural networks via harnessing the low-rank structure of the Jacobean. arXiv preprint arXiv:1906.05392 (2019)"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Povey, D., et al.: Semi-orthogonal low-rank matrix factorization for deep neural networks. In: Interspeech, pp. 3743\u20133747 (2018)","DOI":"10.21437\/Interspeech.2018-1417"},{"issue":"1","key":"23_CR33","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1016\/S0893-6080(98)00116-6","volume":"12","author":"N Qian","year":"1999","unstructured":"Qian, N.: On the momentum term in gradient descent learning algorithms. Neural Netw. 12(1), 145\u2013151 (1999)","journal-title":"Neural Netw."},{"key":"23_CR34","first-page":"9","volume":"8","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners. OpenAI blog. 8, 9 (2019)","journal-title":"OpenAI blog."},{"key":"23_CR35","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"HE Robbins","year":"2007","unstructured":"Robbins, H.E.: A stochastic approximation method. Ann. Math. Stat. 22, 400\u2013407 (2007)","journal-title":"Ann. Math. Stat."},{"key":"23_CR36","doi-asserted-by":"crossref","unstructured":"Sainath, T.N., Kingsbury, B., Sindhwani, V., Arisoy, E., Ramabhadran, B.: Low-rank matrix factorization for deep neural network training with high-dimensional output targets. In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 6655\u20136659. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6638949"},{"key":"23_CR37","unstructured":"Shen, J., Chen, X., Heaton, H., Chen, T., Liu, J., Yin, W., Wang, Z.: Learning a minimax optimizer: a pilot study. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"23_CR38","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"23_CR39","unstructured":"Sohl-Dickstein, J., Poole, B., Ganguli, S.: Fast large-scale optimization by unifying stochastic gradient and quasi-newton methods. In: International Conference on Machine Learning, pp. 604\u2013612. PMLR (2014)"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"23_CR41","unstructured":"Tieleman, T., Hinton, G.: Lecture 6.5\u2013RMSProp: divide the gradient by a running average of its recent magnitude. COURSERA: Neural Networks for Machine Learning (2012)"},{"key":"23_CR42","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357. PMLR (2021)"},{"key":"23_CR43","unstructured":"Tuddenham, M., Pr\u00fcgel-Bennett, A., Hare, J.: Quasi-newton\u2019s method in the class gradient defined high-curvature subspace. arXiv preprint arXiv:2012.01938 (2020)"},{"key":"23_CR44","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"23_CR45","unstructured":"Vicol, P., Metz, L., Sohl-Dickstein, J.: Unbiased gradient estimation in unrolled computation graphs with persistent evolution strategies. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 139, pp. 10553\u201310563. PMLR, 18\u201324 July 2021. https:\/\/proceedings.mlr.press\/v139\/vicol21a.html"},{"key":"23_CR46","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wohlwend, J., Lei, T.: Structured pruning of large language models. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 6151\u20136162 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.496"},{"issue":"10","key":"23_CR47","doi-asserted-by":"publisher","first-page":"1550","DOI":"10.1109\/5.58337","volume":"78","author":"PJ Werbos","year":"1990","unstructured":"Werbos, P.J.: Backpropagation through time: what it does and how to do it. Proc. IEEE 78(10), 1550\u20131560 (1990)","journal-title":"Proc. IEEE"},{"key":"23_CR48","unstructured":"Wichrowska, O., et al.: Learned optimizers that scale and generalize. In: International Conference on Machine Learning (ICML) (2017)"},{"key":"23_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1007\/978-3-030-58598-3_6","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Xiong","year":"2020","unstructured":"Xiong, Y., Hsieh, C.-J.: Improved adversarial training via learned optimizer. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12353, pp. 85\u2013100. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58598-3_6"},{"key":"23_CR50","unstructured":"Yin, M., Tucker, G., Zhou, M., Levine, S., Finn, C.: Meta-learning without memorization. arXiv preprint arXiv:1912.03820 (2019)"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"You, Y., Chen, T., Wang, Z., Shen, Y.: L2-GCN: layer-wise and learned efficient training of graph convolutional networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2127\u20132135 (2020)","DOI":"10.1109\/CVPR42600.2020.00220"},{"key":"23_CR52","doi-asserted-by":"crossref","unstructured":"Yu, X., Liu, T., Wang, X., Tao, D.: On compressing deep models by low rank and sparse decomposition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7370\u20137379 (2017)","DOI":"10.1109\/CVPR.2017.15"},{"key":"23_CR53","first-page":"2707","volume":"34","author":"S Zhang","year":"2021","unstructured":"Zhang, S., Wang, M., Liu, S., Chen, P.Y., Xiong, J.: Why lottery ticket wins? A theoretical perspective of sample complexity on sparse neural networks. Adv. Neural. Inf. Process. Syst. 34, 2707\u20132720 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR54","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Chuangsuwanich, E., Glass, J.: Extracting deep neural network bottleneck features using low-rank matrix factorization. In: 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 185\u2013189. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6853583"},{"key":"23_CR55","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Li, J., Gong, Y.: Low-rank plus diagonal adaptation for deep neural networks. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5005\u20135009. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472630"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20050-2_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,27]],"date-time":"2022-10-27T22:26:36Z","timestamp":1666909596000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20050-2_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200496","9783031200502"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20050-2_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"28 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}