{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T18:48:28Z","timestamp":1743014908616,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":51,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819757787"},{"type":"electronic","value":"9789819757794"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-5779-4_2","type":"book-chapter","created":{"date-parts":[[2025,1,10]],"date-time":"2025-01-10T07:15:24Z","timestamp":1736493324000},"page":"19-34","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Accelerating Training of\u00a0Large Neural Models by\u00a0Gradient-Based Growth Learning"],"prefix":"10.1007","author":[{"given":"Haowei","family":"Jiang","sequence":"first","affiliation":[]},{"given":"Jianxing","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Libin","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Huaijie","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Yin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,11]]},"reference":[{"key":"2_CR1","unstructured":"Ba, L.J., Kiros, J.R., Hinton, G.E.: Layer normalization. CoRR abs\/1607.06450 (2016)"},{"key":"2_CR2","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Lamblin, P., Popovici, D., Larochelle, H.: Greedy layer-wise training of deep networks. In: NeurIPS, pp. 153\u2013160 (2006)","DOI":"10.7551\/mitpress\/7503.003.0024"},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"Beyer, L., et al.: FlexiViT: one model for all patch sizes. In: CVPR, pp. 14496\u201314506 (2023)","DOI":"10.1109\/CVPR52729.2023.01393"},{"key":"2_CR4","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: NeurIPS, pp. 1877\u20131901 (2020)"},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Chen, C., et al.: bert2BERT: towards reusable pretrained language models. In: ACL, pp. 2134\u20132148 (2022)","DOI":"10.18653\/v1\/2022.acl-long.151"},{"key":"2_CR6","unstructured":"Chen, M., et al.: Towards understanding hierarchical learning: benefits of neural representations. In: NeurIPS, pp. 22134\u201322145 (2020)"},{"key":"2_CR7","unstructured":"Chen, T., Goodfellow, I.J., Shlens, J.: Net2Net: accelerating learning via knowledge transfer. In: ICLR, pp. 1\u201312 (2016)"},{"key":"2_CR8","unstructured":"Chowdhery, A., Narang, S., Devlin, J., et\u00a0al.: PaLM: scaling language modeling with pathways. CoRR abs\/2204.02311 (2022)"},{"key":"2_CR9","doi-asserted-by":"publisher","first-page":"1487","DOI":"10.1109\/TC.2019.2914438","volume":"68","author":"X Dai","year":"2019","unstructured":"Dai, X., Yin, H., Jha, N.K.: NeST: a neural network synthesis tool based on a grow-and-prune paradigm. IEEE Trans. Comput. 68, 1487\u20131497 (2019)","journal-title":"IEEE Trans. Comput."},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL, pp. 4171\u20134186 (2019)"},{"key":"2_CR12","unstructured":"Dong, C., Liu, L., Li, Z., Shang, J.: Towards adaptive residual network training: a neural-ODE perspective. In: ICML, pp. 2616\u20132626 (2020)"},{"key":"2_CR13","unstructured":"Driess, D., et al.: PaLM-E: an embodied multimodal language model. In: ICML, pp. 8469\u20138488 (2023)"},{"key":"2_CR14","unstructured":"Evci, U., Gale, T., Menick, J., Castro, P.S., Elsen, E.: Rigging the lottery: making all tickets winners. In: ICML, pp. 2943\u20132952 (2020)"},{"key":"2_CR15","unstructured":"Evci, U., van Merrienboer, B., Unterthiner, T., Pedregosa, F., Vladymyrov, M.: GradMax: growing neural networks using gradient information. In: ICLR, pp. 1\u201317 (2022)"},{"key":"2_CR16","first-page":"1","volume":"23","author":"W Fedus","year":"2022","unstructured":"Fedus, W., Zoph, B., Shazeer, N.: Switch transformers: scaling to trillion parameter models with simple and efficient sparsity. J. Mach. Learn. Res. 23, 1\u201339 (2022)","journal-title":"J. Mach. Learn. Res."},{"key":"2_CR17","unstructured":"Gong, L., He, D., Li, Z., Qin, T., Wang, L., Liu, T.: Efficient training of BERT by progressively stacking. In: ICML, pp. 2337\u20132346 (2019)"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: CVPR, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"2_CR20","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: ICML, pp. 448\u2013456 (2015)"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Joshi, M., Choi, E., Weld, D.S., Zettlemoyer, L.: TriviaQA: a large scale distantly supervised challenge dataset for reading comprehension. In: ACL, pp. 1601\u20131611 (2017)","DOI":"10.18653\/v1\/P17-1147"},{"key":"2_CR22","unstructured":"Kaplan, J., et al.: Scaling laws for neural language models. CoRR abs\/2001.08361 (2020)"},{"key":"2_CR23","unstructured":"Karras, T., Aila, T., Laine, S., Lehtinen, J.: Progressive growing of GANs for improved quality, stability, and variation. In: ICLR, pp. 1\u201326 (2018)"},{"key":"2_CR24","unstructured":"Kim, H., Papamakarios, G., Mnih, A.: The Lipschitz constant of self-attention. In: ICML, pp. 5562\u20135571 (2021)"},{"key":"2_CR25","unstructured":"Krizhevsky, A., Hinton, G.: Learning multiple layers of features from tiny images (2009)"},{"key":"2_CR26","doi-asserted-by":"crossref","unstructured":"Li, C., Zhuang, B., Wang, G., Liang, X., Chang, X., Yang, Y.: Automated progressive learning for efficient training of Vision Transformers. In: CVPR, pp. 12476\u201312486 (2022)","DOI":"10.1109\/CVPR52688.2022.01216"},{"key":"2_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C., Feichtenhofer, C., Darrell, T., Xie, S.: A ConvNet for the 2020s. In: CVPR, pp. 11966\u201311976 (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"2_CR28","unstructured":"Loshchilov, I., Hutter, F.: SGDR: stochastic gradient descent with warm restarts. In: ICLR, pp. 1\u201316 (2017)"},{"key":"2_CR29","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR, pp. 1\u201319 (2019)"},{"key":"2_CR30","doi-asserted-by":"crossref","unstructured":"Ma, J., Yarats, D.: On the adequacy of untuned warmup for adaptive optimization. In: AAAI, pp. 8828\u20138836 (2021)","DOI":"10.1609\/aaai.v35i10.17069"},{"key":"2_CR31","unstructured":"Micikevicius, P., et al.: Mixed precision training. In: ICLR, pp. 1\u201312 (2018)"},{"key":"2_CR32","unstructured":"Radford, A., et\u00a0al.: Language models are unsupervised multitask learners. OpenAI blog (1), 1\u201324 (2019)"},{"key":"2_CR33","unstructured":"Rao, Y., Zhao, W., Tang, Y., Zhou, J., Lim, S., Lu, J.: HorNet: efficient high-order spatial interactions with recursive gated convolutions. In: NeurIPS, pp. 10353\u201310366 (2022)"},{"key":"2_CR34","doi-asserted-by":"crossref","unstructured":"Real, E., Aggarwal, A., Huang, Y., Le, Q.V.: Regularized evolution for image classifier architecture search. In: AAAI, pp. 4780\u20134789 (2019)","DOI":"10.1609\/aaai.v33i01.33014780"},{"key":"2_CR35","unstructured":"Rusu, A.A., et al.: Progressive neural networks. CoRR abs\/1606.04671 (2016)"},{"key":"2_CR36","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1007\/s41019-021-00167-z","volume":"6","author":"R Sarki","year":"2021","unstructured":"Sarki, R., Ahmed, K., Wang, H., Zhang, Y., Ma, J., Wang, K.: Image preprocessing in classification and identification of diabetic eye diseases. J. Data Sci. Eng. 6, 455\u2013471 (2021)","journal-title":"J. Data Sci. Eng."},{"key":"2_CR37","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. CoRR abs\/1707.06347 (2017)"},{"key":"2_CR38","unstructured":"Shen, S., Walsh, P., Keutzer, K., Dodge, J., Peters, M.E., Beltagy, I.: Staged training for Transformer language models. In: ICML, pp. 19893\u201319908 (2022)"},{"key":"2_CR39","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image Transformers & distillation through attention. In: ICML, pp. 10347\u201310357 (2021)"},{"key":"2_CR40","unstructured":"Wang, P., et al.: Learning to grow pretrained models for efficient Transformer training. In: ICLR, pp. 1\u201318 (2023)"},{"key":"2_CR41","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: InternImage: exploring large-scale vision foundation models with deformable convolutions. In: CVPR, pp. 14408\u201314419 (2023)","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"2_CR42","unstructured":"Wei, T., Wang, C., Rui, Y., Chen, C.W.: Network morphism. In: ICML, pp. 564\u2013572 (2016)"},{"key":"2_CR43","doi-asserted-by":"crossref","unstructured":"Wen, W., Yan, F., Chen, Y., Li, H.: AutoGrow: automatic layer growing in deep convolutional networks. In: KDD, pp. 833\u2013841 (2020)","DOI":"10.1145\/3394486.3403126"},{"key":"2_CR44","doi-asserted-by":"publisher","first-page":"253","DOI":"10.1007\/s41019-022-00188-2","volume":"7","author":"H Wu","year":"2022","unstructured":"Wu, H., Song, C., Ge, Y., Ge, T.: Link prediction on complex networks: an experimental survey. J. Data Sci. Eng. 7, 253\u2013278 (2022)","journal-title":"J. Data Sci. Eng."},{"key":"2_CR45","unstructured":"Wu, L., Liu, B., Stone, P., Liu, Q.: Firefly neural architecture descent: a general approach for growing neural networks. In: NeurIPS, pp. 22373\u201322383 (2020)"},{"key":"2_CR46","first-page":"725","volume":"35","author":"J Yu","year":"2023","unstructured":"Yu, J., Su, Q., Quan, X., Yin, J.: Multi-hop reasoning question generation and its application. IEEE Trans. Knowl. Data Eng. 35, 725\u2013740 (2023)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"2_CR47","doi-asserted-by":"crossref","unstructured":"Yu, J., Wang, S., et al.: Generating deep questions with commonsense reasoning ability from the text by disentangled adversarial inference. In: ACL, pp. 470\u2013486 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.30"},{"key":"2_CR48","doi-asserted-by":"crossref","unstructured":"Yu, J., Zha, Z., Yin, J.: Inferential machine comprehension: answering questions by recursively deducing the evidence chain from text. In: ACL, pp. 2241\u20132251 (2019)","DOI":"10.18653\/v1\/P19-1217"},{"key":"2_CR49","unstructured":"Yuan, X., Savarese, P.H.P., Maire, M.: Growing efficient deep networks by structured continuous sparsification. In: ICLR, pp. 1\u201318 (2021)"},{"key":"2_CR50","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1016\/j.comcom.2020.01.003","volume":"151","author":"Q Zhang","year":"2020","unstructured":"Zhang, Q., Yu, X.: GrowingNet: an end-to-end growing network for semi-supervised learning. Comput. Commun. 151, 208\u2013215 (2020)","journal-title":"Comput. Commun."},{"key":"2_CR51","doi-asserted-by":"crossref","unstructured":"Zoph, B., Vasudevan, V., Shlens, J., Le, Q.V.: Learning transferable architectures for scalable image recognition. In: CVPR, pp. 8697\u20138710 (2018)","DOI":"10.1109\/CVPR.2018.00907"}],"container-title":["Lecture Notes in Computer Science","Database Systems for Advanced Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-5779-4_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,10]],"date-time":"2025-01-10T08:03:46Z","timestamp":1736496226000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-5779-4_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819757787","9789819757794"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-5779-4_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"11 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DASFAA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Database Systems for Advanced Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Gifu","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dasfaa2024a","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dasfaa2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}