{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T08:56:22Z","timestamp":1743065782257,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":22,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819722525"},{"type":"electronic","value":"9789819722532"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-2253-2_13","type":"book-chapter","created":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T10:02:11Z","timestamp":1713952931000},"page":"158-170","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Layer-Wise Sparse Training of\u00a0Transformer via\u00a0Convolutional Flood Filling"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0175-0753","authenticated-orcid":false,"given":"Bokyeong","family":"Yoon","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4540-2117","authenticated-orcid":false,"given":"Yoonsang","family":"Han","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4992-6181","authenticated-orcid":false,"given":"Gordon Euhyun","family":"Moon","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,4,25]]},"reference":[{"key":"13_CR1","doi-asserted-by":"crossref","unstructured":"Ainslie, J., Ontanon, S., et\u00a0al.: Etc: encoding long and structured inputs in transformers. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 268\u2013284 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.19"},{"key":"13_CR2","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: Longformer: the long-document transformer. arXiv preprint arXiv:2004.05150 (2020)"},{"key":"13_CR3","unstructured":"Child, R., Gray, S., Radford, A., Sutskever, I.: Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509 (2019)"},{"key":"13_CR4","doi-asserted-by":"crossref","unstructured":"Condevaux, C., Harispe, S.: Lsg attention: extrapolation of pretrained transformers to long sequences. In: Proceedings of the Pacific-Asia Conference on Knowledge Discovery and Data Mining, pp. 443\u2013454 (2023)","DOI":"10.1007\/978-3-031-33374-3_35"},{"key":"13_CR5","unstructured":"Devlin, J., Chang, M.W., et\u00a0al.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (2019)"},{"key":"13_CR6","unstructured":"Dosovitskiy, A., Beyer, L., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Goldman, R.: Graphics gems, p.\u00a0304 (1990)","DOI":"10.1016\/B978-0-08-050753-8.50064-4"},{"key":"13_CR8","unstructured":"iNaturalist 2018 competition dataset. (2018)"},{"key":"13_CR9","unstructured":"Kitaev, N., Kaiser, \u0141., Levskaya, A.: Reformer: the efficient transformer. In: Proceedings of the International Conference on Learning Representations (2020)"},{"key":"13_CR10","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images (2009)"},{"key":"13_CR11","unstructured":"Liu, Y., Ott, M., et\u00a0al.: Roberta: a robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Nangia, N., Bowman, S.R.: Listops: a diagnostic dataset for latent tree learning. arXiv preprint arXiv:1804.06028 (2018)","DOI":"10.18653\/v1\/N18-4013"},{"key":"13_CR13","unstructured":"Nvidia: the api reference guide for cusparse, the cuda sparse matrix library. Technical report (2023). https:\/\/docs.nvidia.com\/cuda\/cusparse\/index.html"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Qiu, J., Ma, H., et\u00a0al.: Blockwise self-attention for long document understanding. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 2555\u20132565 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.232"},{"key":"13_CR15","doi-asserted-by":"publisher","first-page":"919","DOI":"10.1007\/s10579-012-9211-2","volume":"47","author":"DR Radev","year":"2013","unstructured":"Radev, D.R., Muthukrishnan, P., et al.: The ACL anthology network corpus. Lang. Res. Eval. 47, 919\u2013944 (2013)","journal-title":"Lang. Res. Eval."},{"key":"13_CR16","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1162\/tacl_a_00353","volume":"9","author":"A Roy","year":"2021","unstructured":"Roy, A., Saffar, M., et al.: Efficient content-based sparse attention with routing transformers. Trans. Assoc. Comput. Linguist. 9, 53\u201368 (2021)","journal-title":"Trans. Assoc. Comput. Linguist."},{"issue":"6","key":"13_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3530811","volume":"55","author":"Y Tay","year":"2022","unstructured":"Tay, Y., Dehghani, M., et al.: Efficient transformers: a survey. ACM Comput. Surv. 55(6), 1\u201328 (2022)","journal-title":"ACM Comput. Surv."},{"key":"13_CR18","unstructured":"Vaswani, A., Shazeer, N., et\u00a0al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"13_CR19","unstructured":"Wang, S., Li, B.Z., et\u00a0al.: Linformer: self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)"},{"key":"13_CR20","first-page":"17283","volume":"33","author":"M Zaheer","year":"2020","unstructured":"Zaheer, M., Guruganesh, G., et al.: Big bird: transformers for longer sequences. Adv. Neural. Inf. Process. Syst. 33, 17283\u201317297 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"13_CR21","unstructured":"Zhang, H., Gong, Y., et\u00a0al.: Poolingformer: long document modeling with pooling attention. In: International Conference on Machine Learning, pp. 12437\u201312446. PMLR (2021)"},{"key":"13_CR22","unstructured":"Zhang, X., Zhao, J., LeCun, Y.: Character-level convolutional networks for text classification. Adv. Neural Inf. Process. Syst. 28 (2015)"}],"container-title":["Lecture Notes in Computer Science","Advances in Knowledge Discovery and Data Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-2253-2_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T23:13:00Z","timestamp":1714000380000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-2253-2_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819722525","9789819722532"],"references-count":22,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-2253-2_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"25 April 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taipei","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiwan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 May 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 May 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/pakdd2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}