{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T04:10:33Z","timestamp":1778213433305,"version":"3.51.4"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032236036","type":"print"},{"value":"9783032236043","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-23604-3_19","type":"book-chapter","created":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T03:22:58Z","timestamp":1778210578000},"page":"296-312","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Optimizing Transformers: Metaheuristics for\u00a0Attention Head Pruning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0956-2062","authenticated-orcid":false,"given":"Pedro","family":"Ferreira","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4754-2168","authenticated-orcid":false,"given":"Carlos Henggeler","family":"Antunes","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2387-5961","authenticated-orcid":false,"given":"Jorge","family":"Batista","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1087-7941","authenticated-orcid":false,"given":"Telmo","family":"Pinto","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,5,9]]},"reference":[{"issue":"3","key":"19_CR1","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1109\/TEVC.2007.900837","volume":"12","author":"S Bandyopadhyay","year":"2008","unstructured":"Bandyopadhyay, S., Saha, S., Maulik, U., Deb, K.: A simulated annealing-based multiobjective optimization algorithm: AMOSA. IEEE Trans. Evol. Comput. 12(3), 269\u2013283 (2008). https:\/\/doi.org\/10.1109\/TEVC.2007.900837","journal-title":"IEEE Trans. Evol. Comput."},{"issue":"12","key":"19_CR2","doi-asserted-by":"publisher","first-page":"10558","DOI":"10.1109\/TPAMI.2024.3447085","volume":"46","author":"H Cheng","year":"2024","unstructured":"Cheng, H., Zhang, M., Shi, J.Q.: A survey on deep neural network pruning: taxonomy, comparison, analysis, and recommendations. IEEE Trans. Pattern Anal. Mach. Intell. 46(12), 10558\u201310578 (2024). https:\/\/doi.org\/10.1109\/TPAMI.2024.3447085","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"2","key":"19_CR3","doi-asserted-by":"publisher","first-page":"182","DOI":"10.1109\/4235.996017","volume":"6","author":"K Deb","year":"2002","unstructured":"Deb, K., Pratap, A., Agarwal, S., Meyarivan, T.: A fast and elitist multiobjective genetic algorithm: NSGA-II. IEEE Trans. Evol. Comput. 6(2), 182\u2013197 (2002). https:\/\/doi.org\/10.1109\/4235.996017","journal-title":"IEEE Trans. Evol. Comput."},{"key":"19_CR4","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 1 (long and short papers), pp. 4171\u20134186 (2019)","DOI":"10.18653\/v1\/N19-1423"},{"key":"19_CR6","unstructured":"Han, S., Mao, H., Dally, W.J.: Deep compression: compressing deep neural network with pruning, trained quantization and Huffman coding. In: ICLR (2016)"},{"key":"19_CR7","doi-asserted-by":"publisher","unstructured":"Han, S., Pool, J., Tran, J., Dally, W.: Learning both weights and connections for efficient neural network. In: Advances in Neural Information Processing Systems, vol. 28 (2015). https:\/\/doi.org\/10.48550\/arXiv.1506.02626","DOI":"10.48550\/arXiv.1506.02626"},{"key":"19_CR8","unstructured":"Hanson, S., Pratt, L.: Comparing biases for minimal network construction with back-propagation. In: Advances in Neural Information Processing Systems, vol. 1 (1988). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/1988\/file\/1c9ac0159c94d8d0cbedc973445af2da-Paper.pdf"},{"key":"19_CR9","doi-asserted-by":"publisher","unstructured":"Hatefi, S.M.V., Dreyer, M., Achtibat, R., Wiegand, T., Samek, W., Lapuschkin, S.: Pruning by explaining revisited: optimizing attribution methods to prune CNNs and transformers. In: ECCV 2024 Workshops, pp. 152\u2013169. Springer Nature Switzerland, Cham (2025). https:\/\/doi.org\/10.1007\/978-3-031-92648-8_10","DOI":"10.1007\/978-3-031-92648-8_10"},{"key":"19_CR10","doi-asserted-by":"publisher","unstructured":"He, Y., Zhang, X., Sun, J.: Channel pruning for accelerating very deep neural networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1389\u20131397 (2017). https:\/\/doi.org\/10.1109\/iccv.2017.155","DOI":"10.1109\/iccv.2017.155"},{"key":"19_CR11","doi-asserted-by":"publisher","unstructured":"Jin, X., Han, J.: K-Means Clustering, pp. 563\u2013564. Springer US, Boston, MA (2010). https:\/\/doi.org\/10.1007\/978-0-387-30164-8_425","DOI":"10.1007\/978-0-387-30164-8_425"},{"key":"19_CR12","doi-asserted-by":"publisher","unstructured":"Jo, J.y., Myaeng, S.H.: Roles and utilization of attention heads in transformer-based neural language models. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 3404\u20133417 (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.311","DOI":"10.18653\/v1\/2020.acl-main.311"},{"key":"19_CR13","unstructured":"Kuo, C.L., Kuruoglu, E.E., Chan, W.K.V.: Network pruning optimization by simulated annealing algorithm (2022). https:\/\/openreview.net\/forum?id=2jYxq9_TkpG"},{"key":"19_CR14","unstructured":"Le\u00a0Cun, Y., Denker, J., Solla, S.: Optimal brain damage, advances in neural information processing systems. Denver 1989, Ed. D. Touretzsky, Morgan Kaufmann 598, 605 (1990). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/1989\/file\/6c9882bbac1c7093bd25041881277658-Paper.pdf"},{"key":"19_CR15","doi-asserted-by":"publisher","unstructured":"Li, H., Kadav, A., Durdanovic, I., Samet, H., Graf, H.P.: Pruning filters for efficient convnets. arXiv preprint (2016). https:\/\/doi.org\/10.48550\/arXiv.1608.08710","DOI":"10.48550\/arXiv.1608.08710"},{"key":"19_CR16","doi-asserted-by":"publisher","unstructured":"Li, Q., Zhang, B., Chu, X.: Eapruning: evolutionary pruning for vision transformers and CNNs. arXiv preprint (2022). https:\/\/doi.org\/10.48550\/arXiv.2210.00181","DOI":"10.48550\/arXiv.2210.00181"},{"key":"19_CR17","doi-asserted-by":"publisher","unstructured":"Luo, J.H., Wu, J., Lin, W.: ThiNet: a filter level pruning method for deep neural network compression. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2017). https:\/\/doi.org\/10.1109\/iccv.2017.541","DOI":"10.1109\/iccv.2017.541"},{"key":"19_CR18","doi-asserted-by":"publisher","unstructured":"Ma, W., Zhang, K., Lou, R., Wang, L., Vosoughi, S.: Contributions of transformer attention heads in multi-and cross-lingual tasks. In: Proceedings of the 59th Annual Meeting of the ACL and the 11th International Joint Conference on NLP, (vol. 1: Long Papers), pp. 1956\u20131966 (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.152","DOI":"10.18653\/v1\/2021.acl-long.152"},{"key":"19_CR19","doi-asserted-by":"publisher","unstructured":"Parnami, A., Singh, R., Joshi, T.: Pruning attention heads of transformer models using a* search: a novel approach to compress big NLP architectures. arXiv preprint (2021). https:\/\/doi.org\/10.48550\/arXiv.2110.15225","DOI":"10.48550\/arXiv.2110.15225"},{"key":"19_CR20","unstructured":"Prasetyo, Y., Yudistira, N., Widodo, A.W.: Sparse then prune: Toward efficient vision transformers. Available at SSRN 4529273"},{"key":"19_CR21","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020). http:\/\/jmlr.org\/papers\/v21\/20-074.html"},{"key":"19_CR22","doi-asserted-by":"publisher","unstructured":"Shao, H., Liu, B., Qian, Y.: One-shot sensitivity-aware mixed sparsity pruning for large language models. In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 11296\u201311300 (2024). https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10445737","DOI":"10.1109\/ICASSP48485.2024.10445737"},{"key":"19_CR23","doi-asserted-by":"publisher","unstructured":"Tang, Y., Wang, Y., Guo, J., Tu, Z., Han, K., Hu, H., Tao, D.: A survey on transformer compression (2024). https:\/\/doi.org\/10.48550\/arXiv.2402.05964","DOI":"10.48550\/arXiv.2402.05964"},{"key":"19_CR24","doi-asserted-by":"publisher","unstructured":"Touvron, H., Cord, M., J\u00e9gou, H.: DeiT iii: revenge of the ViT. In: European Conference on Computer Vision, pp. 516\u2013533. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20053-3_30","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"19_CR25","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"19_CR26","unstructured":"Wang, Z., et\u00a0al.: Patch diffusion: faster and more data-efficient training of diffusion models. In: Advances in Neural Information Processing Systems, vol. 36 (2024). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/e4667dd0a5a54b74019b72b677ed8ec1-Paper-Conference.pdf"},{"key":"19_CR27","doi-asserted-by":"publisher","unstructured":"Yang, C., An, Z., Li, C., Diao, B., Xu, Y.: Multi-objective pruning for CNNs using genetic algorithm. In: International Conference on Artificial Neural Networks, pp. 299\u2013305. Springer (2019). https:\/\/doi.org\/10.1007\/978-3-030-30484-3_25","DOI":"10.1007\/978-3-030-30484-3_25"},{"key":"19_CR28","doi-asserted-by":"publisher","unstructured":"Yang, H., Yin, H., Shen, M., Molchanov, P., Li, H., Kautz, J.: Global vision transformer pruning with hessian-aware saliency. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18547\u201318557 (2023). https:\/\/doi.org\/10.1109\/cvpr52729.2023.01779","DOI":"10.1109\/cvpr52729.2023.01779"},{"key":"19_CR29","doi-asserted-by":"publisher","unstructured":"Yenduri, G., et al.: Generative pre-trained transformer: a comprehensive review on enabling technologies, potential applications, emerging challenges, and future directions (2023). https:\/\/doi.org\/10.48550\/arXiv.2305.10435","DOI":"10.48550\/arXiv.2305.10435"},{"key":"19_CR30","unstructured":"You, Z., Yan, K., Ye, J., Ma, M., Wang, P.: Gate decorator: global filter pruning method for accelerating deep convolutional neural networks. In: Advances in Neural Information Processing Systems, vol. 32 (2019). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/b51a15f382ac914391a58850ab343b00-Paper.pdf"},{"key":"19_CR31","doi-asserted-by":"publisher","unstructured":"Zhao, C., Ni, B., Zhang, J., Zhao, Q., Zhang, W., Tian, Q.: Variational convolutional neural network pruning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2780\u20132789 (2019). https:\/\/doi.org\/10.1109\/cvpr.2019.00289","DOI":"10.1109\/cvpr.2019.00289"},{"key":"19_CR32","doi-asserted-by":"publisher","unstructured":"Zhu, M., Gupta, S.: To prune, or not to prune: exploring the efficacy of pruning for model compression (2017). https:\/\/doi.org\/10.48550\/arXiv.1710.01878","DOI":"10.48550\/arXiv.1710.01878"}],"container-title":["Lecture Notes in Computer Science","Applications of Evolutionary Computation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-23604-3_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T03:23:17Z","timestamp":1778210597000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-23604-3_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032236036","9783032236043"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-23604-3_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"9 May 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"EvoApplications","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on the Applications of Evolutionary Computation (Part of EvoStar)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Toulouse","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"France","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 April 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 April 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"evoapplications2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.evostar.org\/2026\/evoapps\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}