{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T03:29:36Z","timestamp":1769743776260,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556953","type":"print"},{"value":"9789819556960","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5696-0_7","type":"book-chapter","created":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T14:03:45Z","timestamp":1769695425000},"page":"92-105","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Scale-aware Multi-head Attention with\u00a0Explainability for\u00a0Image Captioning"],"prefix":"10.1007","author":[{"given":"Yuanzhen","family":"Guo","sequence":"first","affiliation":[]},{"given":"Xiaodan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Aozhe","family":"Jia","sequence":"additional","affiliation":[]},{"given":"Boyue","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,30]]},"reference":[{"key":"7_CR1","unstructured":"Achtibat, R., Hatefi, S.M.V., Dreyer, M., Jain, A., Wiegand, T., Lapuschkin, S., Samek, W.: Attnlrp: attention-aware layer-wise relevance propagation for transformers. arXiv preprint arXiv:2402.05602 (2024)"},{"key":"7_CR2","doi-asserted-by":"publisher","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"7_CR4","unstructured":"Banerjee, S., Lavie, A.: METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: ACL, pp. 65\u201372 (2005)"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Barkan, O., et al.: Grad-sam: explaining transformers via gradient self-attention maps. In: Proceedings of the 30th ACM International Conference on Information & Knowledge Management, pp. 2882\u20132887 (2021)","DOI":"10.1145\/3459637.3482126"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 397\u2013406 (2021)","DOI":"10.1109\/ICCV48922.2021.00045"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 782\u2013791 (2021)","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"7_CR8","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Clark, K., Khandelwal, U., Levy, O., Manning, C.D.: What does BERT look at? an analysis of bert\u2019s attention. In: BlackboxNLP@ACL, pp. 276\u2013286 (2019)","DOI":"10.18653\/v1\/W19-4828"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: CVPR, pp. 10575\u201310584 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., Yao, P., Lu, S., Lu, H.: Normalized and geometry-aware self-attention network for image captioning. In: CVPR, pp. 10324\u201310333 (2020)","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"7_CR12","unstructured":"Herdade, S., Kappeler, A., Boakye, K., Soares, J.: Image captioning: transforming objects into words. In: NeurIPS, pp. 11135\u201311145 (2019)"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.: Attention on attention for image captioning. In: ICCV, pp. 4633\u20134642 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Ji, J., et al.: Improving image captioning by leveraging intra- and inter-layer global representation in transformer network. In: AAAI, pp. 1655\u20131663 (2021)","DOI":"10.1609\/aaai.v35i2.16258"},{"key":"7_CR15","doi-asserted-by":"publisher","first-page":"7615","DOI":"10.1109\/TIP.2020.3004729","volume":"29","author":"J Ji","year":"2020","unstructured":"Ji, J., Xu, C., Zhang, X., Wang, B., Song, X.: Spatio-temporal memory attention for image captioning. IEEE Trans. Image Process. 29, 7615\u20137628 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., Learned-Miller, E.G., Chen, X.: In defense of grid features for visual question answering. In: CVPR, pp. 10264\u201310273 (2020)","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"7_CR18","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Li, Y., Pan, Y., Yao, T., Mei, T.: Comprehending and ordering semantics for image captioning. In: CVPR, pp. 17969\u201317978 (2022)","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"7_CR20","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"7_CR21","doi-asserted-by":"crossref","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Neural baby talk. In: CVPR, pp. 7219\u20137228 (2018)","DOI":"10.1109\/CVPR.2018.00754"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Luo, Y., Ji, J., Sun, X., Cao, L., Wu, Y., Huang, F., Lin, C., Ji, R.: Dual-level collaborative transformer for image captioning. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T.: X-linear attention networks for image captioning. In: CVPR, pp. 10968\u201310977 (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"7_CR24","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.: BLEU: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"7_CR25","first-page":"5052","volume":"35","author":"Y Qiang","year":"2022","unstructured":"Qiang, Y., Pan, D., Li, C., Li, X., Jang, R., Zhu, D.: Attcat: explaining transformers via attentive class activation tokens. Adv. Neural. Inf. Process. Syst. 35, 5052\u20135064 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR26","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"7_CR27","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: Cider: consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Wu, M., et al.: DifNet: boosting visual information flow for image captioning. In: CVPR, pp. 17999\u201318008 (2022)","DOI":"10.1109\/CVPR52688.2022.01749"},{"key":"7_CR29","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML. vol.\u00a037, pp. 2048\u20132057 (2015)"},{"key":"7_CR30","unstructured":"Yang, Y., Teo, C., Daum\u00e9\u00a0III, H., Aloimonos, Y.: Corpus-guided sentence generation of natural images. In: EMNLP, pp. 444\u2013454 (Jul 2011)"},{"issue":"12","key":"7_CR31","doi-asserted-by":"publisher","first-page":"4467","DOI":"10.1109\/TCSVT.2019.2947482","volume":"30","author":"J Yu","year":"2019","unstructured":"Yu, J., Li, J., Yu, Z., Huang, Q.: Multimodal transformer with multi-view visual representation for image captioning. IEEE Trans. Circuits Syst. Video Technol. 30(12), 4467\u20134480 (2019)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"7_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, X., et al.: RSTNET: captioning with adaptive attention on visual and non-visual words. In: CVPR, pp. 15465\u201315474 (2021)","DOI":"10.1109\/CVPR46437.2021.01521"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5696-0_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T14:03:55Z","timestamp":1769695435000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5696-0_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556953","9789819556960"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5696-0_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"30 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}