{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T15:26:09Z","timestamp":1743002769680,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031783944"},{"type":"electronic","value":"9783031783951"}],"license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78395-1_9","type":"book-chapter","created":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T09:35:46Z","timestamp":1733132146000},"page":"129-143","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Interpretable Visual Semantic Alignment via Spectral Attribution"],"prefix":"10.1007","author":[{"given":"Shivanvitha","family":"Ambati","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5571-839X","authenticated-orcid":false,"given":"Vineet","family":"Padmanabhan","sequence":"additional","affiliation":[]},{"given":"Wilson Naik","family":"Bhukya","sequence":"additional","affiliation":[]},{"given":"Rajendra","family":"Prasad Lal","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"9_CR1","doi-asserted-by":"publisher","unstructured":"Abnar, S., Zuidema, W.: Quantifying attention flow in transformers. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. pp. 4190\u20134197. Association for Computational Linguistics, Online (Jul 2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.385, https:\/\/aclanthology.org\/2020.acl-main.385","DOI":"10.18653\/v1\/2020.acl-main.385"},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision. pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Bach, S., Binder, A., Montavon, G., Klauschen, F., M\u00fcller, K.R., Samek, W.: On pixel-wise explanations for non-linear classifier decisions by layer-wise relevance propagation. PLoS ONE 10 (2015), https:\/\/api.semanticscholar.org\/CorpusID:9327892","DOI":"10.1371\/journal.pone.0130140"},{"key":"9_CR4","doi-asserted-by":"publisher","unstructured":"Binder, A., Montavon, G., Lapuschkin, S., M\u00fcller, K.-R., Samek, W.: Layer-Wise Relevance Propagation for Neural Networks with Local Renormalization Layers. In: Villa, A.E.P., Masulli, P., Pons Rivero, A.J. (eds.) ICANN 2016. LNCS, vol. 9887, pp. 63\u201371. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-44781-0_8","DOI":"10.1007\/978-3-319-44781-0_8"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). pp. 397\u2013406 (October 2021)","DOI":"10.1109\/ICCV48922.2021.00045"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 782\u2013791 (2021)","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"9_CR7","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (Jun 2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"9_CR8","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. ICLR (2021)"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Dou, Z.Y., Xu, Y., Gan, Z., Wang, J., Wang, S., Wang, L., Zhu, C., Zhang, P., Yuan, L., Peng, N., Liu, Z., Zeng, M.: An empirical study of training end-to-end vision-and-language transformers. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2022), https:\/\/arxiv.org\/abs\/2111.02387","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 580\u2013587 (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"9_CR11","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR12","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 5583\u20135594. PMLR (18\u201324 Jul 2021), http:\/\/proceedings.mlr.press\/v139\/kim21k.html"},{"issue":"1","key":"9_CR13","doi-asserted-by":"publisher","first-page":"1096","DOI":"10.1038\/s41467-019-08987-4","volume":"10","author":"S Lapuschkin","year":"2019","unstructured":"Lapuschkin, S., W\u00e4ldchen, S., Binder, A., Montavon, G., Samek, W., M\u00fcller, K.R.: Unmasking clever hans predictors and assessing what machines really learn. Nat. Commun. 10(1), 1096 (2019)","journal-title":"Nat. Commun."},{"key":"9_CR14","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"9_CR15","unstructured":"Li, J., Selvaraju, R.R., Gotmare, A.D., Joty, S., Xiong, C., Hoi, S.: Align before fuse: Vision and language representation learning with momentum distillation. In: NeurIPS (2021)"},{"key":"9_CR16","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"9_CR17","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., Stoyanov, V.: Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"9_CR18","doi-asserted-by":"publisher","unstructured":"Lyu, Y., Liang, P.P., Deng, Z., Salakhutdinov, R., Morency, L.P.: Dime: Fine-grained interpretations of multimodal models via disentangled local explanations. In: Proceedings of the 2022 AAAI\/ACM Conference on AI, Ethics, and Society. p. 455\u2013467. AIES \u201922, Association for Computing Machinery, New York, NY, USA (2022). https:\/\/doi.org\/10.1145\/3514094.3534148, https:\/\/doi.org\/10.1145\/3514094.3534148","DOI":"10.1145\/3514094.3534148"},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Melas-Kyriazi, L., Rupprecht, C., Laina, I., Vedaldi, A.: Deep spectral methods: A surprisingly strong baseline for unsupervised semantic segmentation and localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 8364\u20138375 (2022)","DOI":"10.1109\/CVPR52688.2022.00818"},{"key":"9_CR20","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR21","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding by generative pre-training. OpenAI (2018), https:\/\/www.cs.ubc.ca\/~amuham01\/LING530\/papers\/radford2018improving.pdf"},{"key":"9_CR22","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)"},{"key":"9_CR23","doi-asserted-by":"publisher","unstructured":"Saeed, W., Omlin, C.: Explainable ai (xai): A systematic meta-survey of current challenges and future opportunities. Know.-Based Syst. 263(C) (mar 2023). https:\/\/doi.org\/10.1016\/j.knosys.2023.110273, https:\/\/doi.org\/10.1016\/j.knosys.2023.110273","DOI":"10.1016\/j.knosys.2023.110273"},{"key":"9_CR24","doi-asserted-by":"publisher","unstructured":"Samek, W., Montavon, G., Vedaldi, A., Hansen, L.K., M\u00fcller, K.-R. (eds.): Explainable AI: Interpreting, Explaining and Visualizing Deep Learning. LNCS (LNAI), vol. 11700. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-28954-6","DOI":"10.1007\/978-3-030-28954-6"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE international conference on computer vision. pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"9_CR26","doi-asserted-by":"publisher","unstructured":"Shao, H., Mesbahi, M.: On the fiedler vector of graphs and its application in consensus networks. Proceedings of the American Control Conference 2015, 1734\u20131739 (07 2015). https:\/\/doi.org\/10.1109\/ACC.2015.7170983","DOI":"10.1109\/ACC.2015.7170983"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Shi, B., Darrell, T., Wang, X.: Top-down visual attention from analysis by synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2102\u20132112 (2023)","DOI":"10.1109\/CVPR52729.2023.00209"},{"key":"9_CR28","doi-asserted-by":"publisher","unstructured":"Spielman, D.A.: Spectral graph theory and its applications. In: Proceedings of the 48th Annual IEEE Symposium on Foundations of Computer Science. p. 29\u201338. FOCS \u201907, IEEE Computer Society, USA (2007). https:\/\/doi.org\/10.1109\/FOCS.2007.66","DOI":"10.1109\/FOCS.2007.66"},{"key":"9_CR29","doi-asserted-by":"publisher","unstructured":"Tan, H., Bansal, M.: LXMERT: Learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). pp. 5100\u20135111. Association for Computational Linguistics, Hong Kong, China (Nov 2019). https:\/\/doi.org\/10.18653\/v1\/D19-1514, https:\/\/aclanthology.org\/D19-1514","DOI":"10.18653\/v1\/D19-1514"},{"key":"9_CR30","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L.u., Polosukhin, I.: Attention is all you need. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems. vol.\u00a030. Curran Associates, Inc. (2017), https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78395-1_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,14]],"date-time":"2025-03-14T17:40:30Z","timestamp":1741974030000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78395-1_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"ISBN":["9783031783944","9783031783951"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78395-1_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"3 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}