{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T22:58:46Z","timestamp":1781045926941,"version":"3.54.1"},"publisher-location":"Singapore","reference-count":38,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819572502","type":"print"},{"value":"9789819572519","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-7251-9_21","type":"book-chapter","created":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T22:27:27Z","timestamp":1781044047000},"page":"309-323","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MAGIC-Enhanced Keyword Prompting for\u00a0Zero-Shot Audio Captioning with\u00a0CLIP Models"],"prefix":"10.1007","author":[{"given":"Vijay","family":"Govindarajan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pratik","family":"Patel","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sahil","family":"Tripathi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Md Azizul","family":"Hoque","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gautam Siddharth","family":"Kashyap","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,5,1]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Bhosale, S., Nag, S., Kanojia, D., Deng, J., Zhu, X.: Diffsed: sound event detection with denoising diffusion. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 792\u2013800 (2024)","DOI":"10.1609\/aaai.v38i2.27837"},{"key":"21_CR2","doi-asserted-by":"crossref","unstructured":"Chen, C., Hou, N., Hu, Y., Zou, H., Qi, X., Chng, E.S.: Interactive audio-text representation for automated audio captioning with contrastive learning. arXiv preprint arXiv:2203.15526 (2022)","DOI":"10.21437\/Interspeech.2022-10510"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Chen, K., Du, X., Zhu, B., Ma, Z., Berg-Kirkpatrick, T., Dubnov, S.: Hts-at: A hierarchical token-semantic audio transformer for sound classification and detection. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 646\u2013650. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"21_CR4","unstructured":"Chen, K., et al.: Audio captioning based on transformer and pre-trained CNN. In: DCASE, pp. 21\u201325 (2020)"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Z., Shao, Y.F., Ma, Y., Wei, M., Zhang, L., Zhang, W.Q.: Improving acoustic scene classification in low-resource conditions. In: ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE (2025)","DOI":"10.1109\/ICASSP49660.2025.10888928"},{"key":"21_CR6","doi-asserted-by":"crossref","unstructured":"Drossos, K., Adavanne, S., Virtanen, T.: Automated audio captioning with recurrent neural networks. In: 2017 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), pp. 374\u2013378. IEEE (2017)","DOI":"10.1109\/WASPAA.2017.8170058"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Drossos, K., Lipping, S., Virtanen, T.: Clotho: an audio captioning dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 736\u2013740. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Elizalde, B., Deshmukh, S., Al\u00a0Ismail, M., Wang, H.: Clap learning audio concepts from natural language supervision. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"21_CR9","doi-asserted-by":"publisher","first-page":"4983","DOI":"10.1109\/ACCESS.2023.3235733","volume":"11","author":"A\u00d6 Eren","year":"2023","unstructured":"Eren, A.\u00d6., Sert, M.: Automated audio captioning with topic modeling. IEEE Access 11, 4983\u20134991 (2023)","journal-title":"IEEE Access"},{"key":"21_CR10","doi-asserted-by":"crossref","unstructured":"Fischer, J., Orescanin, M., Eckstrand, E.: Vi-pann: harnessing transfer learning and uncertainty-aware variational inference for improved generalization in audio pattern recognition. IEEE Access (2024)","DOI":"10.1109\/ACCESS.2024.3372423"},{"key":"21_CR11","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: 2017 IEEE international Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"21_CR12","unstructured":"Gontier, F., Serizel, R., Cerisara, C.: Automated audio captioning by fine-tuning bart with audioset tags. In: DCASE 2021-6th Workshop on Detection and Classification of Acoustic Scenes and Events (2021)"},{"key":"21_CR13","doi-asserted-by":"crossref","unstructured":"Guerreiro, N.M., Rei, R., Stigt, D., Coheur, L., Colombo, P., Martins, A.F.: xcomet: transparent machine translation evaluation through fine-grained error detection. Trans. Assoc. Comput. Linguist. 12, 979\u2013995 (2024)","DOI":"10.1162\/tacl_a_00683"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Guzhov, A., Raue, F., Hees, J., Dengel, A.: Audioclip: extending clip to image, text and audio. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 976\u2013980. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"21_CR15","unstructured":"Kim, C.D., Kim, B., Lee, H., Kim, G.: Audiocaps: Generating captions for audios in the wild. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Vol. 1 (Long and Short Papers), pp. 119\u2013132 (2019)"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Koizumi, Y., Masumura, R., Nishida, K., Yasuda, M., Saito, S.: A transformer-based audio captioning model with keyword estimation. arXiv preprint arXiv:2007.00222 (2020)","DOI":"10.21437\/Interspeech.2020-2087"},{"key":"21_CR17","unstructured":"Koizumi, Y., Ohishi, Y., Niizumi, D., Takeuchi, D., Yasuda, M.: Audio captioning using pre-trained large-scale language model guided by audio-based similar caption retrieval. arXiv preprint arXiv:2012.07331 (2020)"},{"key":"21_CR18","doi-asserted-by":"publisher","first-page":"2880","DOI":"10.1109\/TASLP.2020.3030497","volume":"28","author":"Q Kong","year":"2020","unstructured":"Kong, Q., Cao, Y., Iqbal, T., Wang, Y., Wang, W., Plumbley, M.D.: Panns: Large-scale pretrained audio neural networks for audio pattern recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 28, 2880\u20132894 (2020)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"4","key":"21_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3636513","volume":"16","author":"K Kuhn","year":"2024","unstructured":"Kuhn, K., Kersken, V., Reuter, B., Egger, N., Zimmermann, G.: Measuring the accuracy of automatic speech recognition solutions. ACM Trans. Access. Comput. 16(4), 1\u201323 (2024)","journal-title":"ACM Trans. Access. Comput."},{"key":"21_CR20","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al.: Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)","DOI":"10.18653\/v1\/2020.acl-main.703"},{"issue":"1","key":"21_CR21","doi-asserted-by":"publisher","first-page":"26","DOI":"10.1186\/s13636-022-00259-2","volume":"2022","author":"X Mei","year":"2022","unstructured":"Mei, X., Liu, X., Plumbley, M.D., Wang, W.: Automated audio captioning: an overview of recent progress and new challenges. EURASIP J. Audio Speech Music Process. 2022(1), 26 (2022)","journal-title":"EURASIP J. Audio Speech Music Process."},{"key":"21_CR22","doi-asserted-by":"crossref","unstructured":"Mei, X., et al.: Wavcaps: A Chatgpt-Assisted Weakly-Labelled Audio captioning dataset for audio-language multimodal research. IEEE\/ACM Trans. Audio Speech Lang. Process. (2024)","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Patel, P., Pampaniya, S., Ghosh, A., Raj, R., Karuppaih, D., Kandasamy, S.: Enhancing accessibility through machine learning: a review on visual and hearing impairment technologies. IEEE Access (2025)","DOI":"10.1109\/ACCESS.2025.3539081"},{"key":"21_CR24","unstructured":"Perez-Castanos, S., Naranjo-Alcazar, J., Zuccarello, P., Cobos, M.: Listen carefully and tell: an audio captioning system based on residual learning and gammatone audio representation. arXiv preprint arXiv:2006.15406 (2020)"},{"key":"21_CR25","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PmLR (2021)"},{"key":"21_CR26","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"issue":"8","key":"21_CR27","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"issue":"6","key":"21_CR28","doi-asserted-by":"publisher","first-page":"1264","DOI":"10.1016\/j.jvoice.2022.06.011","volume":"38","author":"Z Ren","year":"2024","unstructured":"Ren, Z., Chang, Y., Bartl-Pokorny, K.D., Pokorny, F.B., Schuller, B.W.: The acoustic dissection of cough: diving into machine listening-based covid-19 analysis and detection. J. Voice 38(6), 1264\u20131277 (2024)","journal-title":"J. Voice"},{"key":"21_CR29","doi-asserted-by":"publisher","first-page":"825","DOI":"10.1109\/LSP.2024.3352514","volume":"31","author":"S Singh","year":"2024","unstructured":"Singh, S., Steinmetz, C.J., Benetos, E., Phan, H., Stowell, D.: Atgnn: Audio tagging graph neural network. IEEE Signal Process. Lett. 31, 825\u2013829 (2024)","journal-title":"IEEE Signal Process. Lett."},{"issue":"10","key":"21_CR30","doi-asserted-by":"publisher","first-page":"1733","DOI":"10.1109\/TMM.2015.2428998","volume":"17","author":"D Stowell","year":"2015","unstructured":"Stowell, D., Giannoulis, D., Benetos, E., Lagrange, M., Plumbley, M.D.: Detection and classification of acoustic scenes and events. IEEE Trans. Multimedia 17(10), 1733\u20131746 (2015)","journal-title":"IEEE Trans. Multimedia"},{"key":"21_CR31","unstructured":"Su, Y., et al.: Language models can see: Plugging visual controls in text generation. arXiv preprint arXiv:2205.02655 (2022)"},{"key":"21_CR32","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Shalev, Y., Schwartz, I., Wolf, L.: Zerocap: Zero-shot image-to-text generation for visual-semantic arithmetic. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17,918\u201317,928 (2022)","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"21_CR33","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. systems 30 (2017)"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Wu, Y., Chen, K., Zhang, T., Hui, Y., Berg-Kirkpatrick, T., Dubnov, S.: Large-scale contrastive language-audio pretraining with feature fusion and keyword-to-caption augmentation. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"21_CR35","unstructured":"Xu, X., Xie, Z., Wu, M., Yu, K.: The sjtu system for dcase2022 challenge task 6: Audio captioning with audio-text retrieval pre-training. Technical Report, DCASE2022 Challenge (2022)"},{"key":"21_CR36","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1109\/TASLP.2023.3321968","volume":"32","author":"X Xu","year":"2023","unstructured":"Xu, X., Xie, Z., Wu, M., Yu, K.: Beyond the status quo: a contemporary survey of advances and challenges in audio captioning. IEEE\/ACM Trans. Audio Speech Lang. Process. 32, 95\u2013112 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"21_CR37","unstructured":"Zeng, A., et\u00a0al.: Socratic models: Composing zero-shot multimodal reasoning with language. arXiv preprint arXiv:2204.00598 (2022)"},{"key":"21_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, H., Yu, P.S., Zhang, J.: A systematic survey of text summarization: From statistical methods to large language models. arXiv preprint arXiv:2406.11289 (2024)","DOI":"10.1145\/3731445"}],"container-title":["Lecture Notes in Computer Science","Web Information Systems Engineering \u2013 WISE 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-7251-9_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T22:27:34Z","timestamp":1781044054000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-7251-9_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819572502","9789819572519"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-7251-9_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"1 May 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"WISE","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Web Information Systems Engineering","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Marrakech","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Morocco","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 December 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 December 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"wise2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/wise2025.ficloud.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}