{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T18:08:39Z","timestamp":1776362919326,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276250"],"award-info":[{"award-number":["62276250"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major Project of the National Social Science Foundation of China","award":["21ZD292"],"award-info":[{"award-number":["21ZD292"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681145","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"7356-7365","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Advancing Multi-grained Alignment for Contrastive Language-Audio Pre-training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3397-8432","authenticated-orcid":false,"given":"Yiming","family":"Li","sequence":"first","affiliation":[{"name":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6037-2299","authenticated-orcid":false,"given":"Zhifang","family":"Guo","sequence":"additional","affiliation":[{"name":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4226-3250","authenticated-orcid":false,"given":"Xiangdong","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4524-495X","authenticated-orcid":false,"given":"Hong","family":"Liu","sequence":"additional","affiliation":[{"name":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"e_1_3_2_1_4_1","volume-title":"Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, Shujie Liu, Daniel Tompkins, Zhuo Chen, and Furu Wei. 2022. Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"High fidelity neural audio compression. arXiv preprint arXiv:2210.13438","author":"D\u00e9fossez Alexandre","year":"2022","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2022. High fidelity neural audio compression. arXiv preprint arXiv:2210.13438 (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"Audio retrieval with wavtext5k and clap training. arXiv preprint arXiv:2209.14275","author":"Deshmukh Soham","year":"2022","unstructured":"Soham Deshmukh, Benjamin Elizalde, and Huaming Wang. 2022. Audio retrieval with wavtext5k and clap training. arXiv preprint arXiv:2209.14275 (2022)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747556"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_12_1","volume-title":"S Ramaneswaran, S Sakshi, Oriol Nieto, Ramani Duraiswami, and Dinesh Manocha.","author":"Ghosh Sreyan","year":"2023","unstructured":"Sreyan Ghosh, Ashish Seth, Sonal Kumar, Utkarsh Tyagi, Chandra Kiran Evuru, S Ramaneswaran, S Sakshi, Oriol Nieto, Ramani Duraiswami, and Dinesh Manocha. 2023. CompA: Addressing the Gap in Compositional Reasoning in Audio-Language Models. arXiv preprint arXiv:2310.08753 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21315"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"e_1_3_2_1_16_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Huang Po-Yao","year":"2024","unstructured":"Po-Yao Huang, Vasu Sharma, Hu Xu, Chaitanya Ryali, Yanghao Li, Shang-Wen Li, Gargi Ghosh, Jitendra Malik, Christoph Feichtenhofer, et al. 2024. Mavil: Masked audio-video learners. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_17_1","first-page":"28708","article-title":"Masked autoencoders that listen","volume":"35","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Hu Xu, Juncheng Li, Alexei Baevski, Michael Auli, Wojciech Galuba, Florian Metze, and Christoph Feichtenhofer. 2022. Masked autoencoders that listen. Advances in Neural Information Processing Systems, Vol. 35 (2022), 28708--28720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. Audiocaps: Generating captions for audios in the wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 119--132."},{"key":"e_1_3_2_1_20_1","volume-title":"Audio retrieval with natural language queries: A benchmark study","author":"Koepke A Sophia","year":"2022","unstructured":"A Sophia Koepke, Andreea-Maria Oncescu, Joao Henriques, Zeynep Akata, and Samuel Albanie. 2022. Audio retrieval with natural language queries: A benchmark study. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964310"},{"key":"e_1_3_2_1_23_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3352248"},{"key":"e_1_3_2_1_25_1","volume-title":"Audio-Free Prompt Tuning for Language-Audio Models. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 491--495","author":"Li Yiming","year":"2024","unstructured":"Yiming Li, Xiangdong Wang, and Hong Liu. 2024. Audio-Free Prompt Tuning for Language-Audio Models. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 491--495."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Yiming Li Xiangdong Wang Hong Liu Rui Tao Long Yan and Kazushige Ouchi. 2024. Semi-Supervised Sound Event Detection with Local and Global Consistency Regularization. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 271--275.","DOI":"10.1109\/ICASSP48485.2024.10446386"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.2989575"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. AudioLDM: Text-to-Audio Generation with Latent Diffusion Models. Proceedings of the International Conference on Machine Learning (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"International conference on machine learning. PMLR, 1614--1623","author":"Martins Andre","year":"2016","unstructured":"Andre Martins and Ramon Astudillo. 2016. From softmax to sparsemax: A sparse model of attention and multi-label classification. In International conference on machine learning. PMLR, 1614--1623."},{"key":"e_1_3_2_1_30_1","volume-title":"Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395","author":"Mei Xinhao","year":"2023","unstructured":"Xinhao Mei, Chutong Meng, Haohe Liu, Qiuqiang Kong, Tom Ko, Chengqi Zhao, Mark D Plumbley, Yuexian Zou, and Wenwu Wang. 2023. Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01860"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_35_1","volume-title":"Do vision transformers see like convolutional neural networks? Advances in neural information processing systems","author":"Raghu Maithra","year":"2021","unstructured":"Maithra Raghu, Thomas Unterthiner, Simon Kornblith, Chiyuan Zhang, and Alexey Dosovitskiy. 2021. Do vision transformers see like convolutional neural networks? Advances in neural information processing systems, Vol. 34 (2021), 12116--12128."},{"key":"e_1_3_2_1_36_1","volume-title":"Aaron Van den Oord, and Oriol Vinyals","author":"Razavi Ali","year":"2019","unstructured":"Ali Razavi, Aaron Van den Oord, and Oriol Vinyals. 2019. Generating diverse high-fidelity images with vq-vae-2. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_37_1","volume-title":"Contrastive learning with hard negative samples. arXiv preprint arXiv:2010.04592","author":"Robinson Joshua","year":"2020","unstructured":"Joshua Robinson, Ching-Yao Chuang, Suvrit Sra, and Stefanie Jegelka. 2020. Contrastive learning with hard negative samples. arXiv preprint arXiv:2010.04592 (2020)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2017.2657381"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2017.8170052"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054478"},{"key":"e_1_3_2_1_42_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems Vol. 30 (2017)."},{"key":"e_1_3_2_1_43_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_44_1","unstructured":"Chengyi Wang Sanyuan Chen Yu Wu Ziqiang Zhang Long Zhou Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00293"},{"key":"e_1_3_2_1_46_1","volume-title":"Audio-Text Models Do Not Yet Leverage Natural Language. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Wu Ho-Hsiang","year":"2023","unstructured":"Ho-Hsiang Wu, Oriol Nieto, Juan Pablo Bello, and Justin Salomon. 2023. Audio-Text Models Do Not Yet Leverage Natural Language. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_49_1","volume-title":"Audiodec: An Open-Source Streaming High-Fidelity Neural Audio Codec. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Wu Yi-Chiao","year":"2023","unstructured":"Yi-Chiao Wu, Israel D Gebru, Dejan Markovi\u0107, and Alexander Richard. 2023. Audiodec: An Open-Source Streaming High-Fidelity Neural Audio Codec. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747336"},{"key":"e_1_3_2_1_51_1","volume-title":"On Negative Sampling for Contrastive Audio-Text Retrieval. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Xie Huang","year":"2023","unstructured":"Huang Xie, Okko R\u00e4s\u00e4nen, and Tuomas Virtanen. 2023. On Negative Sampling for Contrastive Audio-Text Retrieval. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096972"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414834"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW59220.2023.10192960"},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia. 2756--2764","author":"Xu Xuenan","unstructured":"Xuenan Xu, Zhiling Zhang, Zelin Zhou, Pingyue Zhang, Zeyu Xie, Mengyue Wu, and Kenny Q. Zhu. 2023. BLAT: Bootstrapping Language-Audio Pre-training based on AudioSet Tag-guided Synthetic Data. In Proceedings of the 31st ACM International Conference on Multimedia. 2756--2764."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_2"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389692"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"e_1_3_2_1_59_1","volume-title":"Cacophony: An Improved Contrastive Audio-Text Model. arXiv preprint arXiv:2402.06986","author":"Zhu Ge","year":"2024","unstructured":"Ge Zhu and Zhiyao Duan. 2024. Cacophony: An Improved Contrastive Audio-Text Model. arXiv preprint arXiv:2402.06986 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681145","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681145","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:53Z","timestamp":1750294673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681145"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":59,"alternative-id":["10.1145\/3664647.3681145","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681145","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}