{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:58:15Z","timestamp":1772906295928,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Key Research and Development Program of Jiangsu Province, China","award":["BE2022059-2"],"award-info":[{"award-number":["BE2022059-2"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92048205"],"award-info":[{"award-number":["92048205"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Alibaba Innovative Research"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613820","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"2756-2764","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["BLAT: Bootstrapping Language-Audio Pre-training based on AudioSet Tag-guided Synthetic Data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8718-1278","authenticated-orcid":false,"given":"Xuenan","family":"Xu","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8081-704X","authenticated-orcid":false,"given":"Zhiling","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0624-6266","authenticated-orcid":false,"given":"Zelin","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5884-632X","authenticated-orcid":false,"given":"Pingyue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9546-3301","authenticated-orcid":false,"given":"Zeyu","family":"Xie","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5599-8707","authenticated-orcid":false,"given":"Mengyue","family":"Wu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3782-3230","authenticated-orcid":false,"given":"Kenny Q.","family":"Zhu","sequence":"additional","affiliation":[{"name":"University of Texas at Arlington, Arlington, TX, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"24206","article-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text","volume":"34","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Liangzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. In Proc. NIPS, Vol. 34. 24206--24221.","journal-title":"Proc. NIPS"},{"key":"e_1_3_2_2_2_1","volume-title":"Proc. AISTATS. PMLR, 2530--2538","author":"Al-Tahan Haider","year":"2021","unstructured":"Haider Al-Tahan and Yalda Mohsenzadeh. 2021. Clar: Contrastive learning of auditory representations. In Proc. AISTATS. PMLR, 2530--2538."},{"key":"e_1_3_2_2_3_1","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume":"33","author":"Alayrac Jean-Baptiste","year":"2020","unstructured":"Jean-Baptiste Alayrac, Adria Recasens, Rosalia Schneider, Relja Arandjelovi\u0107, Jason Ramapuram, Jeffrey De Fauw, Lucas Smaira, Sander Dieleman, and Andrew Zisserman. 2020. Self-supervised multimodal versatile networks. In Proc. NIPS, Vol. 33. 25--37.","journal-title":"Proc. NIPS"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_5_1","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. In Proc. NIPS, Vol. 33. 12449--12460.","journal-title":"Proc. NIPS"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2015.7280624"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_2_8_1","volume-title":"Proc. DCASE. 21--25","author":"Chen Kun","year":"2020","unstructured":"Kun Chen, Yusong Wu, Ziyue Wang, Xuan Zhang, Fudong Nian, Shengchen Li, and Xi Shao. 2020b. Audio Captioning Based on Transformer and Pre-Trained CNN.. In Proc. DCASE. 21--25."},{"key":"e_1_3_2_2_9_1","first-page":"1505","article-title":"Wavlm: Large-scale self-supervised pre-training for full stack speech processing","volume":"16","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, et al. 2022. Wavlm: Large-scale self-supervised pre-training for full stack speech processing. IEEE JSTSP, Vol. 16, 6 (2022), 1505--1518.","journal-title":"IEEE JSTSP"},{"key":"e_1_3_2_2_10_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020a. Uniter: Universal image-text representation learning. In Proc. ECCV. Springer, 104--120."},{"key":"e_1_3_2_2_11_1","unstructured":"Van den Oord et al. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 Vol. 2 3 (2018) 4."},{"key":"e_1_3_2_2_12_1","volume-title":"Proc. NAACL. 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristin Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proc. NAACL. 4171--4186."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2017.8170058"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_2_16_1","volume-title":"Proc. ISM. 41--48","author":"\u00fcl \u00d6zkaya Eren Aycs","year":"2020","unstructured":"Aycs eg\u00fcl \u00d6zkaya Eren and Mustafa Sert. 2020. Audio Captioning Based on Combined Audio and Semantic Embeddings. In Proc. ISM. 41--48."},{"key":"e_1_3_2_2_17_1","first-page":"829","article-title":"Fsd50k: an open dataset of human-labeled sound events","volume":"30","author":"Fonseca Eduardo","year":"2022","unstructured":"Eduardo Fonseca, Xavier Favory, Jordi Pons, Frederic Font, and Xavier Serra. 2022. Fsd50k: an open dataset of human-labeled sound events. IEEE\/ACM TASLP, Vol. 30 (2022), 829--852.","journal-title":"IEEE\/ACM TASLP"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502245"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"e_1_3_2_2_21_1","first-page":"3292","article-title":"PSLA: Improving Audio Tagging With Pretraining, Sampling, Labeling, and Aggregation","volume":"29","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James Glass. 2021b. PSLA: Improving Audio Tagging With Pretraining, Sampling, Labeling, and Aggregation. IEEE\/ACM TASLP, Vol. 29 (2021), 3292--3306.","journal-title":"IEEE\/ACM TASLP"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_2_23_1","first-page":"3451","article-title":"Hubert: Self-supervised speech representation learning by masked prediction of hidden units","volume":"29","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM TASLP, Vol. 29 (2021), 3451--3460.","journal-title":"IEEE\/ACM TASLP"},{"key":"e_1_3_2_2_24_1","volume-title":"Proc. ICML. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In Proc. ICML. PMLR, 4904--4916."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413376"},{"key":"e_1_3_2_2_26_1","volume-title":"Proc. NAACL. 119--132","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. AudioCaps: Generating Captions for Audios in The Wild. In Proc. NAACL. 119--132."},{"key":"e_1_3_2_2_27_1","volume-title":"Audio Captioning using Pre-Trained Large-Scale Language Model Guided by Audio-based Similar Caption Retrieval. arXiv preprint arXiv:2012.07331","author":"Koizumi Yuma","year":"2020","unstructured":"Yuma Koizumi, Yasunori Ohishi, Daisuke Niizumi, Daiki Takeuchi, and Masahiro Yasuda. 2020. Audio Captioning using Pre-Trained Large-Scale Language Model Guided by Audio-based Similar Caption Retrieval. arXiv preprint arXiv:2012.07331 (2020)."},{"key":"e_1_3_2_2_28_1","first-page":"2880","article-title":"Panns: Large-scale pretrained audio neural networks for audio pattern recognition","volume":"28","author":"Kong Qiuqiang","year":"2020","unstructured":"Qiuqiang Kong, Yin Cao, Turab Iqbal, Yuxuan Wang, Wenwu Wang, and Mark D Plumbley. 2020. Panns: Large-scale pretrained audio neural networks for audio pattern recognition. IEEE\/ACM TASLP, Vol. 28 (2020), 2880--2894.","journal-title":"IEEE\/ACM TASLP"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-227"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_2_31_1","volume-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks. In Proc","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Proc. ECCV. Springer, 121--137."},{"key":"e_1_3_2_2_32_1","volume-title":"Proc","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Proc. ECCV. Springer, 740--755."},{"key":"e_1_3_2_2_33_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)."},{"key":"e_1_3_2_2_34_1","volume-title":"Proc. NIPS. 13--23","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Proc. NIPS. 13--23."},{"key":"e_1_3_2_2_35_1","volume-title":"Proc. DCASE. 90--94","author":"Martin Irene","year":"2021","unstructured":"Irene Martin and Annamaria Mesaros. 2021. Diversity and Bias in Audio Captioning Datasets. In Proc. DCASE. 90--94."},{"key":"e_1_3_2_2_36_1","volume-title":"Proc. DCASE. 9--13","author":"Mesaros Annamaria","year":"2018","unstructured":"Annamaria Mesaros, Toni Heittola, and Tuomas Virtanen. 2018. A multi-device dataset for urban acoustic scene classification. In Proc. DCASE. 9--13."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9534474"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-2227"},{"key":"e_1_3_2_2_40_1","volume-title":"Proc. ICML. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proc. ICML. 8748--8763."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413528"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_2_43_1","volume-title":"Proc. ICLR. 1--16","author":"Su Weijie","year":"2019","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2019. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In Proc. ICLR. 1--16."},{"key":"e_1_3_2_2_44_1","volume-title":"Well-read students learn better: On the importance of pre-training compact models. arXiv preprint arXiv:1908.08962","author":"Turc Iulia","year":"2019","unstructured":"Iulia Turc, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Well-read students learn better: On the importance of pre-training compact models. arXiv preprint arXiv:1908.08962 (2019)."},{"key":"e_1_3_2_2_45_1","volume-title":"Proc. ICML. PMLR, 23318--23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In Proc. ICML. PMLR, 23318--23340."},{"key":"e_1_3_2_2_46_1","volume-title":"Proc. ICLR. 1--17","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. In Proc. ICLR. 1--17."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413982"},{"key":"e_1_3_2_2_49_1","volume-title":"The SJTU System for DCASE2022 Challenge Task 6: Audio Captioning with Audio-Text Retrieval Pre-training. Technical Report. DCASE2022 Challenge.","author":"Xu Xuenan","year":"2022","unstructured":"Xuenan Xu, Zeyu Xie, Mengyue Wu, and Kai Yu. 2022. The SJTU System for DCASE2022 Challenge Task 6: Audio Captioning with Audio-Text Retrieval Pre-training. Technical Report. DCASE2022 Challenge."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482097"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.333"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746427"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613820","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613820","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:46Z","timestamp":1755820846000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613820"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":53,"alternative-id":["10.1145\/3581783.3613820","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613820","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}