{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:41:26Z","timestamp":1755823286157,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612592","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"2785-2794","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SEAR: Semantically-grounded Audio Representations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0904-0573","authenticated-orcid":false,"given":"Rajat","family":"Hebbar","sequence":"first","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5281-1695","authenticated-orcid":false,"given":"Digbalay","family":"Bose","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1052-6204","authenticated-orcid":false,"given":"Shrikanth","family":"Narayanan","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_1_2_1","volume-title":"Abdel rahman Mohamed, and Michael Auli","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Henry Zhou, Abdel rahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. ArXiv, Vol. abs\/2006.11477 (2020)."},{"key":"e_1_3_2_1_3_1","volume-title":"Condensed Movies: Story Based Retrieval with Contextual Embeddings. ArXiv","author":"Bain Max","year":"2020","unstructured":"Max Bain, Arsha Nagrani, A. Brown, and Andrew Zisserman. 2020. Condensed Movies: Story Based Retrieval with Contextual Embeddings. ArXiv, Vol. abs\/2005.04208 (2020)."},{"key":"e_1_3_2_1_4_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 1708--1718."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2396531"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00212"},{"key":"e_1_3_2_1_7_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal et al. 2020. Language Models are Few-Shot Learners. ArXiv Vol. abs\/2005.14165 (2020)."},{"key":"e_1_3_2_1_8_1","volume-title":"Andrew Gallagher, Liat Kaver, Radhika Marvin, Caroline Pantofaru, Nathan Reale, Loretta Guarino Reid, Kevin Wilson, et al.","author":"Chaudhuri Sourish","year":"2018","unstructured":"Sourish Chaudhuri, Joseph Roth, Daniel PW Ellis, Andrew Gallagher, Liat Kaver, Radhika Marvin, Caroline Pantofaru, Nathan Reale, Loretta Guarino Reid, Kevin Wilson, et al. 2018. Ava-speech: A densely labeled dataset of speech activity in movies. arXiv preprint arXiv:1808.00606 (2018)."},{"key":"e_1_3_2_1_9_1","volume-title":"Movies2Scenes: Learning Scene Representations Using Movie Similarities. arXiv preprint arXiv:2202.10650","author":"Chen Shixing","year":"2022","unstructured":"Shixing Chen, Xiang Hao, Xiaohan Nie, and Raffay Hamid. 2022a. Movies2Scenes: Learning Scene Representations Using Movie Similarities. arXiv preprint arXiv:2202.10650 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"BEATs: Audio Pre-Training with Acoustic Tokenizers. arXiv preprint arXiv:2212.09058","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, Shujie Liu, Daniel Tompkins, Zhuo Chen, and Furu Wei. 2022b. BEATs: Audio Pre-Training with Acoustic Tokenizers. arXiv preprint arXiv:2212.09058 (2022)."},{"key":"e_1_3_2_1_11_1","unstructured":"Vesna Dakic. 2009. Sound design for film and television."},{"key":"e_1_3_2_1_12_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_1_14_1","volume-title":"Mahmoud Al Ismail, and Huaming Wang","author":"Elizalde Benjamin","year":"2022","unstructured":"Benjamin Elizalde, Soham Deshmukh, Mahmoud Al Ismail, and Huaming Wang. 2022. CLAP: Learning Audio Concepts From Natural Language Supervision. ArXiv, Vol. abs\/2206.04769 (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_17_1","unstructured":"Aude Giraudel Matthieu Carr\u00e9 Val\u00e9rie Mapelli Juliette Kahn Olivier Galibert and Ludovic Quintard. 2012. The REPERE Corpus: a multimodal corpus for person recognition.. In LREC. 1102--1107."},{"key":"e_1_3_2_1_18_1","volume-title":"Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James Glass. 2021. Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)."},{"key":"e_1_3_2_1_19_1","volume-title":"Glass","author":"Gong Yuan","year":"2022","unstructured":"Yuan Gong, Cheng-I Lai, Yu-An Chung, and James R. Glass. 2022. SSAST: Self-Supervised Audio Spectrogram Transformer. In AAAI."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_1_22_1","unstructured":"Yoonchang Han Jeongsoo Park and Kyogu Lee. 2017. Convolutional Neural Networks with Binaural Representations and Background Subtraction for Acoustic Scene Classification.. In DCASE. 46--50."},{"key":"e_1_3_2_1_23_1","volume-title":"A dataset for Audio-Visual Sound Event Detection in Movies. arXiv preprint arXiv:2302.07315","author":"Hebbar Rajat","year":"2023","unstructured":"Rajat Hebbar, Digbalay Bose, Krishna Somandepalli, Veena Vijai, and Shrikanth Narayanan. 2023. A dataset for Audio-Visual Sound Event Detection in Movies. arXiv preprint arXiv:2302.07315 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682532"},{"key":"e_1_3_2_1_25_1","first-page":"286","article-title":"Improving Gender Identification in Movie Audio Using Cross-Domain Data","volume":"282","author":"Hebbar Rajat","year":"2018","unstructured":"Rajat Hebbar, Krishna Somandepalli, and Shrikanth S Narayanan. 2018. Improving Gender Identification in Movie Audio Using Cross-Domain Data.. In Interspeech, Vol. 282. 286.","journal-title":"Interspeech"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_27_1","volume-title":"Movienet: A holistic dataset for movie understanding. In Computer Vision--ECCV 2020: 16th European Conference","author":"Huang Qingqiu","year":"2020","unstructured":"Qingqiu Huang, Yu Xiong, Anyi Rao, Jiaze Wang, and Dahua Lin. 2020. Movienet: A holistic dataset for movie understanding. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part IV 16. Springer, 709--727."},{"key":"e_1_3_2_1_28_1","volume-title":"On Negative Sampling for Audio-Visual Contrastive Learning from Movies. ArXiv","author":"Kalayeh Mahdi M.","year":"2022","unstructured":"Mahdi M. Kalayeh, Shervin Ardeshir, Lingyi Liu, Nagendra Kamath, and Ashok Chandrashekar. 2022. On Negative Sampling for Audio-Visual Contrastive Learning from Movies. ArXiv, Vol. abs\/2205.00073 (2022)."},{"key":"e_1_3_2_1_29_1","unstructured":"Chris Dongjoo Kim Byeongchang Kim Hyunmin Lee and Gunhee Kim. 2019. AudioCaps: Generating Captions for Audios in The Wild. In NAACL."},{"key":"e_1_3_2_1_30_1","volume-title":"Efficient training of audio transformers with patchout. arXiv preprint arXiv:2110.05069","author":"Koutini Khaled","year":"2021","unstructured":"Khaled Koutini, Jan Schl\u00fcter, Hamid Eghbal-zadeh, and Gerhard Widmer. 2021. Efficient training of audio transformers with patchout. arXiv preprint arXiv:2110.05069 (2021)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"e_1_3_2_1_32_1","volume-title":"Hoi","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML."},{"key":"e_1_3_2_1_33_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_34_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_1_35_1","volume-title":"Contrastive Audio-Language Learning for Music. ArXiv","author":"Manco Ilaria","year":"2022","unstructured":"Ilaria Manco, Emmanouil Benetos, Elio Quinton, and Gy\u00f6rgy Fazekas. 2022. Contrastive Audio-Language Learning for Music. ArXiv, Vol. abs\/2208.12208 (2022)."},{"volume-title":"DCASE 2017 Challenge Setup: Tasks, Datasets and Baseline System. In Proceedings of the Detection and Classification of Acoustic Scenes and Events 2017 Workshop (DCASE2017)","author":"Mesaros A.","key":"e_1_3_2_1_36_1","unstructured":"A. Mesaros, T. Heittola, A. Diment, B. Elizalde, A. Shah, E. Vincent, B. Raj, and T. Virtanen. 2017. DCASE 2017 Challenge Setup: Tasks, Datasets and Baseline System. In Proceedings of the Detection and Classification of Acoustic Scenes and Events 2017 Workshop (DCASE2017). 85--92."},{"key":"e_1_3_2_1_37_1","volume-title":"Jan C Van Gemert, and Cees GM Snoek","author":"Mettes Pascal","year":"2016","unstructured":"Pascal Mettes, Jan C Van Gemert, and Cees GM Snoek. 2016. Spot on: Action localization from pointly-supervised proposals. In Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer, 437--453."},{"key":"e_1_3_2_1_38_1","volume-title":"End-to-End Learning of Visual Representations From Uncurated Instructional Videos. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Miech Antoine","year":"2020","unstructured":"Antoine Miech, Jean-Baptiste Alayrac, Lucas Smaira, Ivan Laptev, Josef Sivic, and Andrew Zisserman. 2020. End-to-End Learning of Visual Representations From Uncurated Instructional Videos. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 9876--9886."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"e_1_3_2_1_41_1","volume-title":"Tel Aviv","author":"Nagrani Arsha","year":"2022","unstructured":"Arsha Nagrani, Paul Hongsuck Seo, Bryan Seybold, Anja Hauth, Santiago Manen, Chen Sun, and Cordelia Schmid. 2022. Learning audio-video modalities from image captions. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XIV. Springer, 407--426."},{"key":"e_1_3_2_1_42_1","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"Nagrani Arsha","year":"2021","unstructured":"Arsha Nagrani, Shan Yang, Anurag Arnab, Aren Jansen, Cordelia Schmid, and Chen Sun. 2021. Attention bottlenecks for multimodal fusion. Advances in Neural Information Processing Systems, Vol. 34 (2021), 14200--14213.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9534474"},{"key":"e_1_3_2_1_44_1","volume-title":"Tel Aviv","author":"Pardo Alejandro","year":"2022","unstructured":"Alejandro Pardo, Fabian Caba Heilbron, Juan Le\u00f3n Alc\u00e1zar, Ali Thabet, and Bernard Ghanem. 2022. Moviecuts: A new dataset and benchmark for cut type recognition. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part VII. Springer, 668--685."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_1_46_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_47_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv, Vol. abs\/2204.06125 (2022)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_2"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01016"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413528"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2020.3047978"},{"key":"e_1_3_2_1_53_1","volume-title":"Multi-face: Self-supervised multiview adaptation for robust face clustering in videos. arXiv preprint arXiv:2008.11289","author":"Somandepalli Krishna","year":"2020","unstructured":"Krishna Somandepalli, Rajat Hebbar, and Shrikanth Narayanan. 2020. Multi-face: Self-supervised multiview adaptation for robust face clustering in videos. arXiv preprint arXiv:2008.11289 (2020)."},{"key":"e_1_3_2_1_54_1","volume-title":"Contrastive learning of musical representations. arXiv preprint arXiv:2103.09410","author":"Spijkervet Janne","year":"2021","unstructured":"Janne Spijkervet and John Ashley Burgoyne. 2021. Contrastive learning of musical representations. arXiv preprint arXiv:2103.09410 (2021)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247986"},{"key":"e_1_3_2_1_57_1","volume-title":"Representation Learning with Contrastive Predictive Coding. ArXiv","author":"van den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. ArXiv, Vol. abs\/1807.03748 (2018)."},{"key":"e_1_3_2_1_58_1","volume-title":"Towards Long-Form Video Understanding. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","author":"Wu Chaoxia","year":"2021","unstructured":"Chaoxia Wu and Philipp Kr\"ahenb\u00fchl. 2021. Towards Long-Form Video Understanding. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 1884--1894."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_1_60_1","volume-title":"Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation. arXiv preprint arXiv:2211.06687","author":"Wu Yusong","year":"2022","unstructured":"Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, and Shlomo Dubnov. 2022a. Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation. arXiv preprint arXiv:2211.06687 (2022)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612592","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612592","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:50Z","timestamp":1755820850000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612592"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":61,"alternative-id":["10.1145\/3581783.3612592","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612592","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}