{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:40:05Z","timestamp":1755974405480,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,29]],"date-time":"2024-05-29T00:00:00Z","timestamp":1716940800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,29]]},"DOI":"10.1145\/3686540.3686547","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T12:23:04Z","timestamp":1730982184000},"page":"48-56","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Exploring Event Misalignment Bias and Segment Focus Bias for Weakly-Supervised Audio-Visual Video Parsing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-4862-7910","authenticated-orcid":false,"given":"Mingchi","family":"Li","sequence":"first","affiliation":[{"name":"Shanghai Institute of Microsystem and Information Technology, University of Chinese Academy of Sciences, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9091-3231","authenticated-orcid":false,"given":"Songrui","family":"Han","sequence":"additional","affiliation":[{"name":"Shanghai Institute of Microsystem and Information Technology, University of Chinese Academy of Sciences, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7602-7136","authenticated-orcid":false,"given":"Xiaobing","family":"Yuan","sequence":"additional","affiliation":[{"name":"Shanghai Institute of Microsystem and Information Technology, University of Chinese Academy of Sciences, China"}]}],"member":"320","published-online":{"date-parts":[[2024,9,19]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings, Part XVIII 16","author":"Afouras Triantafyllos","year":"2020","unstructured":"Triantafyllos Afouras, Andrew Owens, Joon\u00a0Son Chung, and Andrew Zisserman. 2020. Self-supervised learning of audio-visual objects from video. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVIII 16. Springer, 208\u2013224."},{"key":"e_1_3_2_1_2_1","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume":"33","author":"Alwassel Humam","year":"2020","unstructured":"Humam Alwassel, Dhruv Mahajan, Bruno Korbar, Lorenzo Torresani, Bernard Ghanem, and Du Tran. 2020. Self-supervised learning by cross-modal audio-video clustering. Advances in Neural Information Processing Systems 33 (2020), 9758\u20139770.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"e_1_3_2_1_5_1","volume-title":"Soundnet: Learning sound representations from unlabeled video. Advances in neural information processing systems 29","author":"Aytar Yusuf","year":"2016","unstructured":"Yusuf Aytar, Carl Vondrick, and Antonio Torralba. 2016. Soundnet: Learning sound representations from unlabeled video. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_1_6_1","volume-title":"Seeing sounds: visual and auditory interactions in the brain. Current opinion in neurobiology 16, 4","author":"Bulkin A","year":"2006","unstructured":"David\u00a0A Bulkin and Jennifer\u00a0M Groh. 2006. Seeing sounds: visual and auditory interactions in the brain. Current opinion in neurobiology 16, 4 (2006), 415\u2013419."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_25"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","volume-title":"Looking to listen at the cocktail party: A speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619","author":"Ephrat Ariel","year":"2018","unstructured":"Ariel Ephrat, Inbar Mosseri, Oran Lang, Tali Dekel, Kevin Wilson, Avinatan Hassidim, William\u00a0T Freeman, and Michael Rubinstein. 2018. Looking to listen at the cocktail party: A speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619 (2018)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01049"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00715"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00398"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1162\/jocn.2009.21134"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"volume-title":"CNN architectures for large-scale audio classification. In 2017 ieee international conference on acoustics, speech and signal processing (icassp)","author":"Hershey Shawn","key":"e_1_3_2_1_16_1","unstructured":"Shawn Hershey, Sourish Chaudhuri, Daniel\u00a0PW Ellis, Jort\u00a0F Gemmeke, Aren Jansen, R\u00a0Channing Moore, Manoj Plakal, Devin Platt, Rif\u00a0A Saurous, Bryan Seybold, 2017. CNN architectures for large-scale audio classification. In 2017 ieee international conference on acoustics, speech and signal processing (icassp). IEEE, 131\u2013135."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.450"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00947"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1167\/19.11.1"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548309"},{"key":"e_1_3_2_1_21_1","volume-title":"The kinetics human action video dataset. arXiv preprint arXiv:1705.06950","author":"Kay Will","year":"2017","unstructured":"Will Kay, Joao Carreira, Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_22_1","volume-title":"Cooperative learning of audio and video models from self-supervised synchronization. Advances in Neural Information Processing Systems 31","author":"Korbar Bruno","year":"2018","unstructured":"Bruno Korbar, Du Tran, and Lorenzo Torresani. 2018. Cooperative learning of audio and video models from self-supervised synchronization. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683226"},{"key":"e_1_3_2_1_24_1","first-page":"11449","article-title":"Exploring cross-video and cross-modality signals for weakly-supervised audio-visual video parsing","volume":"34","author":"Lin Yan-Bo","year":"2021","unstructured":"Yan-Bo Lin, Hung-Yu Tseng, Hsin-Ying Lee, Yen-Yu Lin, and Ming-Hsuan Yang. 2021. Exploring cross-video and cross-modality signals for weakly-supervised audio-visual video parsing. Advances in Neural Information Processing Systems 34 (2021), 11449\u201311461.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_25_1","volume-title":"Active contrastive learning of audio-visual video representations. arXiv preprint arXiv:2009.09805","author":"Ma Shuang","year":"2020","unstructured":"Shuang Ma, Zhaoyang Zeng, Daniel McDuff, and Yale Song. 2020. Active contrastive learning of audio-visual video representations. arXiv preprint arXiv:2009.09805 (2020)."},{"key":"e_1_3_2_1_26_1","first-page":"4733","article-title":"Learning representations from audio-visual spatial alignment","volume":"33","author":"Morgado Pedro","year":"2020","unstructured":"Pedro Morgado, Yi Li, and Nuno Nvasconcelos. 2020. Learning representations from audio-visual spatial alignment. Advances in Neural Information Processing Systems 33 (2020), 4733\u20134744.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"e_1_3_2_1_29_1","volume-title":"An attempt towards interpretable audio-visual video captioning. arXiv preprint arXiv:1812.02872","author":"Tian Yapeng","year":"2018","unstructured":"Yapeng Tian, Chenxiao Guan, Justin Goodman, Marc Moore, and Chenliang Xu. 2018. An attempt towards interpretable audio-visual video captioning. arXiv preprint arXiv:1812.02872 (2018)."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings, Part III 16","author":"Tian Yapeng","year":"2020","unstructured":"Yapeng Tian, Dingzeyu Li, and Chenliang Xu. 2020. Unified multisensory perception: Weakly-supervised audio-visual video parsing. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part III 16. Springer, 436\u2013454."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_1_33_1","volume-title":"listen, and describe: Globally and locally aligned cross-modal attentions for video captioning. arXiv preprint arXiv:1804.05448","author":"Wang Xin","year":"2018","unstructured":"Xin Wang, Yuan-Fang Wang, and William\u00a0Yang Wang. 2018. Watch, listen, and describe: Globally and locally aligned cross-modal attentions for video captioning. arXiv preprint arXiv:1804.05448 (2018)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00138"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00639"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5361"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547869"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00182"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00833"}],"event":{"name":"BDSIC 2024: 2024 6th International Conference on Big-data Service and Intelligent Computation","acronym":"BDSIC 2024","location":"Hong Kong Hong Kong"},"container-title":["Proceedings of the 2024 6th International Conference on Big-data Service and Intelligent Computation"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686540.3686547","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3686540.3686547","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:21:38Z","timestamp":1755973298000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686540.3686547"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,29]]},"references-count":40,"alternative-id":["10.1145\/3686540.3686547","10.1145\/3686540"],"URL":"https:\/\/doi.org\/10.1145\/3686540.3686547","relation":{},"subject":[],"published":{"date-parts":[[2024,5,29]]},"assertion":[{"value":"2024-09-19","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}