{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:35:33Z","timestamp":1776886533065,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681607","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"2428-2437","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["MMAL: Multi-Modal Analytic Learning for Exemplar-Free Audio-Visual Class Incremental Tasks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3527-6034","authenticated-orcid":false,"given":"Xianghu","family":"Yue","sequence":"first","affiliation":[{"name":"Department of Electrical and Computer and Engineering, National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2394-3518","authenticated-orcid":false,"given":"Xueyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Laboratory for Big Data and Decision, National University of Defense Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8557-1077","authenticated-orcid":false,"given":"Yiming","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer and Engineering, National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3375-2458","authenticated-orcid":false,"given":"Chengwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Electronic, Electrical and Communication Engineering, University of the Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8413-7220","authenticated-orcid":false,"given":"Mingrui","family":"Lao","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Information Systems Engineering, National University of Defense Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4612-5445","authenticated-orcid":false,"given":"Huiping","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Shien-Ming Wu School of Intelligent Engineering, South China University of Technology, Guangzhou, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9511-6713","authenticated-orcid":false,"given":"Xinyuan","family":"Qian","sequence":"additional","affiliation":[{"name":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"Shenzhen Research Institute of Big Data, School of Data Science, The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018. Deep audio-visual speech recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 12 (2018), 8717--8727."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision, 824--833","author":"Ahn Hongjoon","year":"2020","unstructured":"Hongjoon Ahn, Jihwan Kwak, Su Fang Lim, Hyeonsu Bang, Hyojun Kim, and Taesup Moon. 2020. SS-IL: Separated Softmax for Incremental Learning. Proceedings of the IEEE\/CVF International Conference on Computer Vision, 824--833."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems","volume":"29","author":"Aytar Yusuf","year":"2016","unstructured":"Yusuf Aytar, Carl Vondrick, and Antonio Torralba. 2016. Soundnet: Learning sound representations from unlabeled video. In Proceedings of Advances in Neural Information Processing Systems, Vol. 29."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00812"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350873"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00229"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_15"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2993148.2993172"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_33"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_28"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_6"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00406"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01049"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00398"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0925-2312(03)00385-0"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00092"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00947"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01023"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS).","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Hu Xu, Juncheng Li, Alexei Baevski, Michael Auli, Wojciech Galuba, Florian Metze, and Christoph Feichtenhofer. 2022. Masked Autoencoders that Listen. In Proceedings of Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_24_1","unstructured":"Heechul Jung Jeongwoo Ju Minju Jung and Junmo Kim. 2016. Less-forgetting learning in deep neural networks. arXiv preprint arXiv:1607.00122."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01560"},{"key":"e_1_3_2_1_26_1","volume-title":"Adam: A Method for Stochastic Optimization. In International Conference on Learning Representations (ICLR).","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2022. Adam: A Method for Stochastic Optimization. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1611835114"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00123"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems","volume":"31","author":"Korbar Bruno","year":"2018","unstructured":"Bruno Korbar, Du Tran, and Lorenzo Torresani. 2018. Cooperative learning of audio and video models from self-supervised synchronization. In Proceedings of Advances in Neural Information Processing Systems, Vol. 31."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia. 4747--4758","author":"Lao Mingrui","unstructured":"Mingrui Lao, Nan Pu, Yu Liu, Zhun Zhong, Erwin M. Bakker, Nicu Sebe, and Michael S. Lew. 2023. Multi-Domain Lifelong Visual Question Answering via Self-Critical Distillation. In Proceedings of the 31st ACM International Conference on Multimedia. 4747--4758."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the British Machine Vision Conference. 255","author":"Li Sizhe","year":"2021","unstructured":"Sizhe Li, Yapeng Tian, and Chenliang Xu. 2021. Space-time memory network for sounding object localization in videos. In Proceedings of the British Machine Vision Conference. 255."},{"key":"e_1_3_2_1_32_1","volume-title":"Learning without forgetting","author":"Li Zhizhong","year":"2017","unstructured":"Zhizhong Li and Derek Hoiem. 2017. Learning without forgetting. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 12 (2017), 2935--2947."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02196"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00257"},{"key":"e_1_3_2_1_35_1","first-page":"3478","article-title":"Rmm: Reinforced memory management for class-incremental learning","volume":"34","author":"Liu Yaoyao","year":"2021","unstructured":"Yaoyao Liu, Bernt Schiele, and Qianru Sun. 2021. Rmm: Reinforced memory management for class-incremental learning. In Proceedings of Advances in Neural Information Processing Systems, Vol. 34. 3478--3490.","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01226"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00550-z"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_13"},{"key":"e_1_3_2_1_39_1","unstructured":"Shentong Mo and Pedro Morgado. 2022. A unified audio-visual learning framework for localization separation and recognition. In arXiv preprint arXiv:2305.19458."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00716"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01018"},{"key":"e_1_3_2_1_42_1","first-page":"4733","article-title":"Learning representations from audio-visual spatial alignment","volume":"33","author":"Morgado Pedro","year":"2020","unstructured":"Pedro Morgado, Yi Li, and Nuno Nvasconcelos. 2020. Learning representations from audio-visual spatial alignment. In Proceedings of Advances in Neural Information Processing Systems, Vol. 33. 4733--4744.","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3175130"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS).","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Proceedings of Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 3900--3909","author":"Petit G.","unstructured":"G. Petit, A. Popescu, H. Schindler, D. Picard, and B. Delezoide. 2023. FeTrIL: Feature Translation for Exemplar-Free Class-Incremental Learning. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 3900--3909."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00717"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3226330"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3165466"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.587"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2952095"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2975922"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475587"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Tian Yapeng","year":"2019","unstructured":"Yapeng Tian, Jing Shi, Bochen Li, Zhiyao Duan, and Chenliang Xu. 2019. Audio-visual event localization in the wild. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS).","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training. In Proceedings of Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_57_1","unstructured":"Yake Wei Di Hu Yapeng Tian and Xuelong Li. 2022. Learning in audio-visual context: A review analysis and new perspective. arXiv preprint arXiv:2208.09579."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16403"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00046"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01936"},{"key":"e_1_3_2_1_61_1","volume-title":"Dae Hoe Kim, and Yong Man Ro","author":"Yeo Jeong Hun","year":"2024","unstructured":"Jeong Hun Yeo, Minsu Kim, Jeongsoo Choi, Dae Hoe Kim, and Yong Man Ro. 2024. Akvsr: Audio knowledge empowered visual speech recognition by compressing audio knowledge of a pretrained model. IEEE Transactions on Multimedia (2024)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093365"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612460"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3072041"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00833"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00581"},{"key":"e_1_3_2_1_68_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9286--9295","author":"Zhu K.","unstructured":"K. Zhu, W. Zhai, Y. Cao, J. Luo, and Z. Zha. 2022. Self-Sustaining Representation Expansion for Non-Exemplar Class-Incremental Learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9286--9295."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i15.29670"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2021.3064241"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00748"},{"key":"e_1_3_2_1_72_1","first-page":"11602","article-title":"ACIL: Analytic class-incremental learning with absolute memorization and privacy protection","volume":"35","author":"Zhuang Huiping","year":"2022","unstructured":"Huiping Zhuang, Zhenyu Weng, Hongxin Wei, Renchunzi Xie, Kar-Ann Toh, and Zhiping Lin. 2022. ACIL: Analytic class-incremental learning with absolute memorization and privacy protection. In Proceedings of Advances in Neural Information Processing Systems, Vol. 35. 11602--11614.","journal-title":"Proceedings of Advances in Neural Information Processing Systems"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681607","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681607","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681607"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":72,"alternative-id":["10.1145\/3664647.3681607","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681607","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}