{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T09:59:09Z","timestamp":1777888749298,"version":"3.51.4"},"reference-count":79,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004853","name":"CUHK","doi-asserted-by":"publisher","award":["4055269"],"award-info":[{"award-number":["4055269"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00044","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"393-404","source":"Crossref","is-referenced-by-count":0,"title":["InfoBridge: Balanced Multimodal Integration through Conditional Dependency Modeling"],"prefix":"10.1109","author":[{"given":"Chenxin","family":"Li","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifan","family":"Liu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Panwang","family":"Pan","sequence":"additional","affiliation":[{"name":"ByteDance Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hengyu","family":"Liu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyu","family":"Liu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wuyang","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Wang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weihao","family":"Yu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiyang","family":"Lin","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yixuan","family":"Yuan","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Deep variational information bottleneck","volume-title":"International Conference on Learning Representations","author":"Alemi","year":"2016"},{"key":"ref2","article-title":"Self-supervised learning by cross-modal audio-video clustering","author":"Alwassel","year":"2020","journal-title":"Advances in Neural Information Processing Systems, 33"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref4","article-title":"Soundnet: Learning sound representations from unlabeled video","volume":"29","author":"Aytar","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-021-02166-7"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2014.2336244"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1158\/2159-8290.CD-12-0095"},{"key":"ref10","first-page":"789","article-title":"Reconboost: Boosting can achieve modality reconcilement","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Chen","year":"2024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2020.3021387"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2023.3295489"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532011"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2022.12.014"},{"issue":"5","key":"ref15","first-page":"1287","article-title":"Effective unimodal learning approaches for multimodal systems","volume":"22","author":"Doe","year":"2020","journal-title":"IEEE Transactions on Multimedia"},{"key":"ref16","article-title":"Improving multimodal learning with uni-modal teachers","author":"Du","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01918"},{"key":"ref18","article-title":"Learning robust representations via multi-view information bottleneck","volume-title":"8 th International Conference on Learning Representations. OpenReview. net","author":"Federici","year":"2020"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01455"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/IVCNZ51579.2020.9290622"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02005"},{"key":"ref23","article-title":"Learning deep representations by mutual information estimation and maximization","author":"Hjelm","year":"2018","journal-title":"arXiv preprint arXiv"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.389"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00947"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683898"},{"key":"ref27","article-title":"Multimodal learning and reasoning for visual question answering","author":"Ilievski","year":"2017","journal-title":"Advances in Neural Information Processing Systems. Curran Associates, Inc."},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/s12652-019-01239-9"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2823900"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00559"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i4.25643"},{"key":"ref32","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"Korbar","year":"2018","journal-title":"arXiv preprint arXiv"},{"key":"ref33","first-page":"1513","article-title":"A variational information bottleneck approach to multi-omics data integration","volume-title":"International Conference on Artificial Intelligence and Statistics","author":"Lee","year":"2021"},{"key":"ref34","article-title":"Hierarchical optimal transport for multimodal distribution alignment","author":"Lee","year":"2019","journal-title":"Advances in neural information processing systems, 32"},{"key":"ref35","first-page":"567","article-title":"Diagnosing and re-learning for balanced multimodal learning","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","author":"Lee","year":"2024"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73235-5_10"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.797"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2832198"},{"key":"ref39","article-title":"Quantifying & modeling feature interactions: An information decomposition framework","author":"Pu Liang","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32565"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32564"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02084"},{"key":"ref43","article-title":"Jarvisart: Liberating human artistic creativity via an intelligent photo retouching agent","author":"Lin","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref44","article-title":"Ir3d-bench: Evaluating visionlanguage model scene understanding as agentic inverse rendering","author":"Liu","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.7717\/peerj-cs.1400"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2501"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3171679"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178347"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01033"},{"key":"ref50","article-title":"Multimodal deep learning","author":"Ngiam","year":"2011","journal-title":"ICML"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw56347.2022.00504"},{"key":"ref52","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv preprint arXiv"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00269"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3390\/s21144927"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"ref56","article-title":"Audio-visual automatic speech recognition: An overview","volume":"22","author":"Potamianos","year":"2004","journal-title":"Issues in visual and audio-visual speech processing"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58542-6_13"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2691321"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICALT.2018.00057"},{"key":"ref60","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1007\/978-3-030-58598-3_33","article-title":"On modulating the gradient for metalearning","volume-title":"Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK","author":"Simon","year":"2020"},{"key":"ref61","first-page":"1023","article-title":"Mmpareto: Boosting multimodal learning with innocent unimodal assistance","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Smith","year":"2024"},{"key":"ref62","article-title":"Multimodal learning with deep boltzmann machines","author":"Srivastava","year":"2012","journal-title":"Advances in neural information processing systems, 25"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.775"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-018-0548-3"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.5114\/wo.2014.47136"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2016.2529665"},{"key":"ref68","first-page":"1024","article-title":"Joint training for multimodal data fusion","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Wang","year":"2020"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612001"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72684-2_5"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680916"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01936"},{"key":"ref74","article-title":"Audiovisual slowfast networks for video recognition","author":"Xiao","journal-title":"arXiv preprint arXiv"},{"key":"ref75","article-title":"Multimodal optimal transportbased co-attention transformer with global structure consistency for survival prediction","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Xu","year":"2023"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00256"},{"key":"ref77","volume-title":"Towards holistic multimodal interaction: An information-theoretic perspective","author":"Yang"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87193-2_56"},{"key":"ref79","first-page":"222","article-title":"Identify consistent imaging genomic biomarkers for characterizing the survivalassociated interactions between tumor-infiltrating lymphocytes and tumors","volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","author":"Zuo","year":"2022"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445606.pdf?arnumber=11445606","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:56:53Z","timestamp":1777611413000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445606\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":79,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00044","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}