{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T07:11:09Z","timestamp":1778051469106,"version":"3.51.4"},"reference-count":76,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00461","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"4744-4754","source":"Crossref","is-referenced-by-count":0,"title":["Countering Multi-modal Representation Collapse through Rank-targeted Fusion"],"prefix":"10.1109","author":[{"given":"Seulgi","family":"Kim","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology,Atlanta,GA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kiran","family":"Kokilepersaud","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology,Atlanta,GA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohit","family":"Prabhushankar","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology,Atlanta,GA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ghassan","family":"AlRegib","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology,Atlanta,GA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00560"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-71278-5_12"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1281"},{"key":"ref4","article-title":"Multimodal deep learning","author":"Akkus","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref6","article-title":"Vicreg: Variance-invariance-covariance regularization for self-supervised learning","author":"Bardes","year":"2021"},{"key":"ref7","article-title":"3d human motion anticipation and classification","author":"Barsoum","year":"2020"},{"key":"ref8","article-title":"On the efficacy of text-based input modalities for action anticipation","author":"Beedu","year":"2024"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_4"},{"key":"ref10","article-title":"Transitional uncertainty with layered intermediate predictions","author":"Benkert","year":"2024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2022.07.005"},{"key":"ref12","article-title":"A closer look at multimodal representation collapse","author":"Chaudhuri","year":"2025"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01531-2"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00369"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01302"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2021.3134105"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00635"},{"key":"ref20","first-page":"10929","article-title":"Rankme: Assessing the downstream performance of pretrained self-supervised representations by their rank","volume-title":"International conference on machine learning","author":"Garrido"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00306"},{"key":"ref23","article-title":"Actfusion: a unified diffusion model for action segmentation and anticipation","author":"Gong","year":"2024"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.10.016"},{"key":"ref26","first-page":"213","volume-title":"Fusenet: Incorporating depth into semantic segmentation via fusion-based cnn architecture","author":"Hazirbas"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3657582"},{"key":"ref29","article-title":"Pixel-bert: Aligning image pixels with text by deep multi-modal transformers","author":"Huang","year":"2020"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00237"},{"key":"ref31","article-title":"Understanding dimensional collapse in contrastive self-supervised learning","author":"Jing","year":"2021"},{"key":"ref32","article-title":"Hierarchical and multimodal data for daily activity understanding","author":"Kaviani","year":"2025"},{"key":"ref33","author":"Kaviani","year":"2025","journal-title":"Exploring human daily life through a hierarchical multimodal lens"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01016"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00370"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP55913.2025.11084357"},{"key":"ref37","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","volume-title":"International conference on machine learning","author":"Kim"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/BigData59044.2023.10386242"},{"key":"ref39","article-title":"Hex: Hierarchical emergence exploitation in self-supervised algorithms","author":"Kokilepersaud","year":"2024"},{"key":"ref40","article-title":"Adadim: Dimensionality adaptation for ssl representational dynamics","author":"Kokilepersaud","year":"2025"},{"key":"ref41","first-page":"792","article-title":"Learning spatio-temporal structure from rgb-d videos for human activity detection and anticipation","volume-title":"Proceedings of the 30th International Conference on Machine Learning","author":"Koppula"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.105"},{"key":"ref43","article-title":"Human action anticipation: A survey","author":"Lai","year":"2024"},{"key":"ref44","article-title":"Feature collapse","author":"Laurent","year":"2023"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.113"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00930"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.26599\/air.2023.9150019"},{"key":"ref49","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"key":"ref50","article-title":"Sgdr: Stochastic gradient descent with warm restarts","author":"Loshchilov","year":"2016"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-68276-1"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2023.3331010"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0669"},{"key":"ref54","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref55","first-page":"547","article-title":"On measures of entropy and information","volume-title":"Proceedings of the fourth Berkeley symposium on mathematical statistics and probability, volume 1: contributions to the theory of statistics","author":"R\u00e9nyi"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.5220\/0007379000002108"},{"key":"ref57","first-page":"606","article-title":"The effective rank: A measure of effective dimensionality","volume-title":"2007 15th European signal processing conference","author":"Roy"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_10"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"ref60","article-title":"Lidar: Sensing linear probing performance in joint embedding ssl architectures","author":"Thilak","year":"2023"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01188-y"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1515\/9781400889921"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_40"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58595-2_3"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2012.6239233"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1083"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP62443.2025.11204249"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.49"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3713070"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00254"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73001-6_26"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00631"},{"key":"ref74","article-title":"Matrix information theory for self-supervised learning","author":"Zhang","year":"2023"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3391692"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00601"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492646.pdf?arnumber=11492646","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:12:52Z","timestamp":1778047972000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492646\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":76,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00461","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}