{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T19:59:19Z","timestamp":1779911959633,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":80,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714853","type":"proceedings-article","created":{"date-parts":[[2025,5,5]],"date-time":"2025-05-05T16:42:02Z","timestamp":1746463322000},"page":"3084-3095","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["InfoMAE: Pair-Efficient Cross-Modal Alignment for Multimodal Time-Series Sensing Signals"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4297-5865","authenticated-orcid":false,"given":"Tomoyoshi","family":"Kimura","sequence":"first","affiliation":[{"name":"University of Illinois, Urbana Champaign, Champaign, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3882-5166","authenticated-orcid":false,"given":"Xinlin","family":"Li","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7721-4854","authenticated-orcid":false,"given":"Osama","family":"Hanna","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8954-9109","authenticated-orcid":false,"given":"Yatong","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8178-8316","authenticated-orcid":false,"given":"Yizhuo","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Illinois, Urbana Champaign, Champaign, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2520-4941","authenticated-orcid":false,"given":"Denizhan","family":"Kara","sequence":"additional","affiliation":[{"name":"University of Illinois, Urbana Champaign, Champaign, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3085-1434","authenticated-orcid":false,"given":"Tianshi","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Illinois, Urbana Champaign, Champaign, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9285-9872","authenticated-orcid":false,"given":"Jinyang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Illinois, Urbana Champaign, Champaign, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0710-0963","authenticated-orcid":false,"given":"Xiaomin","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7643-7239","authenticated-orcid":false,"given":"Shengzhong","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3782-9192","authenticated-orcid":false,"given":"Mani","family":"Srivastava","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7313-9861","authenticated-orcid":false,"given":"Suhas","family":"Diggavi","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3883-7220","authenticated-orcid":false,"given":"Tarek","family":"Abdelzaher","sequence":"additional","affiliation":[{"name":"University of Illinois, Urbana Champaign, Champaign, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"9758","article-title":"Selfsupervised learning by cross-modal audio-video clustering","volume":"33","author":"Alwassel H.","year":"2020","unstructured":"H. Alwassel, D. Mahajan, B. Korbar, L. Torresani, B. Ghanem, and D. Tran. Selfsupervised learning by cross-modal audio-video clustering. Advances in Neural Information Processing Systems, 33:9758--9770, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01394"},{"issue":"3","key":"e_1_3_2_1_3_1","first-page":"1","article-title":"Eeg-based emotion recognition","volume":"56","author":"Bos D. O.","year":"2006","unstructured":"D. O. Bos et al. Eeg-based emotion recognition. The Influence of Visual and Auditory Stimuli, 56(3):1--17, 2006.","journal-title":"The Influence of Visual and Auditory Stimuli"},{"key":"e_1_3_2_1_4_1","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown T.","year":"2020","unstructured":"T. Brown, B. Mann, N. Ryder, M. Subbiah, J. D. Kaplan, P. Dhariwal, A. Neelakantan, P. Shyam, G. Sastry, A. Askell, S. Agarwal, A. Herbert-Voss, G. Krueger, T. Henighan, R. Child, A. Ramesh, D. Ziegler, J.Wu, C. Winter, C. Hesse, M. Chen, E. Sigler, M. Litwin, S. Gray, B. Chess, J. Clark, C. Berner, S. McCandlish, A. Radford, I. Sutskever, and D. Amodei. Language models are few-shot learners. In H. Larochelle, M. Ranzato, R. Hadsell, M. Balcan, and H. Lin, editors, Advances in Neural Information Processing Systems, volume 33, pages 1877--1901. Curran Associates, Inc., 2020."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Chen T.","year":"2020","unstructured":"T. Chen, S. Kornblith, M. Norouzi, and G. Hinton. A simple framework for contrastive learning of visual representations. In International Conference on Machine Learning (ICML), 2020."},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Chen T.","year":"2020","unstructured":"T. Chen, S. Kornblith, M. Norouzi, and G. Hinton. A simple framework for contrastive learning of visual representations. In International Conference on Machine Learning (ICML), 2020."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01617"},{"key":"e_1_3_2_1_10_1","volume-title":"John Wiley & Sons","author":"Cover T. M.","year":"1999","unstructured":"T. M. Cover. Elements of information theory. John Wiley & Sons, 1999."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550316"},{"key":"e_1_3_2_1_12_1","first-page":"13800","article-title":"Actionsense: A multimodal dataset and recording framework for human activities using wearable sensors in a kitchen environment","volume":"35","author":"DelPreto J.","year":"2022","unstructured":"J. DelPreto, C. Liu, Y. Luo, M. Foshey, Y. Li, A. Torralba, W. Matusik, and D. Rus. Actionsense: A multimodal dataset and recording framework for human activities using wearable sensors in a kitchen environment. Advances in Neural Information Processing Systems, 35:13800--13813, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin J.","year":"2018","unstructured":"J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/324"},{"key":"e_1_3_2_1_15_1","volume-title":"Selfsupervised contrastive representation learning for semi-supervised time-series classification. arXiv preprint arXiv:2208.06616","author":"Eldele E.","year":"2022","unstructured":"E. Eldele, M. Ragab, Z. Chen, M. Wu, C. K. Kwoh, X. Li, and C. Guan. Selfsupervised contrastive representation learning for semi-supervised time-series classification. arXiv preprint arXiv:2208.06616, 2022."},{"key":"e_1_3_2_1_16_1","first-page":"01","article-title":"Common information is far less than mutual information","volume":"2","author":"Gacs P.","year":"1973","unstructured":"P. Gacs and J. K\u00f6rner. Common information is far less than mutual information. Problems of Control and Information Theory, 2, 01 1973.","journal-title":"Problems of Control and Information Theory"},{"key":"e_1_3_2_1_17_1","volume-title":"The Eleventh International Conference on Learning Representations","author":"Gong Y.","year":"2022","unstructured":"Y. Gong, A. Rouditchenko, A. H. Liu, D. Harwath, L. Karlinsky, H. Kuehne, and J. R. Glass. Contrastive audio-visual masked autoencoder. In The Eleventh International Conference on Learning Representations, 2022."},{"key":"e_1_3_2_1_18_1","volume-title":"Bootstrap your own latent-a new approach to self-supervised learning. Advances in neural information processing systems, 33:21271--21284","author":"Grill J.-B.","year":"2020","unstructured":"J.-B. Grill, F. Strub, F. Altch\u00e9, C. Tallec, P. Richemond, E. Buchatskaya, C. Doersch, B. Avila Pires, Z. Guo, M. Gheshlaghi Azar, et al. Bootstrap your own latent-a new approach to self-supervised learning. Advances in neural information processing systems, 33:21271--21284, 2020."},{"key":"e_1_3_2_1_19_1","first-page":"36","article-title":"Siamese masked autoencoders","author":"Gupta A.","year":"2024","unstructured":"A. Gupta, J. Wu, J. Deng, and F.-F. Li. Siamese masked autoencoders. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT54713.2023.10206887"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT57864.2024.10619484"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT50566.2022.9834790"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_24_1","volume-title":"Disentangling by partitioning: A representation learning framework for multimodal sensory data. arXiv preprint arXiv:1805.11264","author":"Hsu W.-N.","year":"2018","unstructured":"W.-N. Hsu and J. Glass. Disentangling by partitioning: A representation learning framework for multimodal sensory data. arXiv preprint arXiv:1805.11264, 2018."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"e_1_3_2_1_26_1","first-page":"28708","article-title":"Masked autoencoders that listen","volume":"35","author":"Huang P.-Y.","year":"2022","unstructured":"P.-Y. Huang, H. Xu, J. Li, A. Baevski, M. Auli, W. Galuba, F. Metze, and C. Feichtenhofer. Masked autoencoders that listen. Advances in Neural Information Processing Systems, 35:28708--28720, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","first-page":"29406","article-title":"Learning with noisy correspondence for cross-modal matching","volume":"34","author":"Huang Z.","year":"2021","unstructured":"Z. Huang, G. Niu, X. Liu, W. Ding, X. Xiao, H. Wu, and X. Peng. Learning with noisy correspondence for cross-modal matching. Advances in Neural Information Processing Systems, 34:29406--29419, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_28_1","first-page":"7661","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Jiang Q.","year":"2023","unstructured":"Q. Jiang, C. Chen, H. Zhao, L. Chen, Q. Ping, S. D. Tran, Y. Xu, B. Zeng, and T. Chilimbi. Understanding and constructing latent modality structures in multimodal representation learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 7661--7671, 2023."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3666025.3699325"},{"key":"e_1_3_2_1_30_1","volume-title":"The World Wide Web Conference","author":"Kara D.","year":"2024","unstructured":"D. Kara, T. Kimura, L. Shengzhong, L. Jinyang, L. Dongxin, W. Tianshi, W. Ruijie, C. Yizhuo, H. Yigong, and A. Tarek. Freqmae: Frequency-aware masked autoencoder for multi-modal iot sensing. In The World Wide Web Conference, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Kim H.","year":"2018","unstructured":"H. Kim and A. Mnih. Disentangling by factorising. In International Conference on Machine Learning (ICML), 2018."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/MASS62177.2024.00014"},{"key":"e_1_3_2_1_33_1","first-page":"36","article-title":"Gacs-korner common information variational autoencoder","author":"Kleinman M.","year":"2024","unstructured":"M. Kleinman, A. Achille, S. Soatto, and J. Kao. Gacs-korner common information variational autoencoder. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00765"},{"key":"e_1_3_2_1_35_1","first-page":"31","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"Korbar B.","year":"2018","unstructured":"B. Korbar, D. Tran, and L. Torresani. Cooperative learning of audio and video models from self-supervised synchronization. Advances in Neural Information Processing Systems, 31, 2018.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCCN61486.2024.10637580"},{"key":"e_1_3_2_1_37_1","first-page":"36","article-title":"Factorized contrastive learning: Going beyond multi-view redundancy","author":"Liang P. P.","year":"2023","unstructured":"P. P. Liang, Z. Deng, M. Q. Ma, J. Y. Zou, L.-P. Morency, and R. Salakhutdinov. Factorized contrastive learning: Going beyond multi-view redundancy. Advances in Neural Information Processing Systems, 36, 2023.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_38_1","unstructured":"P. P. Liang Y. Lyu X. Fan Z. Wu Y. Cheng J. Wu L. Chen P. Wu M. A. Lee Y. Zhu et al. Multibench: Multiscale benchmarks for multimodal representation learning."},{"key":"e_1_3_2_1_39_1","first-page":"17612","article-title":"Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning","volume":"35","author":"Liang V. W.","year":"2022","unstructured":"V. W. Liang, Y. Zhang, Y. Kwon, S. Yeung, and J. Y. Zou. Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning. Advances in Neural Information Processing Systems, 35:17612--17625, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","first-page":"36","article-title":"Focal: Contrastive learning for multimodal time-series sensing signals in factorized orthogonal latent space","author":"Liu S.","year":"2023","unstructured":"S. Liu, T. Kimura, D. Liu, R. Wang, J. Li, S. Diggavi, M. Srivastava, and T. Abdelzaher. Focal: Contrastive learning for multimodal time-series sensing signals in factorized orthogonal latent space. Advances in Neural Information Processing Systems, 36, 2023.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_42_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Loshchilov I.","year":"2017","unstructured":"I. Loshchilov and F. Hutter. SGDR: Stochastic gradient descent with warm restarts. In International Conference on Learning Representations (ICLR), 2017."},{"key":"e_1_3_2_1_43_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Loshchilov I.","year":"2019","unstructured":"I. Loshchilov and F. Hutter. Decoupled weight decay regularization. In International Conference on Learning Representations (ICLR), 2019."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01764"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16330"},{"key":"e_1_3_2_1_46_1","article-title":"Multimodal information bottleneck: Learning minimal sufficient unimodal and multimodal representations","author":"Mai S.","year":"2022","unstructured":"S. Mai, Y. Zeng, and H. Hu. Multimodal information bottleneck: Learning minimal sufficient unimodal and multimodal representations. IEEE Transactions on Multimedia, 2022.","journal-title":"IEEE Transactions on Multimedia"},{"key":"e_1_3_2_1_47_1","first-page":"36","article-title":"4m: Massively multimodal masked modeling","author":"Mizrahi D.","year":"2024","unstructured":"D. Mizrahi, R. Bachmann, O. Kar, T. Yeo, M. Gao, A. Dehghan, and A. Zamir. 4m: Massively multimodal masked modeling. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","first-page":"4733","article-title":"Learning representations from audiovisual spatial alignment","volume":"33","author":"Morgado P.","year":"2020","unstructured":"P. Morgado, Y. Li, and N. Nvasconcelos. Learning representations from audiovisual spatial alignment. Advances in Neural Information Processing Systems, 33:4733--4744, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2010.2068870"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3560519"},{"issue":"1","key":"e_1_3_2_1_52_1","first-page":"2023","article-title":"Multimodal sensor fusion in the latent representation space","volume":"13","author":"Piechocki R. J.","year":"2005","unstructured":"R. J. Piechocki, X. Wang, and M. J. Bocus. Multimodal sensor fusion in the latent representation space. Scientific Reports, 13(1):2005, 2023.","journal-title":"Scientific Reports"},{"key":"e_1_3_2_1_53_1","volume-title":"Comir: Contrastive multimodal image representation for registration. Advances in neural information processing systems, 33:18433--18444","author":"Pielawski N.","year":"2020","unstructured":"N. Pielawski, E.Wetzer, J. \u00d6fverstedt, J. Lu, C. W\u00e4hlby, J. Lindblad, and N. Sladoje. Comir: Contrastive multimodal image representation for registration. Advances in neural information processing systems, 33:18433--18444, 2020."},{"key":"e_1_3_2_1_54_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Poklukar P.","year":"2022","unstructured":"P. Poklukar, M. Vasco, H. Yin, F. S. Melo, A. Paiva, and D. Kragic. Geometric multimodal contrastive representation learning. In International Conference on Machine Learning (ICML), 2022."},{"key":"e_1_3_2_1_55_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford A.","year":"2021","unstructured":"A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, et al. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML), 2021."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISWC.2012.13"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3242985"},{"key":"e_1_3_2_1_58_1","volume-title":"International Conference on Learning Representations","author":"Shen S.","year":"2021","unstructured":"S. Shen, L. H. Li, H. Tan, M. Bansal, A. Rohrbach, K.-W. Chang, Z. Yao, and K. Keutzer. Howmuch can clip benefit vision-and-language tasks? In International Conference on Learning Representations, 2021."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10463-011-0343-8"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2021.3128187"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/PERCOM.2016.7456521"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"e_1_3_2_1_63_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Tonekaboni S.","year":"2021","unstructured":"S. Tonekaboni, D. Eytan, and A. Goldenberg. Unsupervised representation learning for time series with temporal neighborhood coding. In International Conference on Learning Representations (ICLR), 2021."},{"key":"e_1_3_2_1_64_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, 35:10078--10093","author":"Tong Z.","year":"2022","unstructured":"Z. Tong, Y. Song, J. Wang, and L. Wang. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, 35:10078--10093, 2022."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.528"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00131"},{"key":"e_1_3_2_1_67_1","volume-title":"International Conference on Learning Representations","author":"Tsai Y.-H. H.","year":"2018","unstructured":"Y.-H. H. Tsai, P. P. Liang, A. Zadeh, L.-P. Morency, and R. Salakhutdinov. Learning factorized multimodal representations. In International Conference on Learning Representations, 2018."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683133"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01524"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25373"},{"key":"e_1_3_2_1_71_1","first-page":"22099","article-title":"Connecting multi-modal contrastive representations","volume":"36","author":"Wang Z.","year":"2023","unstructured":"Z. Wang, Y. Zhao, H. Huang, J. Liu, A. Yin, L. Tang, L. Li, Y. Wang, Z. Zhang, and Z. Zhao. Connecting multi-modal contrastive representations. Advances in Neural Information Processing Systems, 36:22099--22114, 2023.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1975.1055346"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"e_1_3_2_1_74_1","volume-title":"International Conference on Learning Representations","author":"Yang Y.","year":"2023","unstructured":"Y. Yang, X. Liu, J. Wu, S. Borac, D. Katabi, M.-Z. Poh, and D. McDuff. Simper: Simple self-supervised learning of periodic targets. In International Conference on Learning Representations, 2023."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1561\/0100000122"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i8.20881"},{"key":"e_1_3_2_1_77_1","first-page":"27127","article-title":"How mask matters: Towards theoretical understandings of masked autoencoders","volume":"35","author":"Zhang Q.","year":"2022","unstructured":"Q. Zhang, Y. Wang, and Y. Wang. How mask matters: Towards theoretical understandings of masked autoencoders. Advances in Neural Information Processing Systems, 35:27127--27139, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_78_1","volume-title":"Neural Information Processing Systems (NeurIPS)","author":"Zhang X.","year":"2022","unstructured":"X. Zhang, Z. Zhao, T. Tsiligkaridis, and M. Zitnik. Self-supervised contrastive pre-training for time series via time-frequency consistency. In Neural Information Processing Systems (NeurIPS), 2022."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2020.12.009"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00148"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714853","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714853","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:42Z","timestamp":1750295922000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714853"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":80,"alternative-id":["10.1145\/3696410.3714853","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714853","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}