{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:11:19Z","timestamp":1765357879925,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754773","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"6500-6509","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["GaussianCross: Cross-modal Self-supervised 3D Representation Learning via Gaussian Splatting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0304-3056","authenticated-orcid":false,"given":"Lei","family":"Yao","sequence":"first","affiliation":[{"name":"Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8659-4724","authenticated-orcid":false,"given":"Yi","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8242-1581","authenticated-orcid":false,"given":"Yi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4530-2606","authenticated-orcid":false,"given":"Moyun","family":"Liu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4932-0593","authenticated-orcid":false,"given":"Lap-Pui","family":"Chau","sequence":"additional","affiliation":[{"name":"Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.170"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01840"},{"key":"e_1_3_2_1_3_1","volume-title":"European Conference on Computer Vision (ECCV). Springer, 370-386","author":"Chen Yuedong","year":"2024","unstructured":"Yuedong Chen, Haofei Xu, Chuanxia Zheng, Bohan Zhuang, Marc Pollefeys, Andreas Geiger, Tat-Jen Cham, and Jianfei Cai. 2024. Mvsplat: Efficient 3d gaussian splatting from sparse multi-view images. In European Conference on Computer Vision (ECCV). Springer, 370-386."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00319"},{"key":"e_1_3_2_1_5_1","volume-title":"Pointcept: A Codebase for Point Cloud Perception Research. https:\/\/github.com\/Pointcept\/Pointcept.","author":"Contributors Pointcept","year":"2023","unstructured":"Pointcept Contributors. 2023. Pointcept: A Codebase for Point Cloud Perception Research. https:\/\/github.com\/Pointcept\/Pointcept."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_1_7_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). Association for Computational Linguistics, 4171-4186."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681343"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000022288.19776.77"},{"key":"e_1_3_2_1_10_1","volume-title":"Submanifold sparse convolutional networks. arXiv preprint arXiv:1706.01307","author":"Graham Benjamin","year":"2017","unstructured":"Benjamin Graham and Laurens Van der Maaten. 2017. Submanifold sparse convolutional networks. arXiv preprint arXiv:1706.01307 (2017)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02094"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01533"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01474"},{"key":"e_1_3_2_1_16_1","volume-title":"ARKit LabelMaker: A New Scale for Indoor 3D Scene Understanding. arXiv preprint arXiv:2410.13924","author":"Ji Guangda","year":"2024","unstructured":"Guangda Ji, Silvan Weder, Francis Engelmann, Marc Pollefeys, and Hermann Blum. 2024. ARKit LabelMaker: A New Scale for Indoor 3D Scene Understanding. arXiv preprint arXiv:2410.13924 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00831"},{"key":"e_1_3_2_1_20_1","volume-title":"Spot-Compose: A Framework for Open-Vocabulary Object Retrieval and Drawer Manipulation in Point Clouds. Internationl Conference on Robotics and Automation Workshops (ICRAW)","author":"Lemke Oliver","year":"2024","unstructured":"Oliver Lemke, Zuria Bauer, Ren\u00e9 Zurbr\u00fcgg, Marc Pollefeys, Francis Engelmann, and Hermann Blum. 2024. Spot-Compose: A Framework for Open-Vocabulary Object Retrieval and Drawer Manipulation in Point Clouds. Internationl Conference on Robotics and Automation Workshops (ICRAW) (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Point Cloud Unsupervised Pre-training via 3D Gaussian Splatting. arXiv preprint arXiv:2411.18667","author":"Liu Hao","year":"2024","unstructured":"Hao Liu, Minglin Chen, Yanni Ma, Haihong Xiao, and Ying He. 2024. Point Cloud Unsupervised Pre-training via 3D Gaussian Splatting. arXiv preprint arXiv:2411.18667 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_23_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20086-1_35"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 28223-28243","author":"Qi Zekun","year":"2023","unstructured":"Zekun Qi, Runpei Dong, Guofan Fan, Zheng Ge, Xiangyu Zhang, Kaisheng Ma, and Li Yi. 2023. Contrast with reconstruct: Contrastive 3d representation learning guided by generative pretraining. In International Conference on Machine Learning. PMLR, 28223-28243."},{"key":"e_1_3_2_1_26_1","volume-title":"Pointnext: Revisiting pointnet with improved training and scaling strategies. Advances in Neural Information Processing Systems (NeurIPS)","author":"Qian Guocheng","year":"2022","unstructured":"Guocheng Qian, Yuchen Li, Houwen Peng, Jinjie Mai, Hasan Hammoud, Mohamed Elhoseiny, and Bernard Ghanem. 2022. Pointnext: Revisiting pointnet with improved training and scaling strategies. Advances in Neural Information Processing Systems (NeurIPS) (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"International conference on machine learning. 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. 8748-8763."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_8"},{"volume-title":"ACM siggraph 2006 papers.","author":"Snavely Noah","key":"e_1_3_2_1_29_1","unstructured":"Noah Snavely, Steven M Seitz, and Richard Szeliski. 2006. Photo tourism: exploring photo collections in 3D. In ACM siggraph 2006 papers. Vol. 25. 835-846."},{"key":"e_1_3_2_1_30_1","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems (NeurIPS). 5998-6008.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00470"},{"key":"e_1_3_2_1_32_1","volume-title":"European Conference on Computer Vision. Springer, 193-209","author":"Wang Jiaxu","year":"2024","unstructured":"Jiaxu Wang, Ziyi Zhang, Junhao He, and Renjing Xu. 2024c. PFGS: High Fidelity Point Cloud Rendering via Feature Splatting. In European Conference on Computer Vision. Springer, 193-209."},{"key":"e_1_3_2_1_33_1","volume-title":"Neus: Learning neural implicit surfaces by","author":"Wang Peng","year":"2021","unstructured":"Peng Wang, Lingjie Liu, Yuan Liu, Christian Theobalt, Taku Komura, and Wenping Wang. 2021. Neus: Learning neural implicit surfaces by volume rendering for multi-view reconstruction. arXiv preprint arXiv:2106.10689 (2021)."},{"key":"e_1_3_2_1_34_1","volume-title":"FreeSplat: Generalizable 3D Gaussian Splatting Towards Free-View Synthesis of Indoor Scenes. Advances in Neural Information Processing Systems (NeurIPS)","author":"Wang Yunsong","year":"2024","unstructured":"Yunsong Wang, Tianxin Huang, Hanlin Chen, and Gim Hee Lee. 2024a. FreeSplat: Generalizable 3D Gaussian Splatting Towards Free-View Synthesis of Indoor Scenes. Advances in Neural Information Processing Systems (NeurIPS) (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02067"},{"key":"e_1_3_2_1_36_1","volume-title":"Point transformer v2: Grouped vector attention and partition-based pooling. Advances in Neural Information Processing Systems (NeurIPS)","author":"Wu Xiaoyang","year":"2022","unstructured":"Xiaoyang Wu, Yixing Lao, Li Jiang, Xihui Liu, and Hengshuang Zhao. 2022. Point transformer v2: Grouped vector attention and partition-based pooling. Advances in Neural Information Processing Systems (NeurIPS) (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01849"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00908"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_34"},{"key":"e_1_3_2_1_40_1","volume-title":"SGIFormer: Semantic-guided and geometric-enhanced interleaving transformer for 3D instance segmentation","author":"Yao Lei","year":"2024","unstructured":"Lei Yao, Yi Wang, Moyun Liu, and Lap-Pui Chau. 2024. SGIFormer: Semantic-guided and geometric-enhanced interleaving transformer for 3D instance segmentation. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_1_41_1","first-page":"162","volume-title":"European Conference on Computer Vision (ECCV)","volume":"15087","author":"Ye Mingqiao","year":"2024","unstructured":"Mingqiao Ye, Martin Danelljan, Fisher Yu, and Lei Ke. 2024. Gaussian Grouping: Segment and Edit Anything in 3D Scenes. In European Conference on Computer Vision (ECCV), Vol. 15087. Springer, 162-179."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28501"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"e_1_3_2_1_44_1","first-page":"57","volume-title":"European Conference on Computer Vision (ECCV)","volume":"15060","author":"Yue Yuanwen","year":"2024","unstructured":"Yuanwen Yue, Anurag Das, Francis Engelmann, Siyu Tang, and Jan Eric Lenssen. 2024. Improving 2d feature representations by 3d-aware fine-tuning. In European Conference on Computer Vision (ECCV), Vol. 15060. Springer, 57-74."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"e_1_3_2_1_46_1","volume-title":"A survey of embodied learning for object-centric robotic manipulation. Machine Intelligence Research","author":"Zheng Ying","year":"2025","unstructured":"Ying Zheng, Lei Yao, Yuejiao Su, Yi Zhang, Yi Wang, Sicheng Zhao, Yiyi Zhang, and Lap-Pui Chau. 2025. A survey of embodied learning for object-centric robotic manipulation. Machine Intelligence Research (2025), 1-39."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02048"},{"key":"e_1_3_2_1_48_1","unstructured":"Haoyi Zhu Honghui Yang Xiaoyang Wu Di Huang Sha Zhang Xianglong He Tong He Hengshuang Zhao Chunhua Shen Yu Qiao et al. 2023. Ponderv2: Pave the way for 3d foundataion model with a universal pre-training paradigm. arXiv preprint arXiv:2310.08586 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3754549"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754773","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:58:14Z","timestamp":1765342694000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754773"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":49,"alternative-id":["10.1145\/3746027.3754773","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754773","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}