{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T22:28:57Z","timestamp":1772490537990,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Collaborative Innovation Major Project of Zhengzhou","award":["20XTZX06013"],"award-info":[{"award-number":["20XTZX06013"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302458"],"award-info":[{"award-number":["62302458"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680845","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"3362-3371","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["ImageBind3D: Image as Binding Step for Controllable 3D Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7378-1313","authenticated-orcid":false,"given":"Zhenqiang","family":"Li","sequence":"first","affiliation":[{"name":"Zhengzhou University, Zhengzhou, Henan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4974-6116","authenticated-orcid":false,"given":"Jie","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1170-4340","authenticated-orcid":false,"given":"Yangjie","family":"Cao","sequence":"additional","affiliation":[{"name":"Zhengzhou University, Zhengzhou, Henan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5704-6855","authenticated-orcid":false,"given":"Jiayi","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhengzhou University, Zhengzhou, Henan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8548-7644","authenticated-orcid":false,"given":"Runfeng","family":"Lv","sequence":"additional","affiliation":[{"name":"Zhengzhou University, Zhengzhou, Henan, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"International conference on machine learning. PMLR, 40--49","author":"Achlioptas Panos","year":"2018","unstructured":"Panos Achlioptas, Olga Diamanti, Ioannis Mitliagkas, and Leonidas Guibas. 2018. Learning representations and generative models for 3d point clouds. In International conference on machine learning. PMLR, 40--49."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"e_1_3_2_1_4_1","volume-title":"Orazio Gallo, Leonidas Guibas, Jonathan Tremblay, Sameh Khamis, Tero Karras, and Gordon Wetzstein.","author":"Chan Eric R.","year":"2021","unstructured":"Eric R. Chan, Connor Z. Lin, Matthew A. Chan, Koki Nagano, Boxiao Pan, Shalini De Mello, Orazio Gallo, Leonidas Guibas, Jonathan Tremblay, Sameh Khamis, Tero Karras, and Gordon Wetzstein. 2021. Efficient Geometry-aware 3D Generative Adversarial Networks. In arXiv."},{"key":"e_1_3_2_1_5_1","volume-title":"Shapenet: An information-rich 3d model repository. arXiv preprint arXiv:1512.03012","author":"Chang Angel X","year":"2015","unstructured":"Angel X Chang, Thomas Funkhouser, Leonidas Guibas, Pat Hanrahan, Qixing Huang, Zimo Li, Silvio Savarese, Manolis Savva, Shuran Song, Hao Su, et al. 2015. Shapenet: An information-rich 3d model repository. arXiv preprint arXiv:1512.03012 (2015)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3470848","article-title":"Sofgan: A portrait image generator with dynamic styling","volume":"41","author":"Chen Anpei","year":"2022","unstructured":"Anpei Chen, Ruiyang Liu, Ling Xie, Zhang Chen, Hao Su, and Jingyi Yu. 2022. Sofgan: A portrait image generator with dynamic styling. ACM Transactions on Graphics (TOG), Vol. 41, 1 (2022), 1--26.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612489"},{"key":"e_1_3_2_1_8_1","volume-title":"Text-to-3d using gaussian splatting. arXiv preprint arXiv:2309.16585","author":"Chen Zilong","year":"2023","unstructured":"Zilong Chen, Feng Wang, and Huaping Liu. 2023. Text-to-3d using gaussian splatting. arXiv preprint arXiv:2309.16585 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00609"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00431"},{"key":"e_1_3_2_1_11_1","volume-title":"Hyperdiffusion: Generating implicit neural fields with weight-space diffusion. arXiv preprint arXiv:2303.17015","author":"Erkocc Ziya","year":"2023","unstructured":"Ziya Erkocc, Fangchang Ma, Qi Shan, Matthias Nie\u00dfner, and Angela Dai. 2023. Hyperdiffusion: Generating implicit neural fields with weight-space diffusion. arXiv preprint arXiv:2303.17015 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and 200 FPS. arXiv preprint arXiv:2311.17245","author":"Fan Zhiwen","year":"2023","unstructured":"Zhiwen Fan, Kevin Wang, Kairun Wen, Zehao Zhu, Dejia Xu, and Zhangyang Wang. 2023. LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and 200 FPS. arXiv preprint arXiv:2311.17245 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00542"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00053"},{"key":"e_1_3_2_1_15_1","unstructured":"Jun Gao Tianchang Shen Zian Wang Wenzheng Chen Kangxue Yin Daiqing Li Or Litany Zan Gojcic and Sanja Fidler. 2022. GET3D: A Generative Model of High Quality 3D Textured Shapes Learned from Images. In Advances In Neural Information Processing Systems."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01008"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00467"},{"key":"e_1_3_2_1_19_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_20_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00594"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00270"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"e_1_3_2_1_24_1","volume-title":"Shap-e: Generating conditional 3d implicit functions. arXiv preprint arXiv:2305.02463","author":"Jun Heewoo","year":"2023","unstructured":"Heewoo Jun and Alex Nichol. 2023. Shap-e: Generating conditional 3d implicit functions. arXiv preprint arXiv:2305.02463 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01767"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_1_28_1","volume-title":"Text2video-zero: Text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439","author":"Khachatryan Levon","year":"2023","unstructured":"Levon Khachatryan, Andranik Movsisyan, Vahram Tadevosyan, Roberto Henschel, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi. 2023. Text2video-zero: Text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Jong Hwan Ko, and Eunbyung Park","author":"Lee Joo Chan","year":"2023","unstructured":"Joo Chan Lee, Daniel Rho, Xiangyu Sun, Jong Hwan Ko, and Eunbyung Park. 2023. Compact 3D Gaussian Representation for Radiance Field. arXiv preprint arXiv:2311.13681 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings, Part XIV 16","author":"Li Xueting","year":"2020","unstructured":"Xueting Li, Sifei Liu, Kihwan Kim, Shalini De Mello, Varun Jampani, Ming-Hsuan Yang, and Jan Kautz. 2020. Self-supervised single-view 3d reconstruction via semantic consistency. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XIV 16. Springer, 677--693."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"e_1_3_2_1_32_1","volume-title":"Iss: Image as stetting stone for text-guided 3d shape generation. arXiv preprint arXiv:2209.04145","author":"Liu Zhengzhe","year":"2022","unstructured":"Zhengzhe Liu, Peng Dai, Ruihui Li, Xiaojuan Qi, and Chi-Wing Fu. 2022. Iss: Image as stetting stone for text-guided 3d shape generation. arXiv preprint arXiv:2209.04145 (2022)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01737"},{"key":"e_1_3_2_1_34_1","volume-title":"Inverse graphics gan: Learning to generate 3d shapes from unstructured 2d data. arXiv preprint arXiv:2002.12674","author":"Lunz Sebastian","year":"2020","unstructured":"Sebastian Lunz, Yingzhen Li, Andrew Fitzgibbon, and Nate Kushman. 2020. Inverse graphics gan: Learning to generate 3d shapes from unstructured 2d data. arXiv preprint arXiv:2002.12674 (2020)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00459"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul P. Srinivasan Matthew Tancik Jonathan T. Barron Ravi Ramamoorthi and Ren Ng. 2020. NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_2_1_37_1","volume-title":"Structurenet: Hierarchical graph networks for 3d shape generation. arXiv preprint arXiv:1908.00575","author":"Mo Kaichun","year":"2019","unstructured":"Kaichun Mo, Paul Guerrero, Li Yi, Hao Su, Peter Wonka, Niloy Mitra, and Leonidas J Guibas. 2019. Structurenet: Hierarchical graph networks for 3d shape generation. arXiv preprint arXiv:1908.00575 (2019)."},{"key":"e_1_3_2_1_38_1","volume-title":"T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453","author":"Mou Chong","year":"2023","unstructured":"Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, and Xiaohu Qie. 2023. T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530127"},{"key":"e_1_3_2_1_40_1","volume-title":"Point-e: A system for generating 3d point clouds from complex prompts. arXiv preprint arXiv:2212.08751","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Heewoo Jun, Prafulla Dhariwal, Pamela Mishkin, and Mark Chen. 2022. Point-e: A system for generating 3d point clouds from complex prompts. arXiv preprint arXiv:2212.08751 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Compressed 3D Gaussian Splatting for Accelerated Novel View Synthesis. arXiv preprint arXiv:2401.02436","author":"Niedermayr Simon","year":"2023","unstructured":"Simon Niedermayr, Josef Stumpfegger, and R\u00fcdiger Westermann. 2023. Compressed 3D Gaussian Splatting for Accelerated Novel View Synthesis. arXiv preprint arXiv:2401.02436 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01129"},{"key":"e_1_3_2_1_43_1","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).","author":"Park Dong Huk","year":"2021","unstructured":"Dong Huk Park, Samaneh Azadi, Xihui Liu, Trevor Darrell, and Anna Rohrbach. 2021. Benchmark for compositional text-to-image synthesis. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)."},{"key":"e_1_3_2_1_44_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"DreamFusion: Text-to-3D using 2D Diffusion. arXiv","author":"Poole Ben","year":"2022","unstructured":"Ben Poole, Ajay Jain, Jonathan T. Barron, and Ben Mildenhall. 2022. DreamFusion: Text-to-3D using 2D Diffusion. arXiv (2022)."},{"key":"e_1_3_2_1_46_1","volume-title":"Hierarchical spatio-temporal decoupling for text-to-video generation. arXiv preprint arXiv:2312.04483","author":"Qing Zhiwu","year":"2023","unstructured":"Zhiwu Qing, Shiwei Zhang, Jiayu Wang, Xiang Wang, Yujie Wei, Yingya Zhang, Changxin Gao, and Nong Sang. 2023. Hierarchical spatio-temporal decoupling for text-to-video generation. arXiv preprint arXiv:2312.04483 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_1_48_1","volume-title":"Pivotal tuning for latent-based editing of real images. ACM Transactions on graphics (TOG)","author":"Roich Daniel","year":"2022","unstructured":"Daniel Roich, Ron Mokady, Amit H Bermano, and Daniel Cohen-Or. 2022. Pivotal tuning for latent-based editing of real images. ACM Transactions on graphics (TOG), Vol. 42, 1 (2022), 1--13."},{"key":"e_1_3_2_1_49_1","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. arxiv: 2112.10752 [cs.CV]"},{"key":"e_1_3_2_1_50_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36479--36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","volume-title":"Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation. arXiv preprint arXiv:2303.07937","author":"Seo Junyoung","year":"2023","unstructured":"Junyoung Seo, Wooseok Jang, Min-Seop Kwak, Jaehoon Ko, Hyeonsu Kim, Junho Kim, Jin-Hwa Kim, Jiyoung Lee, and Seungryong Kim. 2023. Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation. arXiv preprint arXiv:2303.07937 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al. 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_1_53_1","volume-title":"Conference on Robot Learning. PMLR, 87--96","author":"Smith Edward J","year":"2017","unstructured":"Edward J Smith and David Meger. 2017. Improved adversarial systems for 3d object generation and reconstruction. In Conference on Robot Learning. PMLR, 87--96."},{"key":"e_1_3_2_1_54_1","volume-title":"International conference on machine learning. PMLR, 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International conference on machine learning. PMLR, 2256--2265."},{"key":"e_1_3_2_1_55_1","volume-title":"Generative modeling by estimating gradients of the data distribution. Advances in neural information processing systems","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_56_1","volume-title":"Dreamgaussian: Generative gaussian splatting for efficient 3d content creation. arXiv preprint arXiv:2309.16653","author":"Tang Jiaxiang","year":"2023","unstructured":"Jiaxiang Tang, Jiawei Ren, Hang Zhou, Ziwei Liu, and Gang Zeng. 2023. Dreamgaussian: Generative gaussian splatting for efficient 3d content creation. arXiv preprint arXiv:2309.16653 (2023)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01612"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00464"},{"key":"e_1_3_2_1_60_1","volume-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721","author":"Ye Hu","year":"2023","unstructured":"Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721 (2023)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_62_1","volume-title":"Computer Graphics Forum","author":"Zheng Xinyang","unstructured":"Xinyang Zheng, Yang Liu, Pengshuai Wang, and Xin Tong. 2022. SDF-StyleGAN: Implicit SDF-Based StyleGAN for 3D Shape Generation. In Computer Graphics Forum, Vol. 41. Wiley Online Library, 52--63."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00577"},{"key":"e_1_3_2_1_64_1","volume-title":"Lafite2: Few-shot text-to-image generation. arXiv preprint arXiv:2210.14124","author":"Zhou Yufan","year":"2022","unstructured":"Yufan Zhou, Chunyuan Li, Changyou Chen, Jianfeng Gao, and Jinhui Xu. 2022. Lafite2: Few-shot text-to-image generation. arXiv preprint arXiv:2210.14124 (2022)."},{"key":"e_1_3_2_1_65_1","volume-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852","author":"Zhu Bin","year":"2023","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, et al. 2023. Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680845","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680845","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:08Z","timestamp":1750295888000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680845"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":65,"alternative-id":["10.1145\/3664647.3680845","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680845","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}