{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:25Z","timestamp":1755825025215,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"name":"Natural Science Foundation of China under Grants","award":["62472178,62376271,U21A20515, U22B2034, 62365014"],"award-info":[{"award-number":["62472178,62376271,U21A20515, U22B2034, 62365014"]}]},{"name":"Natural Science Foundation of Chongqing, China","award":["No.CSTB2022NSCQ-MSX0552"],"award-info":[{"award-number":["No.CSTB2022NSCQ-MSX0552"]}]},{"DOI":"10.13039\/501100006374","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Open Projects Program of StateKey Laboratory of Multimodal Artificial Intelligence Systems","award":["No.MAIS2024111"],"award-info":[{"award-number":["No.MAIS2024111"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733257","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"487-496","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["3D Scene Graph Generation with Cross-Modal Alignment and Adversarial Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5449-1087","authenticated-orcid":false,"given":"Yujun","family":"Hu","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2292-1286","authenticated-orcid":false,"given":"Xiaoyu","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, East China University of Science and Technology, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8604-8495","authenticated-orcid":false,"given":"Changbo","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Data Science and Engineering, East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3221-4981","authenticated-orcid":false,"given":"Weiliang","family":"Meng","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8365-0970","authenticated-orcid":false,"given":"Gaoqi","family":"He","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, East China Normal University, Shanghai, China and Chongqing Key Laboratory of Precision Optics, Chongqing Institute of East China Normal University, Chongqing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw63382.2024.00328"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-031--26889--2_13"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27801"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.lwt.2021.112931"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022--1369--5"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02632"},{"key":"e_1_3_2_1_7_1","volume-title":"Reasoning3D--Grounding and Reasoning in 3D: Fine-Grained Zero-Shot Open-Vocabulary 3D Reasoning Part Segmentation via Large Vision-Language Models. arXiv preprint arXiv:2405.19326","author":"Chen Tianrun","year":"2024","unstructured":"Tianrun Chen, Chunan Yu, Jing Li, Jianqi Zhang, Lanyun Zhu, Deyi Ji, Yong Zhang, Ying Zang, Zejian Li, and Lingyun Sun. 2024b. Reasoning3D--Grounding and Reasoning in 3D: Fine-Grained Zero-Shot Open-Vocabulary 3D Reasoning Part Segmentation via Large Vision-Language Models. arXiv preprint arXiv:2405.19326 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00632"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01882"},{"key":"e_1_3_2_1_11_1","first-page":"131676","article-title":"Phylogen: language model-enhanced phylogenetic inference via graph structure generation","volume":"37","author":"Duan ChenRui","year":"2024","unstructured":"ChenRui Duan, Zelin Zang, Siyuan Li, Yongjie Xu, and Stan Z Li. 2024. Phylogen: language model-enhanced phylogenetic inference via graph structure generation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 131676--131703.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","volume-title":"Mitigate the Gap: Improving Cross-Modal Alignment in CLIP. In The Thirteenth International Conference on Learning Representations.","author":"Eslami Sajad","year":"2024","unstructured":"Sajad Eslami and Gerard de Melo. 2024. Mitigate the Gap: Improving Cross-Modal Alignment in CLIP. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/wacv51458.2022.00235"},{"key":"e_1_3_2_1_14_1","volume-title":"Image scene graph generation (sgg) benchmark. arXiv preprint arXiv:2107.12604","author":"Han Xiaotian","year":"2021","unstructured":"Xiaotian Han, Jianwei Yang, Houdong Hu, Lei Zhang, Jianfeng Gao, and Pengchuan Zhang. 2021. Image scene graph generation (sgg) benchmark. arXiv preprint arXiv:2107.12604 (2021)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19971"},{"key":"e_1_3_2_1_16_1","first-page":"96483","article-title":"Classification done right for vision-language pre-training","volume":"37","author":"Huang Zilong","year":"2024","unstructured":"Zilong Huang, Qinghao Ye, Bingyi Kang, Jiashi Feng, and Haoqi Fan. 2024. Classification done right for vision-language pre-training. Advances in Neural Information Processing Systems, Vol. 37 (2024), 96483--96504.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i11.29182"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01790"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-023-00876--4"},{"key":"e_1_3_2_1_20_1","volume-title":"Fine-tuning CLIP Text Encoders with Two-step Paraphrasing. arXiv preprint arXiv:2402.15120","author":"Kim Hyunjae","year":"2024","unstructured":"Hyunjae Kim, Seunghyun Yoon, Trung Bui, Handong Zhao, Quan Tran, Franck Dernoncourt, and Jaewoo Kang. 2024a. Fine-tuning CLIP Text Encoders with Two-step Paraphrasing. arXiv preprint arXiv:2402.15120 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2401.09786"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01345"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610029"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.140"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01830"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01096"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01887"},{"key":"e_1_3_2_1_28_1","first-page":"140903","article-title":"Multi-modal situated reasoning in 3d scenes","volume":"37","author":"Linghu Xiongkun","year":"2024","unstructured":"Xiongkun Linghu, Jiangyong Huang, Xuesong Niu, Xiaojian Shawn Ma, Baoxiong Jia, and Siyuan Huang. 2024. Multi-modal situated reasoning in 3d scenes. Advances in Neural Information Processing Systems, Vol. 37 (2024), 140903--140936.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01007"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/tvcg.2022.3219451"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/iccvw60793.2023.00013"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/access.2024.3424230"},{"key":"e_1_3_2_1_33_1","volume-title":"European Conference on Computer Vision. Springer, 127--150","author":"Miao Yang","year":"2024","unstructured":"Yang Miao, Francis Engelmann, Olga Vysotska, Federico Tombari, Marc Pollefeys, and D\u00e1niel B\u00e9la Bar\u00e1th. 2024. SceneGraphLoc: Cross-Modal Coarse Visual Localization on 3D Scene Graphs. In European Conference on Computer Vision. Springer, 127--150."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10096516"},{"key":"e_1_3_2_1_35_1","volume-title":"The Thirty Sixth Annual Conference on Learning Theory. PMLR","author":"Parulekar Advait","year":"2023","unstructured":"Advait Parulekar, Liam Collins, Karthikeyan Shanmugam, Aryan Mokhtari, and Sanjay Shakkottai. 2023. Infonce loss provably learns cluster-preserving representations. In The Thirty Sixth Annual Conference on Learning Theory. PMLR, 1914--1961."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/tmi.2024.3444279"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i6.16636"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i8.20800"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102601"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s40823-020-00055--8"},{"key":"e_1_3_2_1_41_1","first-page":"46830","article-title":"Image captioners are scalable vision learners too","volume":"36","author":"Tschannen Michael","year":"2023","unstructured":"Michael Tschannen, Manoj Kumar, Andreas Steiner, Xiaohua Zhai, Neil Houlsby, and Lucas Beyer. 2023. Image captioners are scalable vision learners too. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46830--46855.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00775"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6904"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02065"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3331583"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25435"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2023.103570"},{"key":"e_1_3_2_1_51_1","volume-title":"Multi-modal Semantic Understanding with Contrastive Cross-modal Feature Alignment. arXiv preprint arXiv:2403.06355","author":"Zhang Ming","year":"2024","unstructured":"Ming Zhang, Ke Chang, and Yunfang Wu. 2024a. Multi-modal Semantic Understanding with Contrastive Cross-modal Feature Alignment. arXiv preprint arXiv:2403.06355 (2024)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.5555\/3540261.3541684"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00285"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679719"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","unstructured":"Xiaoshui Zhu Rongkai Zhang Boyuan He et al. 2022. PointCLIP V2: Adapting CLIP for Powerful 3D Open-World Learning. arXiv preprint arXiv:2211.11682 Vol. 3 4 (2022). https:\/\/doi.org\/10.1109\/ICCV51070.2023.00249","DOI":"10.1109\/ICCV51070.2023.00249"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561548"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733257","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:05:00Z","timestamp":1755749100000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733257"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":56,"alternative-id":["10.1145\/3731715.3733257","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733257","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}