{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:55:24Z","timestamp":1781538924627,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62176024"],"award-info":[{"award-number":["62176024"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810718","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"337-346","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MMRet3D: A Multi-Modal Matching Framework for 3D Object Retrieval from Multi-View Images"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0335-2469","authenticated-orcid":false,"given":"Zeyu","family":"Li","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3204-6527","authenticated-orcid":false,"given":"Lei","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00272"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58542-6_36"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01289"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01289"},{"key":"e_1_3_3_1_6_2","unstructured":"Gwangbin Bae Ignas Budvytis and Roberto Cipolla. 2022. Irondepth: Iterative refinement of single-view depth using surface normal and its uncertainty. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.03676 (2022)."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00911"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.543"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33783-3_44"},{"key":"e_1_3_3_1_10_2","unstructured":"Angel\u00a0X Chang Thomas Funkhouser Leonidas Guibas Pat Hanrahan Qixing Huang Zimo Li Silvio Savarese Manolis Savva Shuran Song Hao Su et\u00a0al. 2015. Shapenet: An information-rich 3D model repository. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1512.03012 (2015)."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Rui Chen Yongwei Chen Ningxin Jiao and Kui Jia. 2023. Fantasia3d: Disentangling geometry and appearance for high-quality text-to-3d content creation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.13873 (2023).","DOI":"10.1109\/ICCV51070.2023.02033"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Zoey Chen Aaron Walsman Marius Memmel Kaichun Mo Alex Fang Karthikeya Vemuri Alan Wu Dieter Fox and Abhishek Gupta. 2024. Urdformer: A pipeline for constructing articulated simulation environments from real-world images. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.11656 (2024).","DOI":"10.15607\/RSS.2024.XX.124"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_3_1_14_2","unstructured":"Tianyuan Dai Josiah Wong Yunfan Jiang Chen Wang Cem Gokmen Ruohan Zhang Jiajun Wu and Li Fei-Fei. 2024. Automated creation of digital cousins for robust policy learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.07408 (2024)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_3_1_16_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et\u00a0al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01061"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Muzhi Han Zeyu Zhang Ziyuan Jiao Xu Xie Yixin Zhu Song-Chun Zhu and Hangxin Liu. 2022. Scene reconstruction with functional objects for robot autonomy. International Journal of Computer Vision 130 12 (2022) 2940\u20132961.","DOI":"10.1007\/s11263-022-01670-0"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3726302.3729925"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02469"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Yining Hong Haoyu Zhen Peihao Chen Shuhong Zheng Yilun Du Zhenfang Chen and Chuang Gan. 2023. 3d-llm: Injecting the 3D world into large language models. Advances in Neural Information Processing Systems 36 (2023) 20482\u201320494.","DOI":"10.52202\/075280-0900"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Mu Hu Wei Yin Chi Zhang Zhipeng Cai Xiaoxiao Long Hao Chen Kaixuan Wang Gang Yu Chunhua Shen and Shaojie Shen. 2024. Metric3d v2: A versatile monocular geometric foundation model for zero-shot metric depth and surface normal estimation. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024).","DOI":"10.1109\/TPAMI.2024.3444912"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00873"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_12"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02202"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Asako Kanezaki Yasuyuki Matsushita and Yoshifumi Nishida. 2019. Rotationnet for joint object categorization and unsupervised pose estimation from multi-view images. IEEE transactions on pattern analysis and machine intelligence 43 1 (2019) 269\u2013283.","DOI":"10.1109\/TPAMI.2019.2922640"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Misong Kim SeoungJae Jeong Hyoseok Hwang and HyeongYeop Kang. 2025. 3D Indoor Scene Generation via Diffusion with User Interaction Awareness. Journal of the Korea Computer Graphics Society 31 3 (2025) 119\u2013126.","DOI":"10.15701\/kcgs.2025.31.3.119"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00497"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Changhao Li Junfu Guo Ruizhen Hu and Ligang Liu. 2023. Online Scene CAD Recomposition via Autonomous Scanning. ACM Transactions on Graphics (TOG) 42 6 (2023) 1\u201316.","DOI":"10.1145\/3618339"},{"key":"e_1_3_3_1_32_2","unstructured":"Benlin Liu Yuhao Dong Yiqin Wang Yongming Rao Yansong Tang Wei-Chiu Ma and Ranjay Krishna. 2024. Coarse correspondence elicit 3D spacetime understanding in multimodal language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.00754 (2024)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00960"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Minghua Liu Ruoxi Shi Kaiming Kuang Yinhao Zhu Xuanlin Li Shizhong Han Hong Cai Fatih Porikli and Hao Su. 2023. Openshape: Scaling up 3D shape representation towards open-world understanding. Advances in neural information processing systems 36 (2023) 44860\u201344879.","DOI":"10.52202\/075280-1944"},{"key":"e_1_3_3_1_35_2","unstructured":"Yuan Liu Cheng Lin Zijiao Zeng Xiaoxiao Long Lingjie Liu Taku Komura and Wenping Wang. 2023. Syncdreamer: Generating multiview-consistent images from a single-view image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.03453 (2023)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555392"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00013"},{"key":"e_1_3_3_1_38_2","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Theo Moutakanni Huy\u00a0V. Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby Russell Howes Po-Yao Huang Hu Xu Vasu Sharma Shang-Wen Li Wojciech Galuba Mike Rabbat Mido Assran Nicolas Ballas Gabriel Synnaeve Ishan Misra Herve Jegou Julien Mairal Patrick Labatut Armand Joulin and Piotr Bojanowski. 2023. DINOv2: Learning Robust Visual Features without Supervision."},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00025"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Tessa Pulli Jean-Baptiste Weibel Peter H\u00f6nig Matthias Hirschmanner Markus Vincze and Andreas Holzinger. 2026. OSCAR: Open-Set CAD Retrieval from a Language Prompt and a Single Image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2601.07333 (2026).","DOI":"10.2139\/ssrn.5877052"},{"key":"e_1_3_3_1_41_2","unstructured":"Charles\u00a0Ruizhongtai Qi Li Yi Hao Su and Leonidas\u00a0J Guibas. 2017. Pointnet++: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00037"},{"key":"e_1_3_3_1_43_2","unstructured":"Tianhe Ren Yihao Chen Qing Jiang Zhaoyang Zeng Yuda Xiong Wenlong Liu Zhengyu Ma Junyi Shen Yuan Gao Xiaoke Jiang Xingyu Chen Zhuheng Song Yuhong Zhang Hongjie Huang Han Gao Shilong Liu Hao Zhang Feng Li Kent Yu and Lei Zhang. 2024. DINO-X: A Unified Vision Model for Open-World Object Detection and Understanding. arxiv:https:\/\/arXiv.org\/abs\/2411.14347\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2411.14347"},{"key":"e_1_3_3_1_44_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.1556 (2014)."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.114"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Yiding Sun Haozhe Cheng Chaoyi Lu Zhengqiao Li Minghong Wu Huimin Lu and Jihua Zhu. 2025. HyperPoint: Multimodal 3D foundation model in hyperbolic space. Pattern Recognition (2025) 112800.","DOI":"10.1016\/j.patcog.2025.112800"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Zhenggang Tang Yuchen Fan Dilin Wang Hongyu Xu Rakesh Ranjan Alexander Schwing and Zhicheng Yan. 2024. MV-DUSt3R+: Single-Stage Scene Reconstruction from Sparse Views In 2 Seconds. (2024).","DOI":"10.1109\/CVPR52734.2025.00498"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00077"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Yue Wang Yongbin Sun Ziwei Liu Sanjay\u00a0E Sarma Michael\u00a0M Bronstein and Justin\u00a0M Solomon. 2019. Dynamic graph cnn for learning on point clouds. ACM Transactions on Graphics (tog) 38 5 (2019) 1\u201312.","DOI":"10.1145\/3326362"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618165"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"Chongjie Ye Lingteng Qiu Xiaodong Gu Qi Zuo Yushuang Wu Zilong Dong Liefeng Bo Yuliang Xiu and Xiaoguang Han. 2024. Stablenormal: Reducing diffusion variance for stable and sharp normal. ACM Transactions on Graphics (TOG) 43 6 (2024) 1\u201318.","DOI":"10.1145\/3687971"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"crossref","unstructured":"Duo Zheng Shijia Huang and Liwei Wang. 2024. Video-3D LLM: Learning Position-Aware Video Representation for 3D Scene Understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.00493 (2024).","DOI":"10.1109\/CVPR52734.2025.00841"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_35"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:19:07Z","timestamp":1781536747000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810718"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":54,"alternative-id":["10.1145\/3805622.3810718","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810718","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}