{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:48Z","timestamp":1781538888413,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810583","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1583-1587","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["UniDGF: A Unified Detection-to-Generation Framework for Hierarchical Object Visual Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8245-2607","authenticated-orcid":false,"given":"Xinyu","family":"Nan","sequence":"first","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2804-5872","authenticated-orcid":false,"given":"Lingtao","family":"Mao","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3844-8359","authenticated-orcid":false,"given":"Huangyu","family":"Dai","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8043-5842","authenticated-orcid":false,"given":"Zexin","family":"Zheng","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5934-2652","authenticated-orcid":false,"given":"Xinyu","family":"Sun","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2373-1439","authenticated-orcid":false,"given":"Zihan","family":"Liang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4495-8686","authenticated-orcid":false,"given":"Ben","family":"Chen","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6287-3673","authenticated-orcid":false,"given":"Chenyi","family":"Lei","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et\u00a0al. 2025. Qwen2. 5-vl technical report."},{"key":"e_1_3_3_1_3_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Biderman Stella","year":"2023","unstructured":"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O\u2019Brien, Eric Hallahan, Mohammad\u00a0Aflah Khan, Shivanshu Purohit, USVSN\u00a0Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, and Oskar Van Der\u00a0Wal. 2023. Pythia: a suite for analyzing large language models across training and scaling. In Proceedings of the 40th International Conference on Machine Learning(ICML\u201923). JMLR.org, Honolulu, Hawaii, USA, Article 102, 34\u00a0pages."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00644"},{"key":"e_1_3_3_1_5_2","unstructured":"Ben Chen Xian Guo Siyuan Wang Zihan Liang Yue Lv Yufei Ma Xinlong Xiao Bowen Xue Xuxin Zhang Ying Yang Huangyu Dai Xing Xu Tong Zhao Mingcan Peng Xiaoyang Zheng Chao Wang Qihang Zhao Zhixin Zhai Yang Zhao Bochao Liu Jingshan Lv Xiao Liang Yuqing Ding Jing Chen Chenyi Lei Wenwu Ou Han Li and Kun Gai. 2025. OneSearch: A Preliminary Exploration of the Unified End-to-End Generative Framework for E-commerce Search. arxiv:https:\/\/arXiv.org\/abs\/2509.03236\u00a0[cs.IR] https:\/\/arxiv.org\/abs\/2509.03236"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Keyan Chen Xiaolong Jiang Yao Hu Xu Tang Yan Gao Jianqi Chen and Weidi Xie. 2023. OvarNet: Towards Open-vocabulary Object Attribute Recognition.","DOI":"10.1109\/CVPR52729.2023.02252"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Tianheng Cheng Lin Song Yixiao Ge Wenyu Liu Xinggang Wang and Ying Shan. 2024. YOLO-World: Real-Time Open-Vocabulary Object Detection.","DOI":"10.1109\/CVPR52733.2024.01599"},{"key":"e_1_3_3_1_8_2","unstructured":"Kaiming He Georgia Gkioxari Piotr Doll\u00e1r and Ross Girshick. 2017. Mask r-cnn. 2961\u20132969\u00a0pages."},{"key":"e_1_3_3_1_9_2","unstructured":"Rahima Khanam and Muhammad Hussain. 2024. Yolov11: An overview of the key architectural enhancements."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","unstructured":"Joongwon Lee Wonho Zhung Jisu Seo and Woo\u00a0Youn Kim. 2026. BInD: Bond and Interaction-generating Diffusion Model for Multi-objective Structure-based Drug Design. arxiv:https:\/\/arXiv.org\/abs\/2405.16861\u00a0[q-bio.BM] 10.1002\/advs.202502702","DOI":"10.1002\/advs.202502702"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Mike Lewis Yinhan Liu Naman Goyal Marjan Ghazvininejad Abdelrahman Mohamed Omer Levy Ves Stoyanov and Luke Zettlemoyer. 2019. BART: Denoising sequence-to-sequence pre-training for natural language generation translation and comprehension.","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C\u00a0Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. 740\u2013755\u00a0pages.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Qing Jiang Chunyuan Li Jianwei Yang Hang Su et\u00a0al. 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. 38\u201355\u00a0pages.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Genevieve Patterson and James Hays. 2016. Coco attributes: Attributes for people animals and objects. 85\u2013100\u00a0pages.","DOI":"10.1007\/978-3-319-46466-4_6"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Khoi Pham Kushal Kafle Zhe Lin Zhihong Ding Scott Cohen Quan Tran and Abhinav Shrivastava. 2021. Learning to predict visual attributes in the wild. 13018\u201313028\u00a0pages.","DOI":"10.1109\/CVPR46437.2021.01282"},{"key":"e_1_3_3_1_16_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark et\u00a0al. 2021. Learning transferable visual models from natural language supervision. 8748\u20138763\u00a0pages."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Shuai Shao Zeming Li Tianyuan Zhang Chao Peng Gang Yu Xiangyu Zhang Jing Li and Jian Sun. 2019. Objects365: A large-scale high-quality dataset for object detection. 8430\u20138439\u00a0pages.","DOI":"10.1109\/ICCV.2019.00852"},{"key":"e_1_3_3_1_18_2","unstructured":"Oriane Sim\u00e9oni Huy\u00a0V. Vo Maximilian Seitzer Federico Baldassarre Maxime Oquab Cijo Jose Vasil Khalidov Marc Szafraniec Seungeun Yi Micha\u00ebl Ramamonjisoa Francisco Massa Daniel Haziza Luca Wehrstedt Jianyuan Wang Timoth\u00e9e Darcet Th\u00e9o Moutakanni Leonel Sentana Claire Roberts Andrea Vedaldi Jamie Tolan John Brandt Camille Couprie Julien Mairal Herv\u00e9 J\u00e9gou Patrick Labatut and Piotr Bojanowski. 2025. DINOv3. arxiv:https:\/\/arXiv.org\/abs\/2508.10104\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2508.10104"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Yuan Sun and Mingbo Zhao. 2024. Clothes Image Retrieval via Learnable FashionCLIP. 290\u2013301\u00a0pages.","DOI":"10.1007\/978-981-97-7001-4_21"},{"key":"e_1_3_3_1_20_2","unstructured":"Yi Tay Vinh\u00a0Q. Tran Mostafa Dehghani Jianmo Ni Dara Bahri Harsh Mehta Zhen Qin Kai Hui Zhe Zhao and Jai Gupta. 2022. Transformer Memory as a Differentiable Search Index."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Lewei Yao Jianhua Han Xiaodan Liang Dan Xu Wei Zhang Zhenguo Li and Hang Xu. 2023. Detclipv2: Scalable open-vocabulary object detection pre-training via word-region alignment. 23497\u201323506\u00a0pages.","DOI":"10.1109\/CVPR52729.2023.02250"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Lewei Yao Renjie Pi Jianhua Han Xiaodan Liang Hang Xu Wei Zhang Zhenguo Li and Dan Xu. 2024. Detclipv3: Towards versatile generative open-vocabulary object detection. 27391\u201327401\u00a0pages.","DOI":"10.1109\/CVPR52733.2024.02586"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Yu Yun Sen Wang Mingzhen Hou and Quanxue Gao. 2022. Attributes learning network for generalized zero-shot learning. Neural Networks 150 (2022) 112\u2013118.","DOI":"10.1016\/j.neunet.2022.02.018"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Daoze Zhang Zhanheng Nie Jianyu Liu Chenghan Fu Wanxian Guan Yuan Gao Jun Song Pengjie Wang Jian Xu and Bo Zheng. 2025. MOON: Generative MLLM-based Multimodal Representation Learning for E-commerce Product Understanding.","DOI":"10.1145\/3773966.3777958"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Qiming Zhang Jing Zhang Yufei Xu and Dacheng Tao. 2024. Vision transformer with quadrangle attention. IEEE Transactions on Pattern Analysis and Machine Intelligence 46 5 (2024) 3608\u20133624.","DOI":"10.1109\/TPAMI.2023.3347693"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:04:53Z","timestamp":1781535893000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810583"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":24,"alternative-id":["10.1145\/3805622.3810583","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810583","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}