{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:58:00Z","timestamp":1781539080882,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":101,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62576195"],"award-info":[{"award-number":["62576195"]}]},{"name":"National Natural Science Foundation of China","award":["62276155"],"award-info":[{"award-number":["62276155"]}]},{"name":"Key R&D Program of Shandong Province (Major scientific and technological innovation projects)","award":["2025CXGC020101"],"award-info":[{"award-number":["2025CXGC020101"]}]},{"name":"China National University Student Innovation \\\\& Entrepreneurship Development Program","award":["2025283"],"award-info":[{"award-number":["2025283"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810601","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"288-297","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["IMAGINE: Adaptive Schema-Imagery Enhanced Composition for Composed Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9435-4127","authenticated-orcid":false,"given":"Jiale","family":"Huang","sequence":"first","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5136-159X","authenticated-orcid":false,"given":"Zixu","family":"Li","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0365-8553","authenticated-orcid":false,"given":"Zhiwei","family":"Chen","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7724-5662","authenticated-orcid":false,"given":"Zhiheng","family":"Fu","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6617-0924","authenticated-orcid":false,"given":"Chunxiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Qilu University of Technology (Shandong Academy of Sciences), Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5653-8286","authenticated-orcid":false,"given":"Yupeng","family":"Hu","sequence":"additional","affiliation":[{"name":"Shandong Unversity, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Yuan Sun Zhenwen Ren Peng Hu Dezhong Peng and Xu Wang. 2023. Hierarchical consensus hashing for cross-modal retrieval. IEEE TMM 26 (2023) 824\u2013836.","DOI":"10.1109\/TMM.2023.3272169"},{"key":"e_1_3_3_1_3_2","unstructured":"Mengzhu Xu Hanzhi Liu Ningkang Peng Qianyu Chen and Canran Xiao. 2025. Affordance-First Decomposition for Continual Learning in Video-Language Understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2512.00694 (2025)."},{"key":"e_1_3_3_1_4_2","volume-title":"ICML","author":"Pu Ruitao","year":"2025","unstructured":"Ruitao Pu, Yang Qin, Xiaomin Song, Dezhong Peng, Zhenwen Ren, and Yuan Sun. 2025. She: Streaming-media hashing retrieval. In ICML."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Yupeng Hu Meng Liu Xiaobin Su Zan Gao and Liqiang Nie. 2021. Video moment localization via deep cross-modal hashing. IEEE TIP 30 (2021) 4667\u20134677.","DOI":"10.1109\/TIP.2021.3073867"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Yuan Sun Yang Qin Dezhong Peng Zhenwen Ren Chao Yang and Peng Hu. 2024. Dual self-paced hashing for image retrieval. IEEE TMM 26 (2024) 9619\u20139629.","DOI":"10.1109\/TMM.2024.3395969"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i19.34199"},{"key":"e_1_3_3_1_8_2","unstructured":"Yujia Wang Yuyan Li Jiuming Liu Fang-Lue Zhang Xinhu Zheng Neil Dodgson et\u00a0al. 2026. RL-ScanIQA: Reinforcement-Learned Scanpaths for Blind 360 {\\ deg} Image Quality Assessment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2603.14297 (2026)."},{"key":"e_1_3_3_1_9_2","first-page":"4569","volume-title":"IJCAI","author":"Liu Kaiming","year":"2024","unstructured":"Kaiming Liu, Yunhong Gong, Yu Cao, Zhenwen Ren, Dezhong Peng, and Yuan Sun. 2024. Dual Semantic Fusion Hashing for Multi-Label Cross-Modal Retrieval.. In IJCAI. 4569\u20134577."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Yupeng Hu Liqiang Nie Meng Liu Kun Wang Yinglong Wang and Xian-Sheng Hua. 2021. Coarse-to-fine semantic alignment for cross-modal moment localization. IEEE TIP 30 (2021) 5933\u20135943.","DOI":"10.1109\/TIP.2021.3090521"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3755366"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01828"},{"key":"e_1_3_3_1_14_2","unstructured":"Zhenlong Yuan Jing Tang Jinguo Luo Rui Chen Chengxuan Qian Lei Sun Xiangxiang Chu Yujun Cai Dapeng Zhang and Shuo Li. 2025. AutoDrive-R2: Incentivizing Reasoning and Self-Reflection Capacity for VLA Model in Autonomous Driving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.01944 (2025)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.739"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00682"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Qianyun Yang Zhiwei Chen Yupeng Hu Zixu Li Zhiheng Fu and Liqiang Nie. 2026. STABLE: Efficient Hybrid Nearest Neighbor Search via Magnitude-Uniformity and Cardinality-Robustness. IEEE TKDE (2026).","DOI":"10.1109\/TKDE.2026.3676465"},{"key":"e_1_3_3_1_18_2","volume-title":"ICLR","author":"Xiao Canran","year":"2026","unstructured":"Canran Xiao, Tianxiang Xu, Siyuan Ma, Yiyang Jiang, Haoyu Gao, and Yuhan Wu. 2026. Reversible primitive\u2013composition alignment for continual vision\u2013language learning. In ICLR."},{"key":"e_1_3_3_1_19_2","first-page":"8950","volume-title":"ACL Findings","author":"Zhang Yunyao","year":"2025","unstructured":"Yunyao Zhang, Zikai Song, Hang Zhou, Wenfeng Ren, Yi-Ping\u00a0Phoebe Chen, Junqing Yu, and Wei Yang. 2025. GA \u2212 S3: Comprehensive Social Network Simulation with Group Agents. In ACL Findings. 8950\u20138970."},{"key":"e_1_3_3_1_20_2","unstructured":"Keyang Zhong Junlin Xie Hefeng Wu Haofeng Li and Guanbin Li. 2026. Collaborative Multi-Agent Scripts Generation for Enhancing Imperfect-Information Reasoning in Murder Mystery Games. arxiv:https:\/\/arXiv.org\/abs\/2604.11741\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2604.11741"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","unstructured":"Qianyun Yang Peizhuo Lv Yingjiu Li Shengzhi Zhang Yuxuan Chen Zhiwei Chen Zixu Li and Yupeng Hu. 2026. ERASE: Bypassing Collaborative Detection of AI Counterfeit Via Comprehensive Artifacts Elimination. IEEE TDSC (March 2026) 1\u201318. 10.1109\/TDSC.2026.3677794","DOI":"10.1109\/TDSC.2026.3677794"},{"key":"e_1_3_3_1_22_2","unstructured":"Mingzhu Xu Chenglong Yu Zexuan Li Haoyu Tang Yupeng Hu and Liqiang Nie. 2025. Hdnet: A hybrid domain network with multi-scale high-frequency information enhancement for infrared small target detection. IEEE TGRS (2025)."},{"key":"e_1_3_3_1_23_2","unstructured":"Jinhe Bi Yifan Wang Danqi Yan Aniri Wenke Huang Zengjie Jin Xiaowen Ma Artur Hecker Mang Ye Xun Xiao Hinrich Schuetze Volker Tresp and Yunpu Ma. 2025. PRISM: Self-Pruning Intrinsic Selection Method for Training-Free Multimodal Data Selection. arxiv:https:\/\/arXiv.org\/abs\/2502.12119https:\/\/arxiv.org\/abs\/2502.12119"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i38.40455"},{"key":"e_1_3_3_1_25_2","unstructured":"Zhenlong Yuan Xiangyan Qu Chengxuan Qian Rui Chen Jing Tang Lei Sun Xiangxiang Chu Dapeng Zhang Yiwei Wang Yujun Cai et\u00a0al. 2025. Video-STAR: Reinforcing Open-Vocabulary Action Recognition with Tools. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.08480 (2025)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681104"},{"key":"e_1_3_3_1_27_2","volume-title":"CVPR","author":"Fu Zhiheng","year":"2026","unstructured":"Zhiheng Fu, Yupeng Hu, Qianyun Yang, Shiqi Zhang, Zhiwei Chen, and Zixu Li. 2026. Air-Know: Arbiter-Calibrated Knowledge-Internalizing Robust Network for Composed Image Retrieval. In CVPR."},{"key":"e_1_3_3_1_28_2","unstructured":"Yichen Wu Xu Liu Chenxuan Zhao and Xinyu Wu. 2025. Prompt-Guided Dual Latent Steering for Inversion Problems. arxiv:https:\/\/arXiv.org\/abs\/2509.18619\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2509.18619"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Albert Gordo Jon Almazan Jerome Revaud and Diane Larlus. 2017. End-to-end learning of deep visual representations for image retrieval. IJCV 124 2 (2017) 237\u2013254.","DOI":"10.1007\/s11263-017-1016-8"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00859"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00262"},{"key":"e_1_3_3_1_32_2","volume-title":"CVPR","author":"Li Zixu","year":"2026","unstructured":"Zixu Li, Yupeng Hu, Zhiwei Chen, Mingyu Zhang, Zhiheng Fu, and Liqiang Nie. 2026. ConeSep: Cone-based Robust Noise-Unlearning Compositional Network for Composed Image Retrieval. In CVPR."},{"key":"e_1_3_3_1_33_2","unstructured":"Xinglang Zhang Yunyao Zhang ZeLiang Chen Junqing Yu Wei Yang and Zikai Song. 2026. Logical Phase Transitions: Understanding Collapse in LLM Logical Reasoning. arxiv:https:\/\/arXiv.org\/abs\/2601.02902\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2601.02902"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611864"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Yupeng Hu Kun Wang Meng Liu Haoyu Tang and Liqiang Nie. 2023. Semantic collaborative learning for cross-modal moment localization. ACM TOIS 42 2 (2023) 1\u201326.","DOI":"10.1145\/3620669"},{"key":"e_1_3_3_1_36_2","unstructured":"Zhenlong Yuan Xiangyan Qu Jing Tang Rui Chen Lei Sun Ruidong Chen Hongwei Yu Chengxuan Qian Xiangxiang Chu Shuo Li et\u00a0al. 2026. What if Agents Could Imagine? Reinforcing Open-Vocabulary HOI Comprehension through Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2602.11499 (2026)."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28334"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02540"},{"key":"e_1_3_3_1_39_2","unstructured":"Yupeng Hu Zixu Li Zhiwei Chen Qinlei Huang Zhiheng Fu Mingzhu Xu and Liqiang Nie. 2026. REFINE: Composed Video Retrieval via Shared and Differential Semantics Enhancement. ACM ToMM (2026)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Lucas Ventura Antoine Yang Cordelia Schmid and G\u00fcl Varol. 2024. CoVR-2: Automatic Data Construction for Composed Video Retrieval. IEEE TPAMI (2024).","DOI":"10.1109\/TPAMI.2024.3463799"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i28.39507"},{"key":"e_1_3_3_1_42_2","volume-title":"ICLR","author":"Yue WU","year":"2025","unstructured":"WU Yue, Zhaobo Qi, Yiling Wu, Junshu Sun, Yaowei Wang, and Shuhui Wang. 2025. Learning Fine-Grained Representations through Textual Token Disentanglement in Composed Video Retrieval. In ICLR."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3755445"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Haokun Wen Xuemeng Song Jianhua Yin Jianlong Wu Weili Guan and Liqiang Nie. 2024. Self-Training Boosted Multi-Factor Matching Network for Composed Image Retrieval. IEEE TPAMI 46 5 (2024) 3665\u20133678.","DOI":"10.1109\/TPAMI.2023.3346434"},{"key":"e_1_3_3_1_45_2","unstructured":"Shilin Lu Zhuming Lian Zihan Zhou Shaocong Zhang Chen Zhao and Adams Wai-Kin Kong. 2025. Does FLUX Already Know How to Perform Physically Plausible Image Composition?arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.21278 (2025)."},{"key":"e_1_3_3_1_46_2","unstructured":"Yangliu Hu Zikai Song Na Feng Yawei Luo Junqing Yu Yi-Ping\u00a0Phoebe Chen and Wei Yang. 2025. SF2T: Self-supervised Fragment Finetuning of Video-LLMs for Fine-Grained Understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.07745 (2025)."},{"key":"e_1_3_3_1_47_2","unstructured":"Zixu Li Zhiheng Fu Yupeng Hu Zhiwei Chen Haokun Wen and Liqiang Nie. 2025. FineCIR: Explicit Parsing of Fine-Grained Modification Semantics for Composed Image Retrieval. https:\/\/arxiv.org\/abs\/2503.21309 (2025)."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413917"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475483"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Gangjian Zhang Shikui Wei Huaxin Pang Shuang Qiu and Yao Zhao. 2022. Composed Image Retrieval via Explicit Erasure and Replenishment With Semantic Alignment. IEEE TIP 31 (2022) 5976\u20135988.","DOI":"10.1109\/TIP.2022.3204213"},{"key":"e_1_3_3_1_51_2","unstructured":"Xinlei Yu Chengming Xu Guibin Zhang Zhangquan Chen Yudong Zhang Yongbo He Peng-Tao Jiang Jiangning Zhang Xiaobin Hu and Shuicheng Yan. 2025. Vismem: Latent vision memory unlocks potential of vision-language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.11007 (2025)."},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532047"},{"key":"e_1_3_3_1_53_2","unstructured":"Hongguang Zhu Yunchao Wei Yao Zhao Chunjie Zhang and Shujuan Huang. 2023. AMC: Adaptive Multi-Expert Collaborative Network for Text-Guided Image Retrieval. ACM ToMM (2023)."},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00615"},{"key":"e_1_3_3_1_55_2","unstructured":"Zihan Zhou Shilin Lu Shuli Leng Shaocong Zhang Zhuming Lian Xinlei Yu and Adams Wai-Kin Kong. 2025. DragFlow: Unleashing DiT Priors with Region Based Supervision for Drag Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.02253 (2025)."},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00218"},{"key":"e_1_3_3_1_57_2","unstructured":"Shiming Chen Dingjie Fu Salman Khan and Fahad\u00a0Shahbaz Khan. 2025. GenZSL: Generative Zero-Shot Learning Via Inductive Variational Autoencoder. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.11882 (2025)."},{"key":"e_1_3_3_1_58_2","volume-title":"ICLR","author":"Chou Yu-Ying","year":"2020","unstructured":"Yu-Ying Chou, Hsuan-Tien Lin, and Tyng-Luh Liu. 2020. Adaptive and generative zero-shot learning. In ICLR."},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Wenbing Li Hang Zhou Junqing Yu Zikai Song and Wei Yang. 2024. Coupled mamba: Enhanced multimodal fusion with coupled state space model. NeurIPS 37 (2024) 59808\u201359832.","DOI":"10.52202\/079017-1910"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00588"},{"key":"e_1_3_3_1_61_2","unstructured":"Yujun Wang Jinhe Bi Yunpu Ma and Soeren Pirk. 2025. ASCD: Attention-Steerable Contrastive Decoding for Reducing Hallucination in MLLM. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.14766 (2025)."},{"key":"e_1_3_3_1_62_2","unstructured":"Mohamed Elhoseiny Kai Yi and Mohamed Elfeki. 2021. Cizsl++: Creativity inspired generative zero-shot learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2101.00173 (2021)."},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210096"},{"key":"e_1_3_3_1_64_2","unstructured":"Yuqi Li Siwei Meng Chuanguang Yang Weilun Feng Junming Liu Zhulin An Yikai Wang and Yingli Tian. 2026. A Comprehensive Survey of Interaction Techniques in 3D Scene Generation. Authorea Preprints (2026)."},{"key":"e_1_3_3_1_65_2","unstructured":"Xu Liu Yibo Lu Xinxian Wang and Xinyu Wu. 2025. Training-Free Multi-Style Fusion Through Reference-Based Adaptive Modulation. arxiv:https:\/\/arXiv.org\/abs\/2509.18602\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2509.18602"},{"key":"e_1_3_3_1_66_2","unstructured":"Yunyao Zhang Xinglang Zhang Junxi Sheng Wenbing Li Junqing Yu Yi-Ping\u00a0Phoebe Chen Wei Yang and Zikai Song. 2026. Semantic-Aware Logical Reasoning via a Semiotic Framework. arxiv:https:\/\/arXiv.org\/abs\/2509.24765\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2509.24765"},{"key":"e_1_3_3_1_67_2","volume-title":"CVPR","author":"He Changhao","year":"2026","unstructured":"Changhao He, Di Xue, Shuxian Li, Yanji Hao, Xi Peng, and Peng Hu. 2026. Bootstrapping Multi-view Learning for Test-time Noisy Correspondence. In CVPR."},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657740"},{"key":"e_1_3_3_1_69_2","doi-asserted-by":"crossref","unstructured":"Yue Yang Wenlin Yao Hongming Zhang Xiaoyang Wang Dong Yu and Jianshu Chen. 2022. Z-LaVI: Zero-shot language solver fueled by visual imagination. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.12261 (2022).","DOI":"10.18653\/v1\/2022.emnlp-main.78"},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"crossref","unstructured":"Mingyu Zhang Zixu Li Zhiwei Chen Zhiheng Fu Xiaowei Zhu Jiajia Nie Yinwei Wei and Yupeng Hu. 2026. Hint: Composed image retrieval with dual-path compositional contextualized network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2603.26341 (2026).","DOI":"10.1109\/ICASSP55912.2026.11461012"},{"key":"e_1_3_3_1_71_2","doi-asserted-by":"crossref","unstructured":"Hyuntae Park Yeachan Kim Jun-Hyung Park and SangKeun Lee. 2024. Zero-shot commonsense reasoning over machine imagination. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.09329 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.669"},{"key":"e_1_3_3_1_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01590"},{"key":"e_1_3_3_1_73_2","unstructured":"Geonmo Gu Sanghyuk Chun Wonjae Kim HeeJae Jun Yoohoon Kang and Sangdoo Yun. 2023. Compodiff: Versatile composed image retrieval with latent diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.11916 (2023)."},{"key":"e_1_3_3_1_74_2","unstructured":"Yuqi Li Zijie Zhou Zhiyuan Peng Junhao Dong Haochen You Renye Yan Shiping Wen Yingli Tian and Tingwen Huang. 2025. A preference-driven methodology for efficient code generation. IEEE TAI (2025)."},{"key":"e_1_3_3_1_75_2","first-page":"6743","volume-title":"ACM WWW","author":"Xiao Canran","year":"2026","unstructured":"Canran Xiao and Liwei Hou. 2026. Prototype-Aligned Federated Soft-Prompts for Continual Web Personalization. In ACM WWW. 6743\u20136754."},{"key":"e_1_3_3_1_76_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i8.37608"},{"key":"e_1_3_3_1_77_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25327"},{"key":"e_1_3_3_1_78_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681331"},{"key":"e_1_3_3_1_79_2","unstructured":"Zehao Li Hongwei Yu Hao Jiang Qiang Sheng Yilong Xu Baolong Bi Yang Li Zhenlong Yuan Yujun Cai and Zhaoqi Wang. 2026. FactGuard: Agentic Video Misinformation Detection via Reinforcement Learning. arxiv:https:\/\/arXiv.org\/abs\/2602.22963\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2602.22963"},{"key":"e_1_3_3_1_80_2","unstructured":"Zhongyu Yang Wei Pang and Yingfang Yuan. 2026. XR: Cross-Modal Agents for Composed Image Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2601.14245 (2026)."},{"key":"e_1_3_3_1_81_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02308"},{"key":"e_1_3_3_1_82_2","volume-title":"NeurIPS","author":"Liu Delong","year":"2025","unstructured":"Delong Liu, Haiwen Li, Zhaohui Hou, Zhicheng Zhao, Fei Su, and Yuan Dong. 2025. Automatic synthetic data and fine-grained adaptive feature alignment for composed person retrieval. In NeurIPS."},{"key":"e_1_3_3_1_83_2","unstructured":"Yanjia Huang Mingyang Wu Renjie Li and Zhengzhong Tu. 2025. VISTA: Generative Visual Imagination for Vision-and-Language Navigation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.07868 (2025)."},{"key":"e_1_3_3_1_84_2","unstructured":"Zikai Song Junqing Yu Yi-Ping\u00a0Phoebe Chen Wei Yang and Xinchao Wang. 2026. Hypergraph-State Collaborative Reasoning for Multi-Object Tracking. arxiv:https:\/\/arXiv.org\/abs\/2604.12665\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2604.12665"},{"key":"e_1_3_3_1_85_2","doi-asserted-by":"publisher","unstructured":"Yuqi Li Hansheng Zeng Fuyan Zhang Chuanguang Yang Yanli Li and Weiping Ding. 2025. Efficient Medical Image Segmentation via Reinforcement Learning-Driven K-Space Sampling. IEEE TETCI (2025). 10.1109\/TETCI.2025.3621221","DOI":"10.1109\/TETCI.2025.3621221"},{"key":"e_1_3_3_1_86_2","unstructured":"Jinhe Bi Danqi Yan Yifan Wang Wenke Huang Haokun Chen Guancheng Wan Mang Ye Xun Xiao Hinrich Schuetze Volker Tresp et\u00a0al. 2025. CoT-Kinetics: A Theoretical Modeling Assessing LRM Reasoning Process. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.13408 (2025)."},{"key":"e_1_3_3_1_87_2","unstructured":"Xinlei Yu Chengming Xu Zhangquan Chen Yudong Zhang Shilin Lu Cheng Yang Jiangning Zhang Shuicheng Yan and Xiaobin Hu. 2025. Visual Document Understanding and Reasoning: A Multi-Agent Collaboration Framework with Agent-Wise Adaptive Test-Time Scaling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.03404 (2025)."},{"key":"e_1_3_3_1_88_2","doi-asserted-by":"crossref","unstructured":"Qi He. 2025. A unified metric architecture for ai infrastructure: A cross-layer taxonomy integrating performance efficiency and cost. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.21772 (2025).","DOI":"10.2139\/ssrn.5808163"},{"key":"e_1_3_3_1_89_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i25.39181"},{"key":"e_1_3_3_1_90_2","unstructured":"Liliang Ye Yunyao Zhang Yafeng Wu Yi-Ping\u00a0Phoebe Chen Junqing Yu Wei Yang and Zikai Song. 2025. MVP: Winning Solution to SMP Challenge 2025 Video Track. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.00950 (2025)."},{"key":"e_1_3_3_1_91_2","first-page":"19730","volume-title":"ICML","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML. PMLR, 19730\u201319742."},{"key":"e_1_3_3_1_92_2","first-page":"8748","volume-title":"ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_93_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_1_94_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611817"},{"key":"e_1_3_3_1_95_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681493"},{"key":"e_1_3_3_1_96_2","unstructured":"Yang Bai Xinxing Xu Yong Liu Salman Khan Fahad Khan Wangmeng Zuo Rick Siow\u00a0Mong Goh and Chun-Mei Feng. 2023. Sentence-level Prompts Benefit Composed Image Retrieval. arxiv:https:\/\/arXiv.org\/abs\/2310.05473\u00a0[cs.CV]"},{"key":"e_1_3_3_1_97_2","doi-asserted-by":"crossref","unstructured":"Hongfei Ge Yuanchun Jiang Jianshan Sun Kun Yuan and Yezheng Liu. 2025. LLM-Enhanced Composed Image Retrieval: An Intent Uncertainty-Aware Linguistic-Visual Dual Channel Matching Model. ACM TOIS 43 2 (2025) 1\u201330.","DOI":"10.1145\/3699715"},{"key":"e_1_3_3_1_98_2","unstructured":"Jaehyun Kwak Ramahdani Muhammad\u00a0Izaaz Inhar Se-Young Yun and Sung-Ju Lee. 2025. QuRe: Query-Relevant Retrieval through Hard Negative Sampling in Composed Image Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.12416 (2025)."},{"key":"e_1_3_3_1_99_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888153"},{"key":"e_1_3_3_1_100_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890642"},{"key":"e_1_3_3_1_101_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32541"},{"key":"e_1_3_3_1_102_2","unstructured":"Guozhi Qiu Zhiwei Chen Zixu Li Qinlei Huang Zhiheng Fu Xuemeng Song and Yupeng Hu. 2026. MELT: Improve Composed Image Retrieval via the Modification Frequentation-Rarity Balance Network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2603.29291 (2026)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:51:19Z","timestamp":1781538679000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810601"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":101,"alternative-id":["10.1145\/3805622.3810601","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810601","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}