{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:01:05Z","timestamp":1777654865438,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":95,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658032","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"978-987","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":14,"title":["Enhancing Interactive Image Retrieval With Query Rewriting Using Large Language Models and Vision Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0298-0905","authenticated-orcid":false,"given":"Hongyi","family":"Zhu","sequence":"first","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7943-2591","authenticated-orcid":false,"given":"Jia-Hong","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1904-8736","authenticated-orcid":false,"given":"Stevan","family":"Rudinac","sequence":"additional","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8312-0694","authenticated-orcid":false,"given":"Evangelos","family":"Kanoulas","sequence":"additional","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Learning Attribute Representations with Localization for Flexible Fashion Search. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Ak Kenan E.","year":"2018","unstructured":"Kenan E. Ak, Ashraf Ali Kassim, Joo-Hwee Lim, and Jo Yew Tham. 2018. Learning Attribute Representations with Localization for Flexible Fashion Search. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 7708--7717."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/243199.243274"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/582415.582416"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/322017.322021"},{"key":"e_1_3_2_1_5_1","volume-title":"The IIR evaluation model: a framework for evaluation of interactive information retrieval systems. Inf. Res. 8","author":"Borlund Pia","year":"2003","unstructured":"Pia Borlund. 2003. The IIR evaluation model: a framework for evaluation of interactive information retrieval systems. Inf. Res. 8 (2003)."},{"key":"e_1_3_2_1_6_1","unstructured":"Hyung Won Chung Le Hou S. Longpre Barret Zoph Yi Tay and William Fedus et al. 2022. Scaling Instruction-Finetuned Language Models. ArXiv abs\/2210.11416 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Albert Li, Pascale Fung, and Steven C. H. Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Albert Li, Pascale Fung, and Steven C. H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. ArXiv abs\/2305.06500 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Desai Karan","year":"2020","unstructured":"Karan Desai and Justin Johnson. 2020. VirTex: Learning Visual Representations from Textual Annotations. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 11157--11168."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.3166\/dn.17.1.61-84"},{"key":"e_1_3_2_1_10_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In North American","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In North American Chapter of the Association for Computational Linguistics."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10593-2_13"},{"key":"e_1_3_2_1_12_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov and Dirk Weissenborn et al. 2020. An Image isWorth 16x16Words: Transformers for Image Recognition at Scale. ArXiv abs\/2010.11929 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Recent Advance in Contentbased Image Retrieval: A Literature Survey. ArXiv abs\/1706.06064","author":"Zhou Wen","year":"2017","unstructured":"Wen gang Zhou, Houqiang Li, and Qi Tian. 2017. Recent Advance in Contentbased Image Retrieval: A Literature Survey. ArXiv abs\/1706.06064 (2017)."},{"key":"e_1_3_2_1_14_1","unstructured":"Xiaoxiao Guo Hui Wu Yu Cheng Steven J. Rennie and Rog\u00e9rio Schmidt Feris. 2018. Dialog-based Interactive Image Retrieval. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_15_1","volume-title":"The Fashion IQ Dataset: Retrieving Images by Combining Side Information and Relative Natural Language Feedback. ArXiv abs\/1905.12794","author":"Guo Xiaoxiao","year":"2019","unstructured":"Xiaoxiao Guo, Hui Wu, Yupeng Gao, Steven J. Rennie, and Rog\u00e9rio Schmidt Feris. 2019. The Fashion IQ Dataset: Retrieving Images by Combining Side Information and Relative Natural Language Feedback. ArXiv abs\/1905.12794 (2019)."},{"key":"e_1_3_2_1_16_1","volume-title":"Automatic Spatially-Aware Fashion Concept Discovery. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"Han Xintong","year":"2017","unstructured":"Xintong Han, Zuxuan Wu, Phoenix X. Huang, Xiao Zhang, Menglong Zhu, Yuan Li, Yang Zhao, and Larry S. Davis. 2017. Automatic Spatially-Aware Fashion Concept Discovery. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 1472--1480."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587784"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00161"},{"key":"e_1_3_2_1_19_1","volume-title":"CIEM: Contrastive Instruction Evaluation Method for Better Instruction Tuning. ArXiv abs\/2309.02301","author":"Hu Hongyu","year":"2023","unstructured":"Hongyu Hu, Jiyuan Zhang, Minyi Zhao, and Zhenbang Sun. 2023. CIEM: Contrastive Instruction Evaluation Method for Better Instruction Tuning. ArXiv abs\/2309.02301 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2890899"},{"key":"e_1_3_2_1_21_1","volume-title":"Robustness Analysis of Visual Question Answering Models by Basic Questions","author":"Huang Jia-Hong","year":"2017","unstructured":"Jia-Hong Huang. 2017. Robustness Analysis of Visual Question Answering Models by Basic Questions. King Abdullah University of Science and Technology, Master Thesis (2017)."},{"key":"e_1_3_2_1_22_1","volume-title":"VQABQ: Visual Question Answering by Basic Questions. VQA ChallengeWorkshop, CVPR","author":"Huang Jia-Hong","year":"2017","unstructured":"Jia-Hong Huang, Modar Alfadly, and Bernard Ghanem. 2017. VQABQ: Visual Question Answering by Basic Questions. VQA ChallengeWorkshop, CVPR (2017)."},{"key":"e_1_3_2_1_23_1","volume-title":"VQA Challenge and Visual Dialog Workshop, CVPR","author":"Huang Jia-Hong","year":"2018","unstructured":"Jia-Hong Huang, Modar Alfadly, and Bernard Ghanem. 2018. Robustness Analysis of Visual QA Models by Basic Questions. VQA Challenge and Visual Dialog Workshop, CVPR (2018)."},{"key":"e_1_3_2_1_24_1","volume-title":"Assessing the robustness of visual question answering. arXiv preprint arXiv:1912.01452","author":"Huang Jia-Hong","year":"2019","unstructured":"Jia-Hong Huang, Modar Alfadly, Bernard Ghanem, and Marcel Worring. 2019. Assessing the robustness of visual question answering. arXiv preprint arXiv:1912.01452 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"Improving Visual Question Answering Models through Robustness Analysis and In-Context Learning with a Chain of Basic Questions. arXiv preprint arXiv:2304.03147","author":"Huang Jia-Hong","year":"2023","unstructured":"Jia-Hong Huang, Modar Alfadly, Bernard Ghanem, and MarcelWorring. 2023. Improving Visual Question Answering Models through Robustness Analysis and In-Context Learning with a Chain of Basic Questions. arXiv preprint arXiv:2304.03147 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2008.916364"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/345508.345538"},{"key":"e_1_3_2_1_28_1","volume-title":"Query Expansion by Prompting Large Language Models. ArXiv abs\/2305.03653","author":"Jagerman Rolf","year":"2023","unstructured":"Rolf Jagerman, Honglei Zhuang, Zhen Qin, Xuanhui Wang, and Michael Bendersky. 2023. Query Expansion by Prompting Large Language Models. ArXiv abs\/2305.03653 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Query Expansion by Prompting Large Language Models. arXiv preprint arXiv:2305.03653","author":"Jagerman Rolf","year":"2023","unstructured":"Rolf Jagerman, Honglei Zhuang, Zhen Qin, Xuanhui Wang, and Michael Bendersky. 2023. Query Expansion by Prompting Large Language Models. arXiv preprint arXiv:2305.03653 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2671188.2749399"},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning.","author":"Joachims Thorsten","year":"1997","unstructured":"Thorsten Joachims. 1997. A Probabilistic Analysis of the Rocchio Algorithm with TFIDF for Text Categorization. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-45439-5_33"},{"key":"e_1_3_2_1_34_1","volume-title":"Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022. Large Language Models are Zero-Shot Reasoners. ArXiv abs\/2205.11916 (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248026"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/1991996.1992047"},{"key":"e_1_3_2_1_37_1","volume-title":"Chatting makes perfect: Chat-based image retrieval. Advances in Neural Information Processing Systems 36","author":"Levy Matan","year":"2024","unstructured":"Matan Levy, Rami Ben-Ari, Nir Darshan, and Dani Lischinski. 2024. Chatting makes perfect: Chat-based image retrieval. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1108\/00220410810912451"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/243199.243277"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning.","author":"Li Junnan","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467101"},{"key":"e_1_3_2_1_42_1","volume-title":"Wayne Xin Zhao, and Ji rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji rong Wen. 2023. Evaluating Object Hallucination in Large Vision-Language Models. ArXiv abs\/2305.10355 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Supervision Exists Everywhere: A Data Efficient Contrastive Language-Image Pre-training Paradigm. ArXiv abs\/2110.05208","author":"Li Yangguang","year":"2021","unstructured":"Yangguang Li, Feng Liang, Lichen Zhao, Yufeng Cui, Wanli Ouyang, Jing Shao, Fengwei Yu, and Junjie Yan. 2021. Supervision Exists Everywhere: A Data Efficient Contrastive Language-Image Pre-training Paradigm. ArXiv abs\/2110.05208 (2021)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299135"},{"key":"e_1_3_2_1_45_1","volume-title":"Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision.","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2942142"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01524"},{"key":"e_1_3_2_1_48_1","unstructured":"Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_49_1","volume-title":"VisualSparta: an embarrassingly simple approach to large-scale text-to-image search with weighted bag-of-words. arXiv preprint arXiv:2101.00265","author":"Lu Xiaopeng","year":"2021","unstructured":"Xiaopeng Lu, Tiancheng Zhao, and Kyusong Lee. 2021. VisualSparta: an embarrassingly simple approach to large-scale text-to-image search with weighted bag-of-words. arXiv preprint arXiv:2101.00265 (2021)."},{"key":"e_1_3_2_1_50_1","volume-title":"Generative Relevance Feedback with Large Language Models. arXiv preprint arXiv:2304.13157","author":"Mackie Iain","year":"2023","unstructured":"Iain Mackie, Shubham Chatterjee, and Jeffrey Dalton. 2023. Generative Relevance Feedback with Large Language Models. arXiv preprint arXiv:2304.13157 (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"Large Language Models Know Your Contextual Search Intent: A Prompting Framework for Conversational Search. arXiv preprint arXiv:2303.06573","author":"Mao Kelong","year":"2023","unstructured":"Kelong Mao, Zhicheng Dou, Haonan Chen, Fengran Mo, and Hongjin Qian. 2023. Large Language Models Know Your Contextual Search Intent: A Prompting Framework for Conversational Search. arXiv preprint arXiv:2303.06573 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.256"},{"key":"e_1_3_2_1_53_1","volume-title":"Distributed representations of words and phrases and their compositionality. arXiv preprint arXiv:1310.4546","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. Distributed representations of words and phrases and their compositionality. arXiv preprint arXiv:1310.4546 (2013)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3390668"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.374"},{"key":"e_1_3_2_1_56_1","volume-title":"Extending Faceted Navigation for RDF Data. In International Workshop on the Semantic Web.","author":"Oren Eyal","year":"2006","unstructured":"Eyal Oren, Renaud Delbru, and Stefan Decker. 2006. Extending Faceted Navigation for RDF Data. In International Workshop on the Semantic Web."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126281"},{"key":"e_1_3_2_1_58_1","volume-title":"Deep Face Recognition. In British Machine Vision Conference.","author":"Parkhi Omkar M.","year":"2015","unstructured":"Omkar M. Parkhi, Andrea Vedaldi, and Andrew Zisserman. 2015. Deep Face Recognition. In British Machine Vision Conference."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0965-7"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2846566"},{"key":"e_1_3_2_1_61_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_62_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Francesco Ricci Lior Rokach and Bracha Shapira. 2011. Introduction to Recommender Systems Handbook. In Recommender Systems Handbook.","DOI":"10.1007\/978-0-387-85820-3"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1108\/eb026866"},{"key":"e_1_3_2_1_65_1","unstructured":"J. J. Rocchio. 1971. Relevance feedback in information retrieval."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-012-0018-0"},{"key":"e_1_3_2_1_67_1","volume-title":"MULTIMEDIA '99","author":"Rui Yong","unstructured":"Yong Rui and Thomas S. Huang. 1999. A novel relevance feedback technique in image retrieval. In MULTIMEDIA '99."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.1997.638621"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539813.3545138"},{"key":"e_1_3_2_1_70_1","volume-title":"Learning Visual Representations with Caption Annotations. In European Conference on Computer Vision.","author":"Sariyildiz Mert Bulent","year":"2020","unstructured":"Mert Bulent Sariyildiz, Julien Perez, and Diane Larlus. 2020. Learning Visual Representations with Caption Annotations. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_72_1","volume-title":"LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs. ArXiv abs\/2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs. ArXiv abs\/2111.02114 (2021)."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/1076034.1076045"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.895972"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.77"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3591106.3592234"},{"key":"e_1_3_2_1_77_1","unstructured":"Hugo Touvron Louis Martin Kevin R. Stone Peter Albert and Amjad Almahairi et al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. ArXiv abs\/2307.09288 (2023)."},{"key":"e_1_3_2_1_78_1","volume-title":"2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Vo Nam S.","year":"2018","unstructured":"Nam S. Vo, Lu Jiang, Chen Sun, Kevin P. Murphy, Li-Jia Li, Li Fei-Fei, and James Hays. 2018. Composing Text and Image for Image Retrieval - an Empirical Odyssey. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2018), 6432--6441."},{"key":"e_1_3_2_1_79_1","volume-title":"Prototype-Enhanced Hypergraph Learning for Heterogeneous Information Networks. In International Conference on Multimedia Modeling. Springer, 462--476","author":"Shen Jiayi","year":"2024","unstructured":"ShuaiWang, Jiayi Shen, Athanasios Efthymiou, Stevan Rudinac, Monika Kackovic, Nachoem Wijnberg, and MarcelWorring. 2024. Prototype-Enhanced Hypergraph Learning for Heterogeneous Information Networks. In International Conference on Multimedia Modeling. Springer, 462--476."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390334.1390374"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.320"},{"key":"e_1_3_2_1_82_1","volume-title":"Element-aware Summarization with Large Language Models: Expert-aligned Evaluation and Chainof-Thought Method. In Annual Meeting of the Association for Computational Linguistics.","author":"Wang Yiming","year":"2023","unstructured":"Yiming Wang, Zhuosheng Zhang, and Rui Wang. 2023. Element-aware Summarization with Large Language Models: Expert-aligned Evaluation and Chainof-Thought Method. In Annual Meeting of the Association for Computational Linguistics."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449893"},{"key":"e_1_3_2_1_84_1","volume-title":"F. Xia, Quoc Le, and Denny Zhou.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Huai hsin Chi, F. Xia, Quoc Le, and Denny Zhou. 2022. Chain of Thought Prompting Elicits Reasoning in Large Language Models. ArXiv abs\/2201.11903 (2022)."},{"key":"e_1_3_2_1_85_1","volume-title":"Annual International ACM SIGIR Conference on Research and Development in Information Retrieval.","author":"Xu Jinxi","unstructured":"Jinxi Xu and W. Bruce Croft. 1996. Query expansion using local and global document analysis. In Annual International ACM SIGIR Conference on Research and Development in Information Retrieval."},{"key":"e_1_3_2_1_86_1","volume-title":"MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Xu Jun","year":"2016","unstructured":"Jun Xu, Tao Mei, Ting Yao, and Yong Rui. 2016. MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016), 5288--5296."},{"key":"e_1_3_2_1_87_1","volume-title":"Enhancing conversational search: Large language model-aided informative query rewriting. arXiv preprint arXiv:2310.09716","author":"Ye Fanghua","year":"2023","unstructured":"Fanghua Ye, Meng Fang, Shenghui Li, and Emine Yilmaz. 2023. Enhancing conversational search: Large language model-aided informative query rewriting. arXiv preprint arXiv:2310.09716 (2023)."},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806279"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2755986"},{"key":"e_1_3_2_1_90_1","volume-title":"HallE-Switch: Rethinking and Controlling Object Existence Hallucinations in Large Vision Language Models for Detailed Caption. ArXiv abs\/2310.01779","author":"Zhai Bohan","year":"2023","unstructured":"Bohan Zhai, Shijia Yang, Xiangchen Zhao, Chenfeng Xu, Sheng Shen, Dongdi Zhao, Kurt Keutzer, Manling Li, Tan Yan, and Xiangjun Fan. 2023. HallE-Switch: Rethinking and Controlling Object Existence Hallucinations in Large Vision Language Models for Detailed Caption. ArXiv abs\/2310.01779 (2023)."},{"key":"e_1_3_2_1_91_1","volume-title":"International Conference on Information and Knowledge Management.","author":"Zhai ChengXiang","unstructured":"ChengXiang Zhai and John D. Lafferty. 2001. Model-based feedback in the language modeling approach to information retrieval. In International Conference on Information and Knowledge Management."},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.120"},{"key":"e_1_3_2_1_93_1","volume-title":"Proceedings of the 7th Machine Learning for Healthcare Conference (Proceedings of Machine Learning Research), Zachary Lipton, Rajesh Ranganath, Mark Sendak, Michael Sjoding, and Serena Yeung (Eds.)","volume":"182","author":"Zhang Yuhao","unstructured":"Yuhao Zhang, Hang Jiang, Yasuhide Miura, Christopher D. Manning, and Curtis P. Langlotz. 2022. Contrastive Learning of Medical Visual Representations from Paired Images and Text. In Proceedings of the 7th Machine Learning for Healthcare Conference (Proceedings of Machine Learning Research), Zachary Lipton, Rajesh Ranganath, Mark Sendak, Michael Sjoding, and Serena Yeung (Eds.), Vol. 182. PMLR, 2--25."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123328"},{"key":"e_1_3_2_1_95_1","volume-title":"Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. ArXiv abs\/2306.05685","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Haotong Zhang, Joseph Gonzalez, and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. ArXiv abs\/2306.05685 (2023)."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658032","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658032","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:50:59Z","timestamp":1755766259000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658032"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":95,"alternative-id":["10.1145\/3652583.3658032","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658032","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}