{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T14:46:42Z","timestamp":1773154002391,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1145\/3589334.3645365","type":"proceedings-article","created":{"date-parts":[[2024,5,8]],"date-time":"2024-05-08T07:08:13Z","timestamp":1715152093000},"page":"1374-1385","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Multimodal Query Suggestion with Multi-Agent Reinforcement Learning from Human Feedback"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7064-6267","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"first","affiliation":[{"name":"Huawei Singapore Research Center, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9930-6859","authenticated-orcid":false,"given":"Bingzheng","family":"Gan","sequence":"additional","affiliation":[{"name":"Huawei Singapore Research Center, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2717-4192","authenticated-orcid":false,"given":"Wei","family":"Shi","sequence":"additional","affiliation":[{"name":"Huawei Singapore Research Center, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,5,13]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. NIPS (2022) 23716--23736."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Ziv Bar-Yossef and Naama Kraus. 2011. Context-sensitive query auto-completion. In WWW. 107--116.","DOI":"10.1145\/1963405.1963424"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Jingwen Bian Zheng-Jun Zha Hanwang Zhang Qi Tian and Tat-Seng Chua. 2012. Visual query attributes suggestion. In MM. 869--872.","DOI":"10.1145\/2393347.2396334"},{"key":"e_1_3_2_2_4_1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. Language models are few-shot learners. NIPS, Vol. 33 (2020), 1877--1901.","journal-title":"NIPS"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2071389.2071390"},{"key":"e_1_3_2_2_6_1","volume-title":"Visualgpt: Data-efficient adaptation of pretrained language models for image captioning. In CVPR. 18030--18040.","author":"Chen Jun","year":"2022","unstructured":"Jun Chen, Han Guo, Kai Yi, Boyang Li, and Mohamed Elhoseiny. 2022. Visualgpt: Data-efficient adaptation of pretrained language models for image captioning. In CVPR. 18030--18040."},{"key":"e_1_3_2_2_7_1","volume-title":"ICML. PMLR","author":"Cho Jaemin","year":"2021","unstructured":"Jaemin Cho, Jie Lei, Hao Tan, and Mohit Bansal. 2021. Unifying vision-and-language tasks via text generation. In ICML. PMLR, 1931--1942."},{"key":"e_1_3_2_2_8_1","volume-title":"Towards coherent and cohesive long-form text generation. arXiv preprint","author":"Cho Woon Sang","year":"2018","unstructured":"Woon Sang Cho, Pengchuan Zhang, Yizhe Zhang, Xiujun Li, Michel Galley, Chris Brockett, Mengdi Wang, and Jianfeng Gao. 2018. Towards coherent and cohesive long-form text generation. arXiv preprint (2018)."},{"key":"e_1_3_2_2_9_1","volume-title":"Deep reinforcement learning from human preferences. NIPS","author":"Christiano Paul F","year":"2017","unstructured":"Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. 2017. Deep reinforcement learning from human preferences. NIPS (2017)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2668120"},{"key":"e_1_3_2_2_11_1","volume-title":"Imagenet: A large-scale hierarchical image database. In CVPR. Ieee, 248--255.","author":"Deng Jia","year":"2009","unstructured":"Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database. In CVPR. Ieee, 248--255."},{"key":"e_1_3_2_2_12_1","volume-title":"Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al.","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi SM Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al. 2023. Palm-e: An embodied multimodal language model. arXiv preprint (2023)."},{"key":"e_1_3_2_2_13_1","volume-title":"Chatgpt outperforms crowd-workers for text-annotation tasks. arXiv preprint","author":"Gilardi Fabrizio","year":"2023","unstructured":"Fabrizio Gilardi, Meysam Alizadeh, and Ma\u00ebl Kubli. 2023. Chatgpt outperforms crowd-workers for text-annotation tasks. arXiv preprint (2023)."},{"key":"e_1_3_2_2_14_1","volume-title":"Ontological queries: Rewriting and optimization","author":"Gottlob Georg","unstructured":"Georg Gottlob, Giorgio Orsi, and Andreas Pieris. 2011. Ontological queries: Rewriting and optimization. In ICDE. IEEE, 2--13."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2638546"},{"key":"e_1_3_2_2_16_1","volume-title":"Boyang Li, Dacheng Tao, and Steven CH Hoi.","author":"Guo Jiaxian","year":"2022","unstructured":"Jiaxian Guo, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Boyang Li, Dacheng Tao, and Steven CH Hoi. 2022. From Images to Textual Prompts: Zero-shot VQA with Frozen Large Language Models. arXiv preprint (2022)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"crossref","unstructured":"Matthias Hagen Martin Potthast Marcel Gohsen Anja Rathgeber and Benno Stein. 2017. A large-scale query spelling correction corpus. In SIGIR. 1261--1264.","DOI":"10.1145\/3077136.3080749"},{"key":"e_1_3_2_2_18_1","volume-title":"Qiang Liu, et al.","author":"Huang Shaohan","year":"2023","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, Saksham Singhal, Shuming Ma, Tengchao Lv, Lei Cui, Owais Khan Mohammed, Qiang Liu, et al. 2023. Language is not all you need: Aligning perception with language models. arXiv preprint (2023)."},{"key":"e_1_3_2_2_19_1","volume-title":"Craig Ferguson, Agata Lapedriza, Noah Jones, Shixiang Gu, and Rosalind Picard.","author":"Jaques Natasha","year":"2019","unstructured":"Natasha Jaques, Asma Ghandeharioun, Judy Hanwen Shen, Craig Ferguson, Agata Lapedriza, Noah Jones, Shixiang Gu, and Rosalind Picard. 2019. Way off-policy batch deep reinforcement learning of implicit human preferences in dialog. arXiv preprint (2019)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Kalervo Jarvelin. 2000. IR evaluation methods for retrieving highly relevant documents. In SIGIR. 41--48.","DOI":"10.1145\/345508.345545"},{"key":"e_1_3_2_2_21_1","unstructured":"Chao Jia Yinfei Yang Ye Xia Yi-Ting Chen Zarana Parekh Hieu Pham Quoc Le Yun-Hsuan Sung Zhen Li and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML. PMLR 4904--4916."},{"key":"e_1_3_2_2_22_1","volume-title":"Can neural machine translation be improved with user feedback? arXiv preprint","author":"Kreutzer Julia","year":"2018","unstructured":"Julia Kreutzer, Shahram Khadivi, Evgeny Matusov, and Stefan Riezler. 2018. Can neural machine translation be improved with user feedback? arXiv preprint (2018)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Adenike M Lam-Adesina and Gareth JF Jones. 2001. Applying summarization techniques for term selection in relevance feedback. In SIGIR. 1--9.","DOI":"10.1145\/383952.383953"},{"key":"e_1_3_2_2_24_1","volume-title":"Improving a neural semantic parser by counterfactual learning from human bandit feedback. arXiv preprint","author":"Lawrence Carolin","year":"2018","unstructured":"Carolin Lawrence and Stefan Riezler. 2018. Improving a neural semantic parser by counterfactual learning from human bandit feedback. arXiv preprint (2018)."},{"key":"e_1_3_2_2_25_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint (2023)."},{"key":"e_1_3_2_2_26_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900."},{"key":"e_1_3_2_2_27_1","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. NIPS, Vol. 34 (2021), 9694--9705.","journal-title":"NIPS"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-010-0540-0"},{"key":"e_1_3_2_2_29_1","volume-title":"Microsoft coco: Common objects in context","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. Springer, 740--755."},{"key":"e_1_3_2_2_30_1","volume-title":"Visual instruction tuning. arXiv preprint","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. arXiv preprint (2023)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","unstructured":"Yiding Liu Weixue Lu Suqi Cheng Daiting Shi Shuaiqiang Wang Zhicong Cheng and Dawei Yin. 2021. Pre-trained language model for web-scale retrieval in baidu search. In SIGKDD. 3365--3375.","DOI":"10.1145\/3447548.3467149"},{"key":"e_1_3_2_2_32_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint (2023)."},{"key":"e_1_3_2_2_33_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. 2022. Training language models to follow instructions with human feedback. NIPS, Vol. 35 (2022), 27730--27744.","journal-title":"NIPS"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Bryan A Plummer Liwei Wang Chris M Cervantes Juan C Caicedo Julia Hockenmaier and Svetlana Lazebnik. 2015. Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In ICCV. 2641--2649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_2_35_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748--8763."},{"key":"e_1_3_2_2_36_1","volume-title":"Sentence-bert: Sentence embeddings using siamese bert-networks. EMNLP","author":"Reimers Nils","year":"2019","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-bert: Sentence embeddings using siamese bert-networks. EMNLP (2019)."},{"key":"e_1_3_2_2_37_1","volume-title":"Proximal policy optimization algorithms. arXiv preprint","author":"Schulman John","year":"2017","unstructured":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. 2017. Proximal policy optimization algorithms. arXiv preprint (2017)."},{"key":"e_1_3_2_2_38_1","volume-title":"Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface. arXiv preprint","author":"Shen Yongliang","year":"2023","unstructured":"Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang. 2023. Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface. arXiv preprint (2023)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Milad Shokouhi. 2013. Learning to personalize query auto-completion. In SIGIR. 103--112.","DOI":"10.1145\/2484028.2484076"},{"key":"e_1_3_2_2_40_1","unstructured":"David Silver Guy Lever Nicolas Heess Thomas Degris Daan Wierstra and Martin Riedmiller. 2014. Deterministic policy gradient algorithms. In ICML. PMLR 387--395."},{"key":"e_1_3_2_2_41_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2023","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2023. Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks. In CVPR. 19175--19186."},{"key":"e_1_3_2_2_42_1","volume-title":"Zihang Dai, Yulia Tsvetkov, and Yuan Cao.","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint (2021)."},{"key":"e_1_3_2_2_43_1","volume-title":"Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning","author":"Williams Ronald J","year":"1992","unstructured":"Ronald J Williams. 1992. Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning, Vol. 8, 3 (1992), 229--256."},{"key":"e_1_3_2_2_44_1","volume-title":"Visual chatgpt: Talking, drawing and editing with visual foundation models. arXiv preprint","author":"Wu Chenfei","year":"2023","unstructured":"Chenfei Wu, Shengming Yin, Weizhen Qi, Xiaodong Wang, Zecheng Tang, and Nan Duan. 2023. Visual chatgpt: Talking, drawing and editing with visual foundation models. arXiv preprint (2023)."},{"key":"e_1_3_2_2_45_1","volume-title":"Recursively summarizing books with human feedback. arXiv preprint","author":"Wu Jeff","year":"2021","unstructured":"Jeff Wu, Long Ouyang, Daniel M Ziegler, Nisan Stiennon, Ryan Lowe, Jan Leike, and Paul Christiano. 2021. Recursively summarizing books with human feedback. arXiv preprint (2021)."},{"key":"e_1_3_2_2_46_1","volume-title":"Acm sigir forum","author":"Xu Jinxi","unstructured":"Jinxi Xu and W Bruce Croft. 2017. Quary expansion using local and global document analysis. In Acm sigir forum, Vol. 51. ACM New York, NY, USA, 168--175."},{"key":"e_1_3_2_2_47_1","volume-title":"Mm-react: Prompting chatgpt for multimodal reasoning and action. arXiv preprint","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Ehsan Azarnasab, Faisal Ahmed, Zicheng Liu, Ce Liu, Michael Zeng, and Lijuan Wang. 2023. Mm-react: Prompting chatgpt for multimodal reasoning and action. arXiv preprint (2023)."},{"key":"e_1_3_2_2_48_1","volume-title":"Towards coherent and engaging spoken dialog response generation using automatic conversation evaluators. arXiv preprint","author":"Yi Sanghyun","year":"2019","unstructured":"Sanghyun Yi, Rahul Goel, Chandra Khatri, Alessandra Cervone, Tagyoung Chung, Behnam Hedayatnia, Anu Venkatesh, Raefer Gabriel, and Dilek Hakkani-Tur. 2019. Towards coherent and engaging spoken dialog response generation using automatic conversation evaluators. arXiv preprint (2019)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"crossref","unstructured":"Zhaoyang Zeng Jianlong Fu Hongyang Chao and Tao Mei. 2017. Searching personal photos on the phone with instant visual query suggestion and joint text-image hashing. In MM. 118--126.","DOI":"10.1145\/3123266.3123446"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","unstructured":"Zheng-Jun Zha Linjun Yang Tao Mei Meng Wang and Zengfu Wang. 2009. Visual query suggestion. In MM. 15--24.","DOI":"10.1145\/1631272.1631278"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/1823746.1823747"},{"key":"e_1_3_2_2_52_1","volume-title":"Lit: Zero-shot transfer with locked-image text tuning. In CVPR. 18123--18133.","author":"Zhai Xiaohua","year":"2022","unstructured":"Xiaohua Zhai, Xiao Wang, Basil Mustafa, Andreas Steiner, Daniel Keysers, Alexander Kolesnikov, and Lucas Beyer. 2022. Lit: Zero-shot transfer with locked-image text tuning. In CVPR. 18123--18133."},{"key":"e_1_3_2_2_53_1","volume-title":"Vinvl: Making visual representations matter in vision-language models. arXiv preprint","author":"Zhang Pengchuan","year":"2021","unstructured":"Pengchuan Zhang, Xiujun Li, Xiaowei Hu, Jianwei Yang, Lei Zhang, Lijuan Wang, Yejin Choi, and Jianfeng Gao. 2021. Vinvl: Making visual representations matter in vision-language models. arXiv preprint, Vol. 1, 6 (2021), 8."},{"key":"e_1_3_2_2_54_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint (2022)."}],"event":{"name":"WWW '24: The ACM Web Conference 2024","location":"Singapore Singapore","acronym":"WWW '24","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2024"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589334.3645365","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3589334.3645365","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:25:43Z","timestamp":1755822343000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589334.3645365"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":54,"alternative-id":["10.1145\/3589334.3645365","10.1145\/3589334"],"URL":"https:\/\/doi.org\/10.1145\/3589334.3645365","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]},"assertion":[{"value":"2024-05-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}