{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T01:36:02Z","timestamp":1769045762320,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2018AAA0102000"],"award-info":[{"award-number":["2018AAA0102000"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611885","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"6390-6399","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Conversational Composed Retrieval with Iterative Sequence Refinement"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5094-8906","authenticated-orcid":false,"given":"Hao","family":"Wei","sequence":"first","affiliation":[{"name":"Inst. of Comput. Tech., CAS &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5931-0527","authenticated-orcid":false,"given":"Shuhui","family":"Wang","sequence":"additional","affiliation":[{"name":"Inst. of Comput. Tech., CAS, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6123-0043","authenticated-orcid":false,"given":"Zhe","family":"Xue","sequence":"additional","affiliation":[{"name":"BUPT, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9026-8128","authenticated-orcid":false,"given":"Shengbo","family":"Chen","sequence":"additional","affiliation":[{"name":"Henan University, Kaifeng, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7542-296X","authenticated-orcid":false,"given":"Qingming","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences &amp; Inst. of Comput. Tech., CAS, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Learning Attribute Representations With Localization for Flexible Fashion Search. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018","author":"Ak Kenan E.","year":"2018","unstructured":"Kenan E. Ak, Ashraf A. Kassim, Joo-Hwee Lim, and Jo Yew Tham. 2018. Learning Attribute Representations With Localization for Flexible Fashion Search. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, June 18-22, 2018. Computer Vision Foundation \/ IEEE Computer Society, 7708--7717."},{"key":"e_1_3_2_2_2_1","volume-title":"Compositional Learning of Image-Text Query for Image Retrieval. In IEEE Winter Conference on Applications of Computer Vision, WACV 2021","author":"Anwaar Muhammad Umer","year":"2021","unstructured":"Muhammad Umer Anwaar, Egor Labintcev, and Martin Kleinsteuber. 2021. Compositional Learning of Image-Text Query for Image Retrieval. In IEEE Winter Conference on Applications of Computer Vision, WACV 2021, Waikoloa, HI, USA, January 3-8, 2021. IEEE, 1139--1148."},{"key":"e_1_3_2_2_3_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual."},{"key":"e_1_3_2_2_4_1","volume-title":"Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021","author":"Chen Lili","year":"2021","unstructured":"Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, and Igor Mordatch. 2021. Decision Transformer: Reinforcement Learning via Sequence Modeling. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual. 15084--15097."},{"key":"e_1_3_2_2_5_1","volume-title":"Microsoft COCO Captions: Data Collection and Evaluation Server. CoRR","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2015. Microsoft COCO Captions: Data Collection and Evaluation Server. CoRR, Vol. abs\/1504.00325 (2015)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00307"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.707"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939746"},{"key":"e_1_3_2_2_9_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and Short Papers). Association for Computational Linguistics, 4171--4186."},{"key":"e_1_3_2_2_10_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net."},{"key":"e_1_3_2_2_11_1","volume-title":"VSE: Improving Visual-Semantic Embeddings with Hard Negatives. In British Machine Vision Conference, BMVC. 12","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J. Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. VSE: Improving Visual-Semantic Embeddings with Hard Negatives. In British Machine Vision Conference, BMVC. 12."},{"key":"e_1_3_2_2_12_1","volume-title":"Dialog-based Interactive Image Retrieval. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018","author":"Guo Xiaoxiao","year":"2018","unstructured":"Xiaoxiao Guo, Hui Wu, Yu Cheng, Steven Rennie, Gerald Tesauro, and Rog\u00e9rio Schmidt Feris. 2018. Dialog-based Interactive Image Retrieval. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3-8, 2018, Montr\u00e9al, Canada. 676--686."},{"key":"e_1_3_2_2_13_1","volume-title":"Learning Attribute-driven Disentangled Representations for Interactive Fashion Retrieval. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021","author":"Hou Yuxin","year":"2021","unstructured":"Yuxin Hou, Eleonora Vig, Michael Donoser, and Loris Bazzani. 2021. Learning Attribute-driven Disentangled Representations for Interactive Fashion Retrieval. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021. IEEE, 12127--12137."},{"key":"e_1_3_2_2_14_1","volume-title":"Attentive Memory Networks: Efficient Machine Reading for Conversational Search. CoRR","author":"Kenter Tom","year":"2017","unstructured":"Tom Kenter and Maarten de Rijke. 2017. Attentive Memory Networks: Efficient Machine Reading for Conversational Search. CoRR, Vol. abs\/1712.07229 (2017)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16271"},{"key":"e_1_3_2_2_16_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","volume":"5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021a. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139). PMLR, 5583--5594."},{"key":"e_1_3_2_2_17_1","volume-title":"Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings.","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings."},{"key":"e_1_3_2_2_18_1","volume-title":"Zero-shot Query Contextualization for Conversational Search. In SIGIR '22: The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval","author":"Krasakis Antonios Minas","year":"2022","unstructured":"Antonios Minas Krasakis, Andrew Yates, and Evangelos Kanoulas. 2022. Zero-shot Query Contextualization for Conversational Search. In SIGIR '22: The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, Madrid, Spain, July 11 - 15, 2022. ACM, 1880--1884."},{"key":"e_1_3_2_2_19_1","volume-title":"Computer Vision - ECCV - 15th European Conference (Lecture Notes in Computer Science","author":"Lee Kuang-Huei","unstructured":"Kuang-Huei Lee, Xi Chen, Gang Hua, Houdong Hu, and Xiaodong He. 2018. Stacked Cross Attention for Image-Text Matching. In Computer Vision - ECCV - 15th European Conference (Lecture Notes in Computer Science, Vol. 11208). 212--228."},{"key":"e_1_3_2_2_20_1","volume-title":"Visual Semantic Reasoning for Image-Text Matching. In International Conference on Computer Vision, ICCV. 4653--4661","author":"Li Kunpeng","year":"2019","unstructured":"Kunpeng Li, Yulun Zhang, Kai Li, Yuanyuan Li, and Yun Fu. 2019. Visual Semantic Reasoning for Image-Text Matching. In International Conference on Computer Vision, ICCV. 4653--4661."},{"key":"e_1_3_2_2_21_1","volume-title":"Piotr Doll\u00e1 r, and C. Lawrence Zitnick","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1 r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Computer Vision - ECCV 2014 - 13th European Conference (Lecture Notes in Computer Science, Vol. 8693). 740--755."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_2_23_1","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada. 13--23."},{"key":"e_1_3_2_2_24_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139). PMLR, 8748--8763."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3020165.3020183"},{"key":"e_1_3_2_2_26_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24","volume":"8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139). PMLR, 8821--8831."},{"key":"e_1_3_2_2_27_1","volume-title":"Fashion-IQ 2020 Challenge 2nd Place Team's Solution. CoRR","author":"Shin Minchul","year":"2020","unstructured":"Minchul Shin, Yoonjae Cho, and Seongwuk Hong. 2020. Fashion-IQ 2020 Challenge 2nd Place Team's Solution. CoRR, Vol. abs\/2007.06404 (2020)."},{"key":"e_1_3_2_2_28_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In 8th International Conference on Learning Representations, ICLR 2020","author":"Su Weijie","year":"2020","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2020. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26-30, 2020. OpenReview.net."},{"key":"e_1_3_2_2_29_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA. 5998--6008."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00660"},{"key":"e_1_3_2_2_31_1","volume-title":"Saksham Singhal, Subhojit Som, and Furu Wei.","author":"Wang Wenhui","year":"2022","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, and Furu Wei. 2022. Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks. CoRR, Vol. abs\/2208.10442 (2022)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_2_2_33_1","volume-title":"Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. CoRR","author":"Wu Yonghui","year":"2016","unstructured":"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, and Jeffrey Dean. 2016. Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. CoRR, Vol. abs\/1609.08144 (2016)."},{"key":"e_1_3_2_2_34_1","volume-title":"Virtual Event \/ Singapore","author":"Yanagi Rintaro","year":"2020","unstructured":"Rintaro Yanagi, Ren Togo, Takahiro Ogawa, and Miki Haseyama. 2020. Interactive re-ranking for cross-modal retrieval based on object-wise question answering. In MMAsia 2020: ACM Multimedia Asia, Virtual Event \/ Singapore, 7-9 March, 2021. ACM, 37:1--37:7."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401150"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462881"},{"key":"e_1_3_2_2_39_1","volume-title":"User Respond. In Proceedings of the 27th ACM International Conference on Information and Knowledge Management, CIKM 2018","author":"Zhang Yongfeng","year":"2019","unstructured":"Yongfeng Zhang, Xu Chen, Qingyao Ai, Liu Yang, and W. Bruce Croft. 2018. Towards Conversational Search and Recommendation: System Ask, User Respond. In Proceedings of the 27th ACM International Conference on Information and Knowledge Management, CIKM 2018, Torino, Italy, October 22-26, 2019. ACM, 177--186."},{"key":"e_1_3_2_2_40_1","volume-title":"Memory-Augmented Attribute Manipulation Networks for Interactive Fashion Search. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017","author":"Zhao Bo","year":"2017","unstructured":"Bo Zhao, Jiashi Feng, Xiao Wu, and Shuicheng Yan. 2017. Memory-Augmented Attribute Manipulation Networks for Interactive Fashion Search. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017. IEEE Computer Society, 6156--6164"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611885","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611885","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:03Z","timestamp":1755820803000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611885"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":40,"alternative-id":["10.1145\/3581783.3611885","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611885","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}