{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,13]],"date-time":"2026-07-13T18:48:27Z","timestamp":1783968507821,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","funder":[{"name":"Research Grants Council of Hong Kong","award":["PolyU\/15209724, PolyU\/15207821, PolyU\/15207122, PolyU\/15213323"],"award-info":[{"award-number":["PolyU\/15209724, PolyU\/15207821, PolyU\/15207122, PolyU\/15213323"]}]},{"name":"PolyU internal grants (BDWP)"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730077","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:18:36Z","timestamp":1752455916000},"page":"813-822","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6932-4228","authenticated-orcid":false,"given":"Yongqi","family":"Li","sequence":"first","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9857-6639","authenticated-orcid":false,"given":"Hongru","family":"Cai","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5199-1428","authenticated-orcid":false,"given":"Wenjie","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6555-3834","authenticated-orcid":false,"given":"Leigang","family":"Qu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1791-3159","authenticated-orcid":false,"given":"Yinwei","family":"Wei","sequence":"additional","affiliation":[{"name":"Shandong University, Shandong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7360-8864","authenticated-orcid":false,"given":"Wenjie","family":"Li","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Autoregressive search engines: Generating substrings as document identifiers. arXiv preprint arXiv:2204.10628","author":"Bevilacqua Michele","year":"2022","unstructured":"Michele Bevilacqua, Giuseppe Ottaviano, Patrick Lewis, Wen-tau Yih, Sebastian Riedel, and Fabio Petroni. 2022. Autoregressive search engines: Generating substrings as document identifiers. arXiv preprint arXiv:2204.10628 (2022)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"e_1_3_2_1_4_1","volume-title":"Autoregressive Entity Retrieval. In International Conference on Learning Representations.","author":"Cao Nicola De","year":"2020","unstructured":"Nicola De Cao, Gautier Izacard, Sebastian Riedel, and Fabio Petroni. 2020. Autoregressive Entity Retrieval. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612103"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_8_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_9_1","volume-title":"Devise: A deep visual-semantic embedding model. Advances in neural information processing systems 26","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. 2013. Devise: A deep visual-semantic embedding model. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_3_2_1_10_1","unstructured":"Yuying Ge Sijie Zhao Ziyun Zeng Yixiao Ge Chen Li Xintao Wang and Ying Shan. 2024. Making LLaMA SEE and Draw with SEED Tokenizer. In ICLR."},{"key":"e_1_3_2_1_11_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang,Weizhu Chen, et al. 2021. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_12_1","volume-title":"ICASSP'82","volume":"7","author":"Juang Biing-Hwang","year":"1982","unstructured":"Biing-Hwang Juang and A Gray. 1982. Multiple stage vector quantization for speech coding. In ICASSP'82. IEEE International Conference on Acoustics, Speech, and Signal Processing, Vol. 7. 597--600."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01123"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612101"},{"key":"e_1_3_2_1_16_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612472"},{"key":"e_1_3_2_1_19_1","volume-title":"A survey of generative search and recommendation in the era of large language models. arXiv preprint arXiv:2404.16924","author":"Li Yongqi","year":"2024","unstructured":"Yongqi Li, Xinyu Lin,WenjieWang, Fuli Feng, Liang Pang,Wenjie Li, Liqiang Nie, Xiangnan He, and Tat-Seng Chua. 2024. A survey of generative search and recommendation in the era of large language models. arXiv preprint arXiv:2404.16924 (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.639"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.366"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612411"},{"key":"e_1_3_2_1_24_1","volume-title":"Unified Text-to-Image Generation and Retrieval. arXiv preprint arXiv:2406.05814","author":"Qu Leigang","year":"2024","unstructured":"Leigang Qu, Haochuan Li, Tan Wang, Wenjie Wang, Yongqi Li, Liqiang Nie, and Tat-Seng Chua. 2024. Unified Text-to-Image Generation and Retrieval. arXiv preprint arXiv:2406.05814 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413961"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591712"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462829"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_29_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_30_1","volume-title":"Aaron Van den Oord, and Oriol Vinyals","author":"Razavi Ali","year":"2019","unstructured":"Ali Razavi, Aaron Van den Oord, and Oriol Vinyals. 2019. Generating diverse high-fidelity images with vq-vae-2. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_31_1","volume-title":"Jing Liu, Hua Wu, Ji-Rong Wen, and Haifeng Wang.","author":"Ren Ruiyang","year":"2023","unstructured":"Ruiyang Ren, Wayne Xin Zhao, Jing Liu, Hua Wu, Ji-Rong Wen, and Haifeng Wang. 2023. TOME: A Two-stage Approach for Model-based Retrieval. arXiv preprint arXiv:2305.11161 (2023)."},{"key":"e_1_3_2_1_32_1","unstructured":"Yi Tay Vinh Q Tran Mostafa Dehghani Jianmo Ni Dara Bahri Harsh Mehta Zhen Qin Kai Hui Zhe Zhao Jai Gupta et al. 2022. Transformer memory as a differentiable search index. arXiv preprint arXiv:2202.06991 (2022)."},{"key":"e_1_3_2_1_33_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_34_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10941--10950","author":"Wei Xi","year":"2020","unstructured":"Xi Wei, Tianzhu Zhang, Yan Li, Yongdong Zhang, and Feng Wu. 2020. Multimodality cross attention network for image and sentence matching. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10941--10950."},{"key":"e_1_3_2_1_38_1","volume-title":"Vision: A Survey. arXiv preprint arXiv:2411.05902","author":"Xiong Jing","year":"2024","unstructured":"Jing Xiong, Gongye Liu, Lun Huang, Chengyue Wu, Taiqiang Wu, Yao Mu, Yuan Yao, Hui Shen, Zhongwei Wan, Jinfa Huang, et al. 2024. Autoregressive Models in Vision: A Survey. arXiv preprint arXiv:2411.05902 (2024)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612207"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_41_1","volume-title":"Han Zhang, Ruoming Pang, James Qin, Alexander Ku, Yuanzhong Xu, Jason Baldridge, and Yonghui Wu.","author":"Yu Jiahui","year":"2021","unstructured":"Jiahui Yu, Xin Li, Jing Yu Koh, Han Zhang, Ruoming Pang, James Qin, Alexander Ku, Yuanzhong Xu, Jason Baldridge, and Yonghui Wu. 2021. Vector-quantized image modeling with improved vqgan. arXiv preprint arXiv:2110.04627 (2021)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Yidan Zhang Ting Zhang Dong Chen Yujing Wang Qi Chen Xing Xie Hao Sun Weiwei Deng Qi Zhang Fan Yang et al. 2023. IRGen: Generative Modeling for Image Retrieval. arXiv preprint arXiv:2303.10126 (2023).","DOI":"10.1007\/978-3-031-72633-0_2"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383184"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730077","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:34:07Z","timestamp":1755887647000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730077"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":44,"alternative-id":["10.1145\/3726302.3730077","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730077","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}