{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T15:54:30Z","timestamp":1776441270577,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"the Climbing Plan Project","award":["Grant No. E3Z0261"],"award-info":[{"award-number":["Grant No. E3Z0261"]}]},{"name":"the Central Guidance for Local Special Project","award":["Grant No. Z231100005923044"],"award-info":[{"award-number":["Grant No. Z231100005923044"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680673","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"3955-3963","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["T2VIndexer: A Generative Video Indexer for Efficient Text-Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4037-819X","authenticated-orcid":false,"given":"Yili","family":"Li","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3966-511X","authenticated-orcid":false,"given":"Jing","family":"Yu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6784-0221","authenticated-orcid":false,"given":"Keke","family":"Gai","sequence":"additional","affiliation":[{"name":"School of Cyberspace Science and Technology, Beijing Institute of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9483-8984","authenticated-orcid":false,"given":"Bang","family":"Liu","sequence":"additional","affiliation":[{"name":"Universit\u00e9 de Montr\u00e9al &amp; Mila, Montr\u00e9al, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3190-6521","authenticated-orcid":false,"given":"Gang","family":"Xiong","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Science, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3631-256X","authenticated-orcid":false,"given":"Qi","family":"Wu","sequence":"additional","affiliation":[{"name":"Australia Institute of Machine Learning, University of Adelaide, Adelaide, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In 2021 IEEE International Conference on Computer Vision. 1708--1718","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In 2021 IEEE International Conference on Computer Vision. 1708--1718."},{"key":"e_1_3_2_1_3_1","volume-title":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems","author":"Bevilacqua Michele","year":"2022","unstructured":"Michele Bevilacqua, Giuseppe Ottaviano, Patrick S. H. Lewis, Scott Yih, Sebastian Riedel, and Fabio Petroni. 2022. Autoregressive Search Engines: Generating Substrings as Document Identifiers. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022."},{"key":"e_1_3_2_1_4_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_6_1","volume-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)."},{"key":"e_1_3_2_1_7_1","volume-title":"2021 9th International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 2021 9th International Conference on Learning Representations."},{"key":"e_1_3_2_1_8_1","volume-title":"CLIP2Video: Mastering Video-Text Retrieval via Image CLIP. arXiv preprint arXiv:2106.11097","author":"Fang Han","year":"2021","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Yu Chen. 2021. CLIP2Video: Mastering Video-Text Retrieval via Image CLIP. arXiv preprint arXiv:2106.11097 (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"Computer Vision - ECCV 2020 - 16th European Conference. 214--229.","author":"Gabeur Valentin","unstructured":"Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multi-modal Transformer for Video Retrieval. In Computer Vision - ECCV 2020 - 16th European Conference. 214--229."},{"key":"e_1_3_2_1_10_1","volume-title":"Dense-Captioning Events in Videos. In IEEE International Conference on Computer Vision","author":"Krishna Ranjay","year":"2017","unstructured":"Ranjay Krishna, Kenji Hata, Frederic Ren, Li Fei-Fei, and Juan Carlos Niebles. 2017. Dense-Captioning Events in Videos. In IEEE International Conference on Computer Vision 2017. 706--715."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_12_1","volume-title":"TGIF: A New Dataset and Benchmark on Animated GIF Description. In 2016 IEEE Conference on Computer Vision and Pattern Recognition. 4641--4650","author":"Li Yuncheng","year":"2016","unstructured":"Yuncheng Li, Yale Song, Liangliang Cao, Joel R. Tetreault, Larry Goldberg, Alejandro Jaimes, and Jiebo Luo. 2016. TGIF: A New Dataset and Benchmark on Animated GIF Description. In 2016 IEEE Conference on Computer Vision and Pattern Recognition. 4641--4650."},{"key":"e_1_3_2_1_13_1","volume-title":"30th British Machine Vision Conference","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use What You Have: Video retrieval using representations from collaborative experts. In 30th British Machine Vision Conference 2019. 279."},{"key":"e_1_3_2_1_14_1","volume-title":"Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353","author":"Luo Huaishao","year":"2020","unstructured":"Huaishao Luo, Lei Ji, Botian Shi, Haoyang Huang, Nan Duan, Tianrui Li, Jason Li, Taroon Bharti, and Ming Zhou. 2020. Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_16_1","volume-title":"Scikit-learn: Machine learning in Python. the Journal of machine Learning research","author":"Pedregosa Fabian","year":"2011","unstructured":"Fabian Pedregosa, Ga\u00ebl Varoquaux, Alexandre Gramfort, Vincent Michel, Bertrand Thirion, Olivier Grisel, Mathieu Blondel, Peter Prettenhofer, Ron Weiss, Vincent Dubourg, et al. 2011. Scikit-learn: Machine learning in Python. the Journal of machine Learning research, Vol. 12 (2011), 2825--2830."},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.336"},{"key":"e_1_3_2_1_19_1","volume-title":"Generative Retrieval with Semantic Tree-Structured Item Identifiers via Contrastive Learning. arXiv preprint arXiv:2309.13375","author":"Si Zihua","year":"2023","unstructured":"Zihua Si, Zhongxiang Sun, Jiale Chen, Guozhang Chen, Xiaoxue Zang, Kai Zheng, Yang Song, Xiao Zhang, and Jun Xu. 2023. Generative Retrieval with Semantic Tree-Structured Item Identifiers via Contrastive Learning. arXiv preprint arXiv:2309.13375 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems","author":"Tay Yi","year":"2022","unstructured":"Yi Tay, Vinh Tran, Mostafa Dehghani, Jianmo Ni, Dara Bahri, Harsh Mehta, Zhen Qin, Kai Hui, Zhe Zhao, Jai Prakash Gupta, Tal Schuster, William W. Cohen, and Donald Metzler. 2022. Transformer Memory as a Differentiable Search Index. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022."},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems","author":"Wang Yujing","year":"2022","unstructured":"Yujing Wang, Yingyan Hou, Haonan Wang, Ziming Miao, Shibin Wu, Qi Chen, Yuqing Xia, Chengmin Chi, Guoshuai Zhao, Zheng Liu, Xing Xie, Hao Sun, Weiwei Deng, Qi Zhang, and Mao Yang. 2022. A Neural Corpus Indexer for Document Retrieval. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"e_1_3_2_1_23_1","volume-title":"Vlm: Task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996","author":"Xu Hu","year":"2021","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Prahal Arora, Masoumeh Aminzadeh, Christoph Feichtenhofer, Florian Metze, and Luke Zettlemoyer. 2021. Vlm: Task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996 (2021)."},{"key":"e_1_3_2_1_24_1","volume-title":"Image and Video. In International Conference on Machine Learning. 38728--38748","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Ming Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qi Qian, Wei Wang, Guohai Xu, Ji Zhang, Songfang Huang, Fei Huang, and Jingren Zhou. 2023. mPLUG-2: A Modularized Multi-modal Foundation Model Across Text, Image and Video. In International Conference on Machine Learning. 38728--38748."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_26_1","volume-title":"2023 International Conference on Learning Representations.","author":"Xue Hongwei","year":"2023","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2023. Clip-vip: Adapting pre-trained image-text model to video-language alignment. In 2023 International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Bowen Zhang Hexiang Hu and Fei Sha. 2018. Cross-modal and hierarchical modeling of video and text. In 2018 european conference on computer vision. 374--390.","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"e_1_3_2_1_29_1","volume-title":"Irgen: Generative modeling for image retrieval. arXiv preprint arXiv:2303.10126","author":"Zhang Yidan","year":"2023","unstructured":"Yidan Zhang, Ting Zhang, Dong Chen, Yujing Wang, Qi Chen, Xing Xie, Hao Sun, Weiwei Deng, Qi Zhang, Fan Yang, et al. 2023. Irgen: Generative modeling for image retrieval. arXiv preprint arXiv:2303.10126 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680673","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680673","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680673"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":31,"alternative-id":["10.1145\/3664647.3680673","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680673","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}