{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T16:51:56Z","timestamp":1781542316448,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Singapore Ministry of Education Academic Research Fund (AcRF) Tier 1 grant","award":["25-SIS-SMU-004"],"award-info":[{"award-number":["25-SIS-SMU-004"]}]},{"name":"Singapore Ministry of Education Academic Research Fund (AcRF) Tier 2 grant","award":["T2EP20222-0047"],"award-info":[{"award-number":["T2EP20222-0047"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810720","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1759-1767","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Pruning-based Question-Answering for Interactive Video Search: A Simple Baseline"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3470-6868","authenticated-orcid":false,"given":"Yu-Tong","family":"Cheng","sequence":"first","affiliation":[{"name":"Singapore Management University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1289-3785","authenticated-orcid":false,"given":"Phuong Anh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"Singapore Management University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4182-8261","authenticated-orcid":false,"given":"Chong-Wah","family":"Ngo","sequence":"additional","affiliation":[{"name":"Singapore Management University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00774"},{"key":"e_1_3_3_1_3_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang Humen Zhong Yuanzhi Zhu Mingkun Yang Zhaohai Li Jianqiang Wan Pengfei Wang Wei Ding Zheren Fu Yiheng Xu Jiabo Ye Xi Zhang Tianbao Xie Zesen Cheng Hang Zhang Zhibo Yang Haiyang Xu and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2502.13923\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00185"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-95-6963-2_19"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.1998.719859"},{"key":"e_1_3_3_1_7_2","unstructured":"Aaron Grattafiori Abhimanyu Dubey et\u00a0al. 2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-industry.41"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/1282280.1282369"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David\u00a0A. Shamma Michael\u00a0S. Bernstein and Li Fei-Fei. 2017. Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. Int. J. Comput. Vision 123 1 (May 2017) 32\u201373. 10.1007\/s11263-016-0981-7","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.46"},{"key":"e_1_3_3_1_12_2","first-page":"61437","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Levy Matan","year":"2023","unstructured":"Matan Levy, Rami Ben-Ari, Nir Darshan, and Dani Lischinski. 2023. Chatting Makes Perfect: Chat-based Image Retrieval. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 61437\u201361449. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/c1b3d1e2cf53bb28cabd801bd58b3521-Paper-Conference.pdf"},{"key":"e_1_3_3_1_13_2","unstructured":"Mike Lewis Yinhan Liu Naman Goyal Marjan Ghazvininejad Abdelrahman Mohamed Omer Levy Veselin Stoyanov and Luke Zettlemoyer. 2019. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation Translation and Comprehension. CoRR abs\/1910.13461 (2019). arXiv:https:\/\/arXiv.org\/abs\/1910.13461http:\/\/arxiv.org\/abs\/1910.13461"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01018"},{"key":"e_1_3_3_1_15_2","unstructured":"Bin Lin Bin Zhu Yang Ye Munan Ning Peng Jin and Li Yuan. 2023. Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.10122 (2023)."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-28238-6_40"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733422"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548361"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00484"},{"key":"e_1_3_3_1_20_2","series-title":"Proceedings of Machine Learning Research","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748\u20138763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_3_1_21_2","unstructured":"Luca Rossetto Klaus Schoeffmann Cathal Gurrin Jakub Loko\u010d and Werner Bailer. 2025. Results of the 2025 Video Browser Showdown. arxiv:https:\/\/arXiv.org\/abs\/2509.12000\u00a0[cs.MM] https:\/\/arxiv.org\/abs\/2509.12000"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05710-7_29"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"C.\u00a0E. Shannon. 1948. A Mathematical Theory of Communication. The Bell System Technical Journal 27 3 (1948) 379\u2013423.","DOI":"10.1002\/j.1538-7305.1948.tb01338.x"},{"key":"e_1_3_3_1_24_2","volume-title":"Drill-down: interactive retrieval of complex scenes using natural language queries","author":"Tan Fuwen","year":"2019","unstructured":"Fuwen Tan, Paola Cascante-Bonilla, Xiaoxiao Guo, Hui Wu, Song Feng, and Vicente Ordonez. 2019. Drill-down: interactive retrieval of complex scenes using natural language queries. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_3_1_25_2","volume-title":"TREC Video Retrieval Evaluation","author":"Ueki Kazuya","year":"2020","unstructured":"Kazuya Ueki, Ryou Mutou, Takayuki Hori, Yongbeom Kim, and Yuma Suzuki. 2020. Waseda_Meisei_SoftBank at TRECVID 2020: Ad-hoc Video Search. In TREC Video Retrieval Evaluation. https:\/\/api.semanticscholar.org\/CorpusID:212411865"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658052"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Rintaro Yanagi Ren Togo Takahiro Ogawa and Miki Haseyama. 2022. Interactive Re-ranking via Object Entropy-Guided Question Answering for Cross-Modal Image Retrieval. ACM Trans. Multimedia Comput. Commun. Appl. 18 3 Article 68 (2022).","DOI":"10.1145\/3485042"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.02054"},{"key":"e_1_3_3_1_29_2","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Hao Tian Yuchen Duan Weijie Su Jie Shao Zhangwei Gao Erfei Cui Xuehui Wang Yue Cao Yangzhou Liu Xingguang Wei Hongjie Zhang Haomin Wang Weiye Xu Hao Li Jiahao Wang Nianchen Deng Songze Li Yinan He Tan Jiang Jiapeng Luo Yi Wang Conghui He Botian Shi Xingcheng Zhang Wenqi Shao Junjun He Yingtong Xiong Wenwen Qu Peng Sun Penglong Jiao Han Lv Lijun Wu Kaipeng Zhang Huipeng Deng Jiaye Ge Kai Chen Limin Wang Min Dou Lewei Lu Xizhou Zhu Tong Lu Dahua Lin Yu Qiao Jifeng Dai and Wenhai Wang. 2025. InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arxiv:https:\/\/arXiv.org\/abs\/2504.10479\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2504.10479"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:51:58Z","timestamp":1781538718000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810720"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":28,"alternative-id":["10.1145\/3805622.3810720","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810720","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}