{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:43Z","timestamp":1781539063705,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810613","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"196-205","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AdRetr: Theme-Aware Advertisement Video Retrieval Beyond Keywords"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6143-6967","authenticated-orcid":false,"given":"Neha","family":"Choudhary","sequence":"first","affiliation":[{"name":"Dept. of CSIS, Birla Institute of Technology &amp; Science Pilani, Pilani, Rajasthan, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0101-7932","authenticated-orcid":false,"given":"Abhiraj","family":"Painuly","sequence":"additional","affiliation":[{"name":"Dept. of CSIS, Birla Institute of Technology &amp; Science Pilani, Pilani, Rajasthan, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7880-8219","authenticated-orcid":false,"given":"Yaman","family":"Kumar","sequence":"additional","affiliation":[{"name":"Media and Data Science Research Lab, Adobe Systems, Noida, Uttar Pradesh, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4146-7138","authenticated-orcid":false,"given":"Varun","family":"Khurana","sequence":"additional","affiliation":[{"name":"Media and Data Science Research Lab, Adobe Systems, Noida, Uttar Pradesh, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1556-9905","authenticated-orcid":false,"given":"Poonam","family":"Goyal","sequence":"additional","affiliation":[{"name":"Dept. of CSIS, Birla Institute of Technology &amp; Science Pilani, Pilani, Rajasthan, India"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Aanisha Bhattacharya Yaman\u00a0K Singla Balaji Krishnamurthy Rajiv\u00a0Ratn Shah and Changyou Chen. 2023. A video is worth 4096 tokens: Verbalize videos to understand them in zero shot. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.09758 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.608"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.5555\/3104322.3104338"},{"key":"e_1_3_3_1_4_2","unstructured":"Lisa Cartwright and Marita Sturken. 2001. Practices of looking. An Introduction to Visual Culture 2 (2001)."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00589"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Lin Chen Xilin Wei Jinsong Li Xiaoyi Dong Pan Zhang Yuhang Zang Zehui Chen Haodong Duan Zhenyu Tang Li Yuan et\u00a0al. 2024. Sharegpt4video: Improving video understanding and generation with better captions. Advances in Neural Information Processing Systems 37 (2024) 19472\u201319495.","DOI":"10.52202\/079017-0614"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Sihan Chen Handong Li Qunbo Wang Zijia Zhao Mingzhen Sun Xinxin Zhu and Jing Liu. 2023. Vast: A vision-audio-subtitle-text omni-modality foundation model and dataset. Advances in Neural Information Processing Systems 36 (2023) 72842\u201372866.","DOI":"10.52202\/075280-3185"},{"key":"e_1_3_3_1_8_2","unstructured":"Xing Cheng Hezheng Lin Xiangyu Wu Fan Yang and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.04290 (2021)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00567"},{"key":"e_1_3_3_1_10_2","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_3_1_11_2","unstructured":"Xiaoran Fan Tao Ji Changhao Jiang Shuo Li Senjie Jin Sirui Song Junke Wang Boyang Hong Lu Chen Guodong Zheng Ming Zhang Caishuang Huang Rui Zheng Zhiheng Xi Yuhao Zhou Shihan Dou Junjie Ye Hang Yan Tao Gui Qi Zhang Xipeng Qiu Xuanjing Huang Zuxuan Wu and Yu-Gang Jiang. 2024. MouSi: Poly-Visual-Expert Vision-Language Models. arxiv:https:\/\/arXiv.org\/abs\/2401.17221\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2401.17221"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.123"},{"key":"e_1_3_3_1_13_2","unstructured":"Zhiwei Jia Pradyumna Narayana Arjun\u00a0R Akula Garima Pruthi Hao Su Sugato Basu and Varun Jampani. 2023. KAFA: Rethinking image ad understanding with knowledge-augmented feature adaptation of vision-language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.18373 (2023)."},{"key":"e_1_3_3_1_14_2","unstructured":"Williamson Judith. 1978. Decoding advertisements: Ideology and meaning in advertising. Boston: Marion Boyars (1978)."},{"key":"e_1_3_3_1_15_2","first-page":"113","volume-title":"European Conference on Computer Vision","author":"Kar O\u011fuzhan\u00a0Fatih","year":"2024","unstructured":"O\u011fuzhan\u00a0Fatih Kar, Alessio Tonioni, Petra Poklukar, Achin Kulshrestha, Amir Zamir, and Federico Tombari. 2024. Brave: Broadening the visual encoding of vision-language models. In European Conference on Computer Vision. Springer, 113\u2013132."},{"key":"e_1_3_3_1_16_2","volume-title":"Forty-first International Conference on Machine Learning","author":"Karamcheti Siddharth","year":"2024","unstructured":"Siddharth Karamcheti, Suraj Nair, Ashwin Balakrishna, Percy Liang, Thomas Kollar, and Dorsa Sadigh. 2024. Prismatic vlms: Investigating the design space of visually-conditioned language models. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_3_1_17_2","unstructured":"Varun Khurana Yaman\u00a0Kumar Singla Jayakumar Subramanian Changyou Chen Rajiv\u00a0Ratn Shah zhiqiang xu and Balaji Krishnamurthy. 2025. Measuring And Improving Engagement of Text-to-Image Generation Models. https:\/\/openreview.net\/forum?id=TmCcNuo03f"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25076"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_3_1_20_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_1_21_2","unstructured":"Bin Lin Yang Ye Bin Zhu Jiaxi Cui Munan Ning Peng Jin and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.10122 (2023)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems 36 (2023) 34892\u201334916.","DOI":"10.52202\/075280-1516"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"e_1_3_3_1_25_2","unstructured":"Huaishao Luo Lei Ji Ming Zhong Yang Chen Wen Lei Nan Duan and Tianrui Li. 2021. Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.08860 (2021)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Iv\u00e1n Mart\u00edn-Fern\u00e1ndez Mihai\u00a0Gabriel Constantin Bogdan Ionescu Manuel Gil-Mart\u00edn and Fernando Fern\u00e1ndez-Mart\u00ednez. 2026. Principled Evaluation of Multi-Label Persuasion in Advertisements with Large Vision-Language Models. ACM Transactions on Multimedia Computing Communications and Applications (2026).","DOI":"10.1145\/3788874"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Mary\u00a0L McHugh. 2012. Interrater reliability: the kappa statistic. Biochemia medica 22 3 (2012) 276\u2013282.","DOI":"10.11613\/BM.2012.031"},{"key":"e_1_3_3_1_29_2","unstructured":"Antoine Miech Ivan Laptev and Josef Sivic. 2018. Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.02516 (2018)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3347022"},{"key":"e_1_3_3_1_31_2","first-page":"177","volume-title":"European Conference on Computer Vision","author":"Panagopoulou Artemis","year":"2024","unstructured":"Artemis Panagopoulou, Le Xue, Ning Yu, Junnan Li, Dongxu Li, Shafiq Joty, Ran Xu, Silvio Savarese, Caiming Xiong, and Juan\u00a0Carlos Niebles. 2024. X-instructblip: A framework for aligning image, 3d, audio, video to llms and its emergent cross-modal reasoning. In European Conference on Computer Vision. Springer, 177\u2013197."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Filip Radenovi\u0107 Giorgos Tolias and Ond\u0159ej Chum. 2018. Fine-tuning CNN image retrieval with no human annotation. IEEE transactions on pattern analysis and machine intelligence 41 7 (2018) 1655\u20131668.","DOI":"10.1109\/TPAMI.2018.2846566"},{"key":"e_1_3_3_1_33_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_34_2","unstructured":"Min Shi Fuxiao Liu Shihao Wang Shijia Liao Subhashree Radhakrishnan Yilin Zhao De-An Huang Hongxu Yin Karan Sapra Yaser Yacoob et\u00a0al. 2024. Eagle: Exploring the design space for multimodal llms with mixture of encoders. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.15998 (2024)."},{"key":"e_1_3_3_1_35_2","unstructured":"Somesh Singh Yaman\u00a0Kumar Singla Veeky Baths Rajiv\u00a0Ratn Shah Changyou Chen Balaji Krishnamurthy et\u00a0al. 2024. Llava finds free lunch: Teaching human behavior improves content understanding abilities of llms. CoRR (2024)."},{"key":"e_1_3_3_1_36_2","first-page":"396","volume-title":"European Conference on Computer Vision","author":"Wang Yi","year":"2024","unstructured":"Yi Wang, Kunchang Li, Xinhao Li, Jiashuo Yu, Yinan He, Guo Chen, Baoqi Pei, Rongkun Zheng, Zun Wang, Yansong Shi, et\u00a0al. 2024. Internvideo2: Scaling foundation models for multimodal video understanding. In European Conference on Computer Vision. Springer, 396\u2013416."},{"key":"e_1_3_3_1_37_2","unstructured":"Yi Wang Kunchang Li Yizhuo Li Yinan He Bingkun Huang Zhiyu Zhao Hongjie Zhang Jilan Xu Yi Liu Zun Wang et\u00a0al. 2022. Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.03191 (2022)."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Hu Xu Gargi Ghosh Po-Yao Huang Dmytro Okhonko Armen Aghajanyan Florian Metze Luke Zettlemoyer and Christoph Feichtenhofer. 2021. Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.14084 (2021).","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","unstructured":"Keren Ye Narges\u00a0Honarvar Nazari James Hahn Zaeem Hussain Mingda Zhang and Adriana Kovashka. 2021. Interpreting the Rhetoric of Visual Advertisements. IEEE Transactions on Pattern Analysis and Machine Intelligence 43 4 (2021) 1308\u20131323. 10.1109\/TPAMI.2019.2947440","DOI":"10.1109\/TPAMI.2019.2947440"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_3_1_42_2","unstructured":"Youngjae Yu Hyungjin Ko Jongwook Choi and Gunhee Kim. 2016. Video captioning and retrieval models with semantic attention. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1610.02947 6 7 (2016)."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Hang Zhang Xin Li and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.02858 (2023).","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380001"},{"key":"e_1_3_3_1_45_2","unstructured":"Xiaohan Zou Changqiao Wu Lele Cheng and Zhongyuan Wang. 2022. Tokenflow: Rethinking fine-grained cross-modal alignment in vision-language retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.13822 (2022)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:43:53Z","timestamp":1781538233000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810613"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":44,"alternative-id":["10.1145\/3805622.3810613","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810613","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}