{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:13:37Z","timestamp":1750220017534,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592238","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"76-84","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["CMMT: Cross-Modal Meta-Transformer for Video-Text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6903-2962","authenticated-orcid":false,"given":"Yizhao","family":"Gao","sequence":"first","affiliation":[{"name":"Renmin University of China, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0280-7724","authenticated-orcid":false,"given":"Zhiwu","family":"Lu","sequence":"additional","affiliation":[{"name":"Renmin University of China, China"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Elad Amrani Rami Ben-Ari Daniel Rotman and Alex Bronstein. 2021. Noise Estimation Using Density Estimation for Self-Supervised Multimodal Learning. In AAAI. 6644\u20136652.","DOI":"10.1609\/aaai.v35i8.16822"},{"volume-title":"ICLR","author":"Bahdanau Dzmitry","key":"e_1_3_2_1_2_1","unstructured":"Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2015. Neural Machine Translation by Jointly Learning to Align and Translate. In ICLR. http:\/\/arxiv.org\/abs\/1409.0473"},{"key":"e_1_3_2_1_3_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. arXiv preprint arXiv:2104.00650","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. arXiv preprint arXiv:2104.00650 (2021). https:\/\/arxiv.org\/abs\/2104.00650"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Hila Chefer Shir Gur and Lior Wolf. 2021. Transformer Interpretability Beyond Attention Visualization. In CVPR. 782\u2013791.","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"e_1_3_2_1_5_1","unstructured":"David Chen and William\u00a0B Dolan. 2011. Collecting highly parallel data for paraphrase evaluation. In ACL. 190\u2013200."},{"key":"e_1_3_2_1_6_1","volume-title":"Microsoft COCO Captions: Data Collection and Evaluation Server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C.\u00a0Lawrence Zitnick. 2015. Microsoft COCO Captions: Data Collection and Evaluation Server. arXiv preprint arXiv:1504.00325 (2015). http:\/\/arxiv.org\/abs\/1504.00325"},{"key":"e_1_3_2_1_7_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT. 4171\u20134186.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT. 4171\u20134186."},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_9_1","unstructured":"Fartash Faghri David\u00a0J. Fleet Jamie\u00a0Ryan Kiros and Sanja Fidler. 2018. VSE++: Improving Visual-Semantic Embeddings with Hard Negatives. In BMVC. 12."},{"key":"e_1_3_2_1_10_1","unstructured":"Chelsea Finn Pieter Abbeel and Sergey Levine. 2017. Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks. In ICML. 1126\u20131135."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Valentin Gabeur Chen Sun Karteek Alahari and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In ECCV. 214\u2013229.","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_12_1","unstructured":"Yuying Ge Yixiao Ge Xihui Liu Dian Li Ying Shan Xiaohu Qie and Ping Luo. 2022. Bridging Video-Text Retrieval With Multiple Choice Questions. In CVPR. 16167\u201316176."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Bharath Hariharan and Ross\u00a0B. Girshick. 2017. Low-Shot Visual Recognition by Shrinking and Hallucinating Features. In ICCV. 3037\u20133046.","DOI":"10.1109\/ICCV.2017.328"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Feng He Qi Wang Zhifan Feng Wenbin Jiang Yajuan L\u00fc Yong Zhu and Xiao Tan. 2021. Improving Video Retrieval by Adaptive Margin. In SIGIR. ACM 1359\u20131368.","DOI":"10.1145\/3404835.3462927"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Lisa Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. In ICCV. 5804\u20135813.","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Weike Jin Zhou Zhao Pengcheng Zhang Jieming Zhu Xiuqiang He and Yueting Zhuang. 2021. Hierarchical Cross-Modal Graph Consistency Learning for Video-Text Retrieval. In SIGIR. ACM 1114\u20131124.","DOI":"10.1145\/3404835.3462974"},{"key":"e_1_3_2_1_17_1","volume-title":"Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539","author":"Kiros Ryan","year":"2014","unstructured":"Ryan Kiros, Ruslan Salakhutdinov, and Richard\u00a0S. Zemel. 2014. Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014). http:\/\/arxiv.org\/abs\/1411.2539"},{"key":"e_1_3_2_1_18_1","volume-title":"Video Understanding as Machine Translation. arXiv preprint arXiv:2006.07203","author":"Korbar Bruno","year":"2020","unstructured":"Bruno Korbar, Fabio Petroni, Rohit Girdhar, and Lorenzo Torresani. 2020. Video Understanding as Machine Translation. arXiv preprint arXiv:2006.07203 (2020). https:\/\/arxiv.org\/abs\/2006.07203"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos\u00a0Niebles. 2017. Dense-captioning events in videos. In ICCV. 706\u2013715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_20_1","volume-title":"Visual genome: Connecting language and vision using crowdsourced dense image annotations. IJCV","author":"Krishna Ranjay","year":"2017","unstructured":"Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata, Joshua Kravitz, Stephanie Chen, Yannis Kalantidis, Li-Jia Li, David\u00a0A Shamma, 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. IJCV (2017), 32\u201373."},{"key":"e_1_3_2_1_21_1","volume-title":"Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling. CVPR","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Linjie Li, Luowei Zhou, Zhe Gan, Tamara\u00a0L Berg, Mohit Bansal, and Jingjing Liu. 2021. Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling. CVPR (2021), 7331\u20137341."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Kai Li Yulun Zhang Kunpeng Li and Yun Fu. 2020. Adversarial Feature Hallucination Networks for Few-Shot Learning. In CVPR. 13467\u201313476.","DOI":"10.1109\/CVPR42600.2020.01348"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"e_1_3_2_1_24_1","volume-title":"Meta-SGD: Learning to Learn Quickly for Few Shot Learning. arXiv preprint arXiv:1707.09835","author":"Li Zhenguo","year":"2017","unstructured":"Zhenguo Li, Fengwei Zhou, Fei Chen, and Hang Li. 2017. Meta-SGD: Learning to Learn Quickly for Few Shot Learning. arXiv preprint arXiv:1707.09835 (2017). http:\/\/arxiv.org\/abs\/1707.09835"},{"key":"e_1_3_2_1_25_1","volume-title":"HiT: Hierarchical Transformer with Momentum Contrast for Video-Text Retrieval. arXiv preprint arXiv:2103.15049","author":"Liu Song","year":"2021","unstructured":"Song Liu, Haoqi Fan, Shengsheng Qian, Yiru Chen, Wenkui Ding, and Zhongyuan Wang. 2021. HiT: Hierarchical Transformer with Momentum Contrast for Video-Text Retrieval. arXiv preprint arXiv:2103.15049 (2021). https:\/\/arxiv.org\/abs\/2103.15049"},{"key":"e_1_3_2_1_26_1","unstructured":"Yang Liu Samuel Albanie Arsha Nagrani and Andrew Zisserman. 2019. Use What You Have: Video retrieval using representations from collaborative experts. In BMVC. 279."},{"key":"e_1_3_2_1_27_1","volume-title":"COTS: Collaborative Two-Stream Vision-Language Pre-Training Model for Cross-Modal Retrieval. In CVPR. 15692\u201315701.","author":"Lu Haoyu","year":"2022","unstructured":"Haoyu Lu, Nanyi Fei, Yuqi Huo, Yizhao Gao, Zhiwu Lu, and Ji-Rong Wen. 2022. COTS: Collaborative Two-Stream Vision-Language Pre-Training Model for Cross-Modal Retrieval. In CVPR. 15692\u201315701."},{"key":"e_1_3_2_1_28_1","volume-title":"UniVL: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353","author":"Luo Huaishao","year":"2020","unstructured":"Huaishao Luo, Lei Ji, Botian Shi, Haoyang Huang, Nan Duan, Tianrui Li, Xilin Chen, and Ming Zhou. 2020. UniVL: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020). https:\/\/arxiv.org\/abs\/2002.06353"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Antoine Miech Dimitri Zhukov Jean-Baptiste Alayrac Makarand Tapaswi Ivan Laptev and Josef Sivic. 2019. HowTo100M: Learning a text-video embedding by watching hundred million narrated video clips. In ICCV. 2630\u20132640.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Niluthpol\u00a0Chowdhury Mithun Juncheng Li Florian Metze and Amit\u00a0K Roy-Chowdhury. 2018. Learning joint embedding with multimodal cues for cross-modal video-text retrieval. In ICMR. 19\u201327.","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_1_31_1","unstructured":"Tsendsuren Munkhdalai and Hong Yu. 2017. Meta Networks. In ICML. 2554\u20132563."},{"key":"e_1_3_2_1_32_1","volume-title":"On first-order meta-learning algorithms. arXiv preprint arXiv:1803.02999","author":"Nichol Alex","year":"2018","unstructured":"Alex Nichol, Joshua Achiam, and John Schulman. 2018. On first-order meta-learning algorithms. arXiv preprint arXiv:1803.02999 (2018). http:\/\/arxiv.org\/abs\/1803.02999"},{"key":"e_1_3_2_1_33_1","unstructured":"Vicente Ordonez Girish Kulkarni and Tamara Berg. 2011. Im2Text: Describing images using 1 million captioned photographs. In NeurIPS. 1143\u20131151."},{"key":"e_1_3_2_1_34_1","unstructured":"Mandela Patrick Po-Yao Huang Yuki\u00a0Markus Asano Florian Metze Alexander\u00a0G. Hauptmann Jo\u00e3o\u00a0F. Henriques and Andrea Vedaldi. 2021. Support-set bottlenecks for video-text representation learning. In ICLR. https:\/\/openreview.net\/forum?id=EqoXe2zmhrh"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Bryan\u00a0A. Plummer Liwei Wang Chris\u00a0M. Cervantes Juan\u00a0C. Caicedo Julia Hockenmaier and Svetlana Lazebnik. 2015. Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. In ICCV. 2641\u20132649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_36_1","unstructured":"Sachin Ravi and Hugo Larochelle. 2017. Optimization as a Model for Few-Shot Learning. In ICLR. https:\/\/openreview.net\/forum?id=rJY0-Kcll"},{"key":"e_1_3_2_1_37_1","volume-title":"AVLnet: Learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199","author":"Rouditchenko Andrew","year":"2020","unstructured":"Andrew Rouditchenko, Angie Boggust, David Harwath, Dhiraj Joshi, Samuel Thomas, Kartik Audhkhasi, Rogerio Feris, Brian Kingsbury, Michael Picheny, Antonio Torralba, 2020. AVLnet: Learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199 (2020). https:\/\/arxiv.org\/abs\/2006.09199"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Piyush Sharma Nan Ding Sebastian Goodman and Radu Soricut. 2018. Conceptual captions: A cleaned hypernymed image alt-text dataset for automatic image captioning. In ACL. 2556\u20132565.","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_39_1","unstructured":"Jake Snell Kevin Swersky and Richard\u00a0S. Zemel. 2017. Prototypical Networks for Few-shot Learning. In NeurIPS. 4080\u20134090."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Flood Sung Yongxin Yang Li Zhang Tao Xiang Philip H.\u00a0S. Torr and Timothy\u00a0M. Hospedales. 2018. Learning to Compare: Relation Network for Few-Shot Learning. In CVPR. 1199\u20131208.","DOI":"10.1109\/CVPR.2018.00131"},{"key":"e_1_3_2_1_41_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NeurIPS. 5998\u20136008."},{"key":"e_1_3_2_1_42_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NeurIPS. 5998\u20136008."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Subhashini Venugopalan Huijuan Xu Jeff Donahue Marcus Rohrbach Raymond\u00a0J. Mooney and Kate Saenko. 2015. Translating Videos to Natural Language Using Deep Recurrent Neural Networks. In NAACL-HLT. 1494\u20131504.","DOI":"10.3115\/v1\/N15-1173"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Xin Wang Jiawei Wu Junkun Chen Lei Li Yuan-Fang Wang and William\u00a0Yang Wang. 2019. VaTeX: A Large-Scale High-Quality Multilingual Dataset for Video-and-Language Research. In ICCV. 4580\u20134590.","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Xiaohan Wang Linchao Zhu and Yi Yang. 2021. T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval. In CVPR. 5079\u20135088.","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Jun Xu Tao Mei Ting Yao and Yong Rui. 2016. MSR-VTT: A large video description dataset for bridging video and language. In CVPR. 5288\u20135296.","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_47_1","unstructured":"Kelvin Xu Jimmy Ba Ryan Kiros Kyunghyun Cho Aaron\u00a0C. Courville Ruslan Salakhutdinov Richard\u00a0S. Zemel and Yoshua Bengio. 2015. Show Attend and Tell: Neural Image Caption Generation with Visual Attention. In ICML. 2048\u20132057."},{"key":"e_1_3_2_1_48_1","volume-title":"TACo: Token-aware Cascade Contrastive Learning for Video-Text Alignment. arXiv preprint arXiv:2108.09980","author":"Yang Jianwei","year":"2021","unstructured":"Jianwei Yang, Yonatan Bisk, and Jianfeng Gao. 2021. TACo: Token-aware Cascade Contrastive Learning for Video-Text Alignment. arXiv preprint arXiv:2108.09980 (2021). https:\/\/arxiv.org\/abs\/2108.09980"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Han-Jia Ye Hexiang Hu De-Chuan Zhan and Fei Sha. 2020. Few-Shot Learning via Embedding Adaptation with Set-to-Set Functions. In CVPR. 8805\u20138814.","DOI":"10.1109\/CVPR42600.2020.00883"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Youngjae Yu Jongseok Kim and Gunhee Kim. 2018. A joint sequence fusion model for video question answering and retrieval. In ECCV. 487\u2013503.","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Bowen Zhang Hexiang Hu and Fei Sha. 2018. Cross-modal and hierarchical modeling of video and text. In ECCV. 385\u2013401.","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Hongguang Zhang Jing Zhang and Piotr Koniusz. 2019. Few-shot learning via saliency-guided hallucination of samples. In CVPR. 2770\u20132779.","DOI":"10.1109\/CVPR.2019.00288"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Linchao Zhu and Yi Yang. 2020. ActBERT: Learning global-local video-text representations. In CVPR. 8743\u20138752.","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Thessaloniki Greece","acronym":"ICMR '23"},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592238","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592238","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:51:22Z","timestamp":1750182682000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592238"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":53,"alternative-id":["10.1145\/3591106.3592238","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592238","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}