{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T15:13:17Z","timestamp":1776352397275,"version":"3.51.2"},"reference-count":115,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,2,23]],"date-time":"2023-02-23T00:00:00Z","timestamp":1677110400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,2,23]],"date-time":"2023-02-23T00:00:00Z","timestamp":1677110400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62102061"],"award-info":[{"award-number":["62102061"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62102061"],"award-info":[{"award-number":["62102061"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102061"],"award-info":[{"award-number":["62102061"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s13735-023-00267-8","type":"journal-article","created":{"date-parts":[[2023,2,23]],"date-time":"2023-02-23T14:04:47Z","timestamp":1677161087000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":26,"title":["Deep learning for video-text retrieval: a review"],"prefix":"10.1007","volume":"12","author":[{"given":"Cunjuan","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Qi","family":"Jia","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yanming","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,2,23]]},"reference":[{"key":"267_CR1","doi-asserted-by":"crossref","unstructured":"Ali A, Schwartz I, Hazan T, Wolf L (2022) Video and text matching with conditioned embeddings. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision. pp 1565\u20131574","DOI":"10.1109\/WACV51458.2022.00055"},{"key":"267_CR2","first-page":"6644","volume":"35","author":"E Amrani","year":"2021","unstructured":"Amrani E, Ben-Ari R, Rotman D, Bronstein A (2021) Noise estimation using density estimation for self-supervised multimodal learning. Proc AAAI Conf Artif Intell 35:6644\u20136652","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"267_CR3","doi-asserted-by":"crossref","unstructured":"Arandjelovic R, Gronat P, Torii A, Pajdla T, Sivic J (2016) Netvlad: Cnn architecture for weakly supervised place recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5297\u20135307","DOI":"10.1109\/CVPR.2016.572"},{"key":"267_CR4","doi-asserted-by":"crossref","unstructured":"Arnab A, Dehghani M, Heigold G, Sun C, Lu\u010di\u0107 M, Schmid C(2021) Vivit: a video vision transformer. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 6836\u20136846","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"267_CR5","doi-asserted-by":"crossref","unstructured":"Bain M, Nagrani A, Varol G, Zisserman A(2021) Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 1728\u20131738","DOI":"10.1109\/ICCV48922.2021.00175"},{"issue":"2","key":"267_CR6","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1109\/72.279181","volume":"5","author":"Y Bengio","year":"1994","unstructured":"Bengio Y, Simard P, Frasconi P (1994) Learning long-term dependencies with gradient descent is difficult. IEEE Trans Neural Netw 5(2):157\u2013166","journal-title":"IEEE Trans Neural Netw"},{"key":"267_CR7","first-page":"4","volume":"2","author":"G Bertasius","year":"2021","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding. In ICML 2:4","journal-title":"In ICML"},{"key":"267_CR8","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron F, Escorcia V, Ghanem B, Carlos\u00a0Niebles J (2015) Activitynet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 961\u2013970","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"267_CR9","doi-asserted-by":"crossref","unstructured":"Cao Q, Shen L, Xie W, Parkhi OM, Zisserman A (2018) Vggface2: a dataset for recognising faces across pose and age. In: 2018 13th IEEE international conference on automatic face & gesture recognition (FG 2018). IEEE, pp 67\u201374","DOI":"10.1109\/FG.2018.00020"},{"key":"267_CR10","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"267_CR11","doi-asserted-by":"crossref","unstructured":"Chen S, Zhao Y, Jin Q, Wu Q (2020) Fine-grained video-text retrieval with hierarchical graph reasoning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 10638\u201310647","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"267_CR12","unstructured":"Chen Y (2015) Convolutional neural network for sentence classification. Master\u2019s thesis, University of Waterloo"},{"key":"267_CR13","unstructured":"Cheng X, Lin H, Wu X, Yang F, Shen D (2021) Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290"},{"key":"267_CR14","unstructured":"Chung J, Gulcehre C, Cho K, Bengio Y (2014) Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555"},{"key":"267_CR15","doi-asserted-by":"crossref","unstructured":"Croitoru I, Bogolin S-V, Leordeanu M, Jin H, Zisserman A, Albanie S, Liu Y (2021) Teachtext: crossmodal generalized distillation for text-video retrieval. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 11583\u201311593","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"267_CR16","doi-asserted-by":"crossref","unstructured":"Deng D, Liu H, Li X, Cai D (2018) Pixellink: detecting scene text via instance segmentation. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a032","DOI":"10.1609\/aaai.v32i1.12269"},{"key":"267_CR17","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"issue":"12","key":"267_CR18","doi-asserted-by":"publisher","first-page":"3377","DOI":"10.1109\/TMM.2018.2832602","volume":"20","author":"J Dong","year":"2018","unstructured":"Dong J, Li X, Snoek CGM (2018) Predicting visual features from text for image and video caption retrieval. IEEE Trans Multimedia 20(12):3377\u20133388","journal-title":"IEEE Trans Multimedia"},{"key":"267_CR19","doi-asserted-by":"crossref","unstructured":"Dong J, Li X, Xu C, Ji S, He Y, Yang G, Wang X (2019) Dual encoding for zero-example video retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 9346\u20139355","DOI":"10.1109\/CVPR.2019.00957"},{"key":"267_CR20","doi-asserted-by":"crossref","unstructured":"Dong J, Wang Y, Chen X, Qu X, Li X, He Y, Wang X (2022) Reading-strategy inspired visual representation learning for text-to-video retrieval. In: IEEE transactions on circuits and systems for video technology","DOI":"10.1109\/TCSVT.2022.3150959"},{"key":"267_CR21","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et\u00a0al. (2020) An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"267_CR22","doi-asserted-by":"crossref","unstructured":"Dzabraev M, Kalashnikov M, Komkov S, Petiushko A (2021) Mdmmt: multidomain multimodal transformer for video retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 3354\u20133363","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"267_CR23","unstructured":"Faghri F, Fleet DJ, Kiros JR, Fidler S (2017) Vse++: improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612"},{"key":"267_CR24","first-page":"10754","volume":"34","author":"H Fan","year":"2020","unstructured":"Fan H, Yang Y (2020) Person tube retrieval via language description. Proc AAAI Conf Artif Intell 34:10754\u201310761","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"267_CR25","unstructured":"Fang H, Xiong P, Xu L, Chen Y (2021) Clip2video: mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097"},{"key":"267_CR26","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 1933\u20131941","DOI":"10.1109\/CVPR.2016.213"},{"key":"267_CR27","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"267_CR28","doi-asserted-by":"crossref","unstructured":"Gabeur V, Sun C, Alahari K, Schmid C (2020) Multi-modal transformer for video retrieval. In: Computer vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IV 16. Springer, pp 214\u2013219","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"267_CR29","unstructured":"Gao Z, Liu J, Chen S, Chang D, Zhang H, Yuan J (2021) Clip2tv: an empirical study on transformer-based methods for video-text retrieval. arXiv preprint arXiv:2111.05610"},{"key":"267_CR30","doi-asserted-by":"crossref","unstructured":"Ge Y, Ge Y, Liu X, Li D, Shan Y, Qie X, Luo P (2022) Bridgeformer: bridging video-text retrieval with multiple choice questions. arXiv preprint arXiv:2201.04850","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"267_CR31","doi-asserted-by":"crossref","unstructured":"Ge Y, Ge Y, Liu X, Wang AJ, Wu J, Shan Y, Qie X, Luo P (2022) Miles: visual bert pre-training with injected language semantics for video-text retrieval. arXiv preprint arXiv:2204.12408","DOI":"10.1007\/978-3-031-19833-5_40"},{"key":"267_CR32","first-page":"22605","volume":"33","author":"S Ging","year":"2020","unstructured":"Ging S, Zolfaghari M, Pirsiavash H, Brox T (2020) Coot: cooperative hierarchical transformer for video-text representation learning. Adv Neural Inf Process Syst 33:22605\u201322618","journal-title":"Adv Neural Inf Process Syst"},{"key":"267_CR33","doi-asserted-by":"crossref","unstructured":"Gorti SK, Vouitsis N, Ma J, Golestan K, Volkovs M, Garg A, Yu G (2022) X-pool: Cross-modal language-video attention for text-video retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 5006\u20135015","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"267_CR34","doi-asserted-by":"crossref","unstructured":"Gu Y, Ma C, Yang J (2016) Supervised recurrent hashing for large scale video retrieval. In: Proceedings of the 24th ACM international conference on multimedia. pp 272\u2013276","DOI":"10.1145\/2964284.2967225"},{"key":"267_CR35","doi-asserted-by":"crossref","unstructured":"Guo X, Guo X, Lu Y (2021) Ssan: Separable self-attention network for video representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 12618\u201312627","DOI":"10.1109\/CVPR46437.2021.01243"},{"key":"267_CR36","unstructured":"Han N, Chen J, Xiao G, Zeng Y, Shi C, Chen H (2021) Visual spatio-temporal relation-enhanced network for cross-modal text-video retrieval. arXiv preprint arXiv:2110.15609"},{"key":"267_CR37","doi-asserted-by":"crossref","unstructured":"Hara K, Kataoka H, Satoh Y (2018) Can spatiotemporal 3d cnns retrace the history of 2d cnns and imagenet? In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 6546\u20136555","DOI":"10.1109\/CVPR.2018.00685"},{"key":"267_CR38","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"267_CR39","doi-asserted-by":"crossref","unstructured":"He K, Fan H, Wu Y, Xie S, Girshick R (2020) Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 9729\u20139738","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"267_CR40","doi-asserted-by":"crossref","unstructured":"Hershey S, Chaudhuri S, Ellis DPW, Gemmeke JF, Jansen A, Moore RC, Plakal M, Platt D, Saurous RA, Seybold B, et\u00a0al. (2017) Cnn architectures for large-scale audio classification. In: 2017 IEEE international conference on acoustics, speech and signal processing (icassp). IEEE, pp 131\u2013135","DOI":"10.1109\/ICASSP.2017.7952132"},{"issue":"8","key":"267_CR41","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"267_CR42","doi-asserted-by":"crossref","unstructured":"Hu J, Shen L, Sun G (2018) Squeeze-and-excitation networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 7132\u20137141","DOI":"10.1109\/CVPR.2018.00745"},{"key":"267_CR43","doi-asserted-by":"crossref","unstructured":"Huang G, Liu Z, Van Der\u00a0Maaten L, Weinberger KQ (2017) Densely connected convolutional networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 4700\u20134708","DOI":"10.1109\/CVPR.2017.243"},{"key":"267_CR44","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"267_CR45","unstructured":"Korbar B, Petroni F, Girdhar R, Torresani L (2020) Video understanding as machine translation. arXiv preprint arXiv:2006.07203"},{"key":"267_CR46","doi-asserted-by":"crossref","unstructured":"Krishna R, Hata K, Ren F, Fei-Fei L, Carlos\u00a0Niebles J (2017) Dense-captioning events in videos. In: Proceedings of the IEEE international conference on computer vision. pp 706\u2013715","DOI":"10.1109\/ICCV.2017.83"},{"key":"267_CR47","first-page":"84","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. Adv Neural Inf Process Syst 25:84\u201390","journal-title":"Adv Neural Inf Process Syst"},{"key":"267_CR48","unstructured":"Kunitsyn A, Kalashnikov M, Dzabraev M, Ivaniuta A (2022) Mdmmt-2: Multidomain multimodal transformer for video retrieval, one more step towards generalization. arXiv preprint arXiv:2203.07086"},{"key":"267_CR49","unstructured":"Lan Z, Chen M, Goodman S, Gimpel K, Sharma P, Soricut R (2019) Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942"},{"key":"267_CR50","doi-asserted-by":"crossref","unstructured":"Lei J, Li L, Zhou L, Gan Z, Berg TL, Bansal M, Liu J (2021) Less is more: Clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 7331\u20137341","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"267_CR51","doi-asserted-by":"crossref","unstructured":"Li L, Chen Y-C, Cheng Y, Gan Z, Yu L, Liu J (2020) Hero: Hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"267_CR52","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) Tsm: Temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"267_CR53","doi-asserted-by":"crossref","unstructured":"Liu S, Fan H, Qian S, Chen Y, Ding W, Wang Z (2021) Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 11915\u201311925","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"267_CR54","doi-asserted-by":"crossref","unstructured":"Liu W, Anguelov D, Erhan D, Szegedy C, Reed S, Fu C-Y, Berg AC (2016) Ssd: single shot multibox detector. In: European conference on computer vision. Springer, pp 21\u201337","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"267_CR55","unstructured":"Liu Y, Albanie S, Nagrani A, Zisserman A (2019) Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487"},{"key":"267_CR56","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019) Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692"},{"key":"267_CR57","doi-asserted-by":"crossref","unstructured":"Lu Y-J, Zhang H, de\u00a0Boer M, Ngo C-W (2016) Event detection with zero example: Select the right and suppress the wrong concepts. In: Proceedings of the 2016 ACM on international conference on multimedia retrieval. pp 127\u2013134","DOI":"10.1145\/2911996.2912015"},{"key":"267_CR58","unstructured":"Luo H, Ji L, Shi B, Huang H, Duan N, Li T, Li J, Bharti T, Zhou M (2020) Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353"},{"key":"267_CR59","doi-asserted-by":"crossref","unstructured":"Luo H, Ji L, Zhong M, Chen Y, Lei W, Duan N, Li T (2021) Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"267_CR60","doi-asserted-by":"crossref","unstructured":"Ma Y, Xu G, Sun X, Yan M, Zhang J, Ji R (2022) X-clip: end-to-end multi-grained contrastive learning for video-text retrieval. arXiv preprint arXiv:2207.07285","DOI":"10.1145\/3503161.3547910"},{"key":"267_CR61","unstructured":"Miech A, Laptev I, Sivic J (2018) Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516"},{"key":"267_CR62","doi-asserted-by":"crossref","unstructured":"Miech A, Zhukov D, Alayrac J-B, Tapaswi M, Laptev I, Sivic J (2019) Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 2630\u20132640","DOI":"10.1109\/ICCV.2019.00272"},{"key":"267_CR63","unstructured":"Mikolov T, Chen K, Corrado G, Dean J (2013) Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781"},{"key":"267_CR64","unstructured":"Min S, Kong W, Tu R-C, Gong D, Cai C, Zhao W, Liu C, Zheng S, Wang H, Li Z, et\u00a0al. (2022) Hunyuan_tvr for text-video retrivial. arXiv preprint arXiv:2204.03382"},{"key":"267_CR65","doi-asserted-by":"crossref","unstructured":"Mithun N\u00a0C, Li J, Metze F, Roy-Chowdhury AK (2018) Learning joint embedding with multimodal cues for cross-modal video-text retrieval. In: Proceedings of the 2018 ACM on international conference on multimedia retrieval. pp 19\u201327","DOI":"10.1145\/3206025.3206064"},{"key":"267_CR66","doi-asserted-by":"crossref","unstructured":"Otani M, Nakashima Y, Rahtu E, Heikkil\u00e4 J, Yokoya N (2016) Learning joint representations of videos and sentences with web image search. In: European conference on computer vision. Springer, pp 651\u2013667","DOI":"10.1007\/978-3-319-46604-0_46"},{"key":"267_CR67","unstructured":"Patrick M, Huang P-Y, Asano Y, Metze F, Hauptmann A, Henriques J, Vedaldi A (2020) Support-set bottlenecks for video-text representation learning. arXiv preprint arXiv:2010.02824"},{"key":"267_CR68","doi-asserted-by":"crossref","unstructured":"Qiu Z, Yao T, Mei T (2017) Learning spatio-temporal representation with pseudo-3d residual networks. In: Proceedings of the IEEE international conference on computer vision. pp 5533\u20135541","DOI":"10.1109\/ICCV.2017.590"},{"key":"267_CR69","doi-asserted-by":"crossref","unstructured":"Qiu Z, Yao T, Ngo C-W, Tian X, Mei T (2019) Learning spatio-temporal representation with local and global diffusion. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 12056\u201312065","DOI":"10.1109\/CVPR.2019.01233"},{"key":"267_CR70","unstructured":"Radford A, Kim J\u00a0W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, et\u00a0al. (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning. PMLR, pp 8748\u20138763"},{"key":"267_CR71","first-page":"13937","volume":"34","author":"Y Rao","year":"2021","unstructured":"Rao Y, Zhao W, Liu B, Jiwen L, Zhou J, Hsieh C-J (2021) Dynamicvit: efficient vision transformers with dynamic token sparsification. Advances in neural information processing systems 34:13937\u201313949","journal-title":"Advances in neural information processing systems"},{"issue":"1","key":"267_CR72","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/s11263-016-0987-1","volume":"123","author":"A Rohrbach","year":"2017","unstructured":"Rohrbach A, Torabi A, Rohrbach M, Tandon N, Pal C, Larochelle H, Courville A, Schiele B (2017) Movie description. Int J Comput Vis 123(1):94\u2013120","journal-title":"Int J Comput Vis"},{"key":"267_CR73","unstructured":"Sanh V, Debut L, Chaumond J, Wolf T (2019) Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108"},{"key":"267_CR74","doi-asserted-by":"crossref","unstructured":"Schlichtkrull M, Kipf TN, Bloem P, van\u00a0den Berg R, Titov I, Welling M (2018) Modeling relational data with graph convolutional networks. In: European semantic web conference. Springer, pp 593\u2013607","DOI":"10.1007\/978-3-319-93417-4_38"},{"key":"267_CR75","doi-asserted-by":"crossref","unstructured":"Schroff F, Kalenichenko D, Philbin J (2015) Facenet: A unified embedding for face recognition and clustering. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 815\u2013823","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"267_CR76","doi-asserted-by":"crossref","unstructured":"Shvetsova N, Chen B, Rouditchenko A, Thomas S, Kingsbury B, Feris R, Harwath D, Glass J, Kuehne H (2021) Everything at once\u2013multi-modal fusion transformer for video retrieval. arXiv preprint arXiv:2112.04446","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"267_CR77","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"key":"267_CR78","doi-asserted-by":"crossref","unstructured":"Song Y, Soleymani M (2019) Polysemous visual-semantic embedding for cross-modal retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 1979\u20131988","DOI":"10.1109\/CVPR.2019.00208"},{"key":"267_CR79","unstructured":"Su W, Zhu X, Cao Y, Li B, Lu L, Wei F, Dai J(2019) Vl-bert: pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530"},{"key":"267_CR80","unstructured":"Sun C, Baradel F, Murphy K, Schmid C(2019) Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743"},{"key":"267_CR81","doi-asserted-by":"crossref","unstructured":"Sun C, Myers A, Vondrick C, Murphy K, Schmid C(2019) Videobert: A joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 7464\u20137473","DOI":"10.1109\/ICCV.2019.00756"},{"key":"267_CR82","doi-asserted-by":"crossref","unstructured":"Sun L, Jia K, Yeung D-Y, Shi BE (2015) Human action recognition using factorized spatio-temporal convolutional networks. In: Proceedings of the IEEE international conference on computer vision. pp 4597\u20134605","DOI":"10.1109\/ICCV.2015.522"},{"key":"267_CR83","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"267_CR84","doi-asserted-by":"crossref","unstructured":"Tai KS, Socher R, Manning CD (2015) Improved semantic representations from tree-structured long short-term memory networks. arXiv preprint arXiv:1503.00075","DOI":"10.3115\/v1\/P15-1150"},{"key":"267_CR85","doi-asserted-by":"crossref","unstructured":"Tan H, Bansal M (2019) Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","DOI":"10.18653\/v1\/D19-1514"},{"key":"267_CR86","unstructured":"Torabi A, Tandon N, Sigal L (2016) Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:1609.08124"},{"key":"267_CR87","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision. pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"267_CR88","unstructured":"Tran D, Ray J, Shou Z, Chang S-F, Paluri M (2017) Convnet architecture search for spatiotemporal feature learning. arXiv preprint arXiv:1708.05038"},{"key":"267_CR89","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Ray J, LeCun Y, Paluri M(2018) A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 6450\u20136459","DOI":"10.1109\/CVPR.2018.00675"},{"key":"267_CR90","unstructured":"Van\u00a0den Oord A, Li Y, Vinyals O (2018) Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748"},{"key":"267_CR91","first-page":"6000","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30:6000\u20136010","journal-title":"Adv Neural Inf Process Syst"},{"key":"267_CR92","doi-asserted-by":"crossref","unstructured":"Wang J, Chen B, Liao D, Zeng Z, Li G, Xia S-T, Xu J(2022) Hybrid contrastive quantization for efficient cross-view video retrieval. arXiv preprint arXiv:2202.03384","DOI":"10.1145\/3485447.3512022"},{"key":"267_CR93","doi-asserted-by":"crossref","unstructured":"Wang L, Li Y, Lazebnik S(2016) Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 5005\u20135013","DOI":"10.1109\/CVPR.2016.541"},{"key":"267_CR94","unstructured":"Wang Q, Zhang Y, Zheng Y, Pan P, Hua X-S. Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:2203.07111, 2022"},{"key":"267_CR95","doi-asserted-by":"crossref","unstructured":"Wang X, Zhu L, Yang Y (2021) T2vlad: global-local sequence alignment for text-video retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 5079\u20135088","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"267_CR96","doi-asserted-by":"crossref","unstructured":"Wang X, Wu J, Chen J, Li L, Wang Y-F, Wang WY (2019) Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 4581\u20134591","DOI":"10.1109\/ICCV.2019.00468"},{"key":"267_CR97","doi-asserted-by":"crossref","unstructured":"Wang Y, Dong J, Liang T, Zhang M, Cai R, Wang X (2022) Cross-lingual cross-modal retrieval with noise-robust learning. arXiv preprint arXiv:2208.12526","DOI":"10.1145\/3503161.3548003"},{"key":"267_CR98","doi-asserted-by":"crossref","unstructured":"Wray M, Larlus D, Csurka G, Damen D (2019) Fine-grained action retrieval through multiple parts-of-speech embeddings. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 450\u2013459","DOI":"10.1109\/ICCV.2019.00054"},{"issue":"4","key":"267_CR99","doi-asserted-by":"publisher","first-page":"1993","DOI":"10.1109\/TIP.2018.2882155","volume":"28","author":"G Wu","year":"2018","unstructured":"Wu G, Jungong H, Yuchen G, Li L, Guiguang D, Qiang N, Ling S (2018) Unsupervised deep video hashing via balanced code for large-scale video retrieval. IEEE Trans Image Process 28(4):1993\u20132007","journal-title":"IEEE Trans Image Process"},{"key":"267_CR100","doi-asserted-by":"crossref","unstructured":"Wu P, He X, Tang M, Lv Y, Liu J (2021) Hanet: Hierarchical alignment networks for video-text retrieval. In: Proceedings of the 29th ACM international conference on multimedia. pp 3518\u20133527","DOI":"10.1145\/3474085.3475515"},{"key":"267_CR101","doi-asserted-by":"crossref","unstructured":"Xie S, Sun C, Huang J, Tu Z, Murphy K (2018) Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In: Proceedings of the European conference on computer vision (ECCV). pp 305\u2013321","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"267_CR102","doi-asserted-by":"crossref","unstructured":"Xu J, Mei T, Yao T, Rui Y (2016) Msr-vtt: A large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 5288\u20135296","DOI":"10.1109\/CVPR.2016.571"},{"key":"267_CR103","doi-asserted-by":"crossref","unstructured":"Yang X, Dong J, Cao Y, Wang X, Wang M, Chua T-S (2020) Tree-augmented cross-modal encoding for complex-query video retrieval. In: Proceedings of the 43rd international ACM SIGIR conference on research and development in information retrieval. pp 1339\u20131348","DOI":"10.1145\/3397271.3401151"},{"key":"267_CR104","unstructured":"Yao T, Li X (2018) Yh technologies at activitynet challenge 2018. arXiv preprint arXiv:1807.00686"},{"key":"267_CR105","unstructured":"Yu Y, Ko H, Choi J, Kim G (2016) Video captioning and retrieval models with semantic attention. arXiv preprint arXiv:1610.02947, 6(7)"},{"key":"267_CR106","doi-asserted-by":"crossref","unstructured":"Yu Y, Kim J, Kim G (2018) A joint sequence fusion model for video question answering and retrieval. In: Proceedings of the European conference on computer vision (ECCV). pp 471\u2013487","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"267_CR107","unstructured":"Zhai A, Wu H-Y (2018) Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649"},{"key":"267_CR108","doi-asserted-by":"crossref","unstructured":"Zhang B, Hu H, Sha F (2018) Cross-modal and hierarchical modeling of video and text. In: Proceedings of the European conference on computer vision (ECCV). pp 374\u2013390","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"267_CR109","doi-asserted-by":"crossref","unstructured":"Zhang Y, Li X, Liu C, Shuai B, Zhu Y, Brattoli B, Chen H, Marsic I, Tighe J (2021) Vidtr: Video transformer without convolutions. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 13577\u201313587","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"267_CR110","unstructured":"Zhang Y, Wallace B (2015) A sensitivity analysis of (and practitioners\u2019 guide to) convolutional neural networks for sentence classification. arXiv preprint arXiv:1510.03820"},{"key":"267_CR111","doi-asserted-by":"crossref","unstructured":"Zhao S, Zhu L, Wang X, Yang Y (2022) Centerclip: token clustering for efficient text-video retrieval. arXiv preprint arXiv:2205.00823","DOI":"10.1145\/3477495.3531950"},{"key":"267_CR112","doi-asserted-by":"crossref","unstructured":"Zhong Y, Arandjelovi\u0107 R, Zisserman A (2018) Ghostvlad for set-based face recognition. In: Asian conference on computer vision. Springer, pp 35\u201350","DOI":"10.1007\/978-3-030-20890-5_3"},{"issue":"6","key":"267_CR113","doi-asserted-by":"publisher","first-page":"1452","DOI":"10.1109\/TPAMI.2017.2723009","volume":"40","author":"B Zhou","year":"2017","unstructured":"Zhou B, Lapedriza A, Khosla A, Oliva A, Torralba A (2017) Places: a 10 million image database for scene recognition. IEEE Trans Pattern Anal Machine Intell 40(6):1452\u20131464","journal-title":"IEEE Trans Pattern Anal Machine Intell"},{"key":"267_CR114","doi-asserted-by":"crossref","unstructured":"Zhu L, Yang Y (2020) Actbert: Learning global-local video-text representations. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 8746\u20138755","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"267_CR115","doi-asserted-by":"crossref","unstructured":"Zhuo Y, Li Y, Hsiao J, Ho C, Li B (2022) Clip4hashing: Unsupervised deep hashing for cross-modal video-text retrieval. In: Proceedings of the 2022 international conference on multimedia retrieval. pp 158\u2013166","DOI":"10.1145\/3512527.3531381"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00267-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-023-00267-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00267-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,14]],"date-time":"2023-06-14T15:25:19Z","timestamp":1686756319000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-023-00267-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,23]]},"references-count":115,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["267"],"URL":"https:\/\/doi.org\/10.1007\/s13735-023-00267-8","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,2,23]]},"assertion":[{"value":"5 November 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 January 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 January 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 February 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article. The authors have no competing interests as defined by Springer, or other interests that might be perceived to influence the results and\/or discussion reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The results\/data\/figures in this manuscript have not been published elsewhere, nor are they under consideration (from you or one of your Contributing Authors) by another publisher. We have read the Springer journal policies on author responsibilities and submit this manuscript in accordance with those policies. All of the material is owned by the authors and\/or no permissions are required.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"3"}}