{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:15:44Z","timestamp":1773191744506,"version":"3.50.1"},"publisher-location":"Cham","reference-count":87,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031197802","type":"print"},{"value":"9783031197819","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19781-9_24","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:12:59Z","timestamp":1666440779000},"page":"407-426","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":42,"title":["Learning Audio-Video Modalities from\u00a0Image Captions"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2190-9013","authenticated-orcid":false,"given":"Arsha","family":"Nagrani","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6747-1101","authenticated-orcid":false,"given":"Paul Hongsuck","family":"Seo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6081-8347","authenticated-orcid":false,"given":"Bryan","family":"Seybold","sequence":"additional","affiliation":[]},{"given":"Anja","family":"Hauth","sequence":"additional","affiliation":[]},{"given":"Santiago","family":"Manen","sequence":"additional","affiliation":[]},{"given":"Chen","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Cordelia","family":"Schmid","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"24_CR1","unstructured":"YouTube Data API. http:\/\/developers.google.com\/youtube\/v3\/docs\/captions"},{"key":"24_CR2","doi-asserted-by":"crossref","unstructured":"Abavisani, M., Joze, H.R.V., Patel, V.M.: Improving the performance of unimodal dynamic hand-gesture recognition with multimodal training. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00126"},{"key":"24_CR3","doi-asserted-by":"crossref","unstructured":"Aguilar, G., Rozgic, V., Wang, W., Wang, C.: Multimodal and multi-view models for emotion recognition. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1095"},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"Albanie, S., Nagrani, A., Vedaldi, A., Zisserman, A.: Emotion recognition in speech using cross-modal transfer in the wild. In: Proceedings of the 26th ACM international conference on Multimedia, pp. 292\u2013301 (2018)","DOI":"10.1145\/3240508.3240578"},{"key":"24_CR5","doi-asserted-by":"crossref","unstructured":"Amrani, E., Ben-Ari, R., Rotman, D., Bronstein, A.: Noise estimation using density estimation for self-supervised multimodal learning. arXiv preprint arXiv:2003.03186 (2020)","DOI":"10.1609\/aaai.v35i8.16822"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"Anne Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"24_CR7","doi-asserted-by":"crossref","unstructured":"Antol, S. et al.: VQA: Visual question answering. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"24_CR8","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Brown, A., Zisserman, A.: Condensed movies: Story based retrieval with contextual embeddings. In: ACCV (2020)","DOI":"10.1007\/978-3-030-69541-5_28"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: A joint video and image encoder for end-to-end retrieval. ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"24_CR10","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: A clip-hitchhiker\u2019s guide to long video retrieval. arXiv preprint arXiv:2205.08508 (2022)"},{"key":"24_CR11","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization (2005)"},{"key":"24_CR12","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: Pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"24_CR13","doi-asserted-by":"crossref","unstructured":"Chechik, G., Ie, E., Rehn, M., Bengio, S., Lyon, D.: Large-scale content-based audio retrieval from text queries. In: Proceedings of the 1st ACM International Conference on Multimedia Information Retrieval, pp. 105\u2013112 (2008)","DOI":"10.1145\/1460096.1460115"},{"key":"24_CR14","unstructured":"Chen, H., Li, J., Hu, X.: Delving deeper into the decoder for video captioning. In: ECAI (2020)"},{"key":"24_CR15","doi-asserted-by":"crossref","unstructured":"Chen, H., Lin, K., Maye, A., Li, J., Hu, X.: A semantics-assisted video captioning model trained with scheduled sampling. Front. Robot. AI 7 475767 (2020)","DOI":"10.3389\/frobt.2020.475767"},{"key":"24_CR16","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: Vggsound: A large-scale audio-visual dataset. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 721\u2013725. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"24_CR17","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.G.: Motion guided spatial attention for video captioning. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33018191"},{"key":"24_CR18","unstructured":"Chen, X., Fang, H., Lin, T.Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"24_CR19","unstructured":"Cheng, X., Lin, H., Wu, X., Yang, F., Shen, D.: Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss (2021)"},{"key":"24_CR20","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"24_CR21","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT (2019)"},{"key":"24_CR22","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"24_CR23","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: ICLR (2021)"},{"key":"24_CR24","doi-asserted-by":"crossref","unstructured":"Drossos, K., Lipping, S., Virtanen, T.: Clotho: An audio captioning dataset. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 736\u2013740. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Elizalde, B., Zarar, S., Raj, B.: Cross modal audio search and retrieval with joint embeddings based on text and audio. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4095\u20134099. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682632"},{"key":"24_CR26","unstructured":"Fang, H., Xiong, P., Xu, L., Chen, Y.: Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)"},{"key":"24_CR27","doi-asserted-by":"crossref","unstructured":"Font, F., Roma, G., Serra, X.: Freesound technical demo. In: Proceedings of the 21st ACM International Conference on Multimedia, pp. 411\u2013412 (2013)","DOI":"10.1145\/2502081.2502245"},{"key":"24_CR28","doi-asserted-by":"crossref","unstructured":"Gabeur, V., Nagrani, A., Sun, C., Alahari, K., Schmid, C.: Masking modalities for cross-modal video retrieval. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1766\u20131775 (2022)","DOI":"10.1109\/WACV51458.2022.00217"},{"key":"24_CR29","doi-asserted-by":"crossref","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: Multi-modal transformer for video retrieval. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"24_CR30","doi-asserted-by":"crossref","unstructured":"Gao, R., Oh, T.H., Grauman, K., Torresani, L.: Listen to look: Action recognition by previewing audio. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10457\u201310467 (2020)","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"24_CR31","unstructured":"Gao, Z., Liu, J., Chen, S., Chang, D., Zhang, H., Yuan, J.: Clip2tv: An empirical study on transformer-based methods for video-text retrieval. arXiv preprint arXiv:2111.05610 (2021)"},{"key":"24_CR32","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: An ontology and human-labeled dataset for audio events. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"24_CR33","doi-asserted-by":"crossref","unstructured":"Ghadiyaram, D., Tran, D., Mahajan, D.: Large-scale weakly-supervised pre-training for video action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12046\u201312055 (2019)","DOI":"10.1109\/CVPR.2019.01232"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Tran, D., Torresani, L., Ramanan, D.: Distinit: Learning video representations without a single labeled video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 852\u2013861 (2019)","DOI":"10.1109\/ICCV.2019.00094"},{"key":"24_CR35","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y.A., Glass, J.: Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)","DOI":"10.21437\/Interspeech.2021-698"},{"key":"24_CR36","doi-asserted-by":"crossref","unstructured":"Gupta, S., Hoffman, J., Malik, J.: Cross modal distillation for supervision transfer. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (June 2016)","DOI":"10.1109\/CVPR.2016.309"},{"key":"24_CR37","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"24_CR38","doi-asserted-by":"crossref","unstructured":"Hou, J., Wu, X., Zhao, W., Luo, J., Jia, Y.: Joint syntax representation learning and visual cue translation for video captioning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00901"},{"key":"24_CR39","unstructured":"Huang, G., Pang, B., Zhu, Z., Rivera, C., Soricut, R.: Multimodal pretraining for dense video captioning. In: AACL (2020)"},{"key":"24_CR40","unstructured":"Juan, D.C., et al.: Graph-rise: Graph-regularized image semantic embedding. arXiv preprint arXiv:1902.10814 (2019)"},{"key":"24_CR41","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"24_CR42","unstructured":"Kim, C.D., Kim, B., Lee, H., Kim, G.: Audiocaps: Generating captions for audios in the wild. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 119\u2013132 (2019)"},{"key":"24_CR43","doi-asserted-by":"crossref","unstructured":"Koepke, A., Oncescu, A.M., Henriques, J.F., Akata, Z., Albanie, S.: Audio retrieval with natural language queries: A benchmark study. arXiv preprint arXiv:2112.09418 (2021)","DOI":"10.21437\/Interspeech.2021-2227"},{"key":"24_CR44","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos Niebles, J.: Dense-captioning events in videos. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.83"},{"issue":"1","key":"24_CR45","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishan","year":"2017","unstructured":"Krishan, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"24_CR46","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: Clipbert for video-and-language learning via sparse sampling. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"24_CR47","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Bansal, M., Berg, T.L.: Tvqa: Localized, compositional video question answering. In: EMNLP (2018)","DOI":"10.18653\/v1\/D18-1167"},{"key":"24_CR48","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: Hero: Hierarchical encoder for video+ language omni-representation pre-training. In: EMNLP (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"24_CR49","unstructured":"Li, T., Wang, L.: Learning spatiotemporal features via video and text pair discrimination. arXiv preprint arXiv:2001.05691 (2020)"},{"key":"24_CR50","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: Common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"24_CR51","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: Video retrieval using representations from collaborative experts. In: BMVC (2019)"},{"key":"24_CR52","unstructured":"Luo, H., et al.: UniVL: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)"},{"key":"24_CR53","unstructured":"Luo, H., et al.: UniVL: A unified video and language pre-training model for multimodal understanding and generation. arXiv e-prints (2020)"},{"key":"24_CR54","doi-asserted-by":"crossref","unstructured":"Luo, H., et al.: Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 (2021)","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"24_CR55","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"24_CR56","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"24_CR57","doi-asserted-by":"crossref","unstructured":"Monfort, M., Jin, S., Liu, A., Harwath, D., Feris, R., Glass, J., Oliva, A.: Spoken moments: Learning joint audio-visual representations from video descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14871\u201314881 (2021)","DOI":"10.1109\/CVPR46437.2021.01463"},{"key":"24_CR58","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Sun, C., Ross, D., Sukthankar, R., Schmid, C., Zisserman, A.: Speech2action: Cross-modal supervision for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10317\u201310326 (2020)","DOI":"10.1109\/CVPR42600.2020.01033"},{"key":"24_CR59","unstructured":"Nagrani, A., Yang, S., Arnab, A., Jansen, A., Schmid, C., Sun, C.: Attention bottlenecks for multimodal fusion. In: NeurIPS (2021)"},{"key":"24_CR60","doi-asserted-by":"crossref","unstructured":"Oncescu, A.M., Koepke, A., Henriques, J.F., Akata, Z., Albanie, S.: Audio retrieval with natural language queries. arXiv preprint arXiv:2105.02192 (2021)","DOI":"10.21437\/Interspeech.2021-2227"},{"key":"24_CR61","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"24_CR62","unstructured":"Patrick, M., et al.: Support-set bottlenecks for video-text representation learning. arXiv preprint arXiv:2010.02824 (2020)"},{"key":"24_CR63","unstructured":"Patrick, M., et al.: Support-set bottlenecks for video-text representation learning. In: ICLR (2021)"},{"key":"24_CR64","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"24_CR65","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners. Technical Report (2019)"},{"issue":"1","key":"24_CR66","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/s11263-016-0987-1","volume":"123","author":"A Rohrbach","year":"2017","unstructured":"Rohrbach, A., et al.: Movie description. Int. J. Comput. Vision 123(1), 94\u2013120 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"24_CR67","doi-asserted-by":"crossref","unstructured":"Rouditchenko, A., et al.: AVLnet: Learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199 (2020)","DOI":"10.21437\/Interspeech.2021-1312"},{"key":"24_CR68","unstructured":"Seo, P.H., Nagrani, A., Schmid, C.: Look before you speak: Visually contextualized utterances. In: CVPR (2021)"},{"key":"24_CR69","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"24_CR70","doi-asserted-by":"crossref","unstructured":"Shvetsova, N., et al.: Everything at once-multi-modal fusion transformer for video retrieval. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"24_CR71","doi-asserted-by":"crossref","unstructured":"Slaney, M.: Semantic-audio retrieval. In: 2002 IEEE International Conference on Acoustics, Speech, and Signal Processing. vol. 4, pp. IV-4108. IEEE (2002)","DOI":"10.1109\/ICASSP.2002.5745561"},{"key":"24_CR72","unstructured":"Stroud, J.C., Lu, Z., Sun, C., Deng, J., Sukthankar, R., Schmid, C., Ross, D.A.: Learning video representations from textual web supervision. arXiv preprint arXiv:2007.14937 (2020)"},{"key":"24_CR73","doi-asserted-by":"crossref","unstructured":"Sun, C., Shetty, S., Sukthankar, R., Nevatia, R.: Temporal localization of fine-grained actions in videos by domain transfer from web images. In: ACM Multimedia (2015)","DOI":"10.1145\/2733373.2806226"},{"key":"24_CR74","doi-asserted-by":"crossref","unstructured":"Sun, C., Shetty, S., Sukthankar, R., Nevatia, R.: Temporal localization of fine-grained actions in videos by domain transfer from web images. In: ACM Multimedia (2015)","DOI":"10.1145\/2733373.2806226"},{"key":"24_CR75","doi-asserted-by":"crossref","unstructured":"Tang, Z., Lei, J., Bansal, M.: Decembert: Learning from noisy instructional videos via dense captions and entropy minimization. In: NAACL (2021)","DOI":"10.18653\/v1\/2021.naacl-main.193"},{"key":"24_CR76","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"issue":"4","key":"24_CR77","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2016","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: lessons learned from the 2015 MSCOCO image captioning challenge. IEEE Trans. Pattern Anal. Mach. Intell. 39(4), 652\u2013663 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"24_CR78","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Jiang, W., Wang, J., Liu, W.: Controllable video captioning with pos sequence guidance based on gated fusion network. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00273"},{"key":"24_CR79","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"24_CR80","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhu, L., Yang, Y.: T2vlad: Global-local sequence alignment for text-video retrieval (2021)","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"24_CR81","doi-asserted-by":"crossref","unstructured":"Xu, H.,et al.: Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"24_CR82","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: Msr-vtt: A large video description dataset for bridging video and language. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"24_CR83","unstructured":"Yao, L., et al.: Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)"},{"key":"24_CR84","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"24_CR85","unstructured":"Zhai, A., Wu, H.Y.: Classification is a strong baseline for deep metric learning. In: BMVC (2019)"},{"key":"24_CR86","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Shi, Y., Yuan, C., Li, B., Wang, P., Hu, W., Zha, Z.J.: Object relational graph with teacher-recommended learning for video captioning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"24_CR87","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.: Towards automatic learning of procedures from web instructional videos. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.12342"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19781-9_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T10:18:21Z","timestamp":1728209901000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19781-9_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031197802","9783031197819"],"references-count":87,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19781-9_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}