{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T18:10:25Z","timestamp":1777486225414,"version":"3.51.4"},"reference-count":151,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2022,1,4]],"date-time":"2022-01-04T00:00:00Z","timestamp":1641254400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,4]],"date-time":"2022-01-04T00:00:00Z","timestamp":1641254400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2022,2]]},"DOI":"10.1007\/s11263-021-01547-8","type":"journal-article","created":{"date-parts":[[2022,1,4]],"date-time":"2022-01-04T12:02:41Z","timestamp":1641297761000},"page":"435-454","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":43,"title":["Perspectives and Prospects on Transformer Architecture for Cross-Modal Tasks with Language and Vision"],"prefix":"10.1007","volume":"130","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0969-9925","authenticated-orcid":false,"given":"Andrew","family":"Shin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Masato","family":"Ishii","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Takuya","family":"Narihira","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,1,4]]},"reference":[{"key":"1547_CR1","unstructured":"Abu-El-Haija, S., Kothari, N., Lee, J., Natsev, P., Toderici, G., Varadarajan, B., Vijayanarasimhan, S. (2016), Youtube-8m: A large-scale video classification benchmark. CoRR abs\/1609.08675, http:\/\/arxiv.org\/abs\/1609.08675, 1609.08675"},{"key":"1547_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal P, Carreira J, Malik J (2015) Learning to see by moving. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2015.13"},{"key":"1547_CR3","unstructured":"Akbari H, Yuan L, Qian R, Chuang W, Chang S, Cui Y, Gong B (2021) VATT: transformers for multimodal self-supervised learning from raw video, audio and text. CoRR abs\/2104.11178, https:\/\/arxiv.org\/abs\/2104.11178, 2104.11178"},{"key":"1547_CR4","doi-asserted-by":"publisher","unstructured":"Alberti C, Ling J, Collins M, Reitter D (2019) Fusion of detected objects in text for visual question answering. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), Association for Computational Linguistics, Hong Kong, China, pp 2131\u20132140, https:\/\/doi.org\/10.18653\/v1\/D19-1219, https:\/\/www.aclweb.org\/anthology\/D19-1219","DOI":"10.18653\/v1\/D19-1219"},{"key":"1547_CR5","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: Semantic propositional image caption evaluation. In: ECCV","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1547_CR6","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Zitnick CL, Parikh D (2015) Vqa: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2015.279"},{"key":"1547_CR7","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. 1607.06450"},{"key":"1547_CR8","unstructured":"Banerjee S, Lavie A (2005) METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, Association for Computational Linguistics, Ann Arbor, Michigan, pp 65\u201372, https:\/\/www.aclweb.org\/anthology\/W05-0909"},{"key":"1547_CR9","unstructured":"Barbu A, Bridge A, Burchill Z, Coroian D, Dickinson S, Fidler S, Michaux A, Mussman S, Narayanaswamy S, Salvi D, Schmidt L, Shangguan J, Siskind JM, Waggoner J, Wang S, Wei J, Yin Y, Zhang Z (2012) Video in sentences out. 1204.2742"},{"key":"1547_CR10","doi-asserted-by":"crossref","unstructured":"Ben-younes H, Cadene R, Cord M, Thome N (2017) Mutan: Multimodal tucker fusion for visual question answering. 1705.06676","DOI":"10.1109\/ICCV.2017.285"},{"key":"1547_CR11","unstructured":"Brown T, Mann B, Ryder N, Subbiah M, Kaplan JD, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language models are few-shot learners. In: Larochelle H, Ranzato M, Hadsell R, Balcan MF, Lin H (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a033, pp 1877\u20131901, https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"1547_CR12","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. 2005.12872","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1547_CR13","unstructured":"Chang WC, Yu FX, Chang YW, Yang Y, Kumar S (2020) Pre-training tasks for embedding-based large-scale retrieval. In: International Conference on Learning Representations, https:\/\/openreview.net\/forum?id=rkg-mA4FDr"},{"key":"1547_CR14","doi-asserted-by":"crossref","unstructured":"Chen H, Wang Y, Guo T, Xu C, Deng Y, Liu Z, Ma S, Xu C, Xu C, Gao W (2020a) Pre-trained image processing transformer. 2012.00364","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"1547_CR15","unstructured":"Chen M, Radford A, Child R, Wu J, Jun H, Luan D, Sutskever I (2020b) Generative pretraining from pixels. In: III HD, Singh A (eds) Proceedings of the 37th International Conference on Machine Learning, PMLR, Proceedings of Machine Learning Research, vol 119, pp 1691\u20131703, http:\/\/proceedings.mlr.press\/v119\/chen20s.html"},{"key":"1547_CR16","doi-asserted-by":"crossref","unstructured":"Chen YC, Li L, Yu L, Kholy AE, Ahmed F, Gan Z, Cheng Y, Liu J (2020c) Uniter: Universal image-text representation learning. In: ECCV","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"1547_CR17","unstructured":"Child R, Gray S, Radford A, Sutskever I (2019) Generating long sequences with sparse transformers. CoRR abs\/1904.10509, http:\/\/arxiv.org\/abs\/1904.10509, 1904.10509"},{"key":"1547_CR18","doi-asserted-by":"publisher","unstructured":"Cho K, van Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), Association for Computational Linguistics, Doha, Qatar, pp 1724\u20131734, https:\/\/doi.org\/10.3115\/v1\/D14-1179, https:\/\/www.aclweb.org\/anthology\/D14-1179","DOI":"10.3115\/v1\/D14-1179"},{"key":"1547_CR19","doi-asserted-by":"crossref","unstructured":"Dai B, Fidler S, Urtasun R, Lin D (2017) Towards diverse and natural image descriptions via a conditional gan. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2017.323"},{"key":"1547_CR20","doi-asserted-by":"publisher","unstructured":"Dai Z, Yang Z, Yang Y, Carbonell J, Le Q, Salakhutdinov R (2019) Transformer-XL: Attentive language models beyond a fixed-length context. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, Florence, Italy, pp 2978\u20132988, https:\/\/doi.org\/10.18653\/v1\/P19-1285, https:\/\/www.aclweb.org\/anthology\/P19-1285","DOI":"10.18653\/v1\/P19-1285"},{"key":"1547_CR21","doi-asserted-by":"crossref","unstructured":"Das P, Xu C, Doell RF, Corso JJ (2013) A thousand frames in just a few words: Lingual description of videos through latent topics and sparse object stitching. 2013 IEEE Conference on Computer Vision and Pattern Recognition pp 2634\u20132641","DOI":"10.1109\/CVPR.2013.340"},{"key":"1547_CR22","doi-asserted-by":"publisher","unstructured":"Devlin J, Chang MW, Lee K, Toutanova K (2019) BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Association for Computational Linguistics, Minneapolis, Minnesota, pp 4171\u20134186, https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/www.aclweb.org\/anthology\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"1547_CR23","doi-asserted-by":"crossref","unstructured":"Donahue J, Hendricks LA, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2014) Long-term recurrent convolutional networks for visual recognition and description. CoRR abs\/1411.4389, http:\/\/arxiv.org\/abs\/1411.4389, 1411.4389","DOI":"10.21236\/ADA623249"},{"key":"1547_CR24","doi-asserted-by":"crossref","unstructured":"Dong L, Xu S, Xu B (2018) Speech-transformer: A no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 5884\u20135888, 10.1109\/ICASSP.2018.8462506","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"1547_CR25","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2020) An image is worth 16x16 words: Transformers for image recognition at scale. 2010.11929"},{"key":"1547_CR26","unstructured":"Dufter P, Schmitt M, Sch\u00fctze H (2021) Position information in transformers: An overview. CoRR abs\/2102.11090, https:\/\/arxiv.org\/abs\/2102.11090, 2102.11090"},{"key":"1547_CR27","unstructured":"Elliott D, Keller F (2013) Image description using visual dependency representations. In: Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, Seattle, Washington, USA, pp 1292\u20131302, https:\/\/www.aclweb.org\/anthology\/D13-1128"},{"issue":"2","key":"1547_CR28","first-page":"179","volume":"14","author":"JL Elman","year":"1990","unstructured":"Elman, J. L. (1990). Finding structure in time. COGNITIVE SCIENCE, 14(2), 179\u2013211.","journal-title":"Finding structure in time. COGNITIVE SCIENCE"},{"key":"1547_CR29","doi-asserted-by":"crossref","unstructured":"Farhadi A, Hejrati M, Sadeghi M, Young P, Rashtchian C, Hockenmaier J, Forsyth D (2010) Every picture tells a story: Generating sentences from images. In: Computer Vision, ECCV 2010 - 11th European Conference on Computer Vision, Proceedings, Springer-Verlag Berlin Heidelberg, no. PART 4 in Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics), pp 15\u201329, 10.1007\/978-3-642-15561-1_2, copyright: Copyright 2019 Elsevier B.V., All rights reserved.; 11th European Conference on Computer Vision, ECCV 2010 ; Conference date: 10-09-2010 Through 11-09-2010","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"1547_CR30","unstructured":"Fedus W, Zoph B, Shazeer N (2021) Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. 2101.03961"},{"key":"1547_CR31","doi-asserted-by":"publisher","unstructured":"Fukui A, Park DH, Yang D, Rohrbach A, Darrell T, Rohrbach M (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, Austin, Texas, pp 457\u2013468, https:\/\/doi.org\/10.18653\/v1\/D16-1044, https:\/\/www.aclweb.org\/anthology\/D16-1044","DOI":"10.18653\/v1\/D16-1044"},{"key":"1547_CR32","doi-asserted-by":"crossref","unstructured":"Gabeur V, Sun C, Alahari K, Schmid C (2020) Multi-modal Transformer for Video Retrieval. In: European Conference on Computer Vision (ECCV)","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"1547_CR33","doi-asserted-by":"publisher","unstructured":"Gella S, Lewis M, Rohrbach M (2018) A dataset for telling the stories of social media videos. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, Brussels, Belgium, pp 968\u2013974, https:\/\/doi.org\/10.18653\/v1\/D18-1117, https:\/\/www.aclweb.org\/anthology\/D18-1117","DOI":"10.18653\/v1\/D18-1117"},{"key":"1547_CR34","unstructured":"Gillick D, Presta A, Tomar GS (2018) End-to-end retrieval in continuous space. CoRR abs\/1811.08008, http:\/\/arxiv.org\/abs\/1811.08008, 1811.08008"},{"key":"1547_CR35","unstructured":"Ging S, Zolfaghari M, Pirsiavash H, Brox T (2020) Coot: Cooperative hierarchical transformer for video-text representation learning. 2011.00597"},{"key":"1547_CR36","doi-asserted-by":"crossref","unstructured":"Girshick R (2015) Fast r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2015.169"},{"key":"1547_CR37","unstructured":"Goodfellow I, Pouget-Abadie J, Mirza M, Xu B, Warde-Farley D, Ozair S, Courville A, Bengio Y (2014) Generative adversarial nets. In: Ghahramani Z, Welling M, Cortes C, Lawrence N, Weinberger KQ (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a027, pp 2672\u20132680, https:\/\/proceedings.neurips.cc\/paper\/2014\/file\/5ca3e9b122f61f8f06494c97b1afccf3-Paper.pdf"},{"key":"1547_CR38","doi-asserted-by":"crossref","unstructured":"Goyal Y, Khot T, Summers-Stay D, Batra D, Parikh D (2017) Making the V in VQA matter: Elevating the role of image understanding in Visual Question Answering. In: Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2017.670"},{"key":"1547_CR39","unstructured":"Guo J, Zhu C, Zhao Y, Wang H, Hu Y, He X, Cai D (2020) Lamp: Label augmented multimodal pretraining. 2012.04446"},{"key":"1547_CR40","unstructured":"Han K, Wang Y, Chen H, Chen X, Guo J, Liu Z, Tang Y, Xiao A, Xu C, Xu Y, Yang Z, Zhang Y, Tao D (2021) A survey on visual transformer. 2012.12556"},{"key":"1547_CR41","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 770\u2013778, 10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"1547_CR42","doi-asserted-by":"crossref","unstructured":"Heilbron FC, Escorcia V, Ghanem B, Niebles JC (2015) Activitynet: A large-scale video benchmark for human activity understanding. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 961\u2013970, 10.1109\/CVPR.2015.7298698","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"1547_CR43","unstructured":"Hendrycks D, Gimpel K (2016) Bridging nonlinearities and stochastic regularizers with gaussian error linear units. CoRR abs\/1606.08415, http:\/\/arxiv.org\/abs\/1606.08415, 1606.08415"},{"issue":"8","key":"1547_CR44","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"1547_CR45","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., & Hockenmaier, J. (2013). Framing image description as a ranking task: Data, models and evaluation metrics. J Artif Intell Res, 47, 853\u2013899.","journal-title":"J Artif Intell Res"},{"key":"1547_CR46","unstructured":"Hu R, Singh A (2021) Transformer is all you need: Multimodal multitask learning with a unified transformer. CoRR abs\/2102.10772, https:\/\/arxiv.org\/abs\/2102.10772, 2102.10772"},{"key":"1547_CR47","unstructured":"Huang G, Pang B, Zhu Z, Rivera C, Soricut R (2020a) Multimodal pretraining for dense video captioning. In: Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing, Association for Computational Linguistics, Suzhou, China, pp 470\u2013490, https:\/\/www.aclweb.org\/anthology\/2020.aacl-main.48"},{"key":"1547_CR48","unstructured":"Huang Z, Zeng Z, Liu B, Fu D, Fu J (2020b) Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. 2004.00849"},{"key":"1547_CR49","doi-asserted-by":"crossref","unstructured":"Hudson DA, Manning CD (2019) Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"1547_CR50","doi-asserted-by":"crossref","unstructured":"Jiao X, Yin Y, Shang L, Jiang X, Chen X, Li L, Wang F, Liu Q (2020) Tinybert: Distilling bert for natural language understanding. https:\/\/openreview.net\/forum?id=rJx0Q6EFPB","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"1547_CR51","doi-asserted-by":"crossref","unstructured":"Johnson J, Karpathy A, Fei-Fei L (2016) Densecap: Fully convolutional localization networks for dense captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2016.494"},{"key":"1547_CR52","unstructured":"Karpathy A, Li F (2014) Deep visual-semantic alignments for generating image descriptions. CoRR abs\/1412.2306, http:\/\/arxiv.org\/abs\/1412.2306, 1412.2306"},{"key":"1547_CR53","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: CVPR","DOI":"10.1109\/CVPR.2014.223"},{"key":"1547_CR54","doi-asserted-by":"crossref","unstructured":"Karras T, Laine S, Aila T (2019) A style-based generator architecture for generative adversarial networks. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4396\u20134405, 10.1109\/CVPR.2019.00453","DOI":"10.1109\/CVPR.2019.00453"},{"key":"1547_CR55","doi-asserted-by":"crossref","unstructured":"Karras T, Laine S, Aittala M, Hellsten J, Lehtinen J, Aila T (2020) Analyzing and improving the image quality of stylegan. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"1547_CR56","doi-asserted-by":"publisher","unstructured":"Kazemzadeh S, Ordonez V, Matten M, Berg T (2014) ReferItGame: Referring to objects in photographs of natural scenes. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), Association for Computational Linguistics, Doha, Qatar, pp 787\u2013798, https:\/\/doi.org\/10.3115\/v1\/D14-1086, https:\/\/www.aclweb.org\/anthology\/D14-1086","DOI":"10.3115\/v1\/D14-1086"},{"key":"1547_CR57","unstructured":"Kervadec C, Antipov G, Baccouche M, Wolf C (2019) Weak supervision helps emergence of word-object alignment and improves vision-language tasks. 1912.03063"},{"key":"1547_CR58","doi-asserted-by":"crossref","unstructured":"Khan S, Naseer M, Hayat M, Zamir SW, Khan FS, Shah M (2021) Transformers in vision: A survey. 2101.01169","DOI":"10.1145\/3505244"},{"key":"1547_CR59","unstructured":"Kim JH, On KW, Lim W, Kim J, Ha JW, Zhang BT (2017) Hadamard product for low-rank bilinear pooling. 1610.04325"},{"key":"1547_CR60","unstructured":"Kim W, Son B, Kim I (2021) Vilt: Vision-and-language transformer without convolution or region supervision. 2102.03334"},{"key":"1547_CR61","unstructured":"Kingma DP, Welling M (2014) Auto-Encoding Variational Bayes. In: 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings, http:\/\/arxiv.org\/abs\/1312.6114v10"},{"key":"1547_CR62","unstructured":"Kiros R, Zhu Y, Salakhutdinov R, Zemel RS, Torralba A, Urtasun R, Fidler S (2015) Skip-thought vectors. 1506.06726"},{"key":"1547_CR63","unstructured":"Kitaev N, Kaiser L, Levskaya A (2020) Reformer: The efficient transformer. In: International Conference on Learning Representations, https:\/\/openreview.net\/forum?id=rkgNKkHtvB"},{"key":"1547_CR64","unstructured":"Korbar B, Petroni F, Girdhar R, Torresani L (2020) Video understanding as machine translation. 2006.07203"},{"key":"1547_CR65","doi-asserted-by":"crossref","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalanditis Y, Li LJ, Shamma DA, Bernstein M, Fei-Fei L (2016) Visual genome: Connecting language and vision using crowdsourced dense image annotations","DOI":"10.1007\/s11263-016-0981-7"},{"key":"1547_CR66","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. In: Pereira F, Burges CJC, Bottou L, Weinberger KQ (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a025, pp 1097\u20131105, https:\/\/proceedings.neurips.cc\/paper\/2012\/file\/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf"},{"key":"1547_CR67","doi-asserted-by":"crossref","unstructured":"LeCun Y, Bottou L, Bengio Y, Haffner P (1998) Gradient-based learning applied to document recognition. In: Proceedings of the IEEE, vol\u00a086, pp 2278\u20132324, http:\/\/citeseerx.ist.psu.edu\/viewdoc\/summary?doi=10.1.1.42.7665","DOI":"10.1109\/5.726791"},{"key":"1547_CR68","doi-asserted-by":"crossref","unstructured":"Lei J, Li L, Zhou L, Gan Z, Berg TL, Bansal M, Liu J (2021) Less is more: Clipbert for video-and-language learning via sparse sampling. 2102.06183","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"1547_CR69","unstructured":"Li C, Yan M, Xu H, Luo F, Wang W, Bi B, Huang S (2021a) Semvlp: Vision-language pre-training by aligning semantics at multiple levels. https:\/\/openreview.net\/forum?id=Wg2PSpLZiH"},{"key":"1547_CR70","doi-asserted-by":"publisher","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., & Jiang, D. (2020a). Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. Proceedings of the AAAI Conference on Artificial Intelligence, 34(07), 11336\u201311344. https:\/\/doi.org\/10.1609\/aaai.v34i07.6795, https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/6795","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"1547_CR71","doi-asserted-by":"crossref","unstructured":"Li L, Chen YC, Cheng Y, Gan Z, Yu L, Liu J (2020b) Hero: Hierarchical encoder for video+language omni-representation pre-training. 2005.00200","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"1547_CR72","unstructured":"Li LH, Yatskar M, Yin D, Hsieh CJ, Chang KW (2019) Visualbert: A simple and performant baseline for vision and language. In: Arxiv"},{"key":"1547_CR73","doi-asserted-by":"crossref","unstructured":"Li X, Yin X, Li C, Zhang P, Hu X, Zhang L, Wang L, Hu H, Dong L, Wei F, Choi Y, Gao J (2020c) Oscar: Object-semantics aligned pre-training for vision-language tasks. 2004.06165","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"1547_CR74","unstructured":"Li X, Zhang Y, Liu C, Shuai B, Zhu Y, Brattoli B, Chen H, Marsic I, Tighe J (2021b) Vidtr: Video transformer without convolutions. CoRR abs\/2104.11746, https:\/\/arxiv.org\/abs\/2104.11746, 2104.11746"},{"key":"1547_CR75","unstructured":"Lin CY (2004) ROUGE: A package for automatic evaluation of summaries. In: Text Summarization Branches Out, Association for Computational Linguistics, Barcelona, Spain, pp 74\u201381, https:\/\/www.aclweb.org\/anthology\/W04-1013"},{"key":"1547_CR76","unstructured":"Lin J, Yang A, Zhang Y, Liu J, Zhou J, Yang H (2021) M6-v0: Vision-and-language interaction for multi-modal pretraining. 2003.13198"},{"key":"1547_CR77","unstructured":"Lin TY, Maire M, Belongie S, Bourdev L, Girshick R, Hays J, Perona P, Ramanan D, Zitnick CL, Doll\u00e1r P (2014) Microsoft coco: Common objects in context. http:\/\/arxiv.org\/abs\/1405.0312, cite arxiv:1405.0312Comment: 1) updated annotation pipeline description and figures; 2) added new section describing datasets splits; 3) updated author list"},{"key":"1547_CR78","doi-asserted-by":"publisher","unstructured":"Liu X, He P, Chen W, Gao J (2019a) Multi-task deep neural networks for natural language understanding. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, Florence, Italy, pp 4487\u20134496, https:\/\/doi.org\/10.18653\/v1\/P19-1441, https:\/\/www.aclweb.org\/anthology\/P19-1441","DOI":"10.18653\/v1\/P19-1441"},{"key":"1547_CR79","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019b) Roberta: A robustly optimized bert pretraining approach. http:\/\/arxiv.org\/abs\/1907.11692, cite arxiv:1907.11692"},{"key":"1547_CR80","unstructured":"Lu J, Yang J, Batra D, Parikh D (2016) Hierarchical question-image co-attention for visual question answering. In: Lee D, Sugiyama M, Luxburg U, Guyon I, Garnett R (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a029, pp 289\u2013297, https:\/\/proceedings.neurips.cc\/paper\/2016\/file\/9dcb88e0137649590b755372b040afad-Paper.pdf"},{"key":"1547_CR81","unstructured":"Lu J, Batra D, Parikh D, Lee S (2019) Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Wallach H, Larochelle H, Beygelzimer A, d\u2019Alch\u00e9-Buc F, Fox E, Garnett R (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a032, pp 13\u201323, https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/c74d97b01eae257e44aa9d5bade97baf-Paper.pdf"},{"key":"1547_CR82","unstructured":"Luo F, Yang P, Li S, Ren X, Sun X (2020a) Capt: Contrastive pre-training for learning denoised sequence representations. 2010.06351"},{"key":"1547_CR83","unstructured":"Luo H, Ji L, Shi B, Huang H, Duan N, Li T, Li J, Bharti T, Zhou M (2020b) Univl: A unified video and language pre-training model for multimodal understanding and generation. 2002.06353"},{"key":"1547_CR84","doi-asserted-by":"crossref","unstructured":"Miech A, Zhukov D, Alayrac JB, Tapaswi M, Laptev I, Sivic J (2019) HowTo100M: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips. In: ICCV","DOI":"10.1109\/ICCV.2019.00272"},{"key":"1547_CR85","unstructured":"Mikolov T, Sutskever I, Chen K, Corrado GS, Dean J (2013) Distributed representations of words and phrases and their compositionality. In: Burges CJC, Bottou L, Welling M, Ghahramani Z, Weinberger KQ (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a026, pp 3111\u20133119, https:\/\/proceedings.neurips.cc\/paper\/2013\/file\/9aa42b31882ec039965f3c4923ce901b-Paper.pdf"},{"key":"1547_CR86","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1145\/219717.219748","volume":"38","author":"GA Miller","year":"1995","unstructured":"Miller, G. A. (1995). Wordnet: A lexical database for english. COMMUNICATIONS OF THE ACM, 38, 39\u201341.","journal-title":"COMMUNICATIONS OF THE ACM"},{"key":"1547_CR87","unstructured":"Ordonez V, Kulkarni G, Berg T (2011) Im2text: Describing images using 1 million captioned photographs. In: Shawe-Taylor J, Zemel R, Bartlett P, Pereira F, Weinberger KQ (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a024, pp 1143\u20131151, https:\/\/proceedings.neurips.cc\/paper\/2011\/file\/5dd9db5e033da9c6fb5ba83c7a7ebea9-Paper.pdf"},{"key":"1547_CR88","doi-asserted-by":"publisher","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, Philadelphia, Pennsylvania, USA, pp 311\u2013318, https:\/\/doi.org\/10.3115\/1073083.1073135, https:\/\/www.aclweb.org\/anthology\/P02-1040","DOI":"10.3115\/1073083.1073135"},{"key":"1547_CR89","unstructured":"Parmar N, Vaswani A, Uszkoreit J, \u0141ukasz Kaiser, Shazeer N, Ku A, Tran D (2018) Image transformer. 1802.05751"},{"key":"1547_CR90","unstructured":"Patashnik O, Wu Z, Shechtman E, Cohen-Or D, Lischinski D (2021) Styleclip: Text-driven manipulation of stylegan imagery. CoRR abs\/2103.17249, https:\/\/arxiv.org\/abs\/2103.17249, 2103.17249"},{"key":"1547_CR91","doi-asserted-by":"publisher","unstructured":"Pennington J, Socher R, Manning C (2014) GloVe: Global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), Association for Computational Linguistics, Doha, Qatar, pp 1532\u20131543, https:\/\/doi.org\/10.3115\/v1\/D14-1162, https:\/\/www.aclweb.org\/anthology\/D14-1162","DOI":"10.3115\/v1\/D14-1162"},{"key":"1547_CR92","doi-asserted-by":"publisher","unstructured":"Peters M, Neumann M, Iyyer M, Gardner M, Clark C, Lee K, Zettlemoyer L (2018) Deep contextualized word representations. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), Association for Computational Linguistics, New Orleans, Louisiana, pp 2227\u20132237, https:\/\/doi.org\/10.18653\/v1\/N18-1202, https:\/\/www.aclweb.org\/anthology\/N18-1202","DOI":"10.18653\/v1\/N18-1202"},{"key":"1547_CR93","unstructured":"Qi D, Su L, Song J, Cui E, Bharti T, Sacheti A (2020) Imagebert: Cross-modal pre-training with large-scale weak-supervised image-text data. 2001.07966"},{"key":"1547_CR94","unstructured":"Radford A, Sutskever I (2018) Improving language understanding by generative pre-training. In: arxiv"},{"key":"1547_CR95","unstructured":"Radford A, Wu J, Child R, Luan D, Amodei D, Sutskever I (2019) Language Models are Unsupervised Multitask Learners https:\/\/openai.com\/blog\/better-language-models\/"},{"key":"1547_CR96","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning transferable visual models from natural language supervision. 2103.00020"},{"key":"1547_CR97","unstructured":"Ramesh A, Pavlov M, Goh G, Gray S, Voss C, Radford A, Chen M, Sutskever I (2021) Zero-shot text-to-image generation. 2102.12092"},{"key":"1547_CR98","unstructured":"Reed S, Akata Z, Yan X, Logeswaran L, Schiele B, Lee H (2016) Generative adversarial text-to-image synthesis. In: Proceedings of The 33rd International Conference on Machine Learning"},{"key":"1547_CR99","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: Towards real-time object detection with region proposal networks. In: Cortes C, Lawrence N, Lee D, Sugiyama M, Garnett R (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a028, pp 91\u201399, https:\/\/proceedings.neurips.cc\/paper\/2015\/file\/14bfa6bb14875e45bba028a21ed38046-Paper.pdf"},{"key":"1547_CR100","unstructured":"Rezende DJ, Mohamed S, Wierstra D (2014) Stochastic backpropagation and approximate inference in deep generative models. In: Xing EP, Jebara T (eds) Proceedings of the 31st International Conference on Machine Learning, PMLR, Bejing, China, Proceedings of Machine Learning Research, vol\u00a032, pp 1278\u20131286, http:\/\/proceedings.mlr.press\/v32\/rezende14.html"},{"key":"1547_CR101","unstructured":"Sanh V, Debut L, Chaumond J, Wolf T (2020) Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. 1910.01108"},{"key":"1547_CR102","doi-asserted-by":"crossref","unstructured":"Shao S, Li Z, Zhang T, Peng C, Yu G, Zhang X, Li J, Sun J (2019) Objects365: A large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"1547_CR103","unstructured":"Sharir O, Peleg B, Shoham Y (2020) The cost of training nlp models: A concise overview. 2004.08900"},{"key":"1547_CR104","doi-asserted-by":"publisher","unstructured":"Sharma P, Ding N, Goodman S, Soricut R (2018) Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Association for Computational Linguistics, Melbourne, Australia, pp 2556\u20132565, https:\/\/doi.org\/10.18653\/v1\/P18-1238, https:\/\/www.aclweb.org\/anthology\/P18-1238","DOI":"10.18653\/v1\/P18-1238"},{"key":"1547_CR105","unstructured":"Simonyan K, Zisserman A (2015) Very deep convolutional networks for large-scale image recognition. 1409.1556"},{"key":"1547_CR106","unstructured":"Su W, Zhu X, Cao Y, Li B, Lu L, Wei F, Dai J (2020) Vl-bert: Pre-training of generic visual-linguistic representations. In: International Conference on Learning Representations, https:\/\/openreview.net\/forum?id=SygXPaEYvH"},{"key":"1547_CR107","doi-asserted-by":"publisher","unstructured":"Suhr A, Zhou S, Zhang A, Zhang I, Bai H, Artzi Y (2019) A corpus for reasoning about natural language grounded in photographs. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, Florence, Italy, pp 6418\u20136428, https:\/\/doi.org\/10.18653\/v1\/P19-1644, https:\/\/www.aclweb.org\/anthology\/P19-1644","DOI":"10.18653\/v1\/P19-1644"},{"key":"1547_CR108","doi-asserted-by":"crossref","unstructured":"Sun C, Shrivastava A, Singh S, Gupta A (2017) Revisiting unreasonable effectiveness of data in deep learning era. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2017.97"},{"key":"1547_CR109","doi-asserted-by":"crossref","unstructured":"Sun C, Myers A, Vondrick C, Murphy K, Schmid C (2019) Videobert: A joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"1547_CR110","unstructured":"Sun C, Baradel F, Murphy K, Schmid C (2020) Learning video representations using contrastive bidirectional transformer. https:\/\/openreview.net\/forum?id=rJgRMkrtDr"},{"key":"1547_CR111","doi-asserted-by":"publisher","unstructured":"Tan H, Bansal M (2019) LXMERT: Learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), Association for Computational Linguistics, Hong Kong, China, pp 5100\u20135111, https:\/\/doi.org\/10.18653\/v1\/D19-1514, https:\/\/www.aclweb.org\/anthology\/D19-1514","DOI":"10.18653\/v1\/D19-1514"},{"key":"1547_CR112","unstructured":"Tan M, Le Q (2019) EfficientNet: Rethinking model scaling for convolutional neural networks. In: Chaudhuri K, Salakhutdinov R (eds) Proceedings of the 36th International Conference on Machine Learning, PMLR, Proceedings of Machine Learning Research, vol\u00a097, pp 6105\u20136114, http:\/\/proceedings.mlr.press\/v97\/tan19a.html"},{"key":"1547_CR113","doi-asserted-by":"crossref","unstructured":"Tan M, Pang R, Le QV (2020) Efficientdet: Scalable and efficient object detection. 1911.09070","DOI":"10.1109\/CVPR42600.2020.01079"},{"key":"1547_CR114","doi-asserted-by":"crossref","unstructured":"Tapaswi M, Zhu Y, Stiefelhagen R, Torralba A, Urtasun R, Fidler S (2016) MovieQA: Understanding Stories in Movies through Question-Answering. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2016.501"},{"key":"1547_CR115","unstructured":"Touvron H, Cord M, Douze M, Massa F, Sablayrolles A, J\u00e9gou H (2020) Training data-efficient image transformers & distillation through attention. 2012.12877"},{"key":"1547_CR116","doi-asserted-by":"crossref","unstructured":"Ushiku Y, Harada T, Kuniyoshi Y (2012) Efficient image annotation for automatic sentence generation. In: ACM Multimedia","DOI":"10.1145\/2393347.2393424"},{"key":"1547_CR117","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser Lu, Polosukhin I (2017) Attention is all you need. In: Guyon I, Luxburg UV, Bengio S, Wallach H, Fergus R, Vishwanathan S, Garnett R (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a030, pp 5998\u20136008, https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"1547_CR118","doi-asserted-by":"crossref","unstructured":"Vedantam R, Zitnick CL, Parikh D (2015) Cider: Consensus-based image description evaluation. 1411.5726","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1547_CR119","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney RJ, Darrell T, Saenko K (2015a) Sequence to sequence - video to text. CoRR abs\/1505.00487, http:\/\/arxiv.org\/abs\/1505.00487, 1505.00487"},{"key":"1547_CR120","doi-asserted-by":"publisher","unstructured":"Venugopalan S, Xu H, Donahue J, Rohrbach M, Mooney R, Saenko K (2015b) Translating videos to natural language using deep recurrent neural networks. In: Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics, Denver, Colorado, pp 1494\u20131504, https:\/\/doi.org\/10.3115\/v1\/N15-1173, https:\/\/www.aclweb.org\/anthology\/N15-1173","DOI":"10.3115\/v1\/N15-1173"},{"key":"1547_CR121","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: A neural image caption generator. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 3156\u20133164, 10.1109\/CVPR.2015.7298935","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1547_CR122","doi-asserted-by":"crossref","unstructured":"Vondrick C, Shrivastava A, Fathi A, Guadarrama S, Murphy K (2018) Tracking emerges by colorizing videos. In: Proceedings of the European Conference on Computer Vision (ECCV)","DOI":"10.1007\/978-3-030-01261-8_24"},{"key":"1547_CR123","unstructured":"Wang B, Shang L, Lioma C, Jiang X, Yang H, Liu Q, Simonsen JG (2021a) On position embeddings in bert. In: International Conference on Learning Representations, https:\/\/openreview.net\/forum?id=onxoVA9FxMw"},{"key":"1547_CR124","doi-asserted-by":"crossref","unstructured":"Wang H, Zhu Y, Adam H, Yuille A, Chen LC (2020a) Max-deeplab: End-to-end panoptic segmentation with mask transformers. 2012.00759","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"1547_CR125","unstructured":"Wang J, Hu X, Zhang P, Li X, Wang L, Zhang L, Gao J, Liu Z (2020b) Minivlm: A smaller and faster vision-language model. 2012.06946"},{"key":"1547_CR126","unstructured":"Wang W, Xie E, Li X, Fan D, Song K, Liang D, Lu T, Luo P, Shao L (2021b) Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. CoRR abs\/2102.12122, https:\/\/arxiv.org\/abs\/2102.12122, 2102.12122"},{"key":"1547_CR127","unstructured":"Wang X, Gupta A (2015) Unsupervised learning of visual representations using videos. CoRR abs\/1505.00687, http:\/\/arxiv.org\/abs\/1505.00687, 1505.00687"},{"key":"1547_CR128","doi-asserted-by":"publisher","unstructured":"Wang, Y., Mohamed, A., Le, D., Liu, C., Xiao, A., Mahadeokar, J., Huang, H., Tjandra, A., Zhang, X., Zhang, F., et\u00a0al. (2020c). Transformer-based acoustic modeling for hybrid speech recognition. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) https:\/\/doi.org\/10.1109\/icassp40776.2020.9054345, http:\/\/dx.doi.org\/10.1109\/ICASSP40776.2020.9054345","DOI":"10.1109\/icassp40776.2020.9054345"},{"key":"1547_CR129","unstructured":"Wu, L., Fisch, A., Chopra, S., Adams, K., Bordes, A., & Weston, J. (2017). Starspace: Embed all the things! http:\/\/arxiv.org\/abs\/1709.03856, cite arxiv:1709.03856"},{"key":"1547_CR130","doi-asserted-by":"crossref","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., & Murphy, K. (2018). Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. 1712.04851","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"1547_CR131","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., & Rui, Y. (2016). Msr-vtt: A large video description dataset for bridging video and language. In: IEEE International Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2016.571"},{"key":"1547_CR132","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., & Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. In: Bach F, Blei D (eds) In: Proceedings of the 32nd International Conference on Machine Learning, PMLR, Lille, France, Proceedings of Machine Learning Research, vol\u00a037, pp. 2048\u20132057, http:\/\/proceedings.mlr.press\/v37\/xuc15.html"},{"key":"1547_CR133","unstructured":"Yang, J., Ren, Z., Xu, M., Chen, X., Crandall, D., Parikh, D., & Batra, D. (2019a). Embodied visual recognition. 1904.04404"},{"key":"1547_CR134","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., & Smola, A. J. (2015). Stacked attention networks for image question answering. CoRR abs\/1511.02274, http:\/\/arxiv.org\/abs\/1511.02274"},{"key":"1547_CR135","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R. R., & Le, Q. V. (2019b). Xlnet: Generalized autoregressive pretraining for language understanding. In: Wallach H, Larochelle H, Beygelzimer A, d\u2019Alch\u00e9-Buc F, Fox E, Garnett R (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol\u00a032, pp 5753\u20135763, https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/dc6a7e655d7e5840e66733e9ee67cc69-Paper.pdf"},{"key":"1547_CR136","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., & Luo, J. (2016). Image captioning with semantic attention. 1603.03925","DOI":"10.1109\/CVPR.2016.503"},{"key":"1547_CR137","doi-asserted-by":"publisher","unstructured":"Young, P., Lai, A., Hodosh, M., & Hockenmaier, J. (2014). From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics, 2, 67\u201378. https:\/\/doi.org\/10.1162\/tacl_a_00166, https:\/\/www.aclweb.org\/anthology\/Q14-1006","DOI":"10.1162\/tacl_a_00166"},{"key":"1547_CR138","unstructured":"Yu, F., Tang, J., Yin, W., Sun, Y., Tian, H., Wu, H., & Wang, H. (2020). Ernie-vil: Knowledge enhanced vision-language representations through scene graph. 2006.16934"},{"key":"1547_CR139","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision - ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A. C., & Berg, T. L. (2016). Modeling context in referring expressions. In B. Leibe, J. Matas, N. Sebe, & M. Welling (Eds.), Computer Vision - ECCV 2016 (pp. 69\u201385). Cham: Springer."},{"key":"1547_CR140","doi-asserted-by":"publisher","unstructured":"Zadeh, A., Zellers, R., Pincus, E., & Morency, L. P. (2016). Multimodal sentiment intensity analysis in videos: Facial gestures and verbal messages. IEEE Intelligent Systems, 31(6), 82\u201388. https:\/\/doi.org\/10.1109\/MIS.2016.94, http:\/\/ieeexplore.ieee.org\/abstract\/document\/7742221\/","DOI":"10.1109\/MIS.2016.94"},{"key":"1547_CR141","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., & Choi, Y. (2019). From recognition to cognition: Visual commonsense reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"1547_CR142","doi-asserted-by":"crossref","unstructured":"Zhang, B., Hu, H., & Sha, F. (2018a). Cross-modal and hierarchical modeling of video and text. 1810.07212","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"1547_CR143","doi-asserted-by":"crossref","unstructured":"Zhang, P., Goyal, Y., Summers-Stay, D., Batra, D., & Parikh, D. (2016). Yin and Yang: Balancing and answering binary visual questions. In: Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2016.542"},{"key":"1547_CR144","doi-asserted-by":"publisher","unstructured":"Zhang, S., Jiang, T., Wang, T., Kuang, K., Zhao, Z., Zhu, J., Yu, J., Yang, H., & Wu, F. (2020). Devlbert. In: Proceedings of the 28th ACM International Conference on Multimedia https:\/\/doi.org\/10.1145\/3394171.3413518, https:\/\/doi.org\/10.1145\/3394171.3413518","DOI":"10.1145\/3394171.3413518 10.1145\/3394171.3413518"},{"key":"1547_CR145","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Xie, Y., & Yang, L. (2018b). Photographic text-to-image synthesis with a hierarchically-nested adversarial network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2018.00649"},{"key":"1547_CR146","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Koch, P., & Corso, J. J. (2016). Watch what you just said: Image captioning with text-conditional attention. 1606.04621","DOI":"10.1145\/3126686.3126717"},{"key":"1547_CR147","unstructured":"Zhou, L., Xu. C., & Corso, J. J. (2017). Procnets: Learning to segment procedures in untrimmed and unconstrained videos. CoRR abs\/1703.09788, http:\/\/arxiv.org\/abs\/1703.09788, 1703.09788"},{"key":"1547_CR148","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J. J., Socher, R., & Xiong, C. (2018). End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2018.00911"},{"key":"1547_CR149","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J. J., & Gao, J. (2019). Unified vision-language pre-training for image captioning and vqa. 1909.11059"},{"key":"1547_CR150","doi-asserted-by":"crossref","unstructured":"Zhu, L. & Yang, Y. (2020). Actbert: Learning global-local video-text representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"1547_CR151","doi-asserted-by":"crossref","unstructured":"Zhukov, D., Alayrac, J. B., Cinbis, R. G., Fouhey, D., Laptev, I., & Sivic, J. (2019). Cross-task weakly supervised learning from instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2019.00365"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-021-01547-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-021-01547-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-021-01547-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,15]],"date-time":"2024-09-15T17:44:12Z","timestamp":1726422252000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-021-01547-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,4]]},"references-count":151,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2022,2]]}},"alternative-id":["1547"],"URL":"https:\/\/doi.org\/10.1007\/s11263-021-01547-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,1,4]]},"assertion":[{"value":"6 March 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 November 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 January 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}