{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:24:25Z","timestamp":1774599865114,"version":"3.50.1"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585389","type":"print"},{"value":"9783030585396","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58539-6_34","type":"book-chapter","created":{"date-parts":[[2020,11,6]],"date-time":"2020-11-06T19:02:46Z","timestamp":1604689366000},"page":"565-580","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":45,"title":["Behind the Scene: Revealing the Secrets of Pre-trained Vision-and-Language Models"],"prefix":"10.1007","author":[{"given":"Jize","family":"Cao","sequence":"first","affiliation":[]},{"given":"Zhe","family":"Gan","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Licheng","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Yen-Chun","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Jingjing","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,11,7]]},"reference":[{"key":"34_CR1","doi-asserted-by":"crossref","unstructured":"van Aken, B., Winter, B., L\u00f6ser, A., Gers, F.A.: How does bert answer questions? A layer-wise analysis of transformer representations. In: CIKM (2019)","DOI":"10.1145\/3357384.3358028"},{"key":"34_CR2","doi-asserted-by":"crossref","unstructured":"Alberti, C., Ling, J., Collins, M., Reitter, D.: Fusion of detected objects in text for visual question answering. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1219"},{"key":"34_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"34_CR4","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"34_CR5","doi-asserted-by":"crossref","unstructured":"Bouraoui, Z., Camacho-Collados, J., Schockaert, S.: Inducing relational knowledge from BERT. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i05.6242"},{"key":"34_CR6","unstructured":"Chen, Y.C., et al.: Uniter: Learning universal image-text representations. arXiv preprint arXiv:1909.11740 (2019)"},{"key":"34_CR7","doi-asserted-by":"crossref","unstructured":"Clark, K., Khandelwal, U., Levy, O., Manning, C.D.: What does bert look at? an analysis of bert\u2019s attention. arXiv preprint arXiv:1906.04341 (2019)","DOI":"10.18653\/v1\/W19-4828"},{"key":"34_CR8","unstructured":"Conneau, A., Kiela, D.: Senteval: An evaluation toolkit for universal sentence representations. arXiv preprint arXiv:1803.05449 (2018)"},{"key":"34_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"34_CR10","unstructured":"Gan, Z., Chen, Y.C., Li, L., Zhu, C., Cheng, Y., Liu, J.: Large-scale adversarial training for vision-and-language representation learning. arXiv preprint arXiv:2006.06195 (2020)"},{"issue":"4","key":"34_CR11","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1007\/s11263-018-1116-0","volume":"127","author":"Y Goyal","year":"2018","unstructured":"Goyal, Y., Khot, T., Agrawal, A., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. Int. J. Comput. Vis. 127(4), 398\u2013414 (2018). https:\/\/doi.org\/10.1007\/s11263-018-1116-0","journal-title":"Int. J. Comput. Vis."},{"key":"34_CR12","unstructured":"Htut, P.M., Phang, J., Bordia, S., Bowman, S.R.: Do attention heads in bert track syntactic dependencies? arXiv preprint arXiv:1911.12246 (2019)"},{"key":"34_CR13","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for compositional question answering over real-world images. In: CVPR (2019)"},{"key":"34_CR14","doi-asserted-by":"crossref","unstructured":"Jawahar, G., Sagot, B., Seddah, D.: What does BERT learn about the structure of language? In: ACL (2019)","DOI":"10.18653\/v1\/P19-1356"},{"key":"34_CR15","doi-asserted-by":"crossref","unstructured":"Jiang, Z., Xu, F.F., Araki, J., Neubig, G.: How can we know what language models know? arXiv preprint arXiv:1911.12543 (2019)","DOI":"10.1162\/tacl_a_00324"},{"key":"34_CR16","doi-asserted-by":"crossref","unstructured":"Kovaleva, O., Romanov, A., Rogers, A., Rumshisky, A.: Revealing the dark secrets of BERT. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1445"},{"issue":"1","key":"34_CR17","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vis."},{"key":"34_CR18","doi-asserted-by":"crossref","unstructured":"Lee, K.H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"34_CR19","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Jiang, D., Zhou, M.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"34_CR20","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: Hero: Hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"34_CR21","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"34_CR22","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS (2019)"},{"key":"34_CR23","doi-asserted-by":"crossref","unstructured":"Lu, J., Goswami, V., Rohrbach, M., Parikh, D., Lee, S.: 12-in-1: multi-task vision and language representation learning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"34_CR24","unstructured":"Michel, P., Levy, O., Neubig, G.: Are sixteen heads really better than one? In: NeurIPS (2019)"},{"key":"34_CR25","doi-asserted-by":"crossref","unstructured":"Petroni, F., et al.: Language models as knowledge bases? In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1250"},{"issue":"1","key":"34_CR26","doi-asserted-by":"publisher","first-page":"74","DOI":"10.1007\/s11263-016-0965-7","volume":"123","author":"BA Plummer","year":"2016","unstructured":"Plummer, B.A., et al.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. Int. J. Comput. Vis. 123(1), 74\u201393 (2016). https:\/\/doi.org\/10.1007\/s11263-016-0965-7","journal-title":"Int. J. Comput. Vis."},{"key":"34_CR27","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. In: ICLR (2020)"},{"key":"34_CR28","doi-asserted-by":"crossref","unstructured":"Suhr, A., Zhou, S., Zhang, A., Zhang, I., Bai, H., Artzi, Y.: A corpus for reasoning about natural language grounded in photographs. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1644"},{"key":"34_CR29","unstructured":"Sun, C., Baradel, F., Murphy, K., Schmid, C.: Contrastive bidirectional transformer for temporal representation learning. arXiv preprint arXiv:1906.05743 (2019)"},{"key":"34_CR30","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"34_CR31","doi-asserted-by":"crossref","unstructured":"Talmor, A., Elazar, Y., Goldberg, Y., Berant, J.: olmpics-on what language model pre-training captures. arXiv preprint arXiv:1912.13283 (2019)","DOI":"10.1162\/tacl_a_00342"},{"key":"34_CR32","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"34_CR33","doi-asserted-by":"crossref","unstructured":"Tenney, I., Das, D., Pavlick, E.: BERT rediscovers the classical NLP pipeline. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1452"},{"key":"34_CR34","unstructured":"Tenney, I., et al.: What do you learn from context? Probing for sentence structure in contextualized word representations. In: ICLR (2019)"},{"key":"34_CR35","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"34_CR36","doi-asserted-by":"crossref","unstructured":"Voita, E., Talbot, D., Moiseev, F., Sennrich, R., Titov, I.: Analyzing multi-head self-attention: specialized heads do the heavy lifting, the rest can be pruned. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1580"},{"key":"34_CR37","doi-asserted-by":"crossref","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: ECCV (2016)","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"34_CR38","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: visual commonsense reasoning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"34_CR39","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J.J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"34_CR40","doi-asserted-by":"crossref","unstructured":"Zhou, X., Zhang, Y., Cui, L., Huang, D.: Evaluating commonsense in pre-trained language models. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i05.6523"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58539-6_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,6]],"date-time":"2024-11-06T00:19:12Z","timestamp":1730852352000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58539-6_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585389","9783030585396"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58539-6_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"7 November 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}