{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T01:07:45Z","timestamp":1779152865570,"version":"3.51.4"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585570","type":"print"},{"value":"9783030585587","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58558-7_38","type":"book-chapter","created":{"date-parts":[[2020,10,28]],"date-time":"2020-10-28T09:03:08Z","timestamp":1603875788000},"page":"647-664","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":108,"title":["Connecting Vision and Language with Localized Narratives"],"prefix":"10.1007","author":[{"given":"Jordi","family":"Pont-Tuset","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jasper","family":"Uijlings","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Soravit","family":"Changpinyo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Radu","family":"Soricut","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vittorio","family":"Ferrari","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,10,29]]},"reference":[{"key":"38_CR1","unstructured":"Amodei, D., et al.: Deep speech 2: end-to-end speech recognition in English and Mandarin. In: ICML (2016)"},{"key":"38_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: ECCV (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"38_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"38_CR4","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"38_CR5","doi-asserted-by":"crossref","unstructured":"Benenson, R., Popov, S., Ferrari, V.: Large-scale interactive object segmentation with human annotators. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01197"},{"key":"38_CR6","doi-asserted-by":"crossref","unstructured":"Bigham, J.P., et al.: VizWiz: nearly real-time answers to visual questions. In: Proceedings of the 23nd Annual ACM Symposium on User Interface Software and Technology (2010)","DOI":"10.1145\/1866029.1866080"},{"key":"38_CR7","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Pang, B., Sharma, P., Soricut, R.: Decoupled box proposal and featurization with ultrafine-grained semantic labels improve image captioning and visual question answering. In: EMNLP-IJCNLP (2019)","DOI":"10.18653\/v1\/D19-1155"},{"key":"38_CR8","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv (2015)"},{"key":"38_CR9","doi-asserted-by":"crossref","unstructured":"Cirik, V., Morency, L.P., Berg-Kirkpatrick, T.: Visual referring expression recognition: what do systems actually learn? In: NAACL (2018)","DOI":"10.18653\/v1\/N18-2123"},{"key":"38_CR10","doi-asserted-by":"crossref","unstructured":"Cornia, M., Baraldi, L., Cucchiara, R.: Show, control and tell: a framework for generating controllable and grounded captions. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00850"},{"key":"38_CR11","unstructured":"Dai, D.: Towards cost-effective and performance-aware vision algorithms. Ph.D. thesis, ETH Zurich (2016)"},{"key":"38_CR12","unstructured":"Damen, D., et al.: The EPIC-KITCHENS dataset: collection, challenges and baselines. IEEE Trans. PAMI (2020)"},{"key":"38_CR13","doi-asserted-by":"crossref","unstructured":"Dogan, P., Sigal, L., Gross, M.: Neural sequential phrase grounding (seqground). In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00430"},{"key":"38_CR14","unstructured":"Google cloud speech-to-text API. https:\/\/cloud.google.com\/speech-to-text\/"},{"key":"38_CR15","doi-asserted-by":"crossref","unstructured":"Graves, A., Mohamed, A.R., Hinton, G.: Speech recognition with deep recurrent neural networks. In: ICASSP (2013)","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"38_CR16","doi-asserted-by":"crossref","unstructured":"Gygli, M., Ferrari, V.: Efficient object annotation via speaking and pointing. In: IJCV (2019)","DOI":"10.1007\/s11263-019-01255-4"},{"key":"38_CR17","doi-asserted-by":"crossref","unstructured":"Gygli, M., Ferrari, V.: Fast object class labelling via speech. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00551"},{"key":"38_CR18","doi-asserted-by":"crossref","unstructured":"Harwath, D., Recasens, A., Sur\u00eds, D., Chuang, G., Torralba, A., Glass, J.: Jointly discovering visual objects and spoken words from raw sensory input. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"38_CR19","unstructured":"Honnibal, M., Montani, I.: spaCy 2: natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing (2017). spacy.io"},{"key":"38_CR20","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"38_CR21","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: fully convolutional localization networks for dense captioning. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"38_CR22","unstructured":"Kahneman, D.: Attention and effort. Citeseer (1973)"},{"key":"38_CR23","unstructured":"Kalchbrenner, N., et al.: Efficient neural audio synthesis. In: ICML (2018)"},{"key":"38_CR24","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: Referitgame: referring to objects in photographs of natural scenes. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"38_CR25","doi-asserted-by":"crossref","unstructured":"Kim, D.J., Choi, J., Oh, T.H., Kweon, I.S.: Dense relational captioning: triple-stream networks for relationship-based captioning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00643"},{"key":"38_CR26","doi-asserted-by":"crossref","unstructured":"Krause, J., Johnson, J., Krishna, R., Fei-Fei, L.: A hierarchical approach for generating descriptive image paragraphs. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.356"},{"issue":"1","key":"38_CR27","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. IJCV 123(1), 32\u201373 (2017)","journal-title":"IJCV"},{"key":"38_CR28","unstructured":"Kruskal, J.B., Liberman, M.: The symmetric time-warping problem: from continuous to discrete. In: Time Warps, String Edits, and Macromolecules - The Theory and Practice of Sequence Comparison, chap. 4. CSLI Publications (1999)"},{"key":"38_CR29","unstructured":"Kuznetsova, A., et al.: The Open Images Dataset V4: Unified image classification, object detection, and visual relationship detection at scale. arXiv preprint arXiv:1811.00982 (2018)"},{"key":"38_CR30","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out (2004)"},{"key":"38_CR31","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"38_CR32","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, J., Sha, F., Yuille, A.: Attention correctness in neural image captioning. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11197"},{"key":"38_CR33","doi-asserted-by":"crossref","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Neural baby talk. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00754"},{"key":"38_CR34","doi-asserted-by":"crossref","unstructured":"Malinowski, M., Rohrbach, M., Fritz, M.: Ask your neurons: a neural-based approach to answering questions about images. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.9"},{"key":"38_CR35","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"38_CR36","unstructured":"Mehri, S., et al.: Samplernn: an unconditional end-to-end neural audio generation model. In: ICLR (2017)"},{"key":"38_CR37","unstructured":"Oord, A.V.D., et al.: Wavenet: a generative model for raw audio. arXiv 1609.03499 (2016)"},{"key":"38_CR38","unstructured":"Oviatt, S.: Multimodal interfaces. In: The Human-Computer Interaction Handbook: Fundamentals, Evolving Technologies and Emerging Applications (2003)"},{"key":"38_CR39","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"issue":"1","key":"38_CR40","doi-asserted-by":"publisher","first-page":"74","DOI":"10.1007\/s11263-016-0965-7","volume":"123","author":"BA Plummer","year":"2017","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. IJCV 123(1), 74\u201393 (2017)","journal-title":"IJCV"},{"key":"38_CR41","doi-asserted-by":"crossref","unstructured":"Ravanelli, M., Parcollet, T., Bengio, Y.: The Pytorch-Kaldi speech recognition toolkit. In: ICASSP (2019)","DOI":"10.1109\/ICASSP.2019.8683713"},{"key":"38_CR42","unstructured":"Reed, S.E., Akata, Z., Mohan, S., Tenka, S., Schiele, B., Lee, H.: Learning what and where to draw. In: NeurIPS, pp. 217\u2013225 (2016)"},{"key":"38_CR43","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NeurIPS (2015)"},{"key":"38_CR44","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Hendricks, L.A., Burns, K., Darrell, T., Saenko, K.: Object hallucination in image captioning. In: EMNLP (2018)","DOI":"10.18653\/v1\/D18-1437"},{"key":"38_CR45","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., et al.: Taking a HINT: leveraging explanations to make vision and language models more grounded. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00268"},{"key":"38_CR46","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"38_CR47","doi-asserted-by":"crossref","unstructured":"Tan, F., Feng, S., Ordonez, V.: Text2scene: generating compositional scenes from textual descriptions. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00687"},{"key":"38_CR48","doi-asserted-by":"crossref","unstructured":"Vaidyanathan, P., Prud, E., Pelz, J.B., Alm, C.O.: SNAG : spoken narratives and gaze dataset. In: ACL (2018)","DOI":"10.18653\/v1\/P18-2022"},{"key":"38_CR49","doi-asserted-by":"crossref","unstructured":"Vasudevan, A.B., Dai, D., Van Gool, L.: Object referring in visual scene with spoken language. In: CVPR (2017)","DOI":"10.1109\/WACV.2018.00206"},{"key":"38_CR50","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"38_CR51","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"4","key":"38_CR52","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2016","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: lessons learned from the 2015 MSCOCO image captioning challenge. IEEE Trans. PAMI 39(4), 652\u2013663 (2016)","journal-title":"IEEE Trans. PAMI"},{"key":"38_CR53","unstructured":"Website: Localized Narratives Data and Visualization (2020). https:\/\/google.github.io\/localized-narratives"},{"key":"38_CR54","doi-asserted-by":"crossref","unstructured":"Wu, S., Wieland, J., Farivar, O., Schiller, J.: Automatic alt-text: computer-generated image descriptions for blind users on a social network service. In: Conference on Computer Supported Cooperative Work and Social Computing (2017)","DOI":"10.1145\/2998181.2998364"},{"key":"38_CR55","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"38_CR56","unstructured":"Yan, S., Yang, H., Robertson, N.: ParaCNN: visual paragraph generation via adversarial twin contextual CNNs. arXiv (2020)"},{"key":"38_CR57","doi-asserted-by":"crossref","unstructured":"Yin, G., Liu, B., Sheng, L., Yu, N., Wang, X., Shao, J.: Semantics disentangling for text-to-image generation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00243"},{"key":"38_CR58","doi-asserted-by":"crossref","unstructured":"Yin, G., Sheng, L., Liu, B., Yu, N., Wang, X., Shao, J.: Context and attribute grounded dense captioning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00640"},{"key":"38_CR59","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. TACL 2, 67\u201378 (2014)","journal-title":"TACL"},{"key":"38_CR60","unstructured":"Yu, J., Li, J., Yu, Z., Huang, Q.: Multimodal transformer with multi-view visual representation for image captioning. arXiv 1905.07841 (2019)"},{"key":"38_CR61","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Wu, S., Reynolds, L., Azenkot, S.: The effect of computer-generated descriptions on photo-sharing experiences of people with visual impairments. ACM Hum.-Comput. Interact. 1 (2017)","DOI":"10.1145\/3134756"},{"issue":"3","key":"38_CR62","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Semantic understanding of scenes through the ADE20K dataset. IJCV 127(3), 302\u2013321 (2019)","journal-title":"IJCV"},{"key":"38_CR63","doi-asserted-by":"crossref","unstructured":"Zhou, L., Kalantidis, Y., Chen, X., Corso, J.J., Rohrbach, M.: Grounded video description. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00674"},{"key":"38_CR64","unstructured":"Ziegler, Z.M., Melas-Kyriazi, L., Gehrmann, S., Rush, A.M.: Encoder-agnostic adaptation for conditional language generation. arXiv (2019)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58558-7_38","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T08:58:25Z","timestamp":1730105905000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58558-7_38"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585570","9783030585587"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58558-7_38","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"29 October 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}