{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T09:41:17Z","timestamp":1762508477578},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,1,10]],"date-time":"2021-01-10T00:00:00Z","timestamp":1610236800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,10]],"date-time":"2021-01-10T00:00:00Z","timestamp":1610236800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,10]],"date-time":"2021-01-10T00:00:00Z","timestamp":1610236800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,1,10]]},"DOI":"10.1109\/icpr48806.2021.9413269","type":"proceedings-article","created":{"date-parts":[[2021,5,6]],"date-time":"2021-05-06T02:15:54Z","timestamp":1620267354000},"page":"1212-1219","source":"Crossref","is-referenced-by-count":8,"title":["A Novel Attention-based Aggregation Function to Combine Vision and Language"],"prefix":"10.1109","author":[{"given":"Matteo","family":"Stefanini","sequence":"first","affiliation":[]},{"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[]},{"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[]},{"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref38","article-title":"Microsoft COCO: Common Objects in Context","author":"lin","year":"2014","journal-title":"ECCV"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1109\/CVPR.2017.243"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/CVPR.2017.634"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1109\/CVPR.2016.90"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.1109\/CVPR.2015.7298594"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.1109\/CVPR.2017.670"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref35","article-title":"Poly-encoders: Architectures and Pre-training Strategies for Fast and Accurate Multi-sentence Scoring","author":"humeau","year":"2020","journal-title":"ICLRE"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.1109\/CVPR.2017.106"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/CVPR.2016.12"},{"doi-asserted-by":"publisher","key":"ref40","DOI":"10.1007\/s11263-016-0981-7"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref12","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref13","article-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","author":"devlin","year":"2019","journal-title":"NAACL-HLT"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1109\/CVPR.2018.00750"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.24963\/ijcai.2019\/526"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1109\/ICCV.2019.01041"},{"key":"ref17","article-title":"Multi-modality latent interaction network for visual question answering","author":"gao","year":"2019","journal-title":"ICCV"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/ICCV.2019.00475"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1109\/CVPR.2018.00419"},{"key":"ref28","article-title":"Imagenet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"NeurIPS"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/CVPR42600.2020.01059"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.1109\/CVPR.2019.00644"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/ICRA40945.2020.9196653"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/ICPR.2018.8545064"},{"key":"ref29","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"ICLRE"},{"key":"ref5","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"NeurIPS workshop"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"ref7","article-title":"VSE++: Improving Visual-Semantic Embeddings with Hard Negatives","author":"faghri","year":"2018","journal-title":"BMVC"},{"key":"ref2","article-title":"Show, Reward and Tell: Automatic Generation of Narrative Paragraph From Photo Stream by Adversarial Training","author":"wang","year":"2018","journal-title":"AAAI"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.1016\/j.patrec.2019.11.018"},{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.1109\/CVPR.2018.00636"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1007\/978-3-030-30645-8_66"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1109\/ICCV.2017.93"},{"key":"ref21","article-title":"A multi-world approach to question answering about real-world scenes based on uncertain input","author":"malinowski","year":"2014","journal-title":"NeurIPS"},{"doi-asserted-by":"publisher","key":"ref42","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref24","article-title":"Bilinear Attention Networks","author":"kim","year":"2018","journal-title":"NeurIPS"},{"key":"ref41","article-title":"Adam: A Method for Stochastic Optimization","author":"kingma","year":"2015","journal-title":"ICLRE"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.1109\/CVPR.2016.10"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1109\/CVPR.2019.00680"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1109\/CVPR.2019.00209"}],"event":{"name":"2020 25th International Conference on Pattern Recognition (ICPR)","start":{"date-parts":[[2021,1,10]]},"location":"Milan, Italy","end":{"date-parts":[[2021,1,15]]}},"container-title":["2020 25th International Conference on Pattern Recognition (ICPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9411940\/9411911\/09413269.pdf?arnumber=9413269","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T15:40:49Z","timestamp":1652197249000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9413269\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,1,10]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/icpr48806.2021.9413269","relation":{},"subject":[],"published":{"date-parts":[[2021,1,10]]}}}