{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T18:06:47Z","timestamp":1762020407473,"version":"build-2065373602"},"reference-count":22,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9053397","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T20:21:13Z","timestamp":1586463673000},"page":"6304-6308","source":"Crossref","is-referenced-by-count":7,"title":["Looking Enhances Listening: Recovering Missing Speech Using Images"],"prefix":"10.1109","author":[{"given":"Tejas","family":"Srinivasan","sequence":"first","affiliation":[]},{"given":"Ramon","family":"Sanabria","sequence":"additional","affiliation":[]},{"given":"Florian","family":"Metze","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","article-title":"Analyzing utility of visual context multimodal speech recognition under noisy conditions","author":"srinivasan","year":"2019","journal-title":"Proc of The How2 Challenge New Tasks for Vision and Language Workshop"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1329"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1422"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404800"},{"key":"ref16","article-title":"Framing image description as a ranking task: Data, models and evaluation metrics (extended abstract)","author":"hodosh","year":"2015","journal-title":"Proc of the Twenty-Fourth International Joint Conference on Artificial Intelligence IJCAI"},{"key":"ref17","article-title":"How2: a large-scale dataset for multimodal language understanding","author":"sanabria","year":"2018","journal-title":"In Proc of the Workshop on Visually Grounded Interaction and Language (ViGIL)"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref19","article-title":"Places: A 10 million image database for scene recognition","author":"zhou","year":"2017","journal-title":"Transactions on Pattern Analysis and Machine Intelligence (PAMI)"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462439"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.121"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639551"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682750"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-412"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953112"},{"key":"ref2","article-title":"A shared task on multimodal machine translation and crosslingual image description (WMT)","author":"specia","year":"2016","journal-title":"Proc Conference on Machine Translation (WMT)"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846320"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref20","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"Proc IEEE Workshop Automatic Speech Recognition and Understanding (ASRU)"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2014.80"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1515\/pralin-2017-0035"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2020,5,4]]},"location":"Barcelona, Spain","end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09053397.pdf?arnumber=9053397","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,28]],"date-time":"2022-06-28T00:16:36Z","timestamp":1656375396000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9053397\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":22,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9053397","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}