{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T13:19:16Z","timestamp":1725715156979},"reference-count":36,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9053428","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T16:21:13Z","timestamp":1586449273000},"page":"4352-4356","source":"Crossref","is-referenced-by-count":7,"title":["Trilingual Semantic Embeddings of Visually Grounded Speech with Self-Attention Mechanisms"],"prefix":"10.1109","author":[{"given":"Yasunori","family":"Ohishi","sequence":"first","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan"}]},{"given":"Akisato","family":"Kimura","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan"}]},{"given":"Takahito","family":"Kawanishi","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan"}]},{"given":"Kunio","family":"Kashino","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan"}]},{"given":"David","family":"Harwath","sequence":"additional","affiliation":[{"name":"MIT Computer Science and Artificial Intelligence Laboratory,Cambridge,MA,USA"}]},{"given":"James","family":"Glass","sequence":"additional","affiliation":[{"name":"MIT Computer Science and Artificial Intelligence Laboratory,Cambridge,MA,USA"}]}],"member":"263","reference":[{"key":"ref33","article-title":"Self-attention generative adversarial networks","author":"zhang","year":"2019","journal-title":"Proc ICML"},{"key":"ref32","article-title":"Learning deep features for scene recognition using places database","author":"zhou","year":"2014","journal-title":"Proc NIPS"},{"key":"ref31","article-title":"Symbolic inductive bias for visually grounded learning of spoken language","author":"chrupa?a","year":"2019","journal-title":"Proc ACL"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683540"},{"key":"ref36","article-title":"Improving multilingual sentence embedding using bidirectional dual encoder with additive margin softmax","author":"yang","year":"2019","journal-title":"Proc IJCAI"},{"journal-title":"Efficient natural language response suggestion for smartreply","year":"2017","author":"henderson","key":"ref35"},{"key":"ref34","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","author":"karpathy","year":"2014","journal-title":"Proc NIPS"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1047"},{"key":"ref11","article-title":"Jointly discovering visual objects and spoken words from raw sensory input","author":"harwath","year":"2019","journal-title":"International Journal of Computer Vision"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462396"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1718"},{"key":"ref14","article-title":"Representations of language in a model of visually grounded speech signal","author":"chrupa?a","year":"2017","journal-title":"Proc ACL"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683069"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K19-1006"},{"article-title":"Microsoft COCO captions: Data collection and evaluation server","year":"2015","author":"chen","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-2066"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K18-3013"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461761"},{"key":"ref4","article-title":"Soundnet: Learning sound representations from unlabeled video","author":"aytar","year":"2016","journal-title":"Proc NIPS"},{"key":"ref27","article-title":"Image2speech: Automatically generating audio descriptions of images","author":"hasegawa-johnson","year":"2017","journal-title":"Proc ICNLSP"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2517567"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1227"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s10590-017-9197-z"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404800"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.909282"},{"key":"ref1","article-title":"Toward spoken term discovery at scale with zero resources","author":"jansen","year":"2010","journal-title":"Proc Inter-speech"},{"key":"ref9","article-title":"Unsupervised learning of spoken language with visual context","author":"harwath","year":"2016","journal-title":"Proc NIPS"},{"key":"ref20","article-title":"Rich feature hierarchies for accurate object detection and semantic segmentation","author":"girshick","year":"2013","journal-title":"Proc CVPR"},{"key":"ref22","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v31i1.11231","article-title":"Inception-v4,inception-resnet and the impact of residual connections on learning","author":"szegedy","year":"2017","journal-title":"Proc AAAI"},{"key":"ref21","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"Proc ICLR"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683587"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268967"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683275"},{"key":"ref25","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2017-502","article-title":"Visually grounded learning of keyword prediction from untran-scribed speech","author":"kamper","year":"2017","journal-title":"Proc INTERSPEECH"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2020,5,4]]},"location":"Barcelona, Spain","end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09053428.pdf?arnumber=9053428","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,29]],"date-time":"2023-09-29T15:30:49Z","timestamp":1696001449000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9053428\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9053428","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}