{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T04:28:23Z","timestamp":1730262503924,"version":"3.28.0"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,10,31]],"date-time":"2021-10-31T00:00:00Z","timestamp":1635638400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,10,31]],"date-time":"2021-10-31T00:00:00Z","timestamp":1635638400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,10,31]]},"DOI":"10.1109\/ieeeconf53345.2021.9723367","type":"proceedings-article","created":{"date-parts":[[2022,3,4]],"date-time":"2022-03-04T20:26:46Z","timestamp":1646425606000},"page":"1419-1425","source":"Crossref","is-referenced-by-count":0,"title":["A Translation Framework for Visually Grounded Spoken Unit Discovery"],"prefix":"10.1109","author":[{"given":"Liming","family":"Wang","sequence":"first","affiliation":[{"name":"University of Illinois at Urbana-Champaign,Department of Electrical and Computer Engineering"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mark","family":"Hasegawa-Johnson","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign,Department of Electrical and Computer Engineering"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268953"},{"key":"ref31","article-title":"The DARPA TIMIT Acoustic-Phonetic Continuous Speech Corpus CDROM","author":"garofolo","year":"1993","journal-title":"Linguistic Data Consortium"},{"key":"ref30","first-page":"5206","article-title":"Librispeech: an ASR corpus based on public domain audio books","author":"vassil","year":"2015","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP)"},{"key":"ref10","article-title":"Learning hierarchical discrete linguistic units from visually-grounded speech","author":"harwath","year":"2020","journal-title":"International Conference on Learning Representations"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/GLU.2017-9"},{"key":"ref12","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"Advances in Neural Information Processing Systems 28 (NIPS 2015)"},{"key":"ref13","first-page":"263","article-title":"The mathematics of statistical machine translation: parameter estimation","volume":"19","author":"brown","year":"1993","journal-title":"Computational Linguistics"},{"key":"ref14","first-page":"1","article-title":"Maximum likelihood from incomplete data via the EM algorithm","volume":"39","author":"dempster","year":"1977","journal-title":"Journal of the Royal Statistical Society"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1148"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2398"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"ref18","article-title":"A hierarchical subspace model for language-attuned acoustic unit discovery","author":"yusuf","year":"2020","journal-title":"CoRR"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1664"},{"key":"ref28","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"European Conference on Computer Vision"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/S0020-0255(03)00167-1"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0965-7"},{"key":"ref3","article-title":"The role of sensorimotor function, associative memory and reinforcement learning in automatic acquisition of spoken language by an autonomous robot","author":"levinson","year":"2000","journal-title":"Joint NSF DARPA Workshop on Development and Learning"},{"key":"ref6","article-title":"Collecting image annotations using Amazon&#x2019;s mechanical turk","author":"rashtchian","year":"2010","journal-title":"Proceedings of the NAACL HLT 2010 Workshop on Creating Speech and Language Data with Amazon&#x2019;s Mechanical Turk"},{"key":"ref29","article-title":"Framing image description as a ranking task: data, models and evaluation metrics","author":"hodosh","year":"2010","journal-title":"Journal of Artificial Intelligence Research"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404800"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1047"},{"key":"ref7","article-title":"Unsupervised learning of spoken language with visual context","author":"harwath","year":"2016","journal-title":"Neural Information Processing Systems"},{"key":"ref2","first-page":"1","article-title":"A computational model of word learning from multimodal sensory input","author":"roy","year":"2000","journal-title":"Proceedings of the international conference of cognitive modeling"},{"key":"ref9","article-title":"Vision as an interlingua: Learning multilingual semantic embeddings of untranscribed speech","author":"harwath","year":"2018","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"journal-title":"Verbal Behavior","year":"1992","author":"skinner","key":"ref1"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K19-1006"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414418"},{"key":"ref23","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Neural Information Processing Systems (NIPS)"},{"key":"ref26","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"Proc International Conference on Learning Representations (ICLR)"},{"key":"ref25","article-title":"The Zero Resource Speech Benchmark 2021: Metrics and baselines for unsupervised spoken language modeling","author":"nguyen","year":"2020","journal-title":"Workshop on Self-Supervised Learning for Speech and Audio Processing NeurIPS"}],"event":{"name":"2021 55th Asilomar Conference on Signals, Systems, and Computers","start":{"date-parts":[[2021,10,31]]},"location":"Pacific Grove, CA, USA","end":{"date-parts":[[2021,11,3]]}},"container-title":["2021 55th Asilomar Conference on Signals, Systems, and Computers"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9723034\/9723086\/09723367.pdf?arnumber=9723367","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,15]],"date-time":"2022-06-15T20:16:07Z","timestamp":1655324167000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9723367\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,31]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/ieeeconf53345.2021.9723367","relation":{},"subject":[],"published":{"date-parts":[[2021,10,31]]}}}