{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:54:11Z","timestamp":1772906051579,"version":"3.50.1"},"reference-count":58,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61961130392"],"award-info":[{"award-number":["61961130392"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61632001"],"award-info":[{"award-number":["61632001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1109\/jstsp.2020.2987417","type":"journal-article","created":{"date-parts":[[2020,4,14]],"date-time":"2020-04-14T21:23:12Z","timestamp":1586899392000},"page":"517-529","source":"Crossref","is-referenced-by-count":29,"title":["Direct Speech-to-Image Translation"],"prefix":"10.1109","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1447-4798","authenticated-orcid":false,"given":"Jiguo","family":"Li","sequence":"first","affiliation":[]},{"given":"Xinfeng","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7418-6245","authenticated-orcid":false,"given":"Chuanmin","family":"Jia","sequence":"additional","affiliation":[]},{"given":"Jizheng","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2118-4876","authenticated-orcid":false,"given":"Li","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yue","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2731-5403","authenticated-orcid":false,"given":"Siwei","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Wen","family":"Gao","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461870"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639201"},{"key":"ref30","article-title":"Distilling the knowledge in a neural network","author":"hinton","year":"0","journal-title":"Deep Learning and Representation Learning Workshop NIPS"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1109"},{"key":"ref36","article-title":"Listen and translate: A proof of concept for end-to-end speech-to-text translation","author":"b\u00e9rard","year":"0","journal-title":"CoRR"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref34","article-title":"Enhanced speech-to-speech translation system and methods for adding a new word","author":"waibel","year":"2015"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639635"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462396"},{"key":"ref29","first-page":"2654","article-title":"Do deep nets really need to be deep","author":"ba","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.2307\/1129079"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1113380109"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.267"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540112"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404800"},{"key":"ref26","first-page":"1053","article-title":"Word embeddings for speech recognition","author":"bengio","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref50","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","author":"ar?k","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref51","article-title":"Deep voice 3: Scaling text-to-speech with convolutional sequence learning","author":"ping","year":"0","journal-title":"Proc Intl Conf on Learning Representations"},{"key":"ref58","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in english and mandarin","author":"amodei","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref57","article-title":"Deep speech: Scaling up end-to-end speech recognition","author":"hannun","year":"2014","journal-title":"CoRR"},{"key":"ref56","first-page":"6626","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","author":"heusel","year":"0","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/0047-259X(82)90077-X"},{"key":"ref54","first-page":"2234","article-title":"Improved techniques for training gans","author":"salimans","year":"0","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-99579-3_21"},{"key":"ref10","first-page":"1060","article-title":"Generative adversarial text to image synthesis","author":"reed","year":"0","journal-title":"Proc 33rd Int Conf Mach Learn"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3126686.3126723"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"ref12","first-page":"6886","article-title":"Cmcgan: A uniform framework for cross-modal visual-audio mutual generation","author":"hao","year":"0","journal-title":"Proc 32nd AAAI Conf Artif Intell"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00772"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682970"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1047"},{"key":"ref16","first-page":"1123","article-title":"Direct\ufffd speech-to-speech\ufffd translation\ufffd with\ufffd a\ufffd sequence-to-sequence\ufffd model","author":"jia","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc INTERSPEECH"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1951"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2759726"},{"key":"ref19","first-page":"6626","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","author":"heusel","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2856256"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"ref6","volume":"32","author":"tannen","year":"1982","journal-title":"Spoken and written language exploring orality and literacy"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.15"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.13"},{"key":"ref49","first-page":"2962","article-title":"Multi-speaker neural text-to-speech","author":"gibiansky","year":"0","journal-title":"Proc Adv \ufffdNeural Inform Process \ufffdSyst \ufffd 30 \ufffd Ann \ufffd Conf \ufffd Neural\ufffd Inform Process Syst"},{"key":"ref9","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"0","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.42"},{"key":"ref45","article-title":"The Caltech-UCSD Birds-200-2011 dataset","author":"wah","year":"2011"},{"key":"ref48","first-page":"1858","article-title":"Unsupervised learning of spoken language with visual context","author":"harwath","year":"0","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref47","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"lin","year":"0","journal-title":"Proc Eur Conf Comput Vision"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref41","article-title":"Gradient flow in recurrent nets: the difficulty of learning long-term dependencies","author":"hochreiter","year":"2001","journal-title":"A Field Guide to Dynamical Recurrent Networks"},{"key":"ref44","first-page":"487","article-title":"Learning deep features for scene recognition using places database","author":"zhou","year":"0","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref43","article-title":"Conditional generative adversarial nets","author":"mirza","year":"0","journal-title":"CoRR"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4200690\/9126272\/09067083.pdf?arnumber=9067083","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T17:08:26Z","timestamp":1651079306000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9067083\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3]]},"references-count":58,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2020.2987417","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3]]}}}