{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:22:32Z","timestamp":1764588152487,"version":"build-2065373602"},"reference-count":57,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100013290","name":"National Key Research and Development Program of China Stem Cell and Translational Research","doi-asserted-by":"publisher","award":["2018AAA0102504"],"award-info":[{"award-number":["2018AAA0102504"]}],"id":[{"id":"10.13039\/501100013290","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/taslp.2021.3053391","type":"journal-article","created":{"date-parts":[[2021,1,22]],"date-time":"2021-01-22T23:14:21Z","timestamp":1611357261000},"page":"850-865","source":"Crossref","is-referenced-by-count":16,"title":["Generating Images From Spoken Descriptions"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1826-7419","authenticated-orcid":false,"given":"Xinsheng","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2045-1170","authenticated-orcid":false,"given":"Tingting","family":"Qiao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3081-8781","authenticated-orcid":false,"given":"Jihua","family":"Zhu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5771-2549","authenticated-orcid":false,"given":"Alan","family":"Hanjalic","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0693-8852","authenticated-orcid":false,"given":"Odette","family":"Scharenborg","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2872106"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K19-1006"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01245"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00766"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01060"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3067"},{"key":"ref36","first-page":"1858","article-title":"Unsupervised learning of spoken language with visual context","author":"harwath","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682553"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683868"},{"article-title":"Generative adversarial text to image synthesis","year":"2016","author":"reed","key":"ref28"},{"article-title":"Deep audio-visual learning: A survey","year":"2020","author":"zhu","key":"ref27"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00243"},{"article-title":"Conditional generative adversarial nets","year":"2014","author":"mirza","key":"ref2"},{"key":"ref1","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref20","first-page":"5265","article-title":"Face reconstruction from voice using generative adversarial networks","author":"wen","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00772"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682970"},{"article-title":"Speaker independent and multilingual\/mixlingual speech-driven talking head generation using phonetic posteriorgrams","year":"2020","author":"huang","key":"ref24"},{"article-title":"From inference to generation: End-to-end fully self-supervised generation of human face from speech","year":"2020","author":"choi","key":"ref23"},{"key":"ref26","first-page":"3586","article-title":"Dancing to music","author":"lee","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"article-title":"Everybody's talkin&#x2019;: Let me talk as you want","year":"2020","author":"song","key":"ref25"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682666"},{"key":"ref56","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"maaten","year":"2008","journal-title":"J Mach Learn Res"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1417"},{"key":"ref53","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"0","journal-title":"Proc IEEE Workshop on Automatic Speech Recognition and Understanding"},{"article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","year":"0","author":"heusel","key":"ref52"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2020.2987417"},{"journal-title":"Ethnologue Languages of the World","year":"0","author":"lewis","key":"ref11"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.2973896"},{"article-title":"The Caltech-UCSD birds-200-2011 Dataset","year":"2011","author":"wah","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"ref14","first-page":"487","article-title":"Learning deep features for scene recognition using places database","author":"zhou","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404800"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1759"},{"key":"ref19","first-page":"2234","article-title":"Improved techniques for training gans","author":"salimans","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2836316"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.310"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_26"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00507"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00160"},{"key":"ref7","first-page":"1947","article-title":"StackGAN: Realistic image synthesis with stacked generative adversarial networks","volume":"41","author":"zhang","year":"2018"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"article-title":"The LJ speech dataset","year":"2017","author":"ito","key":"ref47"},{"article-title":"Domain segmentation and adjustment for generalized zero-shot learning","year":"2020","author":"wang","key":"ref42"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.13"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00758"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01052"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9289074\/09333641.pdf?arnumber=9333641","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:54:01Z","timestamp":1652194441000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9333641\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":57,"URL":"https:\/\/doi.org\/10.1109\/taslp.2021.3053391","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"type":"print","value":"2329-9290"},{"type":"electronic","value":"2329-9304"}],"subject":[],"published":{"date-parts":[[2021]]}}}