{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T19:46:01Z","timestamp":1768074361939,"version":"3.49.0"},"reference-count":27,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1109\/icassp49357.2023.10096517","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T13:28:30Z","timestamp":1683293310000},"page":"1-5","source":"Crossref","is-referenced-by-count":6,"title":["Visual Onoma-to-Wave: Environmental Sound Synthesis from Visual Onomatopoeias and Sound-Source Images"],"prefix":"10.1109","author":[{"given":"Hien","family":"Ohnaka","sequence":"first","affiliation":[{"name":"National Institute of Technology,Tokuyama College,Japan"}]},{"given":"Shinnosuke","family":"Takamichi","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Japan"}]},{"given":"Keisuke","family":"Imoto","sequence":"additional","affiliation":[{"name":"Doshisha University,Japan"}]},{"given":"Yuki","family":"Okamoto","sequence":"additional","affiliation":[{"name":"Ritsumeikan University,Japan"}]},{"given":"Kazuki","family":"Fujii","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Japan"}]},{"given":"Hiroshi","family":"Saruwatari","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Japan"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.264"},{"key":"ref12","first-page":"55","article-title":"Vector-based representation and clustering of audio using onomatopoeia words","author":"sundaram","year":"2006","journal-title":"AAAI Fall Symposium Aurally Informed Performance"},{"key":"ref15","article-title":"vTTS: Visual-text to speech","author":"nakano","year":"2022","journal-title":"Proc SLT"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00374"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1121\/1.4861245"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356487"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683727"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1944745.1944755"},{"key":"ref17","first-page":"17 022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"kong","year":"2020","journal-title":"Proc NeurIPS"},{"key":"ref16","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","author":"ren","year":"2020","journal-title":"arXiv preprint arXiv 2006 04989"},{"key":"ref19","first-page":"47","article-title":"The word-final moraic obstruent in Japanese mimetics","volume":"11","author":"nasu","year":"2007","journal-title":"J Phonetic Soc Jpn"},{"key":"ref18","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"Proc ICML"},{"key":"ref24","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"2017","journal-title":"arXiv preprint 1703 10135"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413880"},{"key":"ref26","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2020","journal-title":"arXiv preprint arXiv 2010 11419"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00102"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1250\/ast.20.225"},{"key":"ref22","year":"0","journal-title":"Hidden Markov Model Toolkit (HTK)"},{"key":"ref21","first-page":"125","article-title":"RWCP-SSD-Onomatopoeia: Onomatopoeic word dataset for environmental sound synthesis","author":"okamoto","year":"2020","journal-title":"Proc DCASE"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2019EDP7228"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1177\/1470357214541746"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_16"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-21607-8_11"},{"key":"ref4","article-title":"Diffsound: Discrete diffusion model for text-to-sound generation","author":"yang","year":"2022","journal-title":"arXiv preprint arXiv 2207 09983"},{"key":"ref3","first-page":"1","article-title":"Conditional sound generation using neural discrete time-frequency representation learning","author":"liu","year":"2021","journal-title":"Proc MLSP"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1561\/116.00000049"},{"key":"ref5","article-title":"AudioGen: Textually guided audio generation","author":"kreuk","year":"2022","journal-title":"arXiv preprint arXiv 2209 15352"}],"event":{"name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Rhodes Island, Greece","start":{"date-parts":[[2023,6,4]]},"end":{"date-parts":[[2023,6,10]]}},"container-title":["ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10094559\/10094560\/10096517.pdf?arnumber=10096517","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T14:09:46Z","timestamp":1700489386000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10096517\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/icassp49357.2023.10096517","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]}}}