{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T15:35:37Z","timestamp":1780587337812,"version":"3.54.1"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,7,18]]},"DOI":"10.1109\/ijcnn55064.2022.9892863","type":"proceedings-article","created":{"date-parts":[[2022,9,30]],"date-time":"2022-09-30T19:56:04Z","timestamp":1664567764000},"page":"1-8","source":"Crossref","is-referenced-by-count":8,"title":["Audio-to-Image Cross-Modal Generation"],"prefix":"10.1109","author":[{"given":"Maciej","family":"Zelaszczyk","sequence":"first","affiliation":[{"name":"Science Warsaw University of Technology,Faculty of Mathematics and Information,Warsaw,Poland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jacek","family":"Mandziuk","sequence":"additional","affiliation":[{"name":"Science Warsaw University of Technology,Faculty of Mathematics and Information,Warsaw,Poland"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009820"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_25"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00378"},{"key":"ref30","first-page":"15692","article-title":"Variational mixture-of-experts autoencoders for multi-modal deep generative models","author":"shi","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref36","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"ICLRE"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3053391"},{"key":"ref34","article-title":"Sound-guided semantic image manipulation","author":"lee","year":"2021","journal-title":"NeurIPS Workshop CtrlGen Controllable Generative Modeling in Language and Vision"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.2478\/jaiscr-2022-0007"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1207\/s15516709cog0901_7"},{"key":"ref13","first-page":"318","author":"rumelhart","year":"1986","journal-title":"Learning Internal Representations by Error Propagation"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/BF00332918"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1002\/aic.690370209"},{"key":"ref16","first-page":"3","article-title":"Autoencoders, minimum description length and helmholtz free energy","author":"hinton","year":"1993","journal-title":"Proceedings of the 6th International Conference on Neural Information Processing"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390294"},{"key":"ref18","article-title":"Auto-encoding variational bayes","author":"kingma","year":"2014","journal-title":"Proceedings of the 2nd International Conference on Learning Representations (ICLR)"},{"key":"ref19","first-page":"6306","article-title":"Neural discrete representation learning","author":"van den oord","year":"2017","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12329"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-010-5198-3"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3126686.3126723"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.50"},{"key":"ref6","first-page":"4848","article-title":"Adaptive cross-modal few-shot learning","author":"xing","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref29","author":"hsu","year":"2018","journal-title":"Disentangling by partitioning A representation learning framework for multimodal sensory data"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref8","article-title":"Disjoint mapping network for cross-modal matching of voices and faces","author":"wen","year":"2019","journal-title":"International Conference on Learning Representations"},{"key":"ref7","first-page":"10791","article-title":"Cross-modal learning with adversarial samples","author":"li","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1992.4.6.863"},{"key":"ref9","first-page":"13","article-title":"Vilbert: Pretraining taskagnostic visiolinguistic representations for vision-and-language tasks","author":"lu","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref1","first-page":"77","author":"hinton","year":"1986","journal-title":"Distributed Representations"},{"key":"ref20","first-page":"14837","article-title":"Generating diverse high-fidelity images with vq-vae-2","author":"razavi","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref22","article-title":"Unsupervised representation learning with deep convolutional generative adversarial networks","author":"radford","year":"2016","journal-title":"International Conference on Learning Representations (ICLR)"},{"key":"ref21","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"2014","journal-title":"Advances in Neural Information Processing Systems 27"},{"key":"ref24","article-title":"Spectral normalization for generative adversarial networks","author":"miyato","year":"2018","journal-title":"International Conference on Learning Representations"},{"key":"ref23","article-title":"Wasserstein gan","author":"arjovsky","year":"2017","journal-title":"International Conference on Machine Learning"},{"key":"ref26","article-title":"Generative adversarial text-to-image synthesis","author":"reed","year":"2016","journal-title":"Proceedings of the 33rd International Conference on Machine Learning"},{"key":"ref25","author":"oord","year":"2016","journal-title":"WaveNet A Generative Model for Raw Audio"}],"event":{"name":"2022 International Joint Conference on Neural Networks (IJCNN)","location":"Padua, Italy","start":{"date-parts":[[2022,7,18]]},"end":{"date-parts":[[2022,7,23]]}},"container-title":["2022 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9891857\/9889787\/09892863.pdf?arnumber=9892863","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,14]],"date-time":"2022-10-14T20:52:59Z","timestamp":1665780779000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9892863\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,18]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/ijcnn55064.2022.9892863","relation":{},"subject":[],"published":{"date-parts":[[2022,7,18]]}}}