{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T13:40:04Z","timestamp":1750858804760,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,18]]},"DOI":"10.1145\/3703323.3703325","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T12:03:28Z","timestamp":1750853008000},"page":"10-18","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Audio-Visual Speech Synthesis Leveraging Capsule-Enhanced Generative Adversarial Network"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2538-6768","authenticated-orcid":false,"given":"Subhayu","family":"Ghosh","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, National Institute of Technology Durgapur, Durgapur, West Bengal, IN"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0631-9912","authenticated-orcid":false,"given":"Nanda Dulal","family":"Jana","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, National Institute of Technology Durgapur, Durgapur, West Bengal, IN"}]}],"member":"320","published-online":{"date-parts":[[2025,6,25]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-48309-7_20"},{"key":"e_1_3_3_2_3_2","first-page":"11","volume-title":"AVSP 2013-12th International Conference on Auditory-Visual Speech Processing","author":"Barbulescu Adela","year":"2013","unstructured":"Adela Barbulescu, Thomas Hueber, Gerard Bailly, and Remi Ronfard. 2013. Audio-visual speaker conversion using prosody features. In AVSP 2013-12th International Conference on Auditory-Visual Speech Processing. 11\u201316."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Joon\u00a0Son Chung Amir Jamaludin and Andrew Zisserman. 2017. You said that? arXiv preprint arXiv:1705.02966 (2017).","DOI":"10.5244\/C.31.109"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Antonia Creswell Tom White Vincent Dumoulin Kai Arulkumaran Biswa Sengupta and Anil\u00a0A Bharath. 2018. Generative adversarial networks: An overview. IEEE signal processing magazine 35 1 (2018) 53\u201365.","DOI":"10.1109\/MSP.2017.2765202"},{"key":"e_1_3_3_2_6_2","unstructured":"Kangle Deng Aayush Bansal and Deva Ramanan. 2020. Unsupervised audiovisual synthesis via exemplar autoencoders. arXiv preprint arXiv:2001.04463 (2020)."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960478"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383567"},{"key":"e_1_3_3_2_9_2","first-page":"303","volume-title":"International Conference on Advanced Network Technologies and Intelligent Computing","author":"Ghosh Subhayu","year":"2023","unstructured":"Subhayu Ghosh, Sandipan Dhar, and Nanda\u00a0Dulal Jana. 2023. A Comprehensive Analysis on Features and Performance Evaluation Metrics in Audio-Visual Voice Conversion. In International Conference on Advanced Network Technologies and Intelligent Computing. Springer, 303\u2013318."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN60899.2024.10651476"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Subhayu Ghosh Nanda\u00a0Dulal Jana Tapas Si Saurav Mallik and Mohd\u00a0Asif Shah. 2024. CCLCap-AE-AVSS: Cycle consistency loss based capsule autoencoders for audio\u2013visual speech synthesis. Journal of Intelligent Systems 33 1 (2024) 20230171.","DOI":"10.1515\/jisys-2023-0171"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Subhayu Ghosh Snehashis Sarkar Sovan Ghosh Frank Zalkow and Nanda\u00a0Dulal Jana. 2024. Audio-visual speech synthesis using vision transformer\u2013enhanced autoencoders with ensemble of loss functions. Applied Intelligence (2024) 1\u201318.","DOI":"10.1007\/s10489-024-05380-7"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Jie Gui Zhenan Sun Yonggang Wen Dacheng Tao and Jieping Ye. 2021. A review on generative adversarial networks: Algorithms theory and applications. IEEE transactions on knowledge and data engineering 35 4 (2021) 3313\u20133332.","DOI":"10.1109\/TKDE.2021.3130191"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Parth Gupta Rafael\u00a0E Banchs and Paolo Rosso. 2016. Squeezing bottlenecks: exploring the limits of autoencoder semantic representation capabilities. Neurocomputing 175 (2016) 1001\u20131008.","DOI":"10.1016\/j.neucom.2015.06.091"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Vishwa Gupta Patrick Kenny Pierre Ouellet Gilles Boulianne and Pierre Dumouchel. 2007. Combining gaussianized\/non-gaussianized features to improve speaker diarization of telephone conversations. IEEE Signal processing letters 14 12 (2007) 1040\u20131043.","DOI":"10.1109\/LSP.2007.905088"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Mahmood\u00a0Ul Haq Muhammad Athar\u00a0Javed Sethi and Atiq\u00a0Ur Rehman. 2023. Capsule Network with Its Limitation Modification and Applications\u2014A Survey. Machine Learning and Knowledge Extraction 5 3 (2023) 891\u2013921.","DOI":"10.3390\/make5030047"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Lei Huang Jie Qin Yi Zhou Fan Zhu Li Liu and Ling Shao. 2023. Normalization techniques in training dnns: Methodology analysis and application. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023).","DOI":"10.1109\/TPAMI.2023.3250241"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2018.8706604"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Takuhiro Kaneko Hirokazu Kameoka Kou Tanaka and Nobukatsu Hojo. 2019. Stargan-vc2: Rethinking conditional methods for stargan-based voice conversion. arXiv preprint arXiv:1907.12279 (2019).","DOI":"10.21437\/Interspeech.2019-2236"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.1997-654"},{"key":"e_1_3_3_2_23_2","first-page":"1428","volume-title":"Proceedings of the 27th ACM international conference on multimedia","author":"KR Prajwal","year":"2019","unstructured":"Prajwal KR, Rudrabha Mukhopadhyay, Jerin Philip, Abhishek Jha, Vinay Namboodiri, and CV Jawahar. 2019. Towards automatic face-to-face translation. In Proceedings of the 27th ACM international conference on multimedia. 1428\u20131436."},{"key":"e_1_3_3_2_24_2","unstructured":"Shaojie Li Jie Wu Xuefeng Xiao Fei Chao Xudong Mao and Rongrong Ji. 2021. Revisiting discriminator in GAN compression: A generator-discriminator cooperative compression scheme. Advances in Neural Information Processing Systems 34 (2021) 28560\u201328572."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Chen Liu Juntao Zhen and Wei Shan. 2023. Time series classification based on convolutional network with a Gated Linear Units kernel. Engineering Applications of Artificial Intelligence 123 (2023) 106296.","DOI":"10.1016\/j.engappai.2023.106296"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-19-0825-5_16"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Hao Meng Tianhao Yan Fei Yuan and Hongwei Wei. 2019. Speech emotion recognition from 3D log-mel spectrograms with deep learning network. IEEE access 7 (2019) 125868\u2013125881.","DOI":"10.1109\/ACCESS.2019.2938007"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Seyed\u00a0Hamidreza Mohammadi and Alexander Kain. 2017. An overview of voice conversion systems. Speech Communication 88 (2017) 65\u201382.","DOI":"10.1016\/j.specom.2017.01.008"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Arsha Nagrani Joon\u00a0Son Chung Weidi Xie and Andrew Zisserman. 2020. Voxceleb: Large-scale speaker verification in the wild. Computer Speech & Language Elsevier 60 (2020) 101027.","DOI":"10.1016\/j.csl.2019.101027"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683204"},{"key":"e_1_3_3_2_31_2","unstructured":"Marco Pasini. 2019. MelGAN-VC: Voice conversion and audio style transfer on arbitrarily long samples using spectrograms. arXiv preprint arXiv:1910.03713 (2019)."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/IJCB57857.2023.10449121"},{"key":"e_1_3_3_2_33_2","unstructured":"Sara Sabour Nicholas Frosst and Geoffrey\u00a0E Hinton. 2017. Dynamic routing between capsules. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855138"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688191"},{"key":"e_1_3_3_2_36_2","unstructured":"Joan Serr\u00e0 Santiago Pascual and Carlos Segura\u00a0Perales. 2019. Blow: a single-scale hyperconditioned flow for non-parallel raw-audio voice conversion. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Kalpana Seshadrinathan Rajiv Soundararajan Alan\u00a0Conrad Bovik and Lawrence\u00a0K Cormack. 2010. Study of subjective and objective quality assessment of video. IEEE transactions on Image Processing 19 6 (2010) 1427\u20131441.","DOI":"10.1109\/TIP.2010.2042111"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Pourya Shamsolmoali Masoumeh Zareapoor Swagatam Das Salvador Garcia Eric Granger and Jie Yang. 2022. GEN: Generative equivariant networks for diverse image-to-image translation. IEEE Transactions on Cybernetics 53 2 (2022) 874\u2013886.","DOI":"10.1109\/TCYB.2022.3166761"},{"key":"e_1_3_3_2_39_2","first-page":"31151","volume-title":"International Conference on Machine Learning (ICML)","author":"Shevchenko Aleksandr","year":"2023","unstructured":"Aleksandr Shevchenko, Kevin K\u00f6gler, Hamed Hassani, and Marco Mondelli. 2023. Fundamental limits of two-layer autoencoders, and achieving them with gradient methods. In International Conference on Machine Learning (ICML). PMLR, 31151\u201331209."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023162"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Berrak Sisman Junichi Yamagishi Simon King and Haizhou Li. 2020. An overview of voice conversion and its challenges: From statistical modeling to deep learning. IEEE\/ACM Transactions on Audio Speech and Language Processing 29 (2020) 132\u2013157.","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Robert\u00a0C Streijl Stefan Winkler and David\u00a0S Hands. 2016. Mean opinion score (MOS) revisited: methods and applications limitations and alternatives. Multimedia Systems 22 2 (2016) 213\u2013227.","DOI":"10.1007\/s00530-014-0446-1"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Shinnosuke Takamichi Tomoki Toda Alan\u00a0W. Black Graham Neubig Sakriani Sakti and Satoshi Nakamura. 2016. Postfilters to Modify the Modulation Spectrum for Statistical Parametric Speech Synthesis. IEEE\/ACM Transactions on Audio Speech and Language Processing 24 4 (2016) 755\u2013767.","DOI":"10.1109\/TASLP.2016.2522655"},{"key":"e_1_3_3_2_44_2","unstructured":"Patrick\u00a0Lumban Tobing Yi-Chiao Wu Tomoki Hayashi Kazuhiro Kobayashi and Tomoki Toda. 2019. Non-parallel voice conversion with cyclic variational autoencoder. arXiv preprint arXiv:1907.10185 (2019)."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941046"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Tomasz Walczyna and Zbigniew Piotrowski. 2023. Overview of Voice Conversion Methods Based on Deep Learning. Applied Sciences 13 5 (2023) 3100.","DOI":"10.3390\/app13053100"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Kunfeng Wang Chao Gou Yanjie Duan Yilun Lin Xinhu Zheng and Fei-Yue Wang. 2017. Generative adversarial networks: introduction and outlook. IEEE\/CAA Journal of Automatica Sinica 4 4 (2017) 588\u2013598.","DOI":"10.1109\/JAS.2017.7510583"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Canqun Xiang Lu Zhang Yi Tang Wenbin Zou and Chen Xu. 2018. MS-CapsNet: A novel multi-scale capsule network. IEEE Signal Processing Letters 25 12 (2018) 1850\u20131854.","DOI":"10.1109\/LSP.2018.2873892"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Ziqi Zhang Zeyu Li Kun Wei Siduo Pan and Cheng Deng. 2022. A survey on multimodal-guided visual content synthesis. Neurocomputing 497 (2022) 110\u2013128.","DOI":"10.1016\/j.neucom.2022.04.126"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Tianming Zhao Yuanning Liu Guang Huo and Xiaodong Zhu. 2019. A deep learning iris recognition method based on capsule network architecture. IEEE Access 7 (2019) 49691\u201349701.","DOI":"10.1109\/ACCESS.2019.2911056"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Hao Zhu Man-Di Luo Rui Wang Ai-Hua Zheng and Ran He. 2021. Deep audio-visual learning: A survey. International Journal of Automation and Computing 18 (2021) 351\u2013376.","DOI":"10.1007\/s11633-021-1293-0"}],"event":{"name":"CODS-COMAD 2024: 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)","location":"Jodhpur India","acronym":"CODS-COMAD Dec '24"},"container-title":["Proceedings of the 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3703323.3703325","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T13:01:32Z","timestamp":1750856492000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3703323.3703325"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,18]]},"references-count":50,"alternative-id":["10.1145\/3703323.3703325","10.1145\/3703323"],"URL":"https:\/\/doi.org\/10.1145\/3703323.3703325","relation":{},"subject":[],"published":{"date-parts":[[2024,12,18]]},"assertion":[{"value":"2025-06-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}