{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T14:49:03Z","timestamp":1762094943103,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611779","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"2458-2467","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Rethinking Voice-Face Correlation: A Geometry View"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3282-1159","authenticated-orcid":false,"given":"Xiang","family":"Li","sequence":"first","affiliation":[{"name":"CMU, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6330-7438","authenticated-orcid":false,"given":"Yandong","family":"Wen","sequence":"additional","affiliation":[{"name":"MPI-IS, Munich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6273-0138","authenticated-orcid":false,"given":"Muqiao","family":"Yang","sequence":"additional","affiliation":[{"name":"CMU, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3222-6579","authenticated-orcid":false,"given":"Jinglu","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3743-0162","authenticated-orcid":false,"given":"Rita","family":"Singh","sequence":"additional","affiliation":[{"name":"CMU, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0038-5513","authenticated-orcid":false,"given":"Bhiksha","family":"Raj","sequence":"additional","affiliation":[{"name":"CMU &amp; MBZUAI, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2680467"},{"key":"e_1_3_2_1_2_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_3_1","volume-title":"Hugo Van hamme, and David A. van Leeuwen","author":"Bahari Mohamad Hasan","year":"2012","unstructured":"Mohamad Hasan Bahari, Mitchell McLaren, Hugo Van hamme, and David A. van Leeuwen. 2012. Age Estimation from Telephone Speech using i-vectors. In Interspeech."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311556"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2003.1227983"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1068\/p120223"},{"key":"e_1_3_2_1_7_1","first-page":"223","article-title":"b","volume":"12","author":"Bull R. H. C.","year":"1983","unstructured":"R. H. C. Bull, Harriet Rathborn, and Brian R. Clifford. 1983 b. The Voice-Recognition Accuracy of Blind Listeners. Perception, Vol. 12 (1983), 223--226.","journal-title":"The Voice-Recognition Accuracy of Blind Listeners. Perception"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Lele Chen Zhiheng Li Ross K Maddox Zhiyao Duan and Chenliang Xu. 2018. Lip movements generation at a glance. In ECCV. 520--535.","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Daniel Cudeiro Timo Bolkart Cassidy Laidlaw Anurag Ranjan and Michael J Black. 2019. Capture learning and synthesis of 3D speaking styles. In CVPR. 10101--10111.","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1097\/00001665-200403000-00027"},{"key":"e_1_3_2_1_11_1","unstructured":"Donya Ghafourzadeh Cyrus Rahgoshay Sahel Fallahdoust Adeline Aubame Andre Beauchamp Tiberiu Popa and Eric Paquette. 2019. Part-based 3D face morphable model with anthropometric local control. (2019)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.3634122"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1037\/1528-3542.7.2.377"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Joanna Grzybowska and Stanislaw Kacprzak. 2016. Speaker Age Classification and Regression Using i-Vectors.. In INTERSPEECH. 1402--1406.","DOI":"10.21437\/Interspeech.2016-1118"},{"key":"e_1_3_2_1_16_1","unstructured":"Yudong Guo Keyu Chen Sen Liang Yongjin Liu Hujun Bao and Juyong Zhang. 2021. AD-NeRF: Audio Driven Neural Radiance Fields for Talking Head Synthesis. In ICCV."},{"volume-title":"Exploring Automatic COVID-19 Diagnosis via voice and symptoms from Crowdsourced Data","author":"Han Jing","key":"e_1_3_2_1_17_1","unstructured":"Jing Han, Chlo\u00eb Brown, Jagmohan Chauhan, Andreas Grammenos, Apinan Hasthanasombat, Dimitris Spathis, Tong Xia, Pietro Cicuta, and Cecilia Mascolo. 2021. Exploring Automatic COVID-19 Diagnosis via voice and symptoms from Crowdsourced Data. In ICASSP. IEEE."},{"key":"e_1_3_2_1_18_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, Vol. 33 (2020), 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"e_1_3_2_1_20_1","volume-title":"What uncertainties do we need in bayesian deep learning for computer vision? Advances in neural information processing systems","author":"Kendall Alex","year":"2017","unstructured":"Alex Kendall and Yarin Gal. 2017. What uncertainties do we need in bayesian deep learning for computer vision? Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_21_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Sheng Li Dabre Raj Xugang Lu Peng Shen Tatsuya Kawahara and Hisashi Kawai. 2019a. Improving Transformer-Based Speech Recognition Systems with Compressed Structure and Speech Attributes Augmentation.. In INTERSPEECH. 4400--4404.","DOI":"10.21437\/Interspeech.2019-2112"},{"key":"e_1_3_2_1_23_1","unstructured":"Sheng Li Dabre Raj Xugang Lu Peng Shen Tatsuya Kawahara and Hisashi Kawai. 2019b. Improving Transformer-Based Speech Recognition Systems with Compressed Structure and Speech Attributes Augmentation. In Interspeech."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuropsychologia.2018.03.039"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"John D Markel Augustine H Gray and Augustine H Gray. 1976. Linear prediction of speech: Communication and cybernetics. (1976).","DOI":"10.1007\/978-3-642-66286-7"},{"key":"e_1_3_2_1_26_1","volume-title":"Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784","author":"Mirza Mehdi","year":"2014","unstructured":"Mehdi Mirza and Simon Osindero. 2014. Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784 (2014)."},{"key":"e_1_3_2_1_27_1","volume-title":"Pietro Morerio, Arif Mahmood, Ignazio Gallo, Muhammad Haroon Yousaf, and Alessio Del Bue.","author":"Nawaz Shah","year":"2021","unstructured":"Shah Nawaz, Muhammad Saad Saeed, Pietro Morerio, Arif Mahmood, Ignazio Gallo, Muhammad Haroon Yousaf, and Alessio Del Bue. 2021. Cross-Modal Speaker Verification and Recognition: A Multilingual Perspective. In CVPRW."},{"key":"e_1_3_2_1_28_1","volume-title":"Disentangled Representation Learning for Cross-modal Biometric Matching. TMM","author":"Ning Hailong","year":"2021","unstructured":"Hailong Ning, Xiangtao Zheng, Xiaoqiang Lu, and Yuan Yuan. 2021. Disentangled Representation Learning for Cross-modal Biometric Matching. TMM (2021)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00772"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.0902.273"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.0902.273"},{"key":"e_1_3_2_1_32_1","volume-title":"The Hidden Dance of Phonemes and Visage: Unveiling the Enigmatic Link between Phonemes and Facial Features. arXiv preprint arXiv:2307.13953","author":"Qu Liao","year":"2023","unstructured":"Liao Qu, Xianwei Zou, Xiang Li, Yandong Wen, Rita Singh, and Bhiksha Raj. 2023. The Hidden Dance of Phonemes and Visage: Unveiling the Enigmatic Link between Phonemes and Facial Features. arXiv preprint arXiv:2307.13953 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.187"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639585"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639585"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_37_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention--MICCAI 2015: 18th International Conference","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention--MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18. Springer, 234--241."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Leda Sari Kritika Singh Jiatong Zhou Lorenzo Torresani Nayan Singhal and Yatharth Saraf. 2021. A Multi-View Approach to Audio-Visual Speaker Verification. In ICASSP.","DOI":"10.1109\/ICASSP39728.2021.9414260"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-021-91579-4"},{"volume-title":"The relationship of voice onset time and voice offset time to physical age","author":"Singh Rita","key":"e_1_3_2_1_40_1","unstructured":"Rita Singh, Joseph Keshet, Deniz Gencaga, and Bhiksha Raj. 2016a. The relationship of voice onset time and voice offset time to physical age. In ICASSP. IEEE, 5390--5394."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPRO.2016.7522354"},{"key":"e_1_3_2_1_42_1","volume-title":"Rohan Kumar Das, and Haizhou Li","author":"Tao Ruijie","year":"2020","unstructured":"Ruijie Tao, Rohan Kumar Das, and Haizhou Li. 2020. Audio-visual speaker recognition with a cross-modal discriminative network. In INTERSPEECH."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1121\/10.0002487"},{"volume-title":"Learning utterance-level representations for speech emotion and age\/gender recognition using deep neural networks","author":"Wang Zhong-Qiu","key":"e_1_3_2_1_44_1","unstructured":"Zhong-Qiu Wang and Ivan Tashev. 2017. Learning utterance-level representations for speech emotion and age\/gender recognition using deep neural networks. In ICASSP. IEEE, 5150--5154."},{"key":"e_1_3_2_1_45_1","unstructured":"Peisong Wen Qianqian Xu Yangbangyan Jiang Zhiyong Yang Yuan He and Qingming Huang. 2021. Seeking the Shape of Sound: An Adaptive Framework for Learning Voice-Face Association. In CVPR. 16347--16356."},{"key":"e_1_3_2_1_46_1","volume-title":"NeurIPS","volume":"32","author":"Wen Yandong","year":"2019","unstructured":"Yandong Wen, Bhiksha Raj, and Rita Singh. 2019. Face Reconstruction from Voice using Generative Adversarial Networks. In NeurIPS, Vol. 32."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Olivia Wiles A Koepke and Andrew Zisserman. 2018. X2face: A network for controlling face generation using images audio and pose codes. In ECCV. 670--686.","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01020"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1080\/14015439.2016.1273384"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1159\/000354939"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096807"},{"volume-title":"Attention-augmented end-to-end multi-task learning for emotion prediction from speech","author":"Zhang Zixing","key":"e_1_3_2_1_52_1","unstructured":"Zixing Zhang, Bingwen Wu, and Bj\u00f6rn Schuller. 2019a. Attention-augmented end-to-end multi-task learning for emotion prediction from speech. In ICASSP. IEEE, 6705--6709."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682896"},{"key":"e_1_3_2_1_54_1","volume-title":"Adversarial-metric learning for audio-visual cross-modal matching. TMM","author":"Zheng Aihua","year":"2021","unstructured":"Aihua Zheng, Menglan Hu, Bo Jiang, Yan Huang, Yan Yan, and Bin Luo. 2021. Adversarial-metric learning for audio-visual cross-modal matching. TMM (2021)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"e_1_3_2_1_56_1","first-page":"391","article-title":"Facial anthropometric differences among gender, ethnicity, and age groups","volume":"54","author":"Zhuang Ziqing","year":"2010","unstructured":"Ziqing Zhuang, Douglas Landsittel, Stacey Benson, Raymond Roberge, and Ronald Shaffer. 2010. Facial anthropometric differences among gender, ethnicity, and age groups. Annals of occupational hygiene, Vol. 54, 4 (2010), 391--402.","journal-title":"Annals of occupational hygiene"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611779","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611779","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:31Z","timestamp":1755820891000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611779"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":56,"alternative-id":["10.1145\/3581783.3611779","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611779","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}