{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T20:51:45Z","timestamp":1758055905728,"version":"3.44.0"},"reference-count":90,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T00:00:00Z","timestamp":1751846400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T00:00:00Z","timestamp":1751846400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Institute of Information & communications Technology Planning & Evaluation, South Korea","award":["RS-2023-00229451","RS-2023-00229451"],"award-info":[{"award-number":["RS-2023-00229451","RS-2023-00229451"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00530-025-01872-9","type":"journal-article","created":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T09:01:03Z","timestamp":1751878863000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Face and voice cross-modal association with learning convex feature embedding"],"prefix":"10.1007","volume":"31","author":[{"given":"Taewan","family":"Kim","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiwoo","family":"Kang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,7]]},"reference":[{"issue":"4\u20136","key":"1872_CR1","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1080\/13506285.2017.1290729","volume":"25","author":"I B\u00fclthoff","year":"2017","unstructured":"B\u00fclthoff, I., Newell, F.N.: Crossmodal priming of unfamiliar faces supports early interactions between voices and faces in person perception. Vis. Cogn. 25(4\u20136), 611\u2013628 (2017)","journal-title":"Vis. Cogn."},{"issue":"6","key":"1872_CR2","doi-asserted-by":"publisher","first-page":"263","DOI":"10.1016\/j.tics.2013.04.004","volume":"17","author":"G Yovel","year":"2013","unstructured":"Yovel, G., Belin, P.: A unified coding strategy for processing faces and voices. Trends Cogn. Sci. 17(6), 263\u2013271 (2013)","journal-title":"Trends Cogn. Sci."},{"issue":"2","key":"1872_CR3","doi-asserted-by":"publisher","first-page":"1654","DOI":"10.1016\/j.neuroimage.2010.08.073","volume":"54","author":"F Joassin","year":"2011","unstructured":"Joassin, F., Maurage, P., Campanella, S.: The neural network sustaining the crossmodal processing of human gender from faces and voices: An fMRI study. Neuroimage 54(2), 1654\u20131661 (2011)","journal-title":"Neuroimage"},{"key":"1872_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027","volume":"60","author":"A Nagrani","year":"2020","unstructured":"Nagrani, A., Chung, J.S., Xie, W., Zisserman, A.: VoxCeleb: Large-scale speaker verification in the wild. Computer Speech & Language 60, 101027 (2020)","journal-title":"Computer Speech & Language"},{"key":"1872_CR5","doi-asserted-by":"crossref","unstructured":"Chen, G., Zhang, D., Liu, T., Du, X.: Self-lifting: A novel framework for unsupervised voice-face association learning. In: Proceedings of the International Conference on Multimedia Retrieval, pp. 527\u2013535. Association for Computing Machinery, New York, NY, USA (2022)","DOI":"10.1145\/3512527.3531364"},{"key":"1872_CR6","doi-asserted-by":"crossref","unstructured":"Wu, C.-Y., Hsu, C.-C., Neumann, U.: Cross-modal perceptionist: Can face geometry be gleaned from voices? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10452\u201310461. IEEE, New York, NY, USA (2022)","DOI":"10.1109\/CVPR52688.2022.01020"},{"key":"1872_CR7","doi-asserted-by":"crossref","unstructured":"Saeed, M.S., Khan, M.H., Nawaz, S., Yousaf, M.H., Del\u00a0Bue, A.: Fusion and orthogonal projection for improved face-voice association. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 7057\u20137061. IEEE, New York, NY, USA (2022)","DOI":"10.1109\/ICASSP43922.2022.9747704"},{"key":"1872_CR8","doi-asserted-by":"crossref","unstructured":"Wang, R., Liu, X., Cheung, Y.-m., Cheng, K., Wang, N., Fan, W.: Learning discriminative joint embeddings for efficient face and voice association. In: Proceedings of the ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1881\u20131884. Association for Computing Machinery, New York, NY, USA (2020)","DOI":"10.1145\/3397271.3401302"},{"key":"1872_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, K., Liu, X., Cheung, Y.-m., Wang, R., Xu, X., Zhong, B.: Hearing like seeing: Improving voice-face interactions and associations via adversarial deep semantic matching network. In: Proceedings of the ACM International Conference on Multimedia, pp. 448\u2013455. Association for Computing Machinery, New York, NY, USA (2020)","DOI":"10.1145\/3394171.3413710"},{"key":"1872_CR10","doi-asserted-by":"crossref","unstructured":"Kim, C., Shin, H.V., Oh, T.-H., Kaspar, A., Elgharib, M., Matusik, W.: On learning associations of faces and voices. In: Proceedings of the Asian Conference on Computer Vision, pp. 276\u2013292. Springer, Berlin, Germany (2019)","DOI":"10.1007\/978-3-030-20873-8_18"},{"key":"1872_CR11","doi-asserted-by":"crossref","unstructured":"Nawaz, S., Janjua, M.K., Gallo, I., Mahmood, A., Calefati, A.: Deep latent space learning for cross-modal mapping of audio and visual signals. In: Proceedings of the Digital Image Computing: Techniques and Applications, pp. 1\u20137. IEEE, New York, NY, USA (2019)","DOI":"10.1109\/DICTA47822.2019.8945863"},{"key":"1872_CR12","doi-asserted-by":"publisher","unstructured":"Xiong, C., Zhang, D., Liu, T., Du, X.: Voice-face cross-modal matching and retrieval: A benchmark. arXiv (2019). https:\/\/doi.org\/10.48550\/arXiv.1911.09338 . arXiv:1911.09338","DOI":"10.48550\/arXiv.1911.09338"},{"key":"1872_CR13","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Learnable PINs: Cross-modal embeddings for person identity. In: Proceedings of the European Conference on Computer Vision, pp. 71\u201388. Springer, Berlin, Germany (2018)","DOI":"10.1007\/978-3-030-01261-8_5"},{"key":"1872_CR14","doi-asserted-by":"crossref","unstructured":"Horiguchi, S., Kanda, N., Nagamatsu, K.: Face-voice matching using cross-modal embeddings. In: Proceedings of the ACM International Conference on Multimedia, pp. 1011\u20131019. Association for Computing Machinery, New York, NY, USA (2018)","DOI":"10.1145\/3240508.3240601"},{"key":"1872_CR15","doi-asserted-by":"publisher","unstructured":"Wen, Y., Ismail, M.A., Liu, W., Raj, B., Singh, R.: Disjoint mapping network for cross-modal matching of voices and faces. arXiv (2018). https:\/\/doi.org\/10.48550\/arXiv.1807.04836 . arXiv:1807.04836","DOI":"10.48550\/arXiv.1807.04836"},{"key":"1872_CR16","volume-title":"Advances in Neural Information Processing Systems","author":"K Sohn","year":"2016","unstructured":"Sohn, K.: Improved deep metric learning with multi-class N-pair loss objective. In: Lee, D., Sugiyama, M., Luxburg, U., Guyon, I., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 29. Curran Associates Inc, New York, NY, USA (2016)"},{"key":"1872_CR17","doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: FaceNet: A unified embedding for face recognition and clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 815\u2013823. IEEE, New York, NY, USA (2015)","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"1872_CR18","doi-asserted-by":"crossref","unstructured":"Hadsell, R., Chopra, S., LeCun, Y.: Dimensionality reduction by learning an invariant mapping. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, vol. 2, pp. 1735\u20131742. IEEE, New York, NY, USA (2006)","DOI":"10.1109\/CVPR.2006.100"},{"key":"1872_CR19","doi-asserted-by":"crossref","unstructured":"Wang, X., Han, X., Huang, W., Dong, D., Scott, M.R.: Multi-similarity loss with general pair weighting for deep metric learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5022\u20135030. IEEE, New York, NY, USA (2019)","DOI":"10.1109\/CVPR.2019.00516"},{"key":"1872_CR20","doi-asserted-by":"crossref","unstructured":"Morgado, P., Vasconcelos, N., Misra, I.: Audio-visual instance discrimination with cross-modal agreement. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12475\u201312486. IEEE, New York, NY, USA (2021)","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"1872_CR21","doi-asserted-by":"crossref","unstructured":"Zhu, B., Xu, K., Wang, C., Qin, Z., Sun, T., Wang, H., Peng, Y.: Unsupervised voice-face representation learning by cross-modal prototype contrast. In: Proceedings of the International Joint Conference on Artificial Intelligence, pp. 3787\u20133794. IJCAI Organization, California, USA (2022)","DOI":"10.24963\/ijcai.2022\/526"},{"key":"1872_CR22","doi-asserted-by":"crossref","unstructured":"Chen, G., Zhang, D., Liu, T., Du, X.: Local-global contrast for learning voice-face representations. In: In Proceeding of the IEEE International Conference on Image Processing, New York, NY, USA, pp. 51\u201355 (2023). IEEE","DOI":"10.1109\/ICIP49359.2023.10222130"},{"key":"1872_CR23","doi-asserted-by":"crossref","unstructured":"Wei, X., Wu, J., Chen, Y., Wang, Z., Li, B.: Multi-modality cross attention network for image and sentence matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10941\u201310950 (2020)","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"1872_CR24","doi-asserted-by":"crossref","unstructured":"Wei, X., Wu, J., Chen, Y., Wang, Z., Li, B.: Universal weighting metric learning for cross-modal matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1415\u20131424 (2020)","DOI":"10.1109\/CVPR42600.2020.01302"},{"key":"1872_CR25","doi-asserted-by":"crossref","unstructured":"Peng, D., Lei, Y., Li, W., Zhang, P., Guo, Y.: Sparse-to-dense feature matching: Intra and inter domain cross-modal learning in domain adaptation for 3D semantic segmentation. arXiv preprint arXiv:2107.14724 (2021)","DOI":"10.1109\/ICCV48922.2021.00702"},{"key":"1872_CR26","doi-asserted-by":"crossref","unstructured":"Kang, J., Kim, T., Park, Y.-h.: Convex feature embedding for face and voice association. In: Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 2342\u20132346 (2024)","DOI":"10.1145\/3626772.3657975"},{"key":"1872_CR27","doi-asserted-by":"crossref","unstructured":"Wen, P., Xu, Q., Jiang, Y., Yang, Z., He, Y., Huang, Q.: Seeking the shape of sound: An adaptive framework for learning voice-face association. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16347\u201316356. IEEE, New York, NY, USA (2021)","DOI":"10.1109\/CVPR46437.2021.01608"},{"issue":"4","key":"1872_CR28","doi-asserted-by":"publisher","first-page":"2073","DOI":"10.1007\/s00530-023-01097-8","volume":"29","author":"X Xu","year":"2023","unstructured":"Xu, X., Lv, G., Sun, Y., Hu, Y., Nian, F.: Hierarchical cross-modal contextual attention network for visual grounding. Multimedia Syst. 29(4), 2073\u20132083 (2023)","journal-title":"Multimedia Syst."},{"issue":"1","key":"1872_CR29","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1007\/s13735-023-00316-2","volume":"13","author":"Y Wei","year":"2024","unstructured":"Wei, Y., Zheng, L., Qiu, G., Cai, G.: Cross-modal retrieval based on shared proxies. Int. J. Multimed. Inf. Retr. 13(1), 5 (2024)","journal-title":"Int. J. Multimed. Inf. Retr."},{"issue":"2","key":"1872_CR30","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1007\/s00530-024-01317-9","volume":"30","author":"P Liu","year":"2024","unstructured":"Liu, P., Liu, X.: Complementary expert balanced learning for long-tail cross-modal retrieval. Multimedia Syst. 30(2), 113 (2024)","journal-title":"Multimedia Syst."},{"issue":"5","key":"1872_CR31","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s00530-024-01471-0","volume":"30","author":"Y Li","year":"2024","unstructured":"Li, Y., Tang, X., Lu, J., Huang, Y.: Dual graph-structured semantics multi-subspace learning for cross-modal retrieval. Multimedia Syst. 30(5), 1\u201314 (2024)","journal-title":"Multimedia Syst."},{"key":"1872_CR32","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xia, H., Liu, Y.: Cmc-mmr: multi-modal recommendation model with cross-modal correction. Journal of Intelligent Information Systems, 1\u201325 (2024)","DOI":"10.1007\/s10844-024-00848-x"},{"key":"1872_CR33","doi-asserted-by":"crossref","unstructured":"Lampert, C., Kroemer, O.: Weakly-paired maximum covariance analysis for multimodal dimensionality reduction and transfer learning. In: Proceedings of the European Conference on Computer Vision, pp. 566\u2013579. Springer, Berlin, Germany (2010)","DOI":"10.1007\/978-3-642-15552-9_41"},{"key":"1872_CR34","doi-asserted-by":"crossref","unstructured":"Kidron, E., Schechner, Y.Y., Elad, M.: Pixels that sound. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, vol. 1, pp. 88\u201395. IEEE, New York, NY, USA (2005)","DOI":"10.1109\/CVPR.2005.274"},{"key":"1872_CR35","doi-asserted-by":"crossref","unstructured":"Li, D., Dimitrova, N., Li, M., Sethi, I.K.: Multimedia content processing through cross-modal association. In: Proceedings of the ACM International Conference on Multimedia, pp. 604\u2013611. Association for Computing Machinery, New York, NY, USA (2003)","DOI":"10.1145\/957013.957143"},{"key":"1872_CR36","first-page":"1107","volume":"3","author":"K Barnard","year":"2003","unstructured":"Barnard, K., Duygulu, P., Forsyth, D., Freitas, N., Blei, D.M., Jordan, M.I.: Matching words and pictures. J. Mach. Learn. Res. 3, 1107\u20131135 (2003)","journal-title":"J. Mach. Learn. Res."},{"key":"1872_CR37","doi-asserted-by":"crossref","unstructured":"Duygulu, P., Barnard, K., Freitas, J.F., Forsyth, D.A.: Object recognition as machine translation: Learning a lexicon for a fixed image vocabulary. In: Proceedings of the European Conference on Computer Vision, pp. 97\u2013112. Springer, Berlin, Germany (2002)","DOI":"10.1007\/3-540-47979-1_7"},{"key":"1872_CR38","doi-asserted-by":"crossref","unstructured":"Gordo, A., Larlus, D.: Beyond instance-level image retrieval: Leveraging captions to learn a global visual representation for semantic retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6589\u20136598. IEEE, New York, NY, USA (2017)","DOI":"10.1109\/CVPR.2017.560"},{"key":"1872_CR39","doi-asserted-by":"publisher","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv (2014). https:\/\/doi.org\/10.48550\/arXiv.1411.2539 . arXiv:1411.2539","DOI":"10.48550\/arXiv.1411.2539"},{"key":"1872_CR40","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5005\u20135013. IEEE, New York, NY, USA (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"1872_CR41","doi-asserted-by":"crossref","unstructured":"Chen, Z.-D., Yu, W.-J., Li, C.-X., Nie, L., Xu, X.-S.: Dual deep neural networks cross-modal hashing. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 274\u2013281. AAAI Press, Washington, DC, USA (2018)","DOI":"10.1609\/aaai.v32i1.11249"},{"key":"1872_CR42","doi-asserted-by":"crossref","unstructured":"Su, S., Zhong, Z., Zhang, C.: Deep joint-semantics reconstructing hashing for large-scale unsupervised cross-modal retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3027\u20133035. IEEE, New York, NY, USA (2019)","DOI":"10.1109\/ICCV.2019.00312"},{"key":"1872_CR43","doi-asserted-by":"publisher","first-page":"657","DOI":"10.1007\/s11280-018-0541-x","volume":"22","author":"X Xu","year":"2019","unstructured":"Xu, X., He, L., Lu, H., Gao, L., Ji, Y.: Deep adversarial metric learning for cross-modal retrieval. World Wide Web 22, 657\u2013672 (2019)","journal-title":"World Wide Web"},{"issue":"1","key":"1872_CR44","doi-asserted-by":"publisher","first-page":"174","DOI":"10.1109\/TMM.2019.2922128","volume":"22","author":"J Zhang","year":"2019","unstructured":"Zhang, J., Peng, Y.: Multi-pathway generative adversarial hashing for unsupervised cross-modal retrieval. IEEE Trans. Multimedia 22(1), 174\u2013187 (2019)","journal-title":"IEEE Trans. Multimedia"},{"issue":"5","key":"1872_CR45","doi-asserted-by":"publisher","first-page":"1276","DOI":"10.1109\/TMM.2018.2877127","volume":"21","author":"E Yu","year":"2018","unstructured":"Yu, E., Sun, J., Li, J., Chang, X., Han, X.-H., Hauptmann, A.G.: Adaptive semi-supervised feature selection for cross-modal retrieval. IEEE Trans. Multimedia 21(5), 1276\u20131288 (2018)","journal-title":"IEEE Trans. Multimedia"},{"issue":"5","key":"1872_CR46","doi-asserted-by":"publisher","first-page":"1261","DOI":"10.1109\/TMM.2018.2877122","volume":"21","author":"G Song","year":"2018","unstructured":"Song, G., Wang, D., Tan, X.: Deep memory network for cross-modal retrieval. IEEE Trans. Multimedia 21(5), 1261\u20131275 (2018)","journal-title":"IEEE Trans. Multimedia"},{"issue":"6","key":"1872_CR47","doi-asserted-by":"publisher","first-page":"2400","DOI":"10.1109\/TCYB.2019.2928180","volume":"50","author":"X Xu","year":"2019","unstructured":"Xu, X., Lu, H., Song, J., Yang, Y., Shen, H.T., Li, X.: Ternary adversarial networks with self-supervision for zero-shot cross-modal retrieval. IEEE Transactions on Cybernetics 50(6), 2400\u20132413 (2019)","journal-title":"IEEE Transactions on Cybernetics"},{"issue":"4","key":"1872_CR48","doi-asserted-by":"publisher","first-page":"1173","DOI":"10.1109\/TCSVT.2019.2900171","volume":"30","author":"J Chi","year":"2019","unstructured":"Chi, J., Peng, Y.: Zero-shot cross-media embedding learning with dual adversarial distribution network. IEEE Trans. Circuits Syst. Video Technol. 30(4), 1173\u20131187 (2019)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1872_CR49","doi-asserted-by":"crossref","unstructured":"Albanie, S., Nagrani, A., Vedaldi, A., Zisserman, A.: Emotion recognition in speech using cross-modal transfer in the wild. In: Proceedings of the International Conference on Multimedia Retrieval, pp. 292\u2013301. Association for Computing Machinery, New York, NY, USA (2018)","DOI":"10.1145\/3240508.3240578"},{"key":"1872_CR50","volume-title":"Advances in Neural Information Processing Systems","author":"Y Aytar","year":"2016","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: SoundNet: Learning sound representations from unlabeled video. In: Lee, D., Sugiyama, M., Luxburg, U., Guyon, I., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 29. Curran Associates Inc, New York, NY, USA (2016)"},{"key":"1872_CR51","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: Proceedings of the International Conference on Machine Learning, pp. 689\u2013696. Omnipress, Madison, WI, USA (2011)"},{"key":"1872_CR52","volume-title":"Advances in Neural Information Processing Systems","author":"N Srivastava","year":"2012","unstructured":"Srivastava, N., Salakhutdinov, R.R.: Multimodal learning with deep Boltzmann machines. In: Pereira, F., Burges, C.J., Bottou, L., Weinberger, K.Q. (eds.) Advances in Neural Information Processing Systems, vol. 25. Curran Associates Inc, New York, NY, USA (2012)"},{"key":"1872_CR53","unstructured":"Zhou, Y., Wang, J., Li, X., Zheng, Z., Wu, Q., Loy, C.C.: Video entailment via reaching a structure-aware cross-modal consensus. In: Proceedings of the 31st ACM International Conference on Multimedia (ACM MM), pp. 6786\u20136795 (2023)"},{"key":"1872_CR54","unstructured":"Zhang, Y., Li, Z., Wang, X., Wang, L., Yang, Y.: Learning probabilistic presence-absence evidence for weakly-supervised audio-visual event perception. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) (2025)"},{"key":"1872_CR55","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1007\/s11633-021-1293-0","volume":"18","author":"H Zhu","year":"2021","unstructured":"Zhu, H., Luo, M.-D., Wang, R., Zheng, A.-H., He, R.: Deep audio-visual learning: A survey. Int. J. Autom. Comput. 18, 351\u2013376 (2021)","journal-title":"Int. J. Autom. Comput."},{"issue":"4","key":"1872_CR56","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2601097.2601119","volume":"33","author":"A Davis","year":"2014","unstructured":"Davis, A., Rubinstein, M., Wadhwa, N., Mysore, G.J., Durand, F., Freeman, W.T.: The visual microphone: Passive recovery of sound from video. ACM Transactions on Graphics 33(4), 1\u201310 (2014)","journal-title":"ACM Transactions on Graphics"},{"key":"1872_CR57","doi-asserted-by":"crossref","unstructured":"Wan, C.-H., Chuang, S.-P., Lee, H.-Y.: Towards audio to scene image synthesis using generative adversarial network. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 496\u2013500. IEEE, New York, NY, USA (2019)","DOI":"10.1109\/ICASSP.2019.8682383"},{"key":"1872_CR58","doi-asserted-by":"crossref","unstructured":"Chen, L., Srivastava, S., Duan, Z., Xu, C.: Deep cross-modal audio-visual generation. In: Proceedings of the on Thematic Workshops of ACM Multimedia, pp. 349\u2013357. Association for Computing Machinery, New York, NY, USA (2017)","DOI":"10.1145\/3126686.3126723"},{"key":"1872_CR59","doi-asserted-by":"crossref","unstructured":"Hao, W., Zhang, Z., Guan, H.: CMCGAN: A uniform framework for cross-modal visual-audio mutual generation. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 6886\u20136893. AAAI Press, Washington, DC, USA (2018)","DOI":"10.1609\/aaai.v32i1.12329"},{"key":"1872_CR60","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., Bui, T., Berg, T.L.: Visual to sound: Generating natural sound for videos in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3550\u20133558. IEEE, New York, NY, USA (2018)","DOI":"10.1109\/CVPR.2018.00374"},{"key":"1872_CR61","doi-asserted-by":"crossref","unstructured":"Le\u00a0Cornu, T., Milner, B.: Reconstructing intelligible audio speech from visual speech features. In: Proceedings of the INTERSPEECH 2015, pp. 3355\u20133359. International Speech Communication Association, Baixas, France (2015)","DOI":"10.21437\/Interspeech.2015-139"},{"issue":"9","key":"1872_CR62","doi-asserted-by":"publisher","first-page":"1751","DOI":"10.1109\/TASLP.2017.2716178","volume":"25","author":"T Le Cornu","year":"2017","unstructured":"Le Cornu, T., Milner, B.: Generating intelligible audio speech from visual speech. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 25(9), 1751\u20131761 (2017)","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"1872_CR63","volume-title":"Advances in Neural Information Processing Systems","author":"J Hershey","year":"1999","unstructured":"Hershey, J., Movellan, J.: Audio vision: Using audio-visual synchrony to locate sounds. In: Solla, S., Leen, T., M\u00fcller, K. (eds.) Advances in Neural Information Processing Systems, vol. 12. MIT Press, Cambridge, MA, USA (1999)"},{"key":"1872_CR64","doi-asserted-by":"crossref","unstructured":"Gao, R., Feris, R., Grauman, K.: Learning to separate object sounds by watching unlabeled video. In: Proceedings of the European Conference on Computer Vision, pp. 35\u201353. Springer, Berlin, Germany (2018)","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"1872_CR65","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.-H., Kim, J., Yang, M.-H., Kweon, I.S.: Learning to localize sound source in visual scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4358\u20134366. IEEE, New York, NY, USA (2018)","DOI":"10.1109\/CVPR.2018.00458"},{"key":"1872_CR66","doi-asserted-by":"crossref","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., Xu, C.: Audio-visual event localization in unconstrained videos. In: Proceedings of the European Conference on Computer Vision, pp. 247\u2013263. Springer, Berlin, Germany (2018)","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"1872_CR67","doi-asserted-by":"crossref","unstructured":"Nawaz, S., Saeed, M.S., Morerio, P., Mahmood, A., Gallo, I., Yousaf, M.H., Del\u00a0Bue, A.: Cross-modal speaker verification and recognition: A multilingual perspective. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshop, pp. 1682\u20131691. IEEE, New York, NY, USA (2021)","DOI":"10.1109\/CVPRW53098.2021.00184"},{"key":"1872_CR68","doi-asserted-by":"crossref","unstructured":"Tao, R., Das, R.K., Li, H.: Audio-visual speaker recognition with a cross-modal discriminative network. In: Proceedings of the INTERSPEECH 2020, pp. 2242\u20132246. International Speech Communication Association, Baixas, France (2020)","DOI":"10.21437\/Interspeech.2020-1814"},{"key":"1872_CR69","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Seeing voices and hearing faces: Cross-modal biometric matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8427\u20138436. IEEE, New York, NY, USA (2018)","DOI":"10.1109\/CVPR.2018.00879"},{"key":"1872_CR70","doi-asserted-by":"publisher","first-page":"338","DOI":"10.1109\/TMM.2021.3050089","volume":"24","author":"A Zheng","year":"2021","unstructured":"Zheng, A., Hu, M., Jiang, B., Huang, Y., Yan, Y., Luo, B.: Adversarial-metric learning for audio-visual cross-modal matching. IEEE Trans. Multimedia 24, 338\u2013351 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"1872_CR71","doi-asserted-by":"crossref","unstructured":"Cao, Q., Shen, L., Xie, W., Parkhi, O.M., Zisserman, A.: VGGFace2: A dataset for recognising faces across pose and age. In: Proceedings of the IEEE International Conference on Automatic Face & Gesture Recognition, pp. 67\u201374. IEEE, New York, NY, USA (2018)","DOI":"10.1109\/FG.2018.00020"},{"key":"1872_CR72","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A.: Going deeper with convolutions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20139. IEEE, New York, NY, USA (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"1872_CR73","doi-asserted-by":"publisher","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A.: Deep face recognition. In: Proceedings of the British Machine Vision Conference, pp. 41\u201314112. British Machine Vision Association, Durham, UK (2015). https:\/\/doi.org\/10.5244\/C.29.41","DOI":"10.5244\/C.29.41"},{"key":"1872_CR74","doi-asserted-by":"publisher","unstructured":"Ravanelli, M., Parcollet, T., Plantinga, P., Rouhe, A., Cornell, S., Lugosch, L., Subakan, C., Dawalatabad, N., Heba, A., Zhong, J., Chou, J.-C., Yeh, S.-L., Fu, S.-W., Liao, C.-F., Rastorgueva, E., Grondin, F., Aris, W., Na, H., Gao, Y., De\u00a0Mori, R., Bengio, Y.: SpeechBrain: A general-purpose speech toolkit. arXiv (2021). https:\/\/doi.org\/10.48550\/arXiv.2106.04624 . arXiv:2106.04624","DOI":"10.48550\/arXiv.2106.04624"},{"key":"1872_CR75","doi-asserted-by":"crossref","unstructured":"Desplanques, B., Thienpondt, J., Demuynck, K.: ECAPA-TDNN: Emphasized channel attention, propagation and aggregation in TDNN based speaker verification. In: Proceedings of the INTERSPEECH 2020, pp. 3830\u20133834. International Speech Communication Association, Baixas, France (2020)","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"1872_CR76","doi-asserted-by":"crossref","unstructured":"Xie, W., Nagrani, A., Chung, J.S., Zisserman, A.: Utterance-level aggregation for speaker recognition in the wild. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 5791\u20135795. IEEE, New York, NY, USA (2019)","DOI":"10.1109\/ICASSP.2019.8683120"},{"key":"1872_CR77","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: Deep speaker recognition. In: Proceedings of the INTERSPEECH 2018, pp. 1086\u20131090. International Speech Communication Association, Baixas, France (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"1872_CR78","doi-asserted-by":"publisher","unstructured":"Musgrave, K., Belongie, S., Lim, S.-N.: PyTorch metric learning. arXiv (2020). https:\/\/doi.org\/10.48550\/arXiv.2008.09164 . arXiv:2008.09164","DOI":"10.48550\/arXiv.2008.09164"},{"key":"1872_CR79","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., Desmaison, A., Kopf, A., Yang, E., DeVito, Z., Raison, M., Tejani, A., Chilamkurthy, S., Steiner, B., Fang, L., Bai, J., Chintala, S.: PyTorch: An imperative style, high-performance deep learning library. In: Wallach, H., Larochelle, H., Beygelzimer, A., Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Proceedings of the Advances in Neural Information Processing Systems, vol. 32. Curran Associates, Inc., New York, NY, USA (2019)"},{"key":"1872_CR80","doi-asserted-by":"publisher","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. arXiv (2014). https:\/\/doi.org\/10.48550\/arXiv.1412.6980 . arXiv:1412.6980","DOI":"10.48550\/arXiv.1412.6980"},{"issue":"8","key":"1872_CR81","doi-asserted-by":"publisher","first-page":"861","DOI":"10.1016\/j.patrec.2005.10.010","volume":"27","author":"T Fawcett","year":"2006","unstructured":"Fawcett, T.: An introduction to ROC analysis. Pattern Recogn. Lett. 27(8), 861\u2013874 (2006)","journal-title":"Pattern Recogn. Lett."},{"key":"1872_CR82","doi-asserted-by":"crossref","unstructured":"Zheng, L., Shen, L., Tian, L., Wang, S., Wang, J., Tian, Q.: Scalable person re-identification: A benchmark. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 1116\u20131124 (2015)","DOI":"10.1109\/ICCV.2015.133"},{"key":"1872_CR83","doi-asserted-by":"crossref","unstructured":"Oh\u00a0Song, H., Xiang, Y., Jegelka, S., Savarese, S.: Deep metric learning via lifted structured feature embedding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4004\u20134012. IEEE, New York, NY, USA (2016)","DOI":"10.1109\/CVPR.2016.434"},{"key":"1872_CR84","doi-asserted-by":"crossref","unstructured":"Wen, Y., Zhang, K., Li, Z., Qiao, Y.: A discriminative feature learning approach for deep face recognition. In: Proceedings of the European Conference on Computer Vision, pp. 499\u2013515. Springer, Berlin, Germany (2016)","DOI":"10.1007\/978-3-319-46478-7_31"},{"issue":"11","key":"1872_CR85","first-page":"2579","volume":"9","author":"L Maaten","year":"2008","unstructured":"Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11), 2579\u20132605 (2008)","journal-title":"J. Mach. Learn. Res."},{"issue":"4","key":"1872_CR86","doi-asserted-by":"publisher","first-page":"983","DOI":"10.1090\/jams\/852","volume":"29","author":"C Fefferman","year":"2016","unstructured":"Fefferman, C., Mitter, S., Narayanan, H.: Testing the manifold hypothesis. J. Am. Math. Soc. 29(4), 983\u20131049 (2016)","journal-title":"J. Am. Math. Soc."},{"key":"1872_CR87","unstructured":"Wang, T., Isola, P.: Understanding contrastive representation learning through alignment and uniformity on the hypersphere. International Conference on Machine Learning (ICML), 9929\u20139939 (2020)"},{"key":"1872_CR88","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"issue":"4","key":"1872_CR89","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3197517.3201357","volume":"37","author":"A Ephrat","year":"2018","unstructured":"Ephrat, A., Mosseri, I., Lang, O., Dekel, T., Wilson, K., Hassidim, A., Freeman, W.T., Rubinstein, M.: Looking to listen at the cocktail party: A speaker-independent audio-visual model for speech separation. ACM Transactions on Graphics 37(4), 1\u201311 (2018)","journal-title":"ACM Transactions on Graphics"},{"key":"1872_CR90","doi-asserted-by":"crossref","unstructured":"Shor, J., Jansen, A., Han, W., Park, D., Zhang, Y.: Universal paralinguistic speech representations using self-supervised conformers. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3169\u20133173 (2022). IEEE","DOI":"10.1109\/ICASSP43922.2022.9747197"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01872-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01872-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01872-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:03:54Z","timestamp":1757927034000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01872-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,7]]},"references-count":90,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["1872"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01872-9","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,7,7]]},"assertion":[{"value":"16 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}}],"article-number":"296"}}