{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:16:45Z","timestamp":1776881805916,"version":"3.51.2"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729850","type":"print"},{"value":"9783031729867","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T00:00:00Z","timestamp":1730505600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T00:00:00Z","timestamp":1730505600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72986-7_17","type":"book-chapter","created":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T05:08:25Z","timestamp":1730437705000},"page":"288-305","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Perceptual Evaluation of\u00a0Audio-Visual Synchrony Grounded in\u00a0Viewers\u2019 Opinion Scores"],"prefix":"10.1007","author":[{"given":"Lucas","family":"Goncalves","sequence":"first","affiliation":[]},{"given":"Prashant","family":"Mathur","sequence":"additional","affiliation":[]},{"given":"Chandrashekhar","family":"Lavania","sequence":"additional","affiliation":[]},{"given":"Metehan","family":"Cekic","sequence":"additional","affiliation":[]},{"given":"Marcello","family":"Federico","sequence":"additional","affiliation":[]},{"given":"Kyu J.","family":"Han","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,2]]},"reference":[{"key":"17_CR1","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: LRS3-TED: a large-scale dataset for visual speech recognition (2018)"},{"key":"17_CR2","doi-asserted-by":"publisher","unstructured":"Agarwal, M., et al: Findings of the IWSLT 2023 evaluation campaign. In: Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023), pp. 1\u201361. Association for Computational Linguistics, Toronto, Canada (in-person and online) (2023). https:\/\/doi.org\/10.18653\/v1\/2023.iwslt-1.1, https:\/\/aclanthology.org\/2023.iwslt-1.1","DOI":"10.18653\/v1\/2023.iwslt-1.1"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Look, listen and learn. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 609\u2013617 (2017). https:\/\/api.semanticscholar.org\/CorpusID:10769575","DOI":"10.1109\/ICCV.2017.73"},{"key":"17_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-030-01246-5_27","volume-title":"Computer Vision \u2013 ECCV 2018","author":"R Arandjelovi\u0107","year":"2018","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Objects that sound. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 451\u2013466. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_27"},{"key":"17_CR5","unstructured":"Binkowski, M., Sutherland, D.J., Arbel, M., Gretton, A.: Demystifying MMD GANs. In: 6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada, 30April\u20133 May 2018, Conference Track Proceedings. OpenReview.net (2018). https:\/\/openreview.net\/forum?id=r1lUOzWCW"},{"key":"17_CR6","unstructured":"BT.1359, R.I.R.: BT.1359 - Relative timing of sound and vision for broadcasting (1998)"},{"key":"17_CR7","doi-asserted-by":"publisher","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4724\u20134733. IEEE Computer Society, Los Alamitos (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.502, https:\/\/doi.ieeecomputersociety.org\/10.1109\/CVPR.2017.502","DOI":"10.1109\/CVPR.2017.502"},{"key":"17_CR8","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Audio-visual synchronisation in the wild (2021)"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: VGGSound: a large-scale audio-visual dataset (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"17_CR10","doi-asserted-by":"publisher","unstructured":"Chinen, M., Lim, F.S.C., Skoglund, J., Gureev, N., O\u2019Gorman, F., Hines, A.: ViSQOL v3: an open source production ready objective speech and audio metric. In: 2020 Twelfth International Conference on Quality of Multimedia Experience (QoMEX), pp.\u00a01\u20136 (2020). https:\/\/doi.org\/10.1109\/QoMEX48832.2020.9123150","DOI":"10.1109\/QoMEX48832.2020.9123150"},{"key":"17_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) ACCV 2016. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"17_CR12","doi-asserted-by":"publisher","unstructured":"Clark, A.P., Howard, K.L., Woods, A.T., Penton-Voak, I.S., Neumann, C.: Why rate when you could compare? Using the \u201cEloChoice\u201d package to assess pairwise comparisons of perceived physical strength. PLOS ONE 13(1), e0190393 (Jan 2018). https:\/\/doi.org\/10.1371\/journal.pone.0190393, https:\/\/dx.plos.org\/10.1371\/journal.pone.0190393","DOI":"10.1371\/journal.pone.0190393"},{"key":"17_CR13","unstructured":"Committee, A.T.S.: ATSC implementation subcommittee finding: relative timing of sound and vision for broadcast operations. Technical report, Advanced Television Systems Committee, 1750 K Street, N.W., Suite 1200, Washington, D.C. 20006 (2003). Doc. IS-191"},{"key":"17_CR14","doi-asserted-by":"publisher","unstructured":"Dowson, D., Landau, B.: The fr\u00e9chet distance between multivariate normal distributions. J. Multivariate Anal. 12(3), 450\u2013455 (1982). https:\/\/doi.org\/10.1016\/0047-259X(82)90077-X, https:\/\/www.sciencedirect.com\/science\/article\/pii\/0047259X8290077X","DOI":"10.1016\/0047-259X(82)90077-X"},{"key":"17_CR15","doi-asserted-by":"publisher","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780 (2017). https:\/\/doi.org\/10.1109\/ICASSP.2017.7952261","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"17_CR16","doi-asserted-by":"publisher","unstructured":"Goncalves, L., Busso, C.: AuxFormer: robust approach to audiovisual emotion recognition. In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7357\u20137361 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747157","DOI":"10.1109\/ICASSP43922.2022.9747157"},{"key":"17_CR17","unstructured":"Hershey, J., Movellan, J.: Audio vision: using audio-visual synchrony to locate sounds. In: Solla, S., Leen, T., M\u00fcller, K. (eds.) Advances in Neural Information Processing Systems, vol.\u00a012. MIT Press (1999). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/1999\/file\/b618c3210e934362ac261db280128c22-Paper.pdf"},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. In: EMNLP (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"17_CR20","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: Neural Information Processing Systems (2017)"},{"key":"17_CR21","unstructured":"Iashin, V., Xie, W., Rahtu, E., Zisserman, A.: Sparse in space and time: audio-visual synchronisation with trainable selectors. In: British Machine Vision Conference (BMVC) (2022)"},{"key":"17_CR22","unstructured":"ITU-T\u00a0RECOMMENDATION, P.: Subjective video quality assessment methods for multimedia applications (1999)"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Kilgour, K., Zuluaga, M., Roblek, D., Sharifi, M.: Fr\u00e9chet audio distance: a reference-free metric for evaluating music enhancement algorithms. In: Interspeech (2019). https:\/\/api.semanticscholar.org\/CorpusID:202725406","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"17_CR24","doi-asserted-by":"publisher","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780 (2017). https:\/\/doi.org\/10.1109\/ICASSP.2017.7952261","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"17_CR25","unstructured":"Krippendorff, K.: Content Analysis: An Introduction to Its Methodology, 2nd edn. Sage Publications (2004)"},{"key":"17_CR26","unstructured":"Luo, S., Yan, C., Hu, C., Zhao, H.: Diff-Foley: synchronized video-to-audio synthesis with latent diffusion models (2023)"},{"key":"17_CR27","doi-asserted-by":"publisher","unstructured":"Luo, Y., Mesgarani, N.: TaSNet: time-domain audio separation network for real-time, single-channel speech separation. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 696\u2013700 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8462116","DOI":"10.1109\/ICASSP.2018.8462116"},{"key":"17_CR28","doi-asserted-by":"publisher","unstructured":"Manocha, P., Finkelstein, A., Zhang, R., Bryan, N.J., Mysore, G.J., Jin, Z.: A differentiable perceptual audio metric learned from just noticeable differences. In: Interspeech 2020, pp. 2852\u20132856. ISCA (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-1191, https:\/\/www.isca-speech.org\/archive\/interspeech_2020\/manocha20_interspeech.html","DOI":"10.21437\/Interspeech.2020-1191"},{"key":"17_CR29","doi-asserted-by":"publisher","unstructured":"Manocha, P., et al.: DPLM: a deep perceptual spatial-audio localization metric. In: 2021 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), pp. 6\u201310 (2021). https:\/\/doi.org\/10.1109\/WASPAA52581.2021.9632781","DOI":"10.1109\/WASPAA52581.2021.9632781"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: VoxCeleb: a large-scale speaker identification dataset. In: INTERSPEECH (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"key":"17_CR31","doi-asserted-by":"publisher","unstructured":"Rix, A., Beerends, J., Hollier, M., Hekstra, A.: Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs. In: 2001 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No.01CH37221), vol.\u00a02, pp. 749\u2013752 (2001). https:\/\/doi.org\/10.1109\/ICASSP.2001.941023","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"17_CR32","doi-asserted-by":"publisher","unstructured":"Roux, J.L., Wisdom, S., Erdogan, H., Hershey, J.R.: SDR \u2013 half-baked or well done? In: ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 626\u2013630 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8683855","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"17_CR33","doi-asserted-by":"publisher","unstructured":"Ruan, L., et al.: MM-diffusion: learning multi-modal diffusion models for joint audio and video generation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10219\u201310228. IEEE Computer Society, Los Alamitos (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00985, https:\/\/doi.ieeecomputersociety.org\/10.1109\/CVPR52729.2023.00985","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"17_CR34","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: Proceedings of the 30th International Conference on Neural Information Processing Systems, NIPS 2016, pp. 2234\u20132242. Curran Associates Inc., Red Hook, NY, USA (2016)"},{"issue":"7","key":"17_CR35","doi-asserted-by":"publisher","first-page":"2125","DOI":"10.1109\/TASL.2011.2114881","volume":"19","author":"CH Taal","year":"2011","unstructured":"Taal, C.H., Hendriks, R.C., Heusdens, R., Jensen, J.: An algorithm for intelligibility prediction of time-frequency weighted noisy speech. IEEE Trans. Audio Speech Lang. Process. 19(7), 2125\u20132136 (2011). https:\/\/doi.org\/10.1109\/TASL.2011.2114881","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Tsai, Y.H.H., Bai, S., Liang, P.P., Kolter, J.Z., Morency, L.P., Salakhutdinov, R.: Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Association for Computational Linguistics, Florence (2019)","DOI":"10.18653\/v1\/P19-1656"},{"key":"17_CR37","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: Towards accurate generative models of video: a new metric & challenges (2019)"},{"key":"17_CR38","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol.\u00a030. Curran Associates, Inc. (2017). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"issue":"4","key":"17_CR39","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1109\/TSA.2005.858005","volume":"14","author":"E Vincent","year":"2006","unstructured":"Vincent, E., Gribonval, R., Fevotte, C.: Performance measurement in blind audio source separation. IEEE Trans. Audio Speech Lang. Process. 14(4), 1462\u20131469 (2006). https:\/\/doi.org\/10.1109\/TSA.2005.858005","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Wang, J., Fang, Z., Zhao, H.: AlignNet: a unifying approach to audio-visual alignment. In: WACV (2020)","DOI":"10.1109\/WACV45572.2020.9093345"},{"key":"17_CR41","doi-asserted-by":"publisher","unstructured":"Wang, Y., et al.: Rich features for perceptual quality assessment of UGC videos. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13430\u201313439 (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.01323","DOI":"10.1109\/CVPR46437.2021.01323"},{"key":"17_CR42","doi-asserted-by":"publisher","unstructured":"Wang, Z., Simoncelli, E., Bovik, A.: Multiscale structural similarity for image quality assessment. In: 2003 the Thrity-Seventh Asilomar Conference on Signals, Systems & Computers, vol.\u00a02, pp. 1398\u20131402 (2003). https:\/\/doi.org\/10.1109\/ACSSC.2003.1292216","DOI":"10.1109\/ACSSC.2003.1292216"},{"issue":"3","key":"17_CR43","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1109\/97.995823","volume":"9","author":"Z Wang","year":"2002","unstructured":"Wang, Z., Bovik, A.: A universal image quality index. IEEE Signal Process. Lett. 9(3), 81\u201384 (2002). https:\/\/doi.org\/10.1109\/97.995823","journal-title":"IEEE Signal Process. Lett."},{"key":"17_CR44","doi-asserted-by":"publisher","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00068","DOI":"10.1109\/CVPR.2018.00068"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72986-7_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T05:10:34Z","timestamp":1730437834000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72986-7_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,2]]},"ISBN":["9783031729850","9783031729867"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72986-7_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,2]]},"assertion":[{"value":"2 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}