{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,24]],"date-time":"2026-07-24T14:51:49Z","timestamp":1784904709782,"version":"3.55.0"},"publisher-location":"Cham","reference-count":58,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585228","type":"print"},{"value":"9783030585235","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58523-5_13","type":"book-chapter","created":{"date-parts":[[2020,12,3]],"date-time":"2020-12-03T20:13:16Z","timestamp":1607026396000},"page":"208-224","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":157,"title":["Self-supervised Learning of Audio-Visual Objects from Video"],"prefix":"10.1007","author":[{"given":"Triantafyllos","family":"Afouras","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Owens","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2020,12,4]]},"reference":[{"key":"13_CR1","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE PAMI (2019)"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: The conversation: deep audio-visual speech enhancement. In: INTERSPEECH (2018)","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"13_CR3","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: LRS3-TED: a large-scale dataset for visual speech recognition. In: arXiv preprint arXiv:1809.00496 (2018)"},{"key":"13_CR4","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: My lips are concealed: audio-visual speech enhancement through obstructions. In: INTERSPEECH (2019)","DOI":"10.21437\/Interspeech.2019-3114"},{"key":"13_CR5","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Look, listen and learn. In: Proceedings of ICCV (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"13_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-030-01246-5_27","volume-title":"Computer Vision \u2013 ECCV 2018","author":"R Arandjelovi\u0107","year":"2018","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Objects that sound. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 451\u2013466. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_27"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Barzelay, Z., Schechner, Y.Y.: Harmony in motion. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition (2007)","DOI":"10.1109\/CVPR.2007.383344"},{"key":"13_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1007\/978-3-319-46454-1_18","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Chakravarty","year":"2016","unstructured":"Chakravarty, P., Tuytelaars, T.: Cross-modal supervision for learning active speaker detection in video. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 285\u2013301. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_18"},{"key":"13_CR9","doi-asserted-by":"crossref","unstructured":"Chatfield, K., Simonyan, K., Vedaldi, A., Zisserman, A.: Return of the devil in the details: Delving deep into convolutional nets. arXiv preprint arXiv:1405.3531 (2014)","DOI":"10.5244\/C.28.6"},{"key":"13_CR10","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. ICML (2020)"},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Lee, B.J., Han, I.: Who said that?: Audio-visual speaker diarisation of real-world meetings. In: Interspeech (2019)","DOI":"10.21437\/Interspeech.2019-3116"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: deep speaker recognition. In: INTERSPEECH (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"13_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) ACCV 2016. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"13_CR14","unstructured":"Chung, J.S., Zisserman, A.: Signs in time: encoding human motion as a temporal image. In: Workshop on Brave New Ideas for Motion Representations, ECCV (2016)"},{"key":"13_CR15","doi-asserted-by":"crossref","unstructured":"Chung, S.W., Chung, J.S., Kang, H.G.: Perfect match: improved cross-modal embeddings for audio-visual synchronisation. In: Proceedings of ICASSP, pp. 3965\u20133969. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682524"},{"key":"13_CR16","doi-asserted-by":"crossref","unstructured":"Cutler, R., Davis, L.: Look who\u2019s talking: speaker detection using video and audio correlation. In: 2000 IEEE International Conference on Multimedia and Expo. ICME 2000. Proceedings. Latest Advances in the Fast Changing World of Multimedia (Cat. No. 00TH8532), vol. 3, pp. 1589\u20131592. IEEE (2000)","DOI":"10.1109\/ICME.2000.871073"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Yuxiang, Z., Yu, J., Kotsia, I., Zafeiriou, S.: Retinaface: Single-stage dense face localisation in the wild. In: arxiv (2019)","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"13_CR18","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: Proceedings of ICCV, pp. 1422\u20131430 (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"13_CR19","doi-asserted-by":"crossref","unstructured":"Dutta, A., Zisserman, A.: The VIA annotation software for images, audio and video. In: Proceedings of the 27th ACM International Conference on Multimedia. MM 2019. ACM, New York (2019)","DOI":"10.1145\/3343031.3350535"},{"issue":"4","key":"13_CR20","doi-asserted-by":"publisher","first-page":"112","DOI":"10.1145\/3197517.3201357","volume":"37","author":"A Ephrat","year":"2018","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. ACM Trans. Graph. (TOG) 37(4), 112 (2018)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"13_CR21","unstructured":"F\u00e9votte, C., Gribonval, R., Vincent, E.: BSS EVAL toolbox user guide. IRISA Technical Report 1706 (2005). http:\/\/www.irisa.fr\/metiss\/bsseval\/"},{"key":"13_CR22","unstructured":"Fisher III, J.W., Darrell, T., Freeman, W.T., Viola, P.A.: Learning joint statistical models for audio-visual fusion and segregation. In: NeurIPS (2000)"},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Gabbay, A., Ephrat, A., Halperin, T., Peleg, S.: Seeing through noise: visually driven speaker separation and enhancement. In: Proceedings of ICASSP, pp. 3051\u20133055. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462527"},{"key":"13_CR24","doi-asserted-by":"crossref","unstructured":"Gadde, R., Jampani, V., Gehler, P.V.: Semantic video CNNs through representation warping. In: Proceedings of ICCV, pp. 4463\u20134472 (2017)","DOI":"10.1109\/ICCV.2017.477"},{"key":"13_CR25","doi-asserted-by":"crossref","unstructured":"Gan, C., Zhao, H., Chen, P., Cox, D., Torralba, A.: Self-supervised moving vehicle tracking with stereo sound. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 7053\u20137062 (2019)","DOI":"10.1109\/ICCV.2019.00715"},{"key":"13_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1007\/978-3-030-01219-9_3","volume-title":"Computer Vision \u2013 ECCV 2018","author":"R Gao","year":"2018","unstructured":"Gao, R., Feris, R., Grauman, K.: Learning to separate object sounds by watching unlabeled video. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11207, pp. 36\u201354. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01219-9_3"},{"key":"13_CR27","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: 2.5D visual sound. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00041"},{"key":"13_CR28","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: Co-separating sounds of visual objects. arXiv preprint arXiv:1904.07750 (2019)","DOI":"10.1109\/ICCV.2019.00398"},{"key":"13_CR29","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Video representation learning by dense predictive coding. In: Workshop on Large Scale Holistic Video Understanding, ICCV (2019)","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"13_CR30","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Memory-augmented dense predictive coding for video representation learning. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58580-8_19"},{"key":"13_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"659","DOI":"10.1007\/978-3-030-01231-1_40","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Harwath","year":"2018","unstructured":"Harwath, D., Recasens, A., Sur\u00eds, D., Chuang, G., Torralba, A., Glass, J.: Jointly discovering visual objects and spoken words from raw sensory input. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 659\u2013677. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_40"},{"key":"13_CR32","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"13_CR33","unstructured":"H\u00e9naff, O.J., et al.: Data-efficient image recognition with contrastive predictive coding. In: ICML (2020)"},{"key":"13_CR34","unstructured":"Hershey, J., Movellan, J.: Audio-vision: locating sounds via audio-visual synchrony. In: NeurIPS, vol. 12 (1999)"},{"key":"13_CR35","doi-asserted-by":"crossref","unstructured":"Hu, D., Nie, F., Li, X.: Deep multimodal clustering for unsupervised audiovisual learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2019","DOI":"10.1109\/CVPR.2019.00947"},{"key":"13_CR36","unstructured":"Hu, D., Wang, Z., Xiong, H., Wang, D., Nie, F., Dou, D.: Curriculum audiovisual learning. arXiv preprint arXiv:2001.09414 (2020)"},{"issue":"2","key":"13_CR37","doi-asserted-by":"publisher","first-page":"378","DOI":"10.1109\/TMM.2012.2228476","volume":"15","author":"H Izadinia","year":"2012","unstructured":"Izadinia, H., Saleemi, I., Shah, M.: Multimodal analysis for identification and segmentation of moving-sounding objects. IEEE Trans. Multimedia 15(2), 378\u2013390 (2012)","journal-title":"IEEE Trans. Multimedia"},{"key":"13_CR38","unstructured":"Khosravan, N., Ardeshir, S., Puri, R.: On attention modules for audio-visual synchronization. arXiv preprint arXiv:1812.06071 (2018)"},{"key":"13_CR39","unstructured":"Kidron, E., Schechner, Y.Y., Elad, M.: Pixels that sound. In: Proceedings of CVPR (2005)"},{"key":"13_CR40","unstructured":"Korbar, B., Tran, D., Torresani, L.: Co-training of audio and video representations from self-supervised temporal synchronization. CoRR (2018)"},{"key":"13_CR41","doi-asserted-by":"crossref","unstructured":"Misra, I., van der Maaten, L.: Self-supervised learning of pretext-invariant representations. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00674"},{"key":"13_CR42","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Albanie, S., Zisserman, A.: Disentangled speech embeddings using cross-modal self-supervision. In: Proceedings of ICASSP, pp. 6829\u20136833. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9054057"},{"key":"13_CR43","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"13_CR44","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"639","DOI":"10.1007\/978-3-030-01231-1_39","volume-title":"Computer Vision \u2013 ECCV 2018","author":"A Owens","year":"2018","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 639\u2013658. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_39"},{"key":"13_CR45","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"13_CR46","doi-asserted-by":"crossref","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Learning sight from sound: ambient sound provides supervision for visual learning. Int. J. Comput. Vis. (2018)","DOI":"10.1007\/s11263-018-1083-5"},{"key":"13_CR47","doi-asserted-by":"crossref","unstructured":"Pfister, T., Charles, J., Zisserman, A.: Flowing convnets for human pose estimation in videos. In: Proceedings of ICCV (2015)","DOI":"10.1109\/ICCV.2015.222"},{"key":"13_CR48","doi-asserted-by":"crossref","unstructured":"Ramaswamy, J., Das, S.: See the sound, hear the pixels. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), March 2020","DOI":"10.1109\/WACV45572.2020.9093616"},{"key":"13_CR49","doi-asserted-by":"crossref","unstructured":"Rix, A.W., Beerends, J.G., Hollier, M.P., Hekstra, A.P.: Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs. In: Proceedings of ICASSP, vol. 2, pp. 749\u2013752. IEEE (2001)","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"13_CR50","doi-asserted-by":"crossref","unstructured":"Roth, J., et al.: AVA-ActiveSpeaker: An audio-visual dataset for active speaker detection. arXiv preprint arXiv:1901.01342 (2019)","DOI":"10.1109\/ICCVW.2019.00460"},{"key":"13_CR51","doi-asserted-by":"crossref","unstructured":"Rouditchenko, A., Zhao, H., Gan, C., McDermott, J., Torralba, A.: Self-supervised audio-visual co-segmentation. In: Proceedings of ICASSP, pp. 2357\u20132361. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682467"},{"key":"13_CR52","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.H., Kim, J., Yang, M.H., Kweon, I.S.: Learning to localize sound source in visual scenes. In: Proceedings of CVPR (2018)","DOI":"10.1109\/CVPR.2018.00458"},{"key":"13_CR53","doi-asserted-by":"crossref","unstructured":"Shahid, M., Beyan, C., Murino, V.: Voice activity detection by upper body motion analysis and unsupervised domain adaptation. In: The IEEE International Conference on Computer Vision (ICCV) Workshops, October 2019","DOI":"10.1109\/ICCVW.2019.00159"},{"key":"13_CR54","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"252","DOI":"10.1007\/978-3-030-01216-8_16","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Tian","year":"2018","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., Xu, C.: Audio-visual event localization in unconstrained videos. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11206, pp. 252\u2013268. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01216-8_16"},{"key":"13_CR55","doi-asserted-by":"crossref","unstructured":"Tian, Y., Krishnan, D., Isola, P.: Contrastive multiview coding. arXiv preprint arXiv:1906.05849 (2019)","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"13_CR56","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: Proceedings of ICCV, pp. 2794\u20132802 (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"13_CR57","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Ma, W.C., Torralba, A.: The sound of motions. In: Proceedings of ICCV (2019)","DOI":"10.1109\/ICCV.2019.00182"},{"key":"13_CR58","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"587","DOI":"10.1007\/978-3-030-01246-5_35","volume-title":"Computer Vision \u2013 ECCV 2018","author":"H Zhao","year":"2018","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 587\u2013604. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_35"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58523-5_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:05:13Z","timestamp":1733184313000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58523-5_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585228","9783030585235"],"references-count":58,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58523-5_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"4 December 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic. From the ECCV Workshops 249 full papers, 18 short papers, and 21 further contributions were published out of a total of 467 submissions.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}