{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T23:02:06Z","timestamp":1768431726513,"version":"3.49.0"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030922726","type":"print"},{"value":"9783030922733","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-92273-3_13","type":"book-chapter","created":{"date-parts":[[2021,12,4]],"date-time":"2021-12-04T21:34:27Z","timestamp":1638653667000},"page":"150-161","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Video Face Recognition with Audio-Visual Aggregation Network"],"prefix":"10.1007","author":[{"given":"Qinbo","family":"Li","sequence":"first","affiliation":[]},{"given":"Qing","family":"Wan","sequence":"additional","affiliation":[]},{"given":"Sang-Heon","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Yoonsuck","family":"Choe","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,12,5]]},"reference":[{"issue":"3","key":"13_CR1","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1049\/ip-vis:20045082","volume":"152","author":"A Albiol","year":"2005","unstructured":"Albiol, A., Torres, L., Delp, E.J.: Fully automatic face recognition system using a combined audio-visual approach. IEE Proc.-Vis. Image Sign. Process. 152(3), 318\u2013326 (2005)","journal-title":"IEE Proc.-Vis. Image Sign. Process."},{"key":"13_CR2","unstructured":"Assael, Y.M., Shillingford, B., Whiteson, S., De Freitas, N.: Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599 (2016)"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Bansal, A., Nanduri, A., Castillo, C.D., Ranjan, R., Chellappa, R.: Umdfaces: an annotated face dataset for training deep networks. In: 2017 IEEE International Joint Conference on Biometrics (IJCB), pp. 464\u2013473. IEEE (2017)","DOI":"10.1109\/BTAS.2017.8272731"},{"key":"13_CR4","doi-asserted-by":"crossref","unstructured":"Beveridge, J.R., et al.: The challenge of face recognition from digital point-and-shoot cameras. In: 2013 IEEE Sixth International Conference on Biometrics: Theory, Applications and Systems (BTAS), pp. 1\u20138. IEEE (2013)","DOI":"10.1109\/BTAS.2013.6712704"},{"key":"13_CR5","unstructured":"Choudhury, T., Clarkson, B., Jebara, T., Pentland, A.: Multimodal person recognition using unconstrained audio and video. In: Proceedings, International Conference on Audio-and Video-Based Person Authentication, pp. 176\u2013181. Citeseer (1999)"},{"key":"13_CR6","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: Arcface: additive angular margin loss for deep face recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619 (2018)","DOI":"10.1145\/3197517.3201357"},{"key":"13_CR8","first-page":"513","volume":"17","author":"J Goldberger","year":"2004","unstructured":"Goldberger, J., Hinton, G.E., Roweis, S., Salakhutdinov, R.R.: Neighbourhood components analysis. Adv. Neural Inform. Process. Syst. 17, 513\u2013520 (2004)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"13_CR9","unstructured":"Gong, S., Shi, Y., Jain, A.K.: Recurrent embedding aggregation network for video face recognition. arXiv preprint arXiv:1904.12019 (2019)"},{"key":"13_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1007\/978-3-319-46487-9_6","volume-title":"Computer Vision \u2013 ECCV 2016","author":"Y Guo","year":"2016","unstructured":"Guo, Y., Zhang, L., Hu, Y., He, X., Gao, J.: MS-Celeb-1M: a dataset and benchmark for large-scale face recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 87\u2013102. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_6"},{"key":"13_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1007\/978-3-319-24261-3_7","volume-title":"Similarity-Based Pattern Recognition","author":"E Hoffer","year":"2015","unstructured":"Hoffer, E., Ailon, N.: Deep metric learning using triplet network. In: Feragen, A., Pelillo, M., Loog, M. (eds.) SIMBAD 2015. LNCS, vol. 9370, pp. 84\u201392. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24261-3_7"},{"key":"13_CR12","unstructured":"Hoover, K., Chaudhuri, S., Pantofaru, C., Slaney, M., Sturdy, I.: Putting a face to the voice: Fusing audio and visual signals across a video to determine speakers. arXiv preprint arXiv:1706.00079 (2017)"},{"key":"13_CR13","unstructured":"Huang, G.B., Mattar, M., Berg, T., Learned-Miller, E.: Labeled faces in the wild: A database forstudying face recognition in unconstrained environments (2008)"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Kemelmacher-Shlizerman, I., Seitz, S.M., Miller, D., Brossard, E.: The megaface benchmark: 1 million faces for recognition at scale. In: Proceedings of the IEEE Conference On Computer Vision and Pattern Recognition, pp. 4873\u20134882 (2016)","DOI":"10.1109\/CVPR.2016.527"},{"key":"13_CR15","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"13_CR16","unstructured":"Li, C., et al.: Deep speaker: an end-to-end neural speaker embedding system. arXiv preprint arXiv:1705.02304 650 (2017)"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Liu, W., Wen, Y., Yu, Z., Li, M., Raj, B., Song, L.: Sphereface: deep hypersphere embedding for face recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 212\u2013220 (2017)","DOI":"10.1109\/CVPR.2017.713"},{"key":"13_CR18","unstructured":"Liu, W., Wen, Y., Yu, Z., Yang, M.: Large-margin softmax loss for convolutional neural networks. In: ICML, vol. 2, p. 7 (2016)"},{"key":"13_CR19","unstructured":"Liu, Y., et al.: iqiyi-vid: A large dataset for multi-modal person identification. arXiv preprint arXiv:1811.07548 (2018)"},{"key":"13_CR20","doi-asserted-by":"crossref","unstructured":"Maze, B., et al.: Iarpa janus benchmark-c: Face dataset and protocol. In: 2018 International Conference on Biometrics (ICB), pp. 158\u2013165. IEEE (2018)","DOI":"10.1109\/ICB2018.2018.00033"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Moschoglou, S., Papaioannou, A., Sagonas, C., Deng, J., Kotsia, I., Zafeiriou, S.: Agedb: the first manually collected, in-the-wild age database. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 51\u201359 (2017)","DOI":"10.1109\/CVPRW.2017.250"},{"key":"13_CR22","unstructured":"Paszke, A., et al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, pp. 8024\u20138035 (2019)"},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Rao, Y., Lin, J., Lu, J., Zhou, J.: Learning discriminative aggregation network for video-based face recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3781\u20133790 (2017)","DOI":"10.1109\/ICCV.2017.408"},{"key":"13_CR24","doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: Facenet: a unified embedding for face recognition and clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 815\u2013823 (2015)","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"13_CR25","doi-asserted-by":"crossref","unstructured":"Sell, G., Duh, K., Snyder, D., Etter, D., Garcia-Romero, D.: Audio-visual person recognition in multimedia data from the iarpa janus program. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3031\u20133035. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462122"},{"issue":"7","key":"13_CR26","doi-asserted-by":"publisher","first-page":"955","DOI":"10.1109\/TCSVT.2009.2022694","volume":"19","author":"X Tang","year":"2009","unstructured":"Tang, X., Li, Z.: Audio-guided video-based face recognition. IEEE Trans. Circ. Syst. Video Technol. 19(7), 955\u2013964 (2009)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"issue":"10","key":"13_CR27","doi-asserted-by":"publisher","first-page":"1713","DOI":"10.1109\/TPAMI.2008.75","volume":"30","author":"O Tuzel","year":"2008","unstructured":"Tuzel, O., Porikli, F., Meer, P.: Pedestrian detection via classification on riemannian manifolds. IEEE Trans. Pattern Anal. Mach. Intell. 30(10), 1713\u20131727 (2008)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"7","key":"13_CR28","doi-asserted-by":"publisher","first-page":"926","DOI":"10.1109\/LSP.2018.2822810","volume":"25","author":"F Wang","year":"2018","unstructured":"Wang, F., Cheng, J., Liu, W., Liu, H.: Additive margin softmax for face verification. IEEE Sign. Process. Lett. 25(7), 926\u2013930 (2018)","journal-title":"IEEE Sign. Process. Lett."},{"key":"13_CR29","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Cosface: large margin cosine loss for deep face recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5265\u20135274 (2018)","DOI":"10.1109\/CVPR.2018.00552"},{"key":"13_CR30","doi-asserted-by":"crossref","unstructured":"Wolf, L., Hassner, T., Maoz, I.: Face recognition in unconstrained videos with matched background similarity. In: CVPR 2011, pp. 529\u2013534. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995566"},{"key":"13_CR31","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Neural aggregation network for video face recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4362\u20134371 (2017)","DOI":"10.1109\/CVPR.2017.554"},{"key":"13_CR32","unstructured":"Yi, D., Lei, Z., Liao, S., Li, S.Z.: Learning face representation from scratch. arXiv preprint arXiv:1411.7923 (2014)"},{"issue":"10","key":"13_CR33","doi-asserted-by":"publisher","first-page":"1499","DOI":"10.1109\/LSP.2016.2603342","volume":"23","author":"K Zhang","year":"2016","unstructured":"Zhang, K., Zhang, Z., Li, Z., Qiao, Y.: Joint face detection and alignment using multitask cascaded convolutional networks. IEEE Sign. Process. Lett. 23(10), 1499\u20131503 (2016)","journal-title":"IEEE Sign. Process. Lett."},{"key":"13_CR34","doi-asserted-by":"crossref","unstructured":"Zhao, K., Xu, J., Cheng, M.M.: Regularface: deep face recognition via exclusive regularization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1136\u20131144 (2019)","DOI":"10.1109\/CVPR.2019.00123"},{"issue":"3","key":"13_CR35","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1109\/TBIOM.2020.2973504","volume":"2","author":"J Zheng","year":"2020","unstructured":"Zheng, J., Ranjan, R., Chen, C.H., Chen, J.C., Castillo, C.D., Chellappa, R.: An automatic system for unconstrained video-based face recognition. IEEE Trans. Bio. Behav. Identity Sci. 2(3), 194\u2013209 (2020)","journal-title":"IEEE Trans. Bio. Behav. Identity Sci."}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-92273-3_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,12,4]],"date-time":"2021-12-04T21:36:55Z","timestamp":1638653815000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-92273-3_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030922726","9783030922733"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-92273-3_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"5 December 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sanur, Bali","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Indonesia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iconip2021.apnns.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1093","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"226","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"177","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"21% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.57","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"6","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Due to the COVID-19 pandemic the conference was held online.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}