{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T17:01:19Z","timestamp":1774630879215,"version":"3.50.1"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198328","type":"print"},{"value":"9783031198335","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19833-5_22","type":"book-chapter","created":{"date-parts":[[2022,11,4]],"date-time":"2022-11-04T00:40:30Z","timestamp":1667522430000},"page":"371-387","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":40,"title":["Learning Long-Term Spatial-Temporal Graphs for Active Speaker Detection"],"prefix":"10.1007","author":[{"given":"Kyle","family":"Min","sequence":"first","affiliation":[]},{"given":"Sourya","family":"Roy","sequence":"additional","affiliation":[]},{"given":"Subarna","family":"Tripathi","sequence":"additional","affiliation":[]},{"given":"Tanaya","family":"Guha","sequence":"additional","affiliation":[]},{"given":"Somdeb","family":"Majumdar","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,4]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: The conversation: deep audio-visual speech enhancement. arXiv preprint arXiv:1804.04121 (2018)","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"22_CR2","doi-asserted-by":"crossref","unstructured":"Alcazar, J.L., et al.: Active speakers in context. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 12465\u201312474 (2020)","DOI":"10.1109\/CVPR42600.2020.01248"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Arnab, A., Sun, C., Schmid, C.: Unified graph structured models for video understanding. arXiv preprint arXiv:2103.15662 (2021)","DOI":"10.1109\/ICCV48922.2021.00801"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Chakravarty, P., Tuytelaars, T.: Cross-modal supervision for learning active speaker detection in video. ArXiv abs\/1603.08907 (2016)","DOI":"10.1007\/978-3-319-46454-1_18"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Chatfield, K., Simonyan, K., Vedaldi, A., Zisserman, A.: Return of the devil in the details: Delving deep into convolutional nets. In: Proceedings of the British Machine Vision Conference. BMVA Press (2014)","DOI":"10.5244\/C.28.6"},{"key":"22_CR6","unstructured":"Chung, J.S.: Naver at activitynet challenge 2019 - task B active speaker detection (AVA). CoRR abs\/1906.10555 (2019). http:\/\/arxiv.org\/abs\/1906.10555"},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Cutler, R., Davis, L.: Look who\u2019s talking: speaker detection using video and audio correlation. In: 2000 IEEE International Conference on Multimedia and Expo. ICME2000. Proceedings. Latest Advances in the Fast Changing World of Multimedia (Cat. No. 00TH8532), vol. 3, pp. 1589\u20131592. IEEE (2000)","DOI":"10.1109\/ICME.2000.871073"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Everingham, M., Sivic, J., Zisserman, A.: Hello! my name is... buffy\u201d-automatic naming of characters in tv video. In: BMVC, vol. 2, p. 6 (2006)","DOI":"10.5244\/C.20.92"},{"key":"22_CR9","doi-asserted-by":"publisher","first-page":"545","DOI":"10.1016\/j.imavis.2008.04.018","volume":"27","author":"M Everingham","year":"2009","unstructured":"Everingham, M., Sivic, J., Zisserman, A.: Taking the bite out of automated naming of characters in tv video. Image Vis. Comput. 27, 545\u2013559 (2009)","journal-title":"Image Vis. Comput."},{"key":"22_CR10","unstructured":"Fey, M., Lenssen, J.E.: Fast graph representation learning with pytorch geometric. In: ICLR Workshop on Representation Learning on Graphs and Manifolds (2019). http:\/\/arxiv.org\/abs\/1903.02428"},{"key":"22_CR11","unstructured":"Geng, S., Gao, P., Hori, C., Le Roux, J., Cherian, A.: Spatio-temporal scene graphs for video dialog. arXiv e-prints pp. arXiv-2007 (2020)"},{"key":"22_CR12","unstructured":"Hamilton, W., Ying, Z., Leskovec, J.: Inductive representation learning on large graphs. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"22_CR14","unstructured":"Howard, A.G., et al.: Mobilenets: efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861 (2017)"},{"key":"22_CR15","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Kopuklu, O., Kose, N., Gunduz, A., Rigoll, G.: Resource efficient 3d convolutional neural networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00240"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"K\u00f6p\u00fckl\u00fc, O., Taseska, M., Rigoll, G.: How to design a three-stage architecture for audio-visual active speaker detection in the wild. In: Proceedings of the Internal Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00123"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Le\u00f3n-Alc\u00e1zar, J., Heilbron, F.C., Thabet, A., Ghanem, B.: MAAS: multi-modal assignation for active speaker detection. In: Internal Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00033"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S.: TSM: temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7083\u20137093 (2019)","DOI":"10.1109\/ICCV.2019.00718"},{"key":"22_CR20","unstructured":"Loshchilov, I., Hutter, F.: Sgdr: Stochastic gradient descent with warm restarts. In: International Conference on Learning Representations (ICLR) (2017)"},{"key":"22_CR21","unstructured":"Mi, L., Ou, Y., Chen, Z.: Visual relationship forecasting in videos. arXiv preprint arXiv:2107.01181 (2021)"},{"key":"22_CR22","unstructured":"Min, K., Roy, S., Tripathi, S., Guha, T., Majumdar, S.: Intel labs at activitynet challenge 2022: spell for long-term active speaker detection (2022)"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Nagarajan, T., Li, Y., Feichtenhofer, C., Grauman, K.: Ego-topo: environment affordances from egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 163\u2013172 (2020)","DOI":"10.1109\/CVPR42600.2020.00024"},{"key":"22_CR24","doi-asserted-by":"crossref","unstructured":"Patrick, M., et al.: Space-time crop & attend: improving cross-modal video representation learning. arXiv preprint arXiv:2103.10211 (2021)","DOI":"10.1109\/ICCV48922.2021.01039"},{"key":"22_CR25","doi-asserted-by":"crossref","unstructured":"Ravanelli, M., Bengio, Y.: Speaker recognition from raw waveform with sincnet. In: 2018 IEEE Spoken Language Technology Workshop (SLT), pp. 1021\u20131028. IEEE (2018)","DOI":"10.1109\/SLT.2018.8639585"},{"key":"22_CR26","doi-asserted-by":"crossref","unstructured":"Roth, J., et al.: Ava active speaker: an audio-visual dataset for active speaker detection. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4492\u20134496. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"22_CR27","unstructured":"Sharma, R., Somandepalli, K., Narayanan, S.: Crossmodal learning for audio-visual speech event localization. arXiv preprint arXiv:2003.04358 (2020)"},{"key":"22_CR28","unstructured":"Shirian, A., Tripathi, S., Guha, T.: Learnable graph inception network for emotion recognition. IEEE Trans. Multimedia 24 (2020)"},{"key":"22_CR29","doi-asserted-by":"crossref","unstructured":"Stafylakis, T., Tzimiropoulos, G.: Combining residual networks with lSTMS for lipreading. arXiv preprint arXiv:1703.04105 (2017)","DOI":"10.21437\/Interspeech.2017-85"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Stefanov, K., Beskow, J., Salvi, G.: Vision-based active speaker detection in multiparty interaction. In: Grounding Language Understanding GLU2017 August 25, 2017, KTH Royal Institute of Technology, Stockholm, Sweden (2017)","DOI":"10.21437\/GLU.2017-10"},{"key":"22_CR31","doi-asserted-by":"crossref","unstructured":"Stefanov, K., Sugimoto, A., Beskow, J.: Look who\u2019s talking: visual identification of the active speaker in multi-party human-robot interaction. In: Proceedings of the 2nd Workshop on Advancements in Social Signal Processing for Multimodal Interaction, pp. 22\u201327 (2016)","DOI":"10.1145\/3005467.3005470"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Tan, R., Xu, H., Saenko, K., Plummer, B.A.: Logan: latent graph co-attention network for weakly-supervised video moment retrieval. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2083\u20132092 (2021)","DOI":"10.1109\/WACV48630.2021.00213"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Tao, R., Pan, Z., Das, R.K., Qian, X., Shou, M.Z., Li, H.: Is someone speaking? exploring long-term temporal features for audio-visual active speaker detection. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3927\u20133935 (2021)","DOI":"10.1145\/3474085.3475587"},{"key":"22_CR34","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"issue":"5","key":"22_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3326362","volume":"38","author":"Y Wang","year":"2019","unstructured":"Wang, Y., Sun, Y., Liu, Z., Sarma, S.E., Bronstein, M.M., Solomon, J.M.: Dynamic graph CNN for learning on point clouds. ACM Trans. Graph. 38(5), 1\u201312 (2019)","journal-title":"ACM Trans. Graph."},{"key":"22_CR36","unstructured":"Zhang, Y.H., Xiao, J., Yang, S., Shan, S.: Multi-task learning for audio-visual active speaker detection. The ActivityNet Large-Scale Activity Recognition Challenge, pp. 1\u20134 (2019)"},{"key":"22_CR37","doi-asserted-by":"publisher","unstructured":"Zhang, Y., et al.: UniCon: Unified Context Network for Robust Active Speaker Detection, pp. 3964\u20133972. Association for Computing Machinery, New York, NY, USA (2021). https:\/\/doi.org\/10.1145\/3474085.3475275","DOI":"10.1145\/3474085.3475275"},{"key":"22_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Tokmakov, P., Hebert, M., Schmid, C.: A structured model for action detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9975\u20139984 (2019)","DOI":"10.1109\/CVPR.2019.01021"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19833-5_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,7]],"date-time":"2024-10-07T06:24:05Z","timestamp":1728282245000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19833-5_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198328","9783031198335"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19833-5_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"4 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"From the workshops, 367 reviewed full papers have been selected for publication","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}