{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T07:50:01Z","timestamp":1774597801449,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":22,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819698141","type":"print"},{"value":"9789819698158","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-9815-8_2","type":"book-chapter","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T09:18:19Z","timestamp":1753262299000},"page":"15-27","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Vision-Guided Acoustic Localization with Decoupled Inference for Moving Speakers"],"prefix":"10.1007","author":[{"given":"Yidi","family":"Li","sequence":"first","affiliation":[]},{"given":"Kairan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Chenxu","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Chongwei","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Rongshan","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Mingliang","family":"Dou","sequence":"additional","affiliation":[]},{"given":"Bin","family":"Ren","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,24]]},"reference":[{"issue":"7","key":"2_CR1","doi-asserted-by":"publisher","first-page":"4631","DOI":"10.1007\/s11831-022-09747-2","volume":"29","author":"D Desai","year":"2022","unstructured":"Desai, D., Mehendale, N.: A review on sound source localization systems. Archives of Computational Methods in Engineering 29(7), 4631\u20134642 (2022)","journal-title":"Archives of Computational Methods in Engineering"},{"key":"2_CR2","first-page":"1","volume":"72","author":"Z Wang","year":"2023","unstructured":"Wang, Z., Zou, W., Su, H., Guo, Y., Li, D.: Multiple sound source localization exploiting robot motion and approaching control. IEEE Trans. Instrum. Meas. 72, 1\u201316 (2023)","journal-title":"IEEE Trans. Instrum. Meas."},{"issue":"1","key":"2_CR3","doi-asserted-by":"publisher","first-page":"326","DOI":"10.1109\/TVT.2021.3120201","volume":"71","author":"T Alexandri","year":"2021","unstructured":"Alexandri, T., Walter, M., Diamant, R.: A time difference of arrival based target motion analysis for localization of underwater vehicles. IEEE Trans. Veh. Technol. 71(1), 326\u2013338 (2021)","journal-title":"IEEE Trans. Veh. Technol."},{"issue":"1","key":"2_CR4","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1121\/10.0011809","volume":"152","author":"P-A Grumiaux","year":"2022","unstructured":"Grumiaux, P.-A., Kiti\u0107, S., Girin, L., Gu\u00e9rin, A.: A survey of sound source localization with deep learning methods. The Journal of the Acoustical Society of America 152(1), 107\u2013151 (2022)","journal-title":"The Journal of the Acoustical Society of America"},{"key":"2_CR5","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107906","volume":"115","author":"J Chen","year":"2021","unstructured":"Chen, J., et al.: Multimodal fusion for indoor sound source localization. Pattern Recogn. 115, 107906 (2021)","journal-title":"Pattern Recogn."},{"key":"2_CR6","doi-asserted-by":"crossref","unstructured":"Fan, R., et al.: AttaNet: Attention aggregation network for audio-visual emotion recognition. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 8030\u20138034 (2024)","DOI":"10.1109\/ICASSP48485.2024.10447640"},{"issue":"4","key":"2_CR7","doi-asserted-by":"publisher","first-page":"1578","DOI":"10.1049\/cit2.12189","volume":"8","author":"Y Li","year":"2023","unstructured":"Li, Y., Wang, G., Chen, Z., Tang, H., Liu, H.: On-device audio-visual multi-person wake word spotting. CAAI Trans. Intel. Technol. 8(4), 1578\u20131589 (2023)","journal-title":"CAAI Trans. Intel. Technol."},{"issue":"1","key":"2_CR8","doi-asserted-by":"publisher","first-page":"142","DOI":"10.1049\/cit2.12212","volume":"9","author":"Y Li","year":"2024","unstructured":"Li, Y., Ren, J., Wang, Y., Wang, G., Li, X., Liu, H.: Audio\u2013visual keyword transformer for unconstrained sentence-level keyword spotting. CAAI Transactions on Intelligence Technology 9(1), 142\u2013152 (2024)","journal-title":"CAAI Transactions on Intelligence Technology"},{"issue":"1","key":"2_CR9","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1186\/s13636-024-00377-z","volume":"2024","author":"E Grinstein","year":"2024","unstructured":"Grinstein, E., et al.: Steered response power for sound source localization: A tutorial review. EURASIP Journal on Audio, Speech, and Music Processing 2024(1), 59 (2024)","journal-title":"EURASIP Journal on Audio, Speech, and Music Processing"},{"key":"2_CR10","doi-asserted-by":"publisher","DOI":"10.1016\/j.ymssp.2024.111272","volume":"211","author":"G Zhang","year":"2024","unstructured":"Zhang, G., Geng, L., Xie, F., He, C.-D.: A dynamic convolution-transformer neural network for multiple sound source localization based on functional beamforming. Mech. Syst. Signal Process. 211, 111272 (2024)","journal-title":"Mech. Syst. Signal Process."},{"key":"2_CR11","first-page":"1","volume":"71","author":"SY Lee","year":"2022","unstructured":"Lee, S.Y., Chang, J., Lee, S.: Deep learning-enabled high-resolution and fast sound source localization in spherical microphone array system. IEEE Trans. Instrum. Meas. 71, 1\u201312 (2022)","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"2_CR12","doi-asserted-by":"crossref","unstructured":"Phokhinanan, W., Obin, N., Argentieri, S.: Binaural sound localization in noisy environments using frequency-based audio vision transformer (favit). In: INTERSPEECH, pp. 3704\u20133708. ISCA (2023)","DOI":"10.21437\/Interspeech.2023-2015"},{"issue":"1","key":"2_CR13","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1186\/s13636-023-00301-x","volume":"2023","author":"E Grinstein","year":"2023","unstructured":"Grinstein, E., Neo, V.W., Naylor, P.A.: Dual input neural networks for positional sound source localization. EURASIP Journal on Audio, Speech, and Music Processing 2023(1), 32 (2023)","journal-title":"EURASIP Journal on Audio, Speech, and Music Processing"},{"issue":"6","key":"2_CR14","doi-asserted-by":"publisher","first-page":"2003","DOI":"10.1007\/s11263-023-01950-3","volume":"132","author":"M Qiao","year":"2024","unstructured":"Qiao, M., et al.: Joint learning of audio\u2013visual saliency prediction and sound source localization on multi-face videos. Int. J. Comput. Vision 132(6), 2003\u20132025 (2024)","journal-title":"Int. J. Comput. Vision"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Multi-stage multimodal distillation for audio-visual speaker tracking. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135 (2025)","DOI":"10.1109\/ICASSP49660.2025.10888838"},{"key":"2_CR16","doi-asserted-by":"publisher","first-page":"1835","DOI":"10.1109\/TMM.2024.3521737","volume":"27","author":"Y Li","year":"2025","unstructured":"Li, Y., Liu, H., Yang, B.: STNet: Deep Audio-Visual Fusion Network for Robust Speaker Tracking. IEEE Transactions on Multimedia (TMM) 27, 1835\u20131847 (2025)","journal-title":"IEEE Transactions on Multimedia (TMM)"},{"key":"2_CR17","doi-asserted-by":"publisher","first-page":"550","DOI":"10.1109\/TASLP.2022.3226330","volume":"31","author":"X Qian","year":"2023","unstructured":"Qian, X., Wang, Z., Wang, J., Guan, G., Li, H.: Audio-visual cross-attention network for robotic speaker tracking. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 31, 550\u2013562 (2023)","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Murdock, C., Ananthabhotla, I., Lu, H., Ithapu, V. K.: Self-motion as supervision for egocentric audiovisual localization. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7835\u20137839. IEEE (2024)","DOI":"10.1109\/ICASSP48485.2024.10447683"},{"key":"2_CR19","first-page":"191","volume":"104","author":"A Act","year":"1996","unstructured":"Act, A.: Health insurance portability and accountability act of 1996. Public Law 104, 191 (1996)","journal-title":"Public Law"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y., Liu, H., Tang, H.: Multi-modal perception attention network with self-supervised learning for audio-visual speaker tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, 36(2), 1456\u20131463 (2022)","DOI":"10.1609\/aaai.v36i2.20035"},{"key":"2_CR21","unstructured":"Liu, Z., et al.: Kan: kolmogorov-arnold networks. arXiv preprint arXiv:2404.19756 (2024)"},{"key":"2_CR22","doi-asserted-by":"crossref","unstructured":"Vo, X.-T., Hoang, V.-D., Nguyen, D.-L., Jo, K.-H.: Pedestrian head detection and tracking via global vision transformer. In: International Workshop on Frontiers of Computer Vision, pp. 155\u2013167 (2022)","DOI":"10.1007\/978-3-031-06381-7_11"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-9815-8_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T06:53:39Z","timestamp":1774594419000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-9815-8_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819698141","9789819698158"],"references-count":22,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-9815-8_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"24 July 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ningbo","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 July 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 July 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/icg\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}