{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T14:24:17Z","timestamp":1760711057287,"version":"3.40.3"},"publisher-location":"Cham","reference-count":89,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031198083"},{"type":"electronic","value":"9783031198090"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19809-0_28","type":"book-chapter","created":{"date-parts":[[2022,10,31]],"date-time":"2022-10-31T07:03:04Z","timestamp":1667199784000},"page":"489-508","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Sound Localization by\u00a0Self-supervised Time Delay Estimation"],"prefix":"10.1007","author":[{"given":"Ziyang","family":"Chen","sequence":"first","affiliation":[]},{"given":"David F.","family":"Fouhey","sequence":"additional","affiliation":[]},{"given":"Andrew","family":"Owens","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,1]]},"reference":[{"key":"28_CR1","unstructured":"Time delay estimation for speaker localization using cnn-based parametrized gcc-phat features"},{"key":"28_CR2","doi-asserted-by":"crossref","unstructured":"Adavanne, S., Politis, A., Virtanen, T.: Direction of arrival estimation for multiple sound sources using convolutional recurrent neural network. In: 2018 26th European Signal Processing Conference (EUSIPCO), pp. 1462\u20131466. IEEE (2018)","DOI":"10.23919\/EUSIPCO.2018.8553182"},{"key":"28_CR3","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: The conversation: deep audio-visual speech enhancement. arXiv preprint arXiv:1804.04121 (2018)","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"28_CR4","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Objects that sound. arXiv preprint arXiv:1712.06651 (2017)","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"28_CR5","doi-asserted-by":"crossref","unstructured":"Bian, Z., Jabri, A., Efros, A.A., Owens, A.: Learning pixel trajectories with multiscale contrastive random walks. arXiv (2022)","DOI":"10.1109\/CVPR52688.2022.00640"},{"issue":"5","key":"28_CR6","doi-asserted-by":"publisher","first-page":"3590","DOI":"10.1121\/1.5133944","volume":"146","author":"MJ Bianco","year":"2019","unstructured":"Bianco, M.J., Gerstoft, P., Traer, J., Ozanich, E., Roch, M.A., Gannot, S., Deledalle, C.A.: Machine learning in acoustics: Theory and applications. The Journal of the Acoustical Society of America 146(5), 3590\u20133628 (2019)","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"2","key":"28_CR7","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1006\/csla.1996.0024","volume":"11","author":"MS Brandstein","year":"1997","unstructured":"Brandstein, M.S., Silverman, H.F.: A practical methodology for speech source localization with microphone arrays. Comput. Speech Lang. 11(2), 91\u2013126 (1997)","journal-title":"Comput. Speech Lang."},{"key":"28_CR8","unstructured":"Brungart, D.S.: Near-field auditory localization. Ph.D. thesis, Massachusetts Institute of Technology (1998)"},{"issue":"10","key":"28_CR9","doi-asserted-by":"publisher","first-page":"1497","DOI":"10.1109\/PROC.1973.9300","volume":"61","author":"GC Carter","year":"1973","unstructured":"Carter, G.C., Nuttall, A.H., Cable, P.G.: The smoothed coherence transform. Proc. IEEE 61(10), 1497\u20131498 (1973)","journal-title":"Proc. IEEE"},{"issue":"3","key":"28_CR10","doi-asserted-by":"publisher","first-page":"463","DOI":"10.1109\/TASSP.1981.1163560","volume":"29","author":"G Carter","year":"1981","unstructured":"Carter, G.: Time delay estimation for passive sonar signal processing. IEEE Trans. Acoust. Speech Signal Process. 29(3), 463\u2013470 (1981)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"28_CR11","doi-asserted-by":"crossref","unstructured":"Chakrabarty, S., Habets, E.A.: Broadband doa estimation using convolutional neural networks trained with noise signals. In: 2017 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), pp. 136\u2013140. IEEE (2017)","DOI":"10.1109\/WASPAA.2017.8170010"},{"key":"28_CR12","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. arXiv preprint arXiv:2002.05709 (2020)"},{"key":"28_CR13","doi-asserted-by":"crossref","unstructured":"Chen, Y., Liu, B., Zhang, Z., Kim, H.S.: An end-to-end deep learning framework for multiple audio source separation and localization. International Conference on Acoustics, Speech, and Signal Processing (ICASSP) (2022)","DOI":"10.1109\/ICASSP43922.2022.9746950"},{"key":"28_CR14","unstructured":"Christensen, J.H., Hornauer, S., Yu, S.: Batvision with gcc-phat features for better sound to vision predictions. arXiv preprint arXiv:2006.07995 (2020)"},{"key":"28_CR15","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622 (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"28_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"Joon Son Chung","year":"2017","unstructured":"Chung, Joon Son, Zisserman, Andrew: Out of time: automated lip sync in the wild. In: Chen, Chu-Song., Lu, Jiwen, Ma, Kai-Kuang. (eds.) ACCV 2016. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"28_CR17","doi-asserted-by":"crossref","unstructured":"Chung, Y.A., Hsu, W.N., Tang, H., Glass, J.: An unsupervised autoregressive model for speech representation learning. arXiv preprint arXiv:1904.03240 (2019)","DOI":"10.21437\/Interspeech.2019-1473"},{"key":"28_CR18","doi-asserted-by":"crossref","unstructured":"Comanducci, L., Cobos, M., Antonacci, F., Sarti, A.: Time difference of arrival estimation from frequency-sliding generalized cross-correlations using convolutional neural networks. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). pp. 4945\u20134949. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053429"},{"key":"28_CR19","doi-asserted-by":"crossref","unstructured":"Dai, D., Vasudevan, A.B., Matas, J., Van Gool, L.: Binaural soundnet: predicting semantics, depth and motion with binaural sounds. arXiv preprint arXiv:2109.02763 (2021)","DOI":"10.1109\/TPAMI.2022.3155643"},{"key":"28_CR20","unstructured":"Defferrard, M., Benzi, K., Vandergheynst, P., Bresson, X.: Fma: a dataset for music analysis. arXiv preprint arXiv:1612.01840 (2016)"},{"key":"28_CR21","doi-asserted-by":"publisher","first-page":"300","DOI":"10.1109\/TASLP.2020.3040031","volume":"29","author":"D Diaz-Guerra","year":"2020","unstructured":"Diaz-Guerra, D., Miguel, A., Beltran, J.R.: Robust sound source tracking using srp-phat and 3d convolutional neural networks. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 300\u2013311 (2020)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"28_CR22","doi-asserted-by":"crossref","unstructured":"DiBiase, J.H.: A high-accuracy, low-latency technique for talker localization in reverberant environments using microphone arrays. Brown University (2000)","DOI":"10.1007\/978-3-662-04619-7_8"},{"issue":"9","key":"28_CR23","doi-asserted-by":"publisher","first-page":"1734","DOI":"10.1109\/TPAMI.2015.2496141","volume":"38","author":"A Dosovitskiy","year":"2015","unstructured":"Dosovitskiy, A., Fischer, P., Springenberg, J.T., Riedmiller, M., Brox, T.: Discriminative unsupervised feature learning with exemplar convolutional neural networks. IEEE Trans. Pattern Anal. Mach. Intell. 38(9), 1734\u20131747 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"28_CR24","unstructured":"Dosovitskiy, A., Springenberg, J.T., Riedmiller, M., Brox, T.: Discriminative unsupervised feature learning with convolutional neural networks. In: Neural Information Processing Systems (NIPS) (2014)"},{"key":"28_CR25","doi-asserted-by":"crossref","unstructured":"Eloff, R., et al.: Unsupervised acoustic unit discovery for speech synthesis using discrete latent-variable neural networks. arXiv preprint arXiv:1904.07556 (2019)","DOI":"10.21437\/Interspeech.2019-1518"},{"key":"28_CR26","doi-asserted-by":"crossref","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619 (2018)","DOI":"10.1145\/3197517.3201357"},{"key":"28_CR27","doi-asserted-by":"crossref","unstructured":"Fischler, M.A., Bolles, R.C.: Random sample consensus: a paradigm for model fitting with applications to image analysis and automated cartography. Communications of the ACM (1981)","DOI":"10.1145\/358669.358692"},{"key":"28_CR28","unstructured":"Fisher III, J.W., Darrell, T., Freeman, W.T., Viola, P.A.: Learning joint statistical models for audio-visual fusion and segregation. In: Neural Information Processing Systems (NIPS) (2000)"},{"key":"28_CR29","doi-asserted-by":"crossref","unstructured":"Gabbay, A., Ephrat, A., Halperin, T., Peleg, S.: Seeing through noise: visually driven speaker separation and enhancement. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3051\u20133055. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462527"},{"key":"28_CR30","doi-asserted-by":"crossref","unstructured":"Gan, C., Zhao, H., Chen, P., Cox, D., Torralba, A.: Self-supervised moving vehicle tracking with stereo sound. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 7053\u20137062 (2019)","DOI":"10.1109\/ICCV.2019.00715"},{"key":"28_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1007\/978-3-030-01219-9_3","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Ruohan Gao","year":"2018","unstructured":"Gao, Ruohan, Feris, Rogerio, Grauman, Kristen: Learning to separate object sounds by watching unlabeled video. In: Ferrari, Vittorio, Hebert, Martial, Sminchisescu, Cristian, Weiss, Yair (eds.) ECCV 2018. LNCS, vol. 11207, pp. 36\u201354. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01219-9_3"},{"key":"28_CR32","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: 2.5 d visual sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 324\u2013333 (2019)","DOI":"10.1109\/CVPR.2019.00041"},{"key":"28_CR33","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: Visualvoice: audio-visual speech separation with cross-modal consistency. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15490\u201315500. IEEE (2021)","DOI":"10.1109\/CVPR46437.2021.01524"},{"key":"28_CR34","unstructured":"Garg, R., Gao, R., Grauman, K.: Geometry-aware multi-task learning for binaural audio generation from video. arXiv preprint arXiv:2111.10882 (2021)"},{"key":"28_CR35","unstructured":"Garofolo, J.S.: Timit acoustic phonetic continuous speech corpus. Linguistic Data Consortium, 1993 (1993)"},{"key":"28_CR36","doi-asserted-by":"crossref","unstructured":"Gong, Y., Lai, C.I.J., Chung, Y.A., Glass, J.: Ssast: Self-supervised audio spectrogram transformer. arXiv preprint arXiv:2110.09784 (2021)","DOI":"10.21437\/Interspeech.2021-698"},{"key":"28_CR37","unstructured":"Gordon, D., Ehsani, K., Fox, D., Farhadi, A.: Watching the world go by: Representation learning from unlabeled videos (2020)"},{"key":"28_CR38","doi-asserted-by":"crossref","unstructured":"Hadji, I., Derpanis, K.G., Jepson, A.D.: Representation learning via global temporal alignment and cycle-consistency. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11068\u201311077 (2021)","DOI":"10.1109\/CVPR46437.2021.01092"},{"issue":"6","key":"28_CR39","doi-asserted-by":"publisher","first-page":"3436","DOI":"10.1121\/1.424670","volume":"105","author":"ML Hawley","year":"1999","unstructured":"Hawley, M.L., Litovsky, R.Y., Colburn, H.S.: Speech intelligibility and localization in a multi-source environment. J. Acoustical Soc. Am. 105(6), 3436\u20133448 (1999)","journal-title":"J. Acoustical Soc. Am."},{"key":"28_CR40","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. arXiv preprint arXiv:1911.05722 (2019)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"28_CR41","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"28_CR42","unstructured":"Hershey, J.R., Movellan, J.R.: Audio vision: using audio-visual synchrony to locate sounds. In: Neural Information Processing Systems (NIPS) (1999)"},{"key":"28_CR43","doi-asserted-by":"crossref","unstructured":"Hershey, S., Chaudhuri, S., Ellis, D.P.W., Gemmeke, J.F., Jansen, A., Moore, C., Plakal, M., Platt, D., Saurous, R.A., Seybold, B., Slaney, M., Weiss, R., Wilson, K.: Cnn architectures for large-scale audio classification. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2017), https:\/\/arxiv.org\/abs\/1609.09430","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"28_CR44","doi-asserted-by":"crossref","unstructured":"Houegnigan, L., Safari, P., Nadeu, C., van der Schhaar, M., Sol\u00e9, M., Andre, M.: Neural networks for high performance time delay estimation and acoustic source localization. In: Proceedings of the Second International Conference on Computer Science, Information Technology and Applications. pp. 137\u2013146 (2017)","DOI":"10.5121\/csit.2017.70114"},{"key":"28_CR45","doi-asserted-by":"crossref","unstructured":"Hu, X., Chen, Z., Owens, A.: Mix and localize: Localizing sound sources in mixtures. Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01023"},{"key":"28_CR46","unstructured":"Jabri, A., Owens, A., Efros, A.A.: Space-time correspondence as a contrastive random walk. arXiv (2020)"},{"key":"28_CR47","doi-asserted-by":"crossref","unstructured":"Jiang, D., Li, W., Cao, M., Zou, W., Li, X.: Speech simclr: Combining contrastive and reconstruction objective for self-supervised speech representation learning. arXiv preprint arXiv:2010.13991 (2020)","DOI":"10.21437\/Interspeech.2021-391"},{"key":"28_CR48","doi-asserted-by":"crossref","unstructured":"Jonschkowski, R., Stone, A., Barron, J.T., Gordon, A., Konolige, K., Angelova, A.: What matters in unsupervised optical flow. arXiv preprint arXiv:2006.04902 (2020)","DOI":"10.1007\/978-3-030-58536-5_33"},{"key":"28_CR49","unstructured":"Kidron, E., Schechner, Y.Y., Elad, M.: Pixels that sound. In: Computer Vision and Pattern Recognition (CVPR) (2005)"},{"key":"28_CR50","unstructured":"Kingma, D., Ba, J.: Adam: A method for stochastic optimization. International Conference on Learning Representation (2015)"},{"issue":"4","key":"28_CR51","doi-asserted-by":"publisher","first-page":"320","DOI":"10.1109\/TASSP.1976.1162830","volume":"24","author":"C Knapp","year":"1976","unstructured":"Knapp, C., Carter, G.: The generalized correlation method for estimation of time delay. IEEE Trans. Acoust. Speech Signal Process. 24(4), 320\u2013327 (1976)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"28_CR52","unstructured":"Korbar, B., Tran, D., Torresani, L.: Cooperative learning of audio and video models from self-supervised synchronization. In: Advances in Neural Information Processing Systems (2018)"},{"key":"28_CR53","doi-asserted-by":"publisher","first-page":"1164","DOI":"10.3389\/fnins.2019.01164","volume":"13","author":"DP Kumpik","year":"2019","unstructured":"Kumpik, D.P., Campbell, C., Schnupp, J.W., King, A.J.: Re-weighting of sound localization cues by audiovisual training. Front. Neurosci. 13, 1164 (2019)","journal-title":"Front. Neurosci."},{"key":"28_CR54","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"28_CR55","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"28_CR56","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Seeing voices and hearing faces: cross-modal biometric matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8427\u20138436 (2018)","DOI":"10.1109\/CVPR.2018.00879"},{"key":"28_CR57","unstructured":"Van den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv e-prints pp. arXiv-1807 (2018)"},{"key":"28_CR58","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"639","DOI":"10.1007\/978-3-030-01231-1_39","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Andrew Owens","year":"2018","unstructured":"Owens, Andrew, Efros, Alexei A..: Audio-visual scene analysis with self-supervised multisensory features. In: Ferrari, Vittorio, Hebert, Martial, Sminchisescu, Cristian, Weiss, Yair (eds.) ECCV 2018. LNCS, vol. 11210, pp. 639\u2013658. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_39"},{"key":"28_CR59","doi-asserted-by":"crossref","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Learning sight from sound: Ambient sound provides supervision for visual learning. In: International Journal of Computer Vision (IJCV) (2018)","DOI":"10.1007\/s11263-018-1083-5"},{"key":"28_CR60","doi-asserted-by":"crossref","unstructured":"Pascual, S., Ravanelli, M., Serra, J., Bonafonte, A., Bengio, Y.: Learning problem-agnostic speech representations from multiple self-supervised tasks. arXiv preprint arXiv:1904.03416 (2019)","DOI":"10.21437\/Interspeech.2019-2605"},{"issue":"4","key":"28_CR61","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1109\/MSP.2005.1458287","volume":"22","author":"N Patwari","year":"2005","unstructured":"Patwari, N., Ash, J.N., Kyperountas, S., Hero, A.O., Moses, R.L., Correal, N.S.: Locating the nodes: cooperative localization in wireless sensor networks. IEEE Signal Process. Mag. 22(4), 54\u201369 (2005)","journal-title":"IEEE Signal Process. Mag."},{"key":"28_CR62","doi-asserted-by":"crossref","unstructured":"Pertil\u00e4, P., Parviainen, M.: Time difference of arrival estimation of speech signals using deep neural networks with integrated time-frequency masking. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). pp. 436\u2013440. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682574"},{"issue":"5761","key":"28_CR63","doi-asserted-by":"publisher","first-page":"666","DOI":"10.1126\/science.1122096","volume":"311","author":"R Rajan","year":"2006","unstructured":"Rajan, R., Clement, J.P., Bhalla, U.S.: Rats smell in stereo. Science 311(5761), 666\u2013670 (2006)","journal-title":"Science"},{"key":"28_CR64","doi-asserted-by":"crossref","unstructured":"Rayleigh, L.: Xii. on our perception of sound direction. The London, Edinburgh, and Dublin Philosophical Mag. J. Sci. 13(74), 214\u2013232 (1907)","DOI":"10.1080\/14786440709463595"},{"issue":"2","key":"28_CR65","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1109\/TETCI.2017.2775237","volume":"2","author":"D Salvati","year":"2018","unstructured":"Salvati, D., Drioli, C., Foresti, G.L.: Exploiting cnns for improving acoustic source localization in noisy and reverberant conditions. IEEE Trans. Emerg. Top. Comput. Intell. 2(2), 103\u2013116 (2018)","journal-title":"IEEE Trans. Emerg. Top. Comput. Intell."},{"key":"28_CR66","doi-asserted-by":"crossref","unstructured":"Scheibler, R., Bezzam, E., Dokmani\u0107, I.: Pyroomacoustics: A python package for audio room simulation and array processing algorithms. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 351\u2013355. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461310"},{"issue":"3","key":"28_CR67","doi-asserted-by":"publisher","first-page":"276","DOI":"10.1109\/TAP.1986.1143830","volume":"34","author":"R Schmidt","year":"1986","unstructured":"Schmidt, R.: Multiple emitter location and signal parameter estimation. IEEE Trans. Antennas Propag. 34(3), 276\u2013280 (1986)","journal-title":"IEEE Trans. Antennas Propag."},{"key":"28_CR68","doi-asserted-by":"crossref","unstructured":"Schneider, S., Baevski, A., Collobert, R., Auli, M.: wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862 (2019)","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"28_CR69","doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: Facenet: a unified embedding for face recognition and clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 815\u2013823 (2015)","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"28_CR70","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.H., Kim, J., Yang, M.H., Kweon, I.S.: Learning to localize sound source in visual scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4358\u20134366 (2018)","DOI":"10.1109\/CVPR.2018.00458"},{"key":"28_CR71","doi-asserted-by":"crossref","unstructured":"Spearman, C.: The proof and measurement of association between two things. (1961)","DOI":"10.1037\/11491-005"},{"key":"28_CR72","doi-asserted-by":"crossref","unstructured":"Valverde, F.R., Hurtado, J.V., Valada, A.: There is more than meets the eye: Self-supervised multi-object detection and tracking with sound by distilling multimodal knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11612\u201311621 (2021)","DOI":"10.1109\/CVPR46437.2021.01144"},{"key":"28_CR73","doi-asserted-by":"crossref","unstructured":"Vecchiotti, P., Ma, N., Squartini, S., Brown, G.J.: End-to-end binaural sound localisation from the raw waveform. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 451\u2013455. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8683732"},{"key":"28_CR74","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-01261-8_24","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Carl Vondrick","year":"2018","unstructured":"Vondrick, Carl, Shrivastava, Abhinav, Fathi, Alireza, Guadarrama, Sergio, Murphy, Kevin: Tracking emerges by colorizing videos. In: Ferrari, Vittorio, Hebert, Martial, Sminchisescu, Cristian, Weiss, Yair (eds.) ECCV 2018. LNCS, vol. 11217, pp. 402\u2013419. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_24"},{"key":"28_CR75","unstructured":"Wang, D., Brown, G.J.: Computational auditory scene analysis: Principles, algorithms, and applications. Wiley-IEEE press (2006)"},{"key":"28_CR76","unstructured":"Wang, L., et al.: Towards learning universal audio representations. arXiv preprint arXiv:2111.12124 (2021)"},{"key":"28_CR77","unstructured":"Wang, L., van den Oord, A.: Multi-format contrastive learning of audio representations. arXiv preprint arXiv:2103.06508 (2021)"},{"key":"28_CR78","doi-asserted-by":"crossref","unstructured":"Wang, X., Jabri, A., Efros, A.A.: Learning correspondence from the cycle-consistency of time. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00267"},{"key":"28_CR79","unstructured":"Wang, Z., Zhao, H., Li, Y.L., Wang, S., Torr, P., Bertinetto, L.: Do different tracking tasks require different appearance models? NeruIPS (2021)"},{"issue":"4","key":"28_CR80","doi-asserted-by":"publisher","first-page":"715","DOI":"10.1162\/089976602317318938","volume":"14","author":"L Wiskott","year":"2002","unstructured":"Wiskott, L., Sejnowski, T.J.: Slow feature analysis: unsupervised learning of invariances. Neural Comput. 14(4), 715\u2013770 (2002)","journal-title":"Neural Comput."},{"key":"28_CR81","doi-asserted-by":"crossref","unstructured":"Wu, Z., Xiong, Y., Yu, S.X., Lin, D.: Unsupervised feature learning via non-parametric instance discrimination. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3733\u20133742 (2018)","DOI":"10.1109\/CVPR.2018.00393"},{"key":"28_CR82","doi-asserted-by":"crossref","unstructured":"Xu, X., Zhou, H., Liu, Z., Dai, B., Wang, X., Lin, D.: Visually informed binaural audio generation without binaural audios. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15485\u201315494 (2021)","DOI":"10.1109\/CVPR46437.2021.01523"},{"issue":"1","key":"28_CR83","doi-asserted-by":"publisher","first-page":"37","DOI":"10.20965\/jrm.2017.p0037","volume":"29","author":"N Yalta","year":"2017","unstructured":"Yalta, N., Nakadai, K., Ogata, T.: Sound source localization using deep learning models. J. Robot. Mechatron. 29(1), 37\u201348 (2017)","journal-title":"J. Robot. Mechatron."},{"key":"28_CR84","doi-asserted-by":"crossref","unstructured":"Yang, K., Russell, B., Salamon, J.: Telling left from right: learning spatial correspondence of sight and sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9932\u20139941 (2020)","DOI":"10.1109\/CVPR42600.2020.00995"},{"key":"28_CR85","doi-asserted-by":"crossref","unstructured":"Yang, M., Chuo, L.X., Suri, K., Liu, L., Zheng, H., Kim, H.S.: ilps: local positioning system with simultaneous localization and wireless communication. In: IEEE INFOCOM 2019-IEEE Conference on Computer Communications, pp. 379\u2013387. IEEE (2019)","DOI":"10.1109\/INFOCOM.2019.8737569"},{"key":"28_CR86","doi-asserted-by":"crossref","unstructured":"Yost, W.A., Dye, R.H., Sheft, S.: A simulated \u201ccocktail party\u201d with up to three sound sources. Perception Psychophys. 58(7), 1026\u20131036 (1996)","DOI":"10.3758\/BF03206830"},{"key":"28_CR87","unstructured":"Zhang, C., Flor\u00eancio, D., Zhang, Z.: Why does phat work well in lownoise, reverberative environments? In: 2008 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 2565\u20132568. IEEE (2008)"},{"key":"28_CR88","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"587","DOI":"10.1007\/978-3-030-01246-5_35","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Hang Zhao","year":"2018","unstructured":"Zhao, Hang, Gan, Chuang, Rouditchenko, Andrew, Vondrick, Carl, McDermott, Josh, Torralba, Antonio: The sound of pixels. In: Ferrari, Vittorio, Hebert, Martial, Sminchisescu, Cristian, Weiss, Yair (eds.) ECCV 2018. LNCS, vol. 11205, pp. 587\u2013604. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_35"},{"key":"28_CR89","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"474","DOI":"10.1007\/978-3-030-58548-8_28","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Xingyi Zhou","year":"2020","unstructured":"Zhou, Xingyi, Koltun, Vladlen, Kr\u00e4henb\u00fchl, Philipp: Tracking objects as points. In: Vedaldi, Andrea, Bischof, Horst, Brox, Thomas, Frahm, Jan-Michael. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 474\u2013490. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_28"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19809-0_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T22:17:55Z","timestamp":1728253075000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19809-0_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198083","9783031198090"],"references-count":89,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19809-0_28","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"1 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}