{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,26]],"date-time":"2025-06-26T05:03:27Z","timestamp":1750914207779},"reference-count":29,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,5,4]],"date-time":"2023-05-04T00:00:00Z","timestamp":1683158400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,5,4]],"date-time":"2023-05-04T00:00:00Z","timestamp":1683158400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s13735-023-00276-7","type":"journal-article","created":{"date-parts":[[2023,5,4]],"date-time":"2023-05-04T13:02:17Z","timestamp":1683205337000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Maximizing mutual information inside intra- and inter-modality for audio-visual event retrieval"],"prefix":"10.1007","volume":"12","author":[{"given":"Ruochen","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nannan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenmin","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,5,4]]},"reference":[{"key":"276_CR1","unstructured":"Wang K, Yin Q, Wang W, Wu S, Wang L (2016) A comprehensive survey on cross-modal retrieval. arXiv preprint arXiv:1607.06215"},{"issue":"2","key":"276_CR2","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1007\/s13735-019-00190-x","volume":"9","author":"M Suresha","year":"2020","unstructured":"Suresha M, Kuppa S, Raghukumar D (2020) A study on deep learning spatiotemporal models and feature extraction techniques for video understanding. Int J Multimed Inf Retr 9(2):81\u2013101","journal-title":"Int J Multimed Inf Retr"},{"key":"276_CR3","doi-asserted-by":"crossref","unstructured":"Feng F, Wang X, Li R (2014) Cross-modal retrieval with correspondence autoencoder. In: Proceedings of the 22nd ACM international conference on multimedia, pp 7\u201316","DOI":"10.1145\/2647868.2654902"},{"key":"276_CR4","doi-asserted-by":"crossref","unstructured":"Wang H, Zhang Y, Yu X (2020) An overview of image caption generation methods. Comput Intell Neurosci 2020","DOI":"10.1155\/2020\/3062706"},{"key":"276_CR5","doi-asserted-by":"crossref","unstructured":"Wang L, Shang C, Qiu H, Zhao T, Qiu B, Li H (2020) Multi-stage tag guidance network in video caption. In: Proceedings of the 28th ACM international conference on multimedia, pp 4610\u20134614","DOI":"10.1145\/3394171.3416288"},{"issue":"1","key":"276_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s13735-021-00225-2","volume":"11","author":"S Heller","year":"2022","unstructured":"Heller S, Gsteiger V, Bailer W, Gurrin C, J\u00f3nsson B\u00de, Loko\u010d J, Leibetseder A, Mejzl\u00edk F, Pe\u0161ka L, Rossetto L et al (2022) Interactive video retrieval evaluation at a distance: comparing sixteen interactive video search systems in a remote setting at the 10th video browser showdown. Int J Multimed Inf Retr 11(1):1\u201318","journal-title":"Int J Multimed Inf Retr"},{"key":"276_CR7","doi-asserted-by":"crossref","unstructured":"Nagrani A, Albanie S, Zisserman A (2018) Seeing voices and hearing faces: cross-modal biometric matching. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8427\u20138436","DOI":"10.1109\/CVPR.2018.00879"},{"key":"276_CR8","doi-asserted-by":"crossref","unstructured":"Nagrani A, Albanie S, Zisserman A (2018) Learnable pins: cross-modal embeddings for person identity. In: Proceedings of the European conference on computer vision (ECCV), pp 71\u201388","DOI":"10.1007\/978-3-030-01261-8_5"},{"key":"276_CR9","doi-asserted-by":"crossref","unstructured":"Wen P, Xu Q, Jiang Y, Yang Z, He Y, Huang Q (2021) Seeking the shape of sound: an adaptive framework for learning voice-face association. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16347\u201316356","DOI":"10.1109\/CVPR46437.2021.01608"},{"key":"276_CR10","doi-asserted-by":"publisher","first-page":"1763","DOI":"10.1109\/TMM.2021.3071243","volume":"24","author":"H Ning","year":"2021","unstructured":"Ning H, Zheng X, Lu X, Yuan Y (2021) Disentangled representation learning for cross-modal biometric matching. IEEE Trans Multimed 24:1763\u20131774","journal-title":"IEEE Trans Multimed"},{"key":"276_CR11","doi-asserted-by":"crossref","unstructured":"Saeed MS, Khan MH, Nawaz S, Yousaf MH, Del\u00a0Bue A (2022) Fusion and orthogonal projection for improved face-voice association. In: ICASSP 2022-2022 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, pp 7057\u20137061","DOI":"10.1109\/ICASSP43922.2022.9747704"},{"key":"276_CR12","doi-asserted-by":"crossref","unstructured":"Arandjelovic R, Zisserman A (2017) Look, listen and learn. In: Proceedings of the IEEE international conference on computer vision, pp 609\u2013617","DOI":"10.1109\/ICCV.2017.73"},{"key":"276_CR13","unstructured":"Hong S, Im W, Yang HS (2017) Content-based video-music retrieval using soft intra-modal structure constraint. arXiv preprint arXiv:1704.06761"},{"key":"276_CR14","doi-asserted-by":"crossref","unstructured":"Arandjelovic R, Zisserman A (2018) Objects that sound. In: Proceedings of the European conference on computer vision (ECCV), pp 435\u2013451","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"276_CR15","doi-asserted-by":"crossref","unstructured":"Zhu Y, Wu Y, Latapie H, Yang Y, Yan Y (2021) Learning audio-visual correlations from variational cross-modal generation. In: ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, pp 4300\u20134304","DOI":"10.1109\/ICASSP39728.2021.9414296"},{"key":"276_CR16","doi-asserted-by":"crossref","unstructured":"Chung JS, Huh J, Mun S, Lee M, Heo HS, Choe S, Ham C, Jung S, Lee B-J, Han I (2020) In defence of metric learning for speaker recognition. arXiv preprint arXiv:2003.11982","DOI":"10.21437\/Interspeech.2020-1064"},{"key":"276_CR17","doi-asserted-by":"crossref","unstructured":"Li J, Jing M, Zhu L, Ding Z, Lu K, YangY (2020) Learning modality-invariant latent representations for generalized zero-shot learning. In: Proceedings of the 28th ACM international conference on multimedia, pp 1348\u20131356","DOI":"10.1145\/3394171.3413503"},{"key":"276_CR18","doi-asserted-by":"crossref","unstructured":"Hershey S, Chaudhuri S, Ellis DP, Gemmeke JF, Jansen A, Moore RC, Plakal M, Platt D, Saurous RA, Seybold B et al (2017) CNN architectures for large-scale audio classification. In: 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, pp 131\u2013135","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"276_CR19","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"5588","key":"276_CR20","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"McGurk H, MacDonald J (1976) Hearing lips and seeing voices. Nature 264(5588):746\u2013748","journal-title":"Nature"},{"issue":"3","key":"276_CR21","doi-asserted-by":"publisher","first-page":"868","DOI":"10.3758\/s13414-015-1045-8","volume":"78","author":"HM Smith","year":"2016","unstructured":"Smith HM, Dunn AK, Baguley T, Stacey PC (2016) Matching novel face and voice identity using static and dynamic facial images. Atten Percept Psychophys 78(3):868\u2013879","journal-title":"Atten Percept Psychophys"},{"key":"276_CR22","doi-asserted-by":"crossref","unstructured":"Kim C, Shin HV, Oh T-H, Kaspar A, Elgharib M, Matusik W (2018) On learning associations of faces and voices. In: Asian conference on computer vision. Springer, pp 276\u2013292","DOI":"10.1007\/978-3-030-20873-8_18"},{"key":"276_CR23","doi-asserted-by":"publisher","first-page":"1538","DOI":"10.3389\/fpsyg.2018.01538","volume":"9","author":"K Aslaksen","year":"2018","unstructured":"Aslaksen K, Lor\u00e5s H (2018) The modality-specific learning style hypothesis: a mini-review. Front psychol 9:1538","journal-title":"Front psychol"},{"key":"276_CR24","doi-asserted-by":"crossref","unstructured":"Wang Y, Peng Y (2021) Mars: learning modality-agnostic representation for scalable cross-media retrieval. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2021.3136330"},{"key":"276_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107335","volume":"104","author":"F Wu","year":"2020","unstructured":"Wu F, Jing X-Y, Wu Z, Ji Y, Dong X, Luo X, Huang Q, Wang R (2020) Modality-specific and shared generative adversarial network for cross-modal retrieval. Pattern Recognit 104:107335","journal-title":"Pattern Recognit"},{"key":"276_CR26","unstructured":"Chen X, Kingma DP, Salimans T, Duan Y, Dhariwal P, Schulman J, Sutskever I, Abbeel P (2016) Variational lossy autoencoder. arXiv preprint arXiv:1611.02731"},{"key":"276_CR27","doi-asserted-by":"crossref","unstructured":"Tian Y, Shi J, Li B, Duan Z, Xu C (2018) Audio-visual event localization in unconstrained videos. In: Proceedings of the European conference on computer vision (ECCV), pp 247\u2013263","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"276_CR28","unstructured":"Takida Y, Liao W-H, Uesaka T, Takahashi S, Mitsufuji Y (2021) Preventing posterior collapse induced by oversmoothing in gaussian VAE. arXiv preprint arXiv:2102.08663"},{"key":"276_CR29","doi-asserted-by":"crossref","unstructured":"Bowman SR, Vilnis L, Vinyals O, Dai AM, Jozefowicz R, Bengio S (2015) Generating sentences from a continuous space. arXiv preprint arXiv:1511.06349","DOI":"10.18653\/v1\/K16-1002"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00276-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-023-00276-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00276-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,14]],"date-time":"2023-06-14T15:28:55Z","timestamp":1686756535000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-023-00276-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,4]]},"references-count":29,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["276"],"URL":"https:\/\/doi.org\/10.1007\/s13735-023-00276-7","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,5,4]]},"assertion":[{"value":"25 July 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 October 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 December 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 May 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"10"}}