{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:21:12Z","timestamp":1772907672499,"version":"3.50.1"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030687793","type":"print"},{"value":"9783030687809","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-68780-9_26","type":"book-chapter","created":{"date-parts":[[2021,2,24]],"date-time":"2021-02-24T17:04:13Z","timestamp":1614186253000},"page":"301-313","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Cross-modal Deep Learning Applications: Audio-Visual Retrieval"],"prefix":"10.1007","author":[{"given":"Cong","family":"Jin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tian","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shouxun","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yun","family":"Tie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Lv","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianguang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wencai","family":"Yan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming","family":"Yan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qian","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yicong","family":"Guan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenggougou","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,2,25]]},"reference":[{"issue":"3","key":"26_CR1","doi-asserted-by":"publisher","first-page":"317","DOI":"10.1080\/10350339909360442","volume":"9","author":"KL O\u2019Halloran","year":"1999","unstructured":"O\u2019Halloran, K.L.: Interdependence, interaction and metaphor in multi-semiotic texts. Soc. Semiotics 9(3), 317 (1999)","journal-title":"Soc. Semiotics"},{"key":"26_CR2","unstructured":"Morency, L.P., Baltrusaitis, T.: Tutorial on multimodal machine learning, Language Technologies Institute (2019). https:\/\/www.cs.cmu.edu\/morency\/MMMLTutorial-ACL2017.pdf"},{"key":"26_CR3","doi-asserted-by":"publisher","first-page":"104394","DOI":"10.1109\/ACCESS.2019.2931449","volume":"7","author":"M Yan","year":"2019","unstructured":"Yan, M., Chan, C.A., Li, W., Lei, L., Gygax, A.F., Chih-Lin, I.: Assessing the energy consumption of proactive mobile edge caching in wireless networks. IEEE Access 7, 104394\u2013104404 (2019)","journal-title":"IEEE Access"},{"key":"26_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"299","DOI":"10.1007\/978-3-319-14442-9_33","volume-title":"MultiMedia Modeling","author":"S Sasaki","year":"2015","unstructured":"Sasaki, S., Hirai, T., Ohya, H., Morishima, S.: Affective music recommendation system based on the mood of input video. In: He, X., Luo, S., Tao, D., Xu, C., Yang, J., Hasan, M.A. (eds.) MMM 2015. LNCS, vol. 8936, pp. 299\u2013302. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-14442-9_33"},{"key":"26_CR5","first-page":"210","volume":"41","author":"W Liwei","year":"2018","unstructured":"Liwei, W., Yin, L., Jing, H., et al.: Learning two branch neural networks for image-text matching tasks. IEEE Trans. Pattern Anal. Mach. Intell. 41, 210\u2013223 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"26_CR6","unstructured":"Lee, K.-H., Xi, C., Gang, H., et al.: Stacked Cross Attention for Image-Text Matching, arXiv preprint arXiv:1803.08024 (2018)"},{"issue":"3","key":"26_CR7","doi-asserted-by":"publisher","first-page":"1893","DOI":"10.1007\/s11063-020-10241-8","volume":"52","author":"C Jin","year":"2020","unstructured":"Jin, C., Tie, Y., Bai, Y., Lv, X., Liu, S.: A style-specific music composition neural network. Neural Process. Lett. 52(3), 1893\u20131912 (2020). https:\/\/doi.org\/10.1007\/s11063-020-10241-8","journal-title":"Neural Process. Lett."},{"key":"26_CR8","unstructured":"Andrej, K., Armand, J., Li, F.: Deep fragment embeddings for bidirectional image sentence mapping. In: NeurIPS, pp. 1889\u20131897 (2014)"},{"key":"26_CR9","doi-asserted-by":"crossref","unstructured":"Peng, Z., Li, Z., Zhang, J., Li, Y., Qi, G.J., Tang, J.: Few-shot image recognition with knowledge transfer. In: Proceedings of ICCV, pp. 441\u2013449 (2019)","DOI":"10.1109\/ICCV.2019.00053"},{"key":"26_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"26_CR11","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., et al.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"issue":"1","key":"26_CR12","doi-asserted-by":"publisher","first-page":"276","DOI":"10.1109\/TIP.2016.2624140","volume":"26","author":"Z Li","year":"2016","unstructured":"Li, Z., Tang, J.: Weakly supervised deep matrix factorization for social image understanding. IEEE Trans. Image Process. 26(1), 276\u2013288 (2016)","journal-title":"IEEE Trans. Image Process."},{"key":"26_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/978-3-319-04114-8_26","volume-title":"MultiMedia Modeling","author":"E Acar","year":"2014","unstructured":"Acar, E., Hopfgartner, F., Albayrak, S.: Understanding affective content of music videos through learned representations. In: Gurrin, C., Hopfgartner, F., Hurst, W., Johansen, H., Lee, H., O\u2019Connor, N. (eds.) MMM 2014. LNCS, vol. 8325, pp. 303\u2013314. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-04114-8_26"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Xu, Y., Kong, Q., Huang, Q., Wang, W., Plumbley, M.D.: Attention and localization based on a deep convolutional recurrent model for weakly supervised audio tagging, arXiv preprint arXiv:1703.06052 (2017)","DOI":"10.21437\/Interspeech.2017-486"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., et al.: Going deeper with convolutions. In: Proceedings of CVPR, pp. 1\u20139 (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"issue":"10","key":"26_CR16","doi-asserted-by":"publisher","first-page":"2085","DOI":"10.1109\/TPAMI.2015.2400461","volume":"37","author":"Z Li","year":"2015","unstructured":"Li, Z., Liu, J., Tang, J., Lu, H.: Robust structured subspace learning for data representation. IEEE Trans. Pattern Anal. Mach. Intell. 37(10), 2085\u20132098 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"11","key":"26_CR17","doi-asserted-by":"publisher","first-page":"1989","DOI":"10.1109\/TMM.2015.2477035","volume":"17","author":"Z Li","year":"2015","unstructured":"Li, Z., Tang, J.: Weakly supervised deep metric learning for community-contributed image retrieval. IEEE Trans. Multimedia 17(11), 1989\u20131999 (2015)","journal-title":"IEEE Trans. Multimedia"},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Song, K., Nie, F., Han, J., Li, X.: Parameter free large margin nearest neighbor for distance metric learning. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.10861"},{"key":"26_CR19","doi-asserted-by":"crossref","unstructured":"Wang, X., Han, X., Huang, W., Dong, D., Scott, M.R.: Multi-similarity loss with general pair weighting for deep metric learning. In: Proceedings of CVPR, pp. 5022\u20135030 (2019)","DOI":"10.1109\/CVPR.2019.00516"},{"key":"26_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"272","DOI":"10.1007\/978-3-030-01231-1_17","volume-title":"Computer Vision \u2013 ECCV 2018","author":"W Ge","year":"2018","unstructured":"Ge, W., Huang, W., Dong, D., Scott, M.R.: Deep metric learning with hierarchical triplet loss. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 272\u2013288. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_17"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., et al.: Visual to sound: generating natural sound for videos in the wild. In: Proceedings of the CVPR, pp. 3550\u20133558 (2018)","DOI":"10.1109\/CVPR.2018.00374"},{"issue":"4","key":"26_CR22","first-page":"925","volume":"42","author":"L Canyi","year":"2019","unstructured":"Canyi, L., Jiashi, F., Yudong, C., et al.: Tensor robust principal component analysis with a new tensor nuclear norm. IEEE Trans. Pattern Anal. Mach. Intell. 42(4), 925\u2013938 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"26_CR23","unstructured":"Wegelin, J.A., et al.: A survey of Partial Least Squares (PLS) methods, with emphasis on the two-block case. University of Washington, Technical report (2000)"},{"issue":"12","key":"26_CR24","doi-asserted-by":"publisher","first-page":"2639","DOI":"10.1162\/0899766042321814","volume":"16","author":"DR Hardoon","year":"2004","unstructured":"Hardoon, D.R., Szedmak, S., Shawe-Taylor, J.: Canonical correlation analysis: an overview with application to learning methods. Neural comput. 16(12), 2639\u20132664 (2004)","journal-title":"Neural comput."},{"key":"26_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"711","DOI":"10.1007\/978-3-030-11018-5_62","volume-title":"Computer Vision \u2013 ECCV 2018 Workshops","author":"D Sur\u00eds","year":"2019","unstructured":"Sur\u00eds, D., Duarte, A., Salvador, A., Torres, J., Gir\u00f3-i-Nieto, X.: Cross-modal embeddings for video and audio retrieval. In: Leal-Taix\u00e9, L., Roth, S. (eds.) ECCV 2018. LNCS, vol. 11132, pp. 711\u2013716. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-11018-5_62"},{"key":"26_CR26","unstructured":"Andrew, G., Arora, R., Bilmes, J., Livescu, K.: Deep canonical correlation analysis. In: Proceedings of ICML, pp. 1247\u20131255 (2013)"},{"issue":"1","key":"26_CR27","first-page":"2030","volume":"17","author":"Y Ganin","year":"2016","unstructured":"Ganin, Y., et al.: Domain-adversarial training of neural networks. J. Mach. Learn. Res. 17(1), 2030\u20132096 (2016)","journal-title":"J. Mach. Learn. Res."}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition. ICPR International Workshops and Challenges"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-68780-9_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,18]],"date-time":"2022-12-18T21:44:46Z","timestamp":1671399886000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-68780-9_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030687793","9783030687809"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-68780-9_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"25 February 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 January 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ICPR2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.icpr2020.it\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}