{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T00:43:58Z","timestamp":1774399438770,"version":"3.50.1"},"publisher-location":"Cham","reference-count":18,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030110178","type":"print"},{"value":"9783030110185","type":"electronic"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-11018-5_62","type":"book-chapter","created":{"date-parts":[[2019,1,24]],"date-time":"2019-01-24T05:50:50Z","timestamp":1548309050000},"page":"711-716","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":29,"title":["Cross-modal Embeddings for Video and Audio Retrieval"],"prefix":"10.1007","author":[{"given":"Didac","family":"Sur\u00eds","sequence":"first","affiliation":[]},{"given":"Amanda","family":"Duarte","sequence":"additional","affiliation":[]},{"given":"Amaia","family":"Salvador","sequence":"additional","affiliation":[]},{"given":"Jordi","family":"Torres","sequence":"additional","affiliation":[]},{"given":"Xavier","family":"Gir\u00f3-i-Nieto","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,1,23]]},"reference":[{"key":"62_CR1","unstructured":"Abu-El-Haija, S., et al.: YouTube-8M: a large-scale video classification Benchmark. CoRR abs\/1609.08675 (2016). http:\/\/arxiv.org\/abs\/1609.08675"},{"key":"62_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/978-3-319-04114-8_26","volume-title":"MultiMedia Modeling","author":"E Acar","year":"2014","unstructured":"Acar, E., Hopfgartner, F., Albayrak, S.: Understanding affective content of music videos through learned representations. In: Gurrin, C., Hopfgartner, F., Hurst, W., Johansen, H., Lee, H., O\u2019Connor, N. (eds.) MMM 2014. LNCS, vol. 8325, pp. 303\u2013314. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-04114-8_26"},{"key":"62_CR3","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: See, hear, and read: deep aligned representations. arXiv preprint arXiv:1706.00932 (2017)"},{"key":"62_CR4","unstructured":"Brochu, E., De Freitas, N., Bao, K.: The sound of an album cover: probabilistic multimedia and information retrieval. In: Artificial Intelligence and Statistics (AISTATS) (2003)"},{"key":"62_CR5","unstructured":"Chao, J., Wang, H., Zhou, W., Zhang, W., Yu, Y.: TuneSensor: a semantic-driven music recommendation service for digital photo albums. In: 10th International Semantic Web Conference (2011)"},{"key":"62_CR6","unstructured":"Frome, A., et al.: DeViSE: a deep visual-semantic embedding model. In: Neural Information Processing Systems (2013)"},{"issue":"3","key":"62_CR7","doi-asserted-by":"publisher","first-page":"347","DOI":"10.1109\/TCSVT.2007.890831","volume":"17","author":"O Gillet","year":"2007","unstructured":"Gillet, O., Essid, S., Richard, G.: On the correlation of automatic audio and visual segmentations of music videos. IEEE Trans. Circuits Syst. Video Technol. 17(3), 347\u2013355 (2007)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"62_CR8","unstructured":"Hong, S., Im, W., Yang, H.S.: Deep learning for content-based, cross-modal retrieval of videos and music. CoRR abs\/1704.06761 (2017)"},{"key":"62_CR9","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. CoRR abs\/1411.2539 (2014)"},{"key":"62_CR10","doi-asserted-by":"crossref","unstructured":"Li, D., Dimitrova, N., Li, M., Sethi, I.K.: Multimedia content processing through cross-modal association. In: Proceedings of the Eleventh ACM International Conference on Multimedia, pp. 604\u2013611. ACM (2003)","DOI":"10.1145\/957013.957143"},{"issue":"4","key":"62_CR11","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1109\/MMUL.2011.1","volume":"18","author":"J Libeks","year":"2011","unstructured":"Libeks, J., Turnbull, D.: You can judge an artist by an album cover: using images for music annotation. IEEE MultiMedia 18(4), 30\u201337 (2011)","journal-title":"IEEE MultiMedia"},{"key":"62_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"357","DOI":"10.1007\/978-3-642-21566-7_36","volume-title":"Advances in Self-Organizing Maps","author":"R Mayer","year":"2011","unstructured":"Mayer, R.: Analysing the similarity of album art with self-organising maps. In: Laaksonen, J., Honkela, T. (eds.) WSOM 2011. LNCS, vol. 6731, pp. 357\u2013366. Springer, Heidelberg (2011). https:\/\/doi.org\/10.1007\/978-3-642-21566-7_36"},{"key":"62_CR13","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: Proceedings of the 28th International Conference on Machine Learning, pp. 689\u2013696 (2011)"},{"key":"62_CR14","doi-asserted-by":"crossref","unstructured":"Salvador, A., et al.: Learning cross-modal embeddings for cooking recipes and food images. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.327"},{"key":"62_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1007\/978-3-319-16354-3_8","volume-title":"Advances in Information Retrieval","author":"A Schindler","year":"2015","unstructured":"Schindler, A., Rauber, A.: An audio-visual approach to music genre classification through affective color features. In: Hanbury, A., Kazai, G., Rauber, A., Fuhr, N. (eds.) ECIR 2015. LNCS, vol. 9022, pp. 61\u201367. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-16354-3_8"},{"key":"62_CR16","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. CoRR abs\/1511.06078 (2015). http:\/\/arxiv.org\/abs\/1511.06078"},{"issue":"7","key":"62_CR17","doi-asserted-by":"publisher","first-page":"1305","DOI":"10.1109\/TMM.2016.2557722","volume":"18","author":"X Wu","year":"2016","unstructured":"Wu, X., Qiao, Y., Wang, X., Tang, X.: Bridging music and image via cross-modal ranking analysis. IEEE Trans. Multimedia 18(7), 1305\u20131318 (2016)","journal-title":"IEEE Trans. Multimedia"},{"key":"62_CR18","doi-asserted-by":"crossref","unstructured":"Zhang, H., Zhuang, Y., Wu, F.: Cross-modal correlation learning for clustering on image-audio dataset. In: 15th ACM International Conference on Multimedia, pp. 273\u2013276. ACM (2007)","DOI":"10.1145\/1291233.1291290"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-11018-5_62","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,22]],"date-time":"2023-01-22T01:27:59Z","timestamp":1674350879000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-11018-5_62"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030110178","9783030110185"],"references-count":18,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-11018-5_62","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"23 January 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}