{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T11:52:37Z","timestamp":1751370757169,"version":"3.37.3"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2020,9,10]],"date-time":"2020-09-10T00:00:00Z","timestamp":1599696000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,9,10]],"date-time":"2020-09-10T00:00:00Z","timestamp":1599696000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100010418","name":"Institute for Information and communications Technology Promotion","doi-asserted-by":"publisher","award":["2016-0-00564"],"award-info":[{"award-number":["2016-0-00564"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003696","name":"Electronics and Telecommunications Research Institute","doi-asserted-by":"publisher","award":["18ZS1100"],"award-info":[{"award-number":["18ZS1100"]}],"id":[{"id":"10.13039\/501100003696","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,1]]},"DOI":"10.1007\/s11042-020-09636-5","type":"journal-article","created":{"date-parts":[[2020,9,10]],"date-time":"2020-09-10T18:03:52Z","timestamp":1599761032000},"page":"1793-1812","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Scene2Wav: a deep convolutional sequence-to-conditional SampleRNN for emotional scene musicalization"],"prefix":"10.1007","volume":"80","author":[{"given":"Gwenaelle Cunha","family":"Sergio","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0441-7087","authenticated-orcid":false,"given":"Minho","family":"Lee","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,9,10]]},"reference":[{"key":"9636_CR1","doi-asserted-by":"crossref","unstructured":"Bravo F (2012) The influence of music on the emotional interpretation of visual contexts. In: International symposium on computer music modeling and retrieval. Springer, pp 366\u2013377","DOI":"10.1007\/978-3-642-41248-6_20"},{"issue":"1","key":"9636_CR2","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1177\/1745691610393980","volume":"6","author":"M Buhrmester","year":"2011","unstructured":"Buhrmester M, Kwang T, Gosling SD (2011) Amazon\u2019s mechanical turk: a new source of inexpensive, yet high-quality, data? Perspectives on Psychological Science 6(1):3\u20135","journal-title":"Perspectives on Psychological Science"},{"key":"9636_CR3","doi-asserted-by":"crossref","unstructured":"\u00c7evikalp H, Dordinejad GG, Elmas M (2017) Feature extraction with convolutional neural networks for aerial image retrieval. In: Signal processing and communications applications conference (SIU), 2017 25th. IEEE, pp 1\u20134","DOI":"10.1109\/SIU.2017.7960212"},{"issue":"4","key":"9636_CR4","first-page":"63","volume":"20","author":"JD Chang","year":"2010","unstructured":"Chang JD, Yu SS, Chen HH, Tsai CS (2010) Hsv-based color texture image classification using wavelet transform and motif patterns. J Comput 20 (4):63\u201369","journal-title":"J Comput"},{"key":"9636_CR5","doi-asserted-by":"crossref","unstructured":"Cho K, van Merrienboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using rnn encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP). http:\/\/www.aclweb.org\/anthology\/D14-1179. Association for Computational Linguistics, Doha, Qatar, pp 1724 \u2013 1734","DOI":"10.3115\/v1\/D14-1179"},{"key":"9636_CR6","doi-asserted-by":"crossref","unstructured":"Cunha Sergio G, Lee M (2020) Emotional video to audio transformation using deep recurrent neural networks and a neuro-fuzzy system. Math Probl Eng 2020","DOI":"10.1155\/2020\/8478527"},{"issue":"1","key":"9636_CR7","doi-asserted-by":"publisher","first-page":"143","DOI":"10.1109\/TMM.2004.840618","volume":"7","author":"A Hanjalic","year":"2005","unstructured":"Hanjalic A, Xu LQ (2005) Affective video content representation and modeling. IEEE Trans Multimed 7(1):143\u2013154","journal-title":"IEEE Trans Multimed"},{"key":"9636_CR8","unstructured":"Heinichen JD (1728) Der general-bass in der composition. Ripol Classic Publishing House"},{"key":"9636_CR9","unstructured":"Ishiguro MA (2010) The affective properties of keys in instrumental music from the late nineteenth and early twentieth centuries. Master\u2019s thesis University of Massachusett Amherst"},{"key":"9636_CR10","doi-asserted-by":"crossref","unstructured":"Jaimovich J, Coghlan N, Knapp RB (2012) Emotion in motion: a study of music and affective response. In: International symposium on computer music modeling and retrieval. Springer, pp 19\u201343","DOI":"10.1007\/978-3-642-41248-6_2"},{"issue":"1","key":"9636_CR11","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1109\/T-AFFC.2011.15","volume":"3","author":"S Koelstra","year":"2011","unstructured":"Koelstra S, Muhl C, Soleymani M, Lee JS, Yazdani A, Ebrahimi T, Pun T, Nijholt A, Patras I (2011) Deap: a database for emotion analysis; using physiological signals. IEEE Trans Affective Comput 3(1):18\u201331","journal-title":"IEEE Trans Affective Comput"},{"key":"9636_CR12","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. In: Advances in neural information processing systems, pp 1097\u20131105"},{"key":"9636_CR13","doi-asserted-by":"crossref","unstructured":"LeCun Y, Haffner P, Bottou L, Bengio Y (1999) Object recognition with gradient-based learning. In: Shape, contour and grouping in computer vision. Springer, pp 319\u2013345","DOI":"10.1007\/3-540-46805-6_19"},{"key":"9636_CR14","doi-asserted-by":"crossref","unstructured":"Manocha P, Finkelstein A, Jin Z, Bryan NJ, Zhang R, Mysore GJ (2020) A differentiable perceptual audio metric learned from just noticeable differences. arXiv:2001.04460","DOI":"10.21437\/Interspeech.2020-1191"},{"key":"9636_CR15","unstructured":"Mehri S, Kumar K, Gulrajani I, Kumar R, Jain S, Sotelo J, Courville A, Bengio Y (2016) Samplernn: an unconditional end-to-end neural audio generation model. arXiv:1612.07837"},{"issue":"4","key":"9636_CR16","doi-asserted-by":"publisher","first-page":"626","DOI":"10.3758\/BF03192732","volume":"37","author":"JA Mikels","year":"2005","unstructured":"Mikels JA, Fredrickson BL, Larkin GR, Lindberg CM, Maglio SJ, Reuter-Lorenz PA (2005) Emotional category data on images from the international affective picture system. Behav Res Methods 37(4):626\u2013630","journal-title":"Behav Res Methods"},{"issue":"2","key":"9636_CR17","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1111\/j.1469-7580.2009.01160.x","volume":"216","author":"GM Morriss-Kay","year":"2010","unstructured":"Morriss-Kay GM (2010) The evolution of human artistic creativity. J Anat 216(2):158\u2013176","journal-title":"J Anat"},{"key":"9636_CR18","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1016\/j.patcog.2017.05.025","volume":"71","author":"L Nanni","year":"2017","unstructured":"Nanni L, Ghidoni S, Brahnam S (2017) Handcrafted vs. non-handcrafted features for computer vision classification. Pattern Recogn 71:158\u2013172","journal-title":"Pattern Recogn"},{"key":"9636_CR19","unstructured":"Neubig G (2017) Neural machine translation and sequence-to-sequence models: a tutorial. arXiv:1703.01619"},{"key":"9636_CR20","unstructured":"Oatley K, Keltner D, Jenkins JM (2006) Understanding emotions. Blackwell Publishing"},{"key":"9636_CR21","unstructured":"Rouzic M (2008) Photosounder. http:\/\/photosounder.com\/"},{"key":"9636_CR22","unstructured":"Savage TM, Vogel KE (2013) An introduction to digital multimedia. Jones & Bartlett Publishers"},{"key":"9636_CR23","unstructured":"Schubart CFD (1806) Christ. Fried. Dan. Schubart\u2019s Ideen zu einer \u00c4sthetik der Tonkunst. Degen"},{"key":"9636_CR24","doi-asserted-by":"crossref","unstructured":"Sergio GC, Lee M (2016) Audio generation from scene considering its emotion aspect. In: International conference on neural information processing. Springer, Kyoto, pp 74\u201381","DOI":"10.1007\/978-3-319-46672-9_9"},{"key":"9636_CR25","doi-asserted-by":"crossref","unstructured":"Sergio GC, Moirangthem DS, Lee M (2018) Temporal hierarchies in sequence to sequence for sentence correction. In: 2018 international joint conference on neural networks (IJCNN). IEEE, pp 1\u20137","DOI":"10.1109\/IJCNN.2018.8489499"},{"issue":"4","key":"9636_CR26","doi-asserted-by":"publisher","first-page":"7666","DOI":"10.1016\/j.eswa.2008.09.042","volume":"36","author":"MK Shan","year":"2009","unstructured":"Shan MK, Kuo FF, Chiang MF, Lee SY (2009) Emotion-based music recommendation by affinity discovery from film music. Expert Systems with Applications 36(4):7666\u20137674","journal-title":"Expert Systems with Applications"},{"issue":"4","key":"9636_CR27","doi-asserted-by":"publisher","first-page":"7666","DOI":"10.1016\/j.eswa.2008.09.042","volume":"36","author":"MK Shan","year":"2009","unstructured":"Shan MK, Kuo FF, Chiang MF, Lee SY (2009) Emotion-based music recommendation by affinity discovery from film music. Expert Systems with Applications 36(4):7666\u20137674","journal-title":"Expert Systems with Applications"},{"key":"9636_CR28","unstructured":"Singh JF (2012) Paint2sound. http:\/\/flexibeatz.weebly.com\/paint2sound.html"},{"issue":"2","key":"9636_CR29","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1109\/T-AFFC.2011.37","volume":"3","author":"M Soleymani","year":"2012","unstructured":"Soleymani M, Pantic M, Pun T (2012) Multimodal emotion recognition in response to videos. IEEE Trans Affective Comput 3(2):211\u2013223","journal-title":"IEEE Trans Affective Comput"},{"key":"9636_CR30","unstructured":"Sotelo J, Mehri S, Kumar K, Santos JF, Kastner K, Courville A, Bengio Y (2017) Char2wav: end-to-end speech synthesis"},{"key":"9636_CR31","unstructured":"Steblin R (2005) A history of key characteristics in the eighteenth and early nineteenth centuries. University of Rochester Press"},{"key":"9636_CR32","unstructured":"Ullrich K, van der Wel E (2017) Music transcription with convolutional sequence-to-sequence models. In: Proceedings of the 18th international society for music information retrieval conference"},{"key":"9636_CR33","unstructured":"Van Den Oord A, Dieleman S, Zen H, Simonyan K, Vinyals O, Graves A, Kalchbrenner N, Senior AW, Kavukcuoglu K (2016) Wavenet: a generative model for raw audio. In: SSW, p 125"},{"key":"9636_CR34","doi-asserted-by":"crossref","unstructured":"van der Zwaag MD, Westerink JH, van den Broek EL (2009) Deploying music characteristics for an affective music player. In: 3rd international conference on affective computing and intelligent interaction and workshops, 2009. ACII 2009. IEEE, pp 1\u20137","DOI":"10.1109\/ACII.2009.5349387"},{"key":"9636_CR35","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney R, Darrell T, Saenko K (2015) Sequence to sequence-video to text. In: Proceedings of the IEEE international conference on computer vision, pp 4534\u20134542","DOI":"10.1109\/ICCV.2015.515"},{"issue":"6","key":"9636_CR36","doi-asserted-by":"publisher","first-page":"689","DOI":"10.1109\/TCSVT.2006.873781","volume":"16","author":"HL Wang","year":"2006","unstructured":"Wang HL, Cheong LF (2006) Affective understanding in film. IEEE Trans Circ Syst Video Technol 16(6):689\u2013704","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"9636_CR37","unstructured":"van der Wel E, Ullrich K (2017) Optical music recognition with convolutional sequence-to-sequence models. arXiv:1707.04877"},{"key":"9636_CR38","unstructured":"White D (2011) Sonicphoto. http:\/\/www.skytopia.com\/software\/sonicphoto\/"},{"key":"9636_CR39","doi-asserted-by":"crossref","unstructured":"Yang X, Fan Y (2018) Feature extraction using convolutional neural networks for multi-atlas based image segmentation. In: Medical imaging 2018: image processing. International Society for Optics and Photonics, vol 10574, p 1057439","DOI":"10.1117\/12.2293876"},{"key":"9636_CR40","doi-asserted-by":"crossref","unstructured":"Yanulevskaya V, van Gemert JC, Roth K, Herbold AK, Sebe N, Geusebroek JM (2008) Emotional valence categorization using holistic image features. In: 15th IEEE international conference on image processing, 2008. ICIP 2008. IEEE, pp 101\u2013104","DOI":"10.1109\/ICIP.2008.4711701"},{"key":"9636_CR41","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1016\/j.neucom.2011.12.034","volume":"86","author":"Q Zhang","year":"2012","unstructured":"Zhang Q, Jeong S, Lee M (2012) Autonomous emotion development using incremental modified adaptive neuro-fuzzy inference system. Neurocomputing 86:33\u201344","journal-title":"Neurocomputing"},{"issue":"1","key":"9636_CR42","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/j.cogsys.2010.12.012","volume":"14","author":"Q Zhang","year":"2012","unstructured":"Zhang Q, Lee M (2012) Emotion development system by interacting with human eeg and natural scene understanding. Cogn Syst Res 14(1):37\u201349","journal-title":"Cogn Syst Res"},{"key":"9636_CR43","doi-asserted-by":"crossref","unstructured":"Zhao S, Yao H, Wang F, Jiang X, Zhang W (2014) Emotion based image musicalization. In: 2014 IEEE international conference on multimedia and expo workshops (ICMEW). IEEE, pp 1\u20136","DOI":"10.1109\/ICMEW.2014.6890565"},{"key":"9636_CR44","doi-asserted-by":"crossref","unstructured":"Zhiqiang W, Jun L (2017) A review of object detection based on convolutional neural network. In: Control conference (CCC), 2017 36th chinese. IEEE, pp 11104\u201311109","DOI":"10.23919\/ChiCC.2017.8029130"},{"key":"9636_CR45","doi-asserted-by":"crossref","unstructured":"Zhou C, Horgan M, Kumar V, Vasco C, Darcy D (2018) Voice conversion with conditional samplernn. arXiv:1808.08311","DOI":"10.21437\/Interspeech.2018-1121"},{"issue":"1","key":"9636_CR46","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1186\/s13640-017-0194-1","volume":"2017","author":"A Zlatintsi","year":"2017","unstructured":"Zlatintsi A, Koutras P, Evangelopoulos G, Malandrakis N, Efthymiou N, Pastra K, Potamianos A, Maragos P (2017) Cognimuse: a multimodal video database annotated with saliency, events, semantics and emotion with application to summarization. EURASIP Journal on Image and Video Processing 2017 (1):54","journal-title":"EURASIP Journal on Image and Video Processing"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09636-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-020-09636-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09636-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,9,10]],"date-time":"2021-09-10T00:29:27Z","timestamp":1631233767000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-020-09636-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,9,10]]},"references-count":46,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2021,1]]}},"alternative-id":["9636"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-09636-5","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2020,9,10]]},"assertion":[{"value":"30 September 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 July 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 August 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 September 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}