{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T18:18:06Z","timestamp":1769192286871,"version":"3.49.0"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100004608","name":"Natural Science Foundation of Jiangsu Province","doi-asserted-by":"crossref","award":["BK20171345"],"award-info":[{"award-number":["BK20171345"]}],"id":[{"id":"10.13039\/501100004608","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61003113"],"award-info":[{"award-number":["61003113"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61321491"],"award-info":[{"award-number":["61321491"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61672273"],"award-info":[{"award-number":["61672273"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1007\/s13735-019-00186-7","type":"journal-article","created":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T05:36:18Z","timestamp":1577856978000},"page":"3-16","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Hierarchical attentive deep neural networks for semantic music annotation through multiple music representations"],"prefix":"10.1007","volume":"9","author":[{"given":"Qianqian","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8426-9634","authenticated-orcid":false,"given":"Feng","family":"Su","sequence":"additional","affiliation":[]},{"given":"Yuyang","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,1,1]]},"reference":[{"issue":"2\u20133","key":"186_CR1","doi-asserted-by":"publisher","first-page":"473","DOI":"10.1007\/s10994-006-9019-7","volume":"65","author":"J Bergstra","year":"2006","unstructured":"Bergstra J, Casagrande N, Erhan D, Eck D, K\u00e9gl B (2006) Aggregate features and adaboost for music classification. Mach Learn 65(2\u20133):473\u2013484","journal-title":"Mach Learn"},{"issue":"2","key":"186_CR2","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1080\/09298210802479250","volume":"37","author":"T Bertin-Mahieux","year":"2008","unstructured":"Bertin-Mahieux T, Eck D, Maillet F, Lamere P (2008) Autotagger: a model for predicting social tags from acoustic features on large music databases. J New Music Res 37(2):115\u2013135","journal-title":"J New Music Res"},{"key":"186_CR3","unstructured":"Chang KK, Jang JSR, Iliopoulos CS (2010) Music genre classification via compressive sampling. In: Proceedings of the 11th conference of the international society for music information retrieval (ISMIR), pp 387\u2013392"},{"issue":"8","key":"186_CR4","doi-asserted-by":"publisher","first-page":"1547","DOI":"10.1109\/TASL.2009.2022435","volume":"17","author":"ZS Chen","year":"2009","unstructured":"Chen ZS, Jang JSR (2009) On the use of anti-word models for audio music annotation and retrieval. IEEE Trans Audio Speech Lang Process 17(8):1547\u20131556","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"186_CR5","unstructured":"Choi K, Fazekas G, Sandler M (2016) Automatic tagging using deep convolutional neural networks. In: Proceedings of the 17th conference of the international society for music information retrieval (ISMIR)"},{"key":"186_CR6","doi-asserted-by":"crossref","unstructured":"Choi K, Fazekas G, Sandler M, Cho K (2017) Convolutional recurrent neural networks for music classification. In: 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 2392\u20132396","DOI":"10.1109\/ICASSP.2017.7952585"},{"key":"186_CR7","unstructured":"Dauphin YN, Fan A, Auli M, Grangier D (2016) Language modeling with gated convolutional networks. Preprint arXiv:1612.08083"},{"key":"186_CR8","unstructured":"Dieleman S, Schrauwen B (2013) Multiscale approaches to music audio feature learning. In: Proceedings of the 14th conference of the international society for music information retrieval (ISMIR), pp 116\u2013121"},{"key":"186_CR9","doi-asserted-by":"crossref","unstructured":"Dieleman S, Schrauwen B (2014) End-to-end learning for music audio. In: 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 6964\u20136968","DOI":"10.1109\/ICASSP.2014.6854950"},{"issue":"10","key":"186_CR10","doi-asserted-by":"publisher","first-page":"2451","DOI":"10.1162\/089976600300015015","volume":"12","author":"FA Gers","year":"2000","unstructured":"Gers FA, Schmidhuber J, Cummins F (2000) Learning to forget: continual prediction with LSTM. Neural Comput 12(10):2451\u20132471","journal-title":"Neural Comput"},{"key":"186_CR11","unstructured":"Grosse R, Raina R, Kwong H, Ng AY (2012) Shift-invariance sparse coding for audio classification. Preprint arXiv:1206.5241"},{"key":"186_CR12","unstructured":"G\u00fc\u00e7l\u00fc U, Thielen J, Hanke M, van Gerven MAJ (2016) Brains on beats. In: 30th conference on neural information processing systems (NIPS 2016), pp 2101\u20132109"},{"key":"186_CR13","unstructured":"Hamel P, Lemieux S, Bengio Y, Eck D (2011) Temporal pooling and multiscale learning for automatic annotation and ranking of music audio. In: Proceedings of the 12th conference of the international society for music information retrieval (ISMIR), pp 729\u2013734"},{"key":"186_CR14","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: 2016 IEEE conference on computer vision and pattern recognition (CVPR), pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"186_CR15","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate shift. In: Proceedings of the 32nd international conference on machine learning, pp 448\u2013456"},{"key":"186_CR16","doi-asserted-by":"crossref","unstructured":"Kim T, Lee J, Nam J (2018) Sample-level CNN architectures for music auto-tagging using raw waveforms. In: 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 366\u2013370","DOI":"10.1109\/ICASSP.2018.8462046"},{"key":"186_CR17","unstructured":"Kingma DP, Ba JL (2015) Adam: a method for stochastic optimization. In: Proceedings of the 3rd international conference for learning representations"},{"key":"186_CR18","unstructured":"Law E, West K, Mandel M, Bay M, Downie JS (2009) Evaluation of algorithms using games: the case of music tagging. In: Proceedings of the 10th conference of the international society for music information retrieval (ISMIR), pp 387\u2013392"},{"key":"186_CR19","doi-asserted-by":"publisher","first-page":"1208","DOI":"10.1109\/LSP.2017.2713830","volume":"24","author":"J Lee","year":"2017","unstructured":"Lee J, Nam J (2017) Multi-level and multi-scale feature aggregation using pretrained convolutional neural networks for music auto-tagging. IEEE Signal Process. Lett. 24:1208\u20131212","journal-title":"IEEE Signal Process. Lett."},{"key":"186_CR20","unstructured":"Lin Z, Feng M, dos Santos CN, Yu M, Xiang B, Zhou B, Bengio Y (2017) A structured self-attentive sentence embedding. In: International conference on learning representations"},{"key":"186_CR21","unstructured":"Liu JY, Yang YH (2016) Event localization in music auto-tagging. In: Proceedings of the 24th ACM international conference on multimedia, pp 1048\u20131057"},{"key":"186_CR22","doi-asserted-by":"crossref","unstructured":"McFee B, Raffel C, Liang D, Ellis DPW, McVicar M, Battenberg E, Nieto O (2015) librosa: audio and music signal analysis in python. In: Proceedings of the 14th python in science conference, pp 18\u201324","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"186_CR23","unstructured":"McKinney MF, Breebaart J (2003) Features for audio and music classification. In: Proceedings of the 4th conference of the international society for music information retrieval (ISMIR), pp 151\u2013158"},{"key":"186_CR24","volume-title":"An introduction to the psychology of hearing","author":"B Moore","year":"2012","unstructured":"Moore B (2012) An introduction to the psychology of hearing. Brill, Leiden"},{"key":"186_CR25","unstructured":"Nam J, Herrera J, Lee K (2015) A deep bag-of-features model for music auto-tagging. Preprint arXiv:1508.04999"},{"key":"186_CR26","unstructured":"Nam J, Herrera J, Slaney M, Smith J (2012) Learning sparse feature representations for music annotation and retrieval. In: Proceedings of the 13th conference of the international society for music information retrieval (ISMIR), pp 565\u2013571"},{"key":"186_CR27","doi-asserted-by":"crossref","unstructured":"Ness SR, Theocharis A, Martins LG (2009) Improving automatic music tag annotation using stacked generalization of probabilistic SVM outputs. In: Proceedings of the 17th ACM international conference on multimedia, pp 705\u2013708","DOI":"10.1145\/1631272.1631393"},{"key":"186_CR28","doi-asserted-by":"crossref","unstructured":"Schluter J, Osendorfer C (2011) Music similarity estimation with the mean\u2013covariance restricted Boltzmann machine. In: 2011 10th international conference on machine learning and applications, pp 118\u2013123","DOI":"10.1109\/ICMLA.2011.102"},{"issue":"11","key":"186_CR29","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster M, Paliwal KK (1997) Bidirectional recurrent neural networks. IEEE Trans Signal Process 45(11):2673\u20132681","journal-title":"IEEE Trans Signal Process"},{"key":"186_CR30","unstructured":"Sordo M (2012) Semantic annotation of music collections: a computational approach. PhD thesis, Universitat Pompeu Fabra"},{"issue":"1","key":"186_CR31","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton G, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15(1):1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"186_CR32","doi-asserted-by":"crossref","unstructured":"Tingle D, Kim YE, Turnbull D (2010) Exploring automatic music annotation with acoustically-objective tags. In: Proceedings of the international conference on multimedia information retrieval (MIR 2010), pp 55\u201362","DOI":"10.1145\/1743384.1743400"},{"issue":"2","key":"186_CR33","doi-asserted-by":"publisher","first-page":"467","DOI":"10.1109\/TASL.2007.913750","volume":"16","author":"D Turnbull","year":"2008","unstructured":"Turnbull D, Barrington L, Torres D, Lanckriet G (2008) Semantic annotation and retrieval of music and sound effects. IEEE Trans Audio Speech Lang Process 16(2):467\u2013476","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"5","key":"186_CR34","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1109\/TSA.2002.800560","volume":"10","author":"G Tzanetakis","year":"2002","unstructured":"Tzanetakis G, Cook P (2002) Musical genre classification of audio signals. IEEE Trans Speech Audio Process 10(5):293\u2013302","journal-title":"IEEE Trans Speech Audio Process"},{"key":"186_CR35","unstructured":"van\u00a0den Oord A, Dieleman S, Schrauwen B (2014) Transfer learning by supervised pre-training for audio-based music classification. In: Proceedings of the 15th conference of the international society for music information retrieval (ISMIR), pp 29\u201334"},{"key":"186_CR36","doi-asserted-by":"crossref","unstructured":"Xiong Y, Su F, Wang Q (2017) Automatic music mood classification by learning cross-media relevance between audio and lyrics. In: 2017 IEEE international conference on multimedia and expo (ICME), pp 961\u2013966","DOI":"10.1109\/ICME.2017.8019341"},{"key":"186_CR37","doi-asserted-by":"crossref","unstructured":"Yang Z, Han Y, Wang Z (2017) Catching the temporal regions-of-interest for video captioning. In: Proceedings of the 25th ACM international conference on multimedia, pp 146\u2013153","DOI":"10.1145\/3123266.3123327"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-019-00186-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s13735-019-00186-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-019-00186-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,12,31]],"date-time":"2020-12-31T00:40:29Z","timestamp":1609375229000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s13735-019-00186-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,1,1]]},"references-count":37,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2020,3]]}},"alternative-id":["186"],"URL":"https:\/\/doi.org\/10.1007\/s13735-019-00186-7","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,1,1]]},"assertion":[{"value":"5 September 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 November 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 December 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 January 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}