{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T05:21:35Z","timestamp":1755926495456,"version":"3.28.0"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,7]]},"DOI":"10.1109\/ijcnn48605.2020.9207052","type":"proceedings-article","created":{"date-parts":[[2020,9,30]],"date-time":"2020-09-30T00:40:33Z","timestamp":1601426433000},"page":"1-8","source":"Crossref","is-referenced-by-count":21,"title":["Automatic Lyrics Transcription using Dilated Convolutional Neural Networks with Self-Attention"],"prefix":"10.1109","author":[{"given":"Emir","family":"Demirel","sequence":"first","affiliation":[]},{"given":"Sven","family":"Ahlback","sequence":"additional","affiliation":[]},{"given":"Simon","family":"Dixon","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","article-title":"Transcribing lyrics from com-mercial song audio: The first step towards singing content processing","author":"tsai","year":"2018","journal-title":"Proc 43th Int Conf Acoust Speech Signal Process"},{"key":"ref11","article-title":"Semi-supervised lyrics and solo-singing alignment","author":"gupta","year":"2018","journal-title":"International Society for Music Information Retrieval Con-ference"},{"key":"ref12","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"2011 IEEE Workshop on Automatic Speech Recognition &amp; Understanding"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1006\/csla.2001.0184"},{"year":"2015","author":"peddinti","journal-title":"A time delay neural network architecture for efficient modeling of long temporal","key":"ref14"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1109\/ICASSP.2018.8462497"},{"key":"ref16","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref17","article-title":"BERT: Pre-training of deep bidirectional Transformers for language understanding","author":"devlin","year":"2019","journal-title":"North American Chapter Of The Association For Computational Linguistics"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.21437\/Interspeech.2018-1910"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1109\/ICASSP.2016.7472621"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/ICASSP.2014.6853589"},{"key":"ref4","article-title":"Adaptation of a speech recognizer for singing voice","author":"mesaros","year":"2009","journal-title":"European Signal Processing Conference"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.21437\/Interspeech.2017-129"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/ICASSP40776.2020.9054567"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.21437\/Interspeech.2019-2378"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1109\/ICASSP.2018.8461974"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.21437\/Interspeech.2018-1267"},{"year":"0","article-title":"Digital Archive Mobile Performances (DAMP)","key":"ref8"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/SpeD.2013.6682644"},{"key":"ref2","article-title":"Bootstrapping a system for phoneme recognition and keyword spotting in unaccompanied singing","author":"kruspe","year":"2016","journal-title":"International Society for Music Information Retrieval Conference"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.21437\/Interspeech.2016-1272"},{"year":"0","journal-title":"Smule Sing! 300x30x2 Dataset","key":"ref1"},{"key":"ref20","article-title":"SRILM &#x2014; An extensible language modeling toolkit","author":"stolcke","year":"2002","journal-title":"International Conference on Spoken Language Processing"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1006\/csla.1998.0043"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.21437\/Interspeech.2018-1413"},{"key":"ref24","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"InterSpeech"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.21437\/Interspeech.2016-595"},{"key":"ref26","article-title":"Sequence-discriminative training of deep neural networks","volume":"2013","author":"vesel\u00fd","year":"2013","journal-title":"InterSpeech"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1109\/ASRU.2013.6707705"}],"event":{"name":"2020 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2020,7,19]]},"location":"Glasgow, United Kingdom","end":{"date-parts":[[2020,7,24]]}},"container-title":["2020 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9200848\/9206590\/09207052.pdf?arnumber=9207052","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,28]],"date-time":"2022-06-28T21:58:34Z","timestamp":1656453514000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9207052\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/ijcnn48605.2020.9207052","relation":{},"subject":[],"published":{"date-parts":[[2020,7]]}}}