{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T05:14:37Z","timestamp":1748409277940,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":28,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819620609"},{"type":"electronic","value":"9789819620616"}],"license":[{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2061-6_9","type":"book-chapter","created":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T05:46:25Z","timestamp":1735537585000},"page":"115-128","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Improving Singing Voice Transcription Generalization with\u00a0AI Generated Accompaniments"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5117-791X","authenticated-orcid":false,"given":"Miguel","family":"Perez","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4655-729X","authenticated-orcid":false,"given":"Holger","family":"Kirchhoff","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5525-5233","authenticated-orcid":false,"given":"Peter","family":"Grosche","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1395-2345","authenticated-orcid":false,"given":"Xavier","family":"Serra","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,31]]},"reference":[{"key":"9_CR1","doi-asserted-by":"crossref","unstructured":"Bittner, R.M., Bosch, J.J., Rubinstein, D., Meseguer-Brocal, G., Ewert, S.: A lightweight instrument-agnostic model for polyphonic note transcription and multipitch estimation. In: Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP). Singapore (2022)","DOI":"10.1109\/ICASSP43922.2022.9746549"},{"key":"9_CR2","unstructured":"Bittner, R.M., McFee, B., Salamon, J., Li, P., Bello, J.P.: Deep Salience representations for F0 estimation in polyphonic music. In: Proceedings of the 18th International Society for Music Information Retrieval Conference (2017)"},{"key":"9_CR3","doi-asserted-by":"publisher","unstructured":"Brown, J.C.: Calculation of a constant Q spectral transform. J. Acoust. Soc. Am. 89(1) (1991). https:\/\/doi.org\/10.1121\/1.400476","DOI":"10.1121\/1.400476"},{"key":"9_CR4","unstructured":"Copet, J., et al.: Simple and controllable music generation. In: 37th Conference on Neural Information Processing Systems (2023)"},{"key":"9_CR5","unstructured":"Doh, S., Choi, K., Lee, J., Nam, J.: LP-MusicCaps: LLM-based pseudo music captioning. In: Proceedings of the 24th International Society for Music Information Retrieval Conference, ISMIR 2023 (2023)"},{"key":"9_CR6","doi-asserted-by":"publisher","unstructured":"Donahue, C., et al.: SingSong: generating musical accompaniments from singing (2023). https:\/\/doi.org\/10.48550\/ARXIV.2301.12662","DOI":"10.48550\/ARXIV.2301.12662"},{"key":"9_CR7","doi-asserted-by":"publisher","first-page":"1118","DOI":"10.1109\/TASLP.2020.2982285","volume":"28","author":"B Gfeller","year":"2020","unstructured":"Gfeller, B., Frank, C., Roblek, D., Sharifi, M., Tagliasacchi, M., Velimirovi\u0107, M.: Spice: self-supervised pitch estimation. IEEE\/ACM Trans. Audio, Speech Lang. Process. 28, 1118\u20131128 (2020). https:\/\/doi.org\/10.1109\/TASLP.2020.2982285","journal-title":"IEEE\/ACM Trans. Audio, Speech Lang. Process."},{"key":"9_CR8","doi-asserted-by":"publisher","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 369\u2013376. ICML \u201906, Association for Computing Machinery, New York, NY, USA (2006). https:\/\/doi.org\/10.1145\/1143844.1143891","DOI":"10.1145\/1143844.1143891"},{"key":"9_CR9","doi-asserted-by":"publisher","unstructured":"Gu, X., Ou, L., Zeng, W., Zhang, J., Wong, N., Wang, Y.: Automatic lyric transcription and automatic music transcription from multimodal singing. ACM Trans. Multimedia Comput. Commun. Appl. 20(7) (2024). https:\/\/doi.org\/10.1145\/3651310","DOI":"10.1145\/3651310"},{"key":"9_CR10","doi-asserted-by":"publisher","unstructured":"Hennequin, R., Khlif, A., Voituret, F., Moussallam, M.: Spleeter: a fast and efficient music source separation tool with pre-trained models. J. Open Source Softw. 5(50), 2154 (2020). https:\/\/doi.org\/10.21105\/joss.02154","DOI":"10.21105\/joss.02154"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8) (1997)","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"9_CR12","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"9_CR13","doi-asserted-by":"publisher","unstructured":"Kum, S., Lee, J., Kim, K.L., Kim, T., Nam, J.: Pseudo-label transfer from frame-level to note-level in a teacher-student framework for singing transcription from polyphonic music. In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 796\u2013800 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747147","DOI":"10.1109\/ICASSP43922.2022.9747147"},{"key":"9_CR14","doi-asserted-by":"publisher","unstructured":"Lu, W.T., Wang, J.C., Hung, Y.N.: Multitrack music transcription with a time-frequency perceiver. In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2023). https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10096688","DOI":"10.1109\/ICASSP49357.2023.10096688"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Mei, X., et al.: WavCaps: A ChatGPT-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395 (2023)","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"9_CR16","doi-asserted-by":"publisher","DOI":"10.5334\/tismir.26","author":"R Mignot","year":"2019","unstructured":"Mignot, R., Peeters, G.: An analysis of the effect of data augmentation methods: experiments for a musical genre classification task. Trans. Int. Soc. Music Inf. Retrieval (2019). https:\/\/doi.org\/10.5334\/tismir.26","journal-title":"Trans. Int. Soc. Music Inf. Retrieval"},{"key":"9_CR17","unstructured":"Molina, E., Barbancho, A.M., Tard\u00f3n, L.J., Barbancho, I.: Evaluation framework for automatic singing transcription. In: Proceedings of the 15th International Society for Music Information Retrieval Conference, ISMIR 2014, Taipei, Taiwan, October 27\u201331, 2014, pp. 567\u2013572 (2014)"},{"key":"9_CR18","unstructured":"Perez, M., Kirchhoff, H., Serra, X.: TriAD: capturing harmonics with 3D convolutions. In: Proceedings of the 24th International Society for Music Information Retrieval Conference. Milan, Italy (2023)"},{"key":"9_CR19","doi-asserted-by":"publisher","unstructured":"Qiu, Y., Zhang, J., Shan, Y., Zhou, J.: Enhancing note-level singing transcription model with unlabeled and weakly labeled data. In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 341\u2013345 (2024). https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10445858","DOI":"10.1109\/ICASSP48485.2024.10445858"},{"key":"9_CR20","unstructured":"Riou, A., Lattner, S., Hadjeres, G., Peeters, G.: PESTO: pitch estimation with self-supervised transposition-equivariant objective. In: Proceedings of the 24th International Society for Music Information Retrieval Conference, ISMIR 2023 (2023)"},{"key":"9_CR21","unstructured":"Salamon, J.: Melody Extraction from Polyphonic Signals. Ph.D. thesis, Universitat Pompeu Fabra (2013)"},{"key":"9_CR22","unstructured":"Tal, O., Ziv, A., Gat, I., Kreuk, F., Adi, Y.: Joint audio and symbolic conditioning for temporally controlled text-to-music generation (2024)"},{"key":"9_CR23","doi-asserted-by":"publisher","unstructured":"Wang, J.Y., Jang, J.S.R.: On the preparation and validation of a large-scale dataset of singing transcription. In: 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 276\u2013280 (2021). https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9414601","DOI":"10.1109\/ICASSP39728.2021.9414601"},{"key":"9_CR24","doi-asserted-by":"publisher","first-page":"383","DOI":"10.1109\/TASLP.2022.3224297","volume":"31","author":"JY Wang","year":"2023","unstructured":"Wang, J.Y., Jang, J.S.R.: Training a singing transcription model using connectionist temporal classification loss and cross-entropy loss. IEEE\/ACM Trans. Audio, Speech Lang. Process. 31, 383\u2013396 (2023). https:\/\/doi.org\/10.1109\/TASLP.2022.3224297","journal-title":"IEEE\/ACM Trans. Audio, Speech Lang. Process."},{"key":"9_CR25","doi-asserted-by":"publisher","unstructured":"Wang, X., Liu, L., Shi, Q.: Harmonic structure-based neural network model for music pitch detection. In: 2020 19th IEEE International Conference on Machine Learning and Applications (ICMLA) (2020). https:\/\/doi.org\/10.1109\/ICMLA51294.2020.00023","DOI":"10.1109\/ICMLA51294.2020.00023"},{"key":"9_CR26","doi-asserted-by":"publisher","unstructured":"Wei, W., Li, P., Yu, Y., Li, W.: HarmoF0: logarithmic scale dilated convolution for pitch estimation. In: 2022 IEEE International Conference on Multimedia and Expo (ICME) (2022). https:\/\/doi.org\/10.1109\/ICME52920.2022.9858935","DOI":"10.1109\/ICME52920.2022.9858935"},{"key":"9_CR27","unstructured":"Wei, W., Li, P., Yu, Y., Li, W.: HPPNet: modeling the harmonic structure and pitch invariance in piano transcription. In: Proceedings of the 23th International Society for Music Information Retrieval Conference (2022)"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Yamamoto, R., Yoneyama, R., Toda, T.: NNSVS: a neural network-based singing voice synthesis toolkit. arXiv preprint arXiv:2210.15987 (2022)","DOI":"10.1109\/ICASSP49357.2023.10096239"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2061-6_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T06:03:53Z","timestamp":1735538633000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2061-6_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,31]]},"ISBN":["9789819620609","9789819620616"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2061-6_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,31]]},"assertion":[{"value":"31 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}