{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:23:02Z","timestamp":1760314982143,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032079589","type":"print"},{"value":"9783032079596","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07959-6_12","type":"book-chapter","created":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:09Z","timestamp":1760260929000},"page":"161-173","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["PinkVocalTransformer: Neural Acoustic-to-Articulatory Inversion Based on\u00a0the\u00a0Pink Trombone"],"prefix":"10.1007","author":[{"given":"Zhiyuan","family":"Xu","sequence":"first","affiliation":[]},{"given":"Joshua","family":"Reiss","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,13]]},"reference":[{"key":"12_CR1","unstructured":"Richmond, K.: Estimating articulatory parameters from the acoustic speech signal. Annexe Thesis Digitisation Project 2017 Block 11 (2002)"},{"key":"12_CR2","doi-asserted-by":"publisher","unstructured":"Tokuda, K., et al.: Speech synthesis based on hidden Markov models. Proc. IEEE 101(5), 1234\u20131252 (2013). https:\/\/doi.org\/10.1109\/JPROC.2013.2251852","DOI":"10.1109\/JPROC.2013.2251852"},{"key":"12_CR3","unstructured":"Van den Oord, A., et al.: WaveNet: a generative model for raw audio. In: Proceedings of the 9th ISCA Workshop on Speech Synthesis Workshop (SSW 9), p. 125 (2016)"},{"key":"12_CR4","doi-asserted-by":"publisher","unstructured":"Hunt, A.J., Black, A.W.: Unit selection in a concatenative speech synthesis system using a large speech database. In: Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Atlanta, GA, USA, vol. 1, pp. 373\u2013376 (1996). https:\/\/doi.org\/10.1109\/ICASSP.1996.541110.","DOI":"10.1109\/ICASSP.1996.541110."},{"key":"12_CR5","doi-asserted-by":"publisher","unstructured":"Dutoit, T., et al.: The MBROLA project: towards a set of high quality speech synthesizers free of use for non-commercial purposes. In: Proceedings of the International Conference on Spoken Language Processing (ICSLP), Philadelphia, PA, USA, vol. 3, pp. 1393\u20131396 (1996). https:\/\/doi.org\/10.1109\/ICSLP.1996.607874.","DOI":"10.1109\/ICSLP.1996.607874."},{"key":"12_CR6","unstructured":"Fant, G., Liljencrants, J., Lin, Q.: A four-parameter model of glottal flow. STL-QPSR, vol. 4, no. 1985, pp. 1\u201313 (1985)"},{"key":"12_CR7","unstructured":"Fant, G.: Acoustic Theory of Speech Production. The Hague. Mouton, The Netherlands (1960)"},{"key":"12_CR8","unstructured":"Thapen, N.: Pink Trombone. https:\/\/dood.al\/pinktrombone\/. Accessed 04 July 2025"},{"issue":"4","key":"12_CR9","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0060603","volume":"8","author":"P Birkholz","year":"2013","unstructured":"Birkholz, P.: Modeling consonant-vowel coarticulation for articulatory speech synthesis. PLoS ONE 8(4), e60603 (2013). https:\/\/doi.org\/10.1371\/journal.pone.0060603","journal-title":"PLoS ONE"},{"key":"12_CR10","unstructured":"C\u00e1mara, M., et al.: Optimization techniques for a physical model of human vocalisation. In: 26th International Conference on Digital Audio Effects (DAFx), Copenhagen, Denmark, 4\u20137 September 2023"},{"key":"12_CR11","unstructured":"C\u00e1mara, M., et al.: Decoding vocal articulations from acoustic latent representations. In: Proceedings of the AES Europe Convention, Madrid, Spain (2024)"},{"key":"12_CR12","doi-asserted-by":"publisher","first-page":"1754","DOI":"10.1109\/TSA.2005.858550","volume":"14","author":"S Mathur","year":"2006","unstructured":"Mathur, S., Story, B., Rodriguez, J.: Vocal-tract modeling: Fractional elongation of segment lengths in a waveguide model with half-sample delays. IEEE Trans. Audio Speech Lang. Process. 14, 1754\u20131762 (2006)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"12_CR13","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"W-N Hsu","year":"2021","unstructured":"Hsu, W.-N., et al.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021). https:\/\/doi.org\/10.1109\/TASLP.2021.3122291","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"2","key":"12_CR14","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1121\/1.1906583","volume":"22","author":"H Dudley","year":"1950","unstructured":"Dudley, H., Tarnoczy, T.H.: The speaking machine of Wolfgang von Kempelen. J. Acoust. Soc. Am. 22(2), 151\u2013166 (1950). https:\/\/doi.org\/10.1121\/1.1906583","journal-title":"J. Acoust. Soc. Am."},{"key":"12_CR15","unstructured":"Kelly, K.L., Lochbaum, C.C.: Speech synthesis. In: Proceedings of the Fourth ICA (1962)"},{"issue":"5","key":"12_CR16","doi-asserted-by":"publisher","first-page":"3231","DOI":"10.1121\/1.1869752","volume":"117","author":"BH Story","year":"2005","unstructured":"Story, B.H.: A parametric model of the vocal tract area function for vowel and consonant simulation. J. Acoust. Soc. Am. 117(5), 3231\u20133254 (2005)","journal-title":"J. Acoust. Soc. Am."},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Chenoukh, S., et al.: Voice mimic system using an articulatory codebook for estimation of vocal tract shape. In: Proceedings of the EuroSpeech-97, Rhodes, pp. 429\u2013432 (1997)","DOI":"10.21437\/Eurospeech.1997-179"},{"issue":"1","key":"12_CR18","doi-asserted-by":"publisher","first-page":"444","DOI":"10.1121\/1.1921448","volume":"118","author":"S Ouni","year":"2005","unstructured":"Ouni, S., Laprie, Y.: Modeling the articulatory space using a hypercube codebook for acoustic-to-articulatory inversion. J. Acoust. Soc. Am. 118(1), 444\u2013460 (2005). https:\/\/doi.org\/10.1121\/1.1921448","journal-title":"J. Acoust. Soc. Am."},{"issue":"1","key":"12_CR19","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1016\/S0167-6393(99)00031-X","volume":"30","author":"VN Sorokin","year":"2000","unstructured":"Sorokin, V.N., Leonov, A.S., Trushkin, A.V.: Estimation of stability and accuracy of inverse problem solution for the vocal tract. Speech Commun. 30(1), 55\u201374 (2000)","journal-title":"Speech Commun."},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Saha, P., et al.: Learning joint articulatory-acoustic representations with normalizing flows. In: Proceedings of the Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-2004"},{"key":"12_CR21","doi-asserted-by":"publisher","unstructured":"Pasad, A., Shi, B., Livescu, K.: Comparative layer-wise analysis of self-supervised speech models. In: IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2023, Rhodes Island, Greece, pp. 1\u20135 (2023). https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10096149","DOI":"10.1109\/ICASSP49357.2023.10096149"},{"key":"12_CR22","unstructured":"Lu, H.-L., Smith, J.O.: Glottal source modeling for singing voice synthesis. In: Proceedings of the ICMC (2000)"},{"key":"12_CR23","unstructured":"Fant, G.: The LF-model revisited: transformations and frequency domain analysis. STL-QPSR, vol. 2, no. 3 (1995)"},{"key":"12_CR24","unstructured":"S\u00fcdholt, D., et al.: Vocal tract estimation by gradient descent. In: 26th International Conference on Digital Audio Effects (DAFx), Copenhagen, Denmark, 4\u20137 September 2023"},{"key":"12_CR25","doi-asserted-by":"publisher","unstructured":"Kim, J.W., et al.: Crepe: a convolutional representation for pitch estimation. In: Proceedings of the ICASSP, pp. 161\u2013165 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8461329","DOI":"10.1109\/ICASSP.2018.8461329"},{"key":"12_CR26","unstructured":"Iman, R.L., Davenport, J.M., Zeigler, D.K.: Latin hypercube sampling (program user\u2019s guide) (1980)"},{"key":"12_CR27","unstructured":"Nath, S., Khadilkar, H., Bhattacharyya, P.: Transformers are expressive, but are they expressive enough for regression? arXiv preprint arXiv:2402.15478 (2024)"},{"key":"12_CR28","doi-asserted-by":"publisher","unstructured":"Freedman, D., Diaconis, P.: On the histogram as a density estimator: L2 theory. Zeitschrift f\u00fcr Wahrscheinlichkeitstheorie und Verwandte Gebiete 57(4), 453\u2013476 (1981). https:\/\/doi.org\/10.1007\/BF01025868","DOI":"10.1007\/BF01025868"},{"issue":"10","key":"12_CR29","doi-asserted-by":"publisher","first-page":"12113","DOI":"10.1109\/TPAMI.2023.3275158","volume":"45","author":"P Xu","year":"2023","unstructured":"Xu, P., Zhu, X., Clifton, D.A.: Multimodal learning with transformers: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 45(10), 12113\u201312132 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2023.3275158","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR30","unstructured":"Baevski, A., et al.: Wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Proceedings of the 34th International Conference on Neural Information Processing Systems (NeurIPS), Red Hook, NY, USA, Article no. 1044, pp. 12449\u201312460. Curran Associates Inc. (2020)"},{"key":"12_CR31","doi-asserted-by":"publisher","unstructured":"Chang, H.-J., Yang, S., Lee, H.-Y.: DistilHuBERT: speech representation learning by layer-wise distillation of hidden-unit BERT. In: Proceedings of the ICASSP, pp. 7087\u20137091 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747490","DOI":"10.1109\/ICASSP43922.2022.9747490"},{"key":"12_CR32","doi-asserted-by":"publisher","unstructured":"Kumar, P., Sukhadia, V.N., Umesh, S.: Investigation of robustness of HuBERT features from different layers to domain, accent and language variations. In: Proceedings of the ICASSP, pp. 6887\u20136891 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746250","DOI":"10.1109\/ICASSP43922.2022.9746250"},{"key":"12_CR33","doi-asserted-by":"publisher","unstructured":"Yoon, J.W., Woo, B.J., Kim, N.S.: HuBERT-EE: early exiting HuBERT for efficient speech recognition. In: Proceedings of the Interspeech, pp. 2400\u20132404 (2024). https:\/\/doi.org\/10.21437\/Interspeech.2024-80","DOI":"10.21437\/Interspeech.2024-80"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Chinen, M., et al.: ViSQOL v3: an open source production ready objective speech and audio metric. In: 2020 QoMEX, pp. 1\u20136. IEEE (2020)","DOI":"10.1109\/QoMEX48832.2020.9123150"},{"key":"12_CR35","unstructured":"Xu, Z.: PinkVocalTransformer Project Page. https:\/\/zhiyuanxu27.github.io\/pinkVocalTransformer\/. Accessed 04 July 2025"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07959-6_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:16Z","timestamp":1760260936000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07959-6_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"ISBN":["9783032079589","9783032079596"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07959-6_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"13 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Szeged","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hungary","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom.inf.u-szeged.hu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}