{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T20:16:56Z","timestamp":1770149816677,"version":"3.49.0"},"reference-count":36,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389629","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:38:40Z","timestamp":1705689520000},"page":"1-7","source":"Crossref","is-referenced-by-count":4,"title":["HIGNN-TTS: Hierarchical Prosody Modeling With Graph Neural Networks for Expressive Long-Form TTS"],"prefix":"10.1109","author":[{"given":"Dake","family":"Guo","sequence":"first","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"}]},{"given":"Xinfa","family":"Zhu","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"}]},{"given":"Liumeng","family":"Xue","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen),School of Data Science,China"}]},{"given":"Tao","family":"Li","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"}]},{"given":"Yuanjun","family":"Lv","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"}]},{"given":"Yuepeng","family":"Jiang","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"}]},{"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref2","first-page":"3165","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume-title":"Proc. NeurIPS","author":"Ren"},{"key":"ref3","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. ICML","author":"Kim"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362069"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-610"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3164181"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095776"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-18"},{"key":"ref9","first-page":"4700","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"Proce. ICML","author":"Skerry-Ryan"},{"key":"ref10","first-page":"5167","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. ICML","author":"Wang"},{"key":"ref11","first-page":"595","article-title":"Predicting expressive speaking style from text in endto-end speech synthesis","volume-title":"Proc. SLT","author":"Stanton"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2571"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1129"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683501"},{"key":"ref15","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. NAACL-HLT","author":"Devlin"},{"key":"ref16","first-page":"5754","article-title":"Xlnet: Generalized autoregressive pretraining for language understanding","volume-title":"Proc. NeurIPS","author":"Yang"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3177"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1418"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10061"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/620"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414102"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3278184"},{"key":"ref23","volume":"abs\/2012.03763","author":"Gallegos","year":"2020","journal-title":"Using previous acoustic context to improve text-to-speech synthesis"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095866"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096247"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3202126"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747438"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11171"},{"key":"ref29","article-title":"Locating boundaries for prosodic constituents in unrestricted mandarin texts","volume":"6","author":"Chu","year":"2001","journal-title":"Int. J. Comput. Linguistics Chin. Lang. Process"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/214"},{"key":"ref31","first-page":"59:1","article-title":"Domain-adversarial training of neural networks","volume":"17","author":"Ganin","year":"2016","journal-title":"J. Mach. Learn. Res"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref33","volume-title":"A practical chinese dependency parser based on a largescale dataset","author":"Zhang","year":"2020"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095105"},{"key":"ref35","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. ICLR","author":"Ren"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18637\/jss.v031.i07"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Taipei, Taiwan","start":{"date-parts":[[2023,12,16]]},"end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389629.pdf?arnumber=10389629","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T16:31:48Z","timestamp":1706027508000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389629\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/asru57964.2023.10389629","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}