{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T07:11:38Z","timestamp":1764400298730,"version":"3.46.0"},"reference-count":20,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,22]]},"DOI":"10.1109\/apsipaasc65261.2025.11249327","type":"proceedings-article","created":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T18:40:26Z","timestamp":1764355226000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["Dialospeech: Dual-Speaker Dialogue Generation with LLM and Flow Matching"],"prefix":"10.1109","author":[{"given":"Hanke","family":"Xie","sequence":"first","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dake","family":"Guo","sequence":"additional","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengyou","family":"Wang","sequence":"additional","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yue","family":"Li","sequence":"additional","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenjie","family":"Tian","sequence":"additional","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinfa","family":"Zhu","sequence":"additional","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinsheng","family":"Wang","sequence":"additional","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiulin","family":"Li","sequence":"additional","affiliation":[{"name":"DataBaker (Qingdao) Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guanqiong","family":"Miao","sequence":"additional","affiliation":[{"name":"DataBaker (Qingdao) Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Liu","sequence":"additional","affiliation":[{"name":"DataBaker (Qingdao) Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"ASLP, Northwestern Polytechnical University,Xi&#x0027;an,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Neural codec language models are zero-shot text to speech synthesizers","volume":"abs\/2301.02111","author":"Wang","year":"2023","journal-title":"CoRR"},{"key":"ref2","article-title":"High fidelity neural audio compression","volume":"2023","author":"D\u00e9fossez","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"journal-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","year":"2020","author":"Baevski","key":"ref4"},{"key":"ref5","article-title":"Parrottts: Text-to-speech synthesis by exploiting self-supervised representations","volume":"abs\/2303.01261","author":"Kosgi","year":"2023","journal-title":"CoRR"},{"volume-title":"Hierspeech: Bridging the gap between text and speech by hierarchical variational inference using self-supervised representations for speech synthesis","year":"2022","author":"Lee","key":"ref6"},{"key":"ref7","article-title":"Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens","volume":"abs\/2407.05407","author":"Du","year":"2024","journal-title":"CoRR"},{"key":"ref8","article-title":"Matcha-tts: A fast TTS architecture with conditional flow matching","volume":"abs\/2309.03199","author":"Mehta","year":"2023","journal-title":"CoRR"},{"volume-title":"Covomix: Advancing zero-shot speech generation for human-like multitalker conversations","year":"2024","author":"Zhang","key":"ref9"},{"key":"ref10","article-title":"Mooncast: Highquality zero-shot podcast generation","volume":"abs\/2503.14345","author":"Ju","year":"2025","journal-title":"CoRR"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461404"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-9996"},{"journal-title":"Reverb: Open-source asr and diarization from rev","year":"2024","author":"Bhandari","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746108"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3357036"},{"key":"ref16","article-title":"Cosyvoice 2: Scalable streaming speech synthesis with large language models","volume":"abs\/2412.10117","author":"Du","year":"2024","journal-title":"CoRR"},{"volume-title":"Bigvgan: A universal neural vocoder with largescale training","year":"2023","author":"Lee","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-2650"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00545"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.313"}],"event":{"name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","start":{"date-parts":[[2025,10,22]]},"location":"Singapore, Singapore","end":{"date-parts":[[2025,10,24]]}},"container-title":["2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11248853\/11248968\/11249327.pdf?arnumber=11249327","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T07:09:06Z","timestamp":1764400146000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11249327\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":20,"URL":"https:\/\/doi.org\/10.1109\/apsipaasc65261.2025.11249327","relation":{},"subject":[],"published":{"date-parts":[[2025,10,22]]}}}