{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T03:09:48Z","timestamp":1772593788760,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3714263","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T03:30:09Z","timestamp":1745465409000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["SpeakEasy: Enhancing Text-to-Speech Interactions for Expressive Content Creation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3182-1498","authenticated-orcid":false,"given":"Stephen","family":"Brade","sequence":"first","affiliation":[{"name":"Electrical Engineering &amp; Computer Science Department, Massachusetts Institute of Technology, Cambridge, Massachusetts, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2374-9270","authenticated-orcid":false,"given":"Sam","family":"Anderson","sequence":"additional","affiliation":[{"name":"Adobe Research, New York, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6881-2783","authenticated-orcid":false,"given":"Rithesh","family":"Kumar","sequence":"additional","affiliation":[{"name":"Adobe Research, Toronto, Ontario, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0161-5915","authenticated-orcid":false,"given":"Zeyu","family":"Jin","sequence":"additional","affiliation":[{"name":"Adobe Research, San Francisco, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5409-7287","authenticated-orcid":false,"given":"Anh","family":"Truong","sequence":"additional","affiliation":[{"name":"Adobe Research, New York, New York, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"Adobe. [n. d.]. Clean up audio for free with Adobe Podcast AI. https:\/\/podcast.adobe.com\/enhance. Accessed: 2024-09-10."},{"key":"e_1_3_3_3_3_2","volume-title":"Adobe Audition","author":"Inc. Adobe","year":"2024","unstructured":"Adobe Inc.2024. Adobe Audition. https:\/\/www.adobe.com\/products\/audition.html Version 2024."},{"key":"e_1_3_3_3_4_2","unstructured":"Open AI. [n. d.]. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/. Accessed: 2024-12-04."},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"crossref","unstructured":"Max Bain Jaesung Huh Tengda Han and Andrew Zisserman. 2023. Whisperx: Time-accurate speech transcription of long-form audio. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.00747 (2023).","DOI":"10.21437\/Interspeech.2023-78"},{"key":"e_1_3_3_3_6_2","volume-title":"TorToiSe text-to-speech","author":"Betker James","year":"2022","unstructured":"James Betker. 2022. TorToiSe text-to-speech. https:\/\/github.com\/neonbjb\/tortoise-tts"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"crossref","unstructured":"Paola Bonifacci Elisa Colombini Michele Marzocchi Valentina Tobia and Lorenzo Desideri. 2022. Text-to-speech applications to reduce mind wandering in students with dyslexia. Journal of Computer Assisted Learning 38 2 (2022) 440\u2013454.","DOI":"10.1111\/jcal.12624"},{"key":"e_1_3_3_3_8_2","unstructured":"Zal\u00e1n Borsos Matt Sharifi Damien Vincent Eugene Kharitonov Neil Zeghidour and Marco Tagliasacchi. 2023. Soundstorm: Efficient parallel audio generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.09636 (2023)."},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606725"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376789"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447526.3472057"},{"key":"e_1_3_3_3_12_2","unstructured":"Sanyuan Chen Shujie Liu Long Zhou Yanqing Liu Xu Tan Jinyu Li Sheng Zhao Yao Qian and Furu Wei. 2024. VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.05370 (2024)."},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545676"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3650818"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517564"},{"key":"e_1_3_3_3_16_2","unstructured":"Thierry Dutoit. 1997. High-quality text-to-speech synthesis: An overview. Journal Of Electrical And Electronics Engineering Australia 17 1 (1997) 25\u201336."},{"key":"e_1_3_3_3_17_2","unstructured":"ElevenLabs. 2024. ElevenLabs Launches New Generative Voice AI Products and Announces $19M Series A Round Led by Nat Friedman Daniel Gross and Andreessen Horowitz. https:\/\/elevenlabs.io\/blog\/elevenlabs-launches-new-generative-voice-ai-products-and-announces-19m-series-a-round-led-by-nat-friedman-daniel-gross-and-andreessen-horowitz"},{"key":"e_1_3_3_3_18_2","unstructured":"ElevenLabs. 2024. ElevenLabs Official Website. https:\/\/elevenlabs.io"},{"key":"e_1_3_3_3_19_2","unstructured":"ElevenLabs. 2024. Speech-to-Speech Technology Documentation. https:\/\/elevenlabs.io\/docs\/speech-synthesis\/speech-to-speech"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173857"},{"key":"e_1_3_3_3_21_2","unstructured":"Zhifang Guo Yichong Leng Yihan Wu Sheng Zhao and Xu Tan. 2022. PromptTTS: Controllable Text-to-Speech with Text Descriptions. arxiv:https:\/\/arXiv.org\/abs\/2211.12171\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2211.12171"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642817"},{"key":"e_1_3_3_3_23_2","unstructured":"Discord Inc.2024. Discord Official Website. https:\/\/discord.com\/"},{"key":"e_1_3_3_3_24_2","unstructured":"Reddit Inc.2024. Reddit Official Website. https:\/\/reddit.com"},{"key":"e_1_3_3_3_25_2","unstructured":"Speechify Inc. 2024. Speechify Official Website. https:\/\/speechify.com"},{"key":"e_1_3_3_3_26_2","unstructured":"User\u00a0Interviews Inc.2024. User Interviews Official Website. https:\/\/www.userinterviews.com\/"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3405755.3406130"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3405755.3406130"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445074"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642476"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581215"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411763.3451745"},{"key":"e_1_3_3_3_33_2","unstructured":"Yichong Leng Zhifang Guo Kai Shen Xu Tan Zeqian Ju Yanqing Liu Yufei Liu Dongchao Yang Leying Zhang Kaitao Song Lei He Xiang-Yang Li Sheng Zhao Tao Qin and Jiang Bian. 2023. PromptTTS 2: Describing and Generating Voices with Text Prompt. arxiv:https:\/\/arXiv.org\/abs\/2309.02285\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2309.02285"},{"key":"e_1_3_3_3_34_2","unstructured":"Guanghou Liu Yongmao Zhang Yi Lei Yunlin Chen Rui Wang Zhifei Li and Lei Xie. 2023. PromptStyle: Controllable Style Transfer for Text-to-Speech with Natural Language Descriptions. arxiv:https:\/\/arXiv.org\/abs\/2305.19522\u00a0[cs.SD] https:\/\/arxiv.org\/abs\/2305.19522"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"crossref","unstructured":"David Malah. 1979. Time-domain algorithms for harmonic bandwidth reduction and time scaling of speech signals. IEEE Transactions on Acoustics Speech and Signal Processing 27 2 (1979) 121\u2013133.","DOI":"10.1109\/TASSP.1979.1163210"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502093"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/2807442.2807464"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"crossref","unstructured":"James\u00a0A Russell. 1980. A circumplex model of affect. Journal of personality and social psychology 39 6 (1980) 1161.","DOI":"10.1037\/h0077714"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376338"},{"key":"e_1_3_3_3_40_2","unstructured":"Phoebe Sparrow. [n. d.]. What is Stanislavski Technique. https:\/\/www.city-academy.com\/news\/what-is-stanislavski-technique. Accessed: 2024-09-10."},{"key":"e_1_3_3_3_41_2","unstructured":"LLC. TED\u00a0Conferences. [n. d.]. TED Talks: Discover ideas worth spreading. https:\/\/www.ted.com\/talks. Accessed: 2024-09-10."},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","unstructured":"Andreas Triantafyllopoulos Bj\u00f6rn\u00a0W. Schuller G\u00f6k\u00e7e \u0130ymen Metin Sezgin Xiangheng He Zijiang Yang Panagiotis Tzirakis Shuo Liu Silvan Mertes Elisabeth Andr\u00e9 Ruibo Fu and Jianhua Tao. 2023. An Overview of Affective Speech Synthesis and Conversion in the Deep Learning Era. Proc. IEEE 111 10 (2023) 1355\u20131381. 10.1109\/JPROC.2023.3250266","DOI":"10.1109\/JPROC.2023.3250266"},{"key":"e_1_3_3_3_43_2","unstructured":"Aaron Van Den\u00a0Oord Sander Dieleman Heiga Zen Karen Simonyan Oriol Vinyals Alex Graves Nal Kalchbrenner Andrew Senior Koray Kavukcuoglu et\u00a0al. 2016. Wavenet: A generative model for raw audio. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.03499 12 (2016)."},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126661"},{"key":"e_1_3_3_3_45_2","unstructured":"Chengyi Wang Sanyuan Chen Yu Wu Ziqiang Zhang Long Zhou Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et\u00a0al. 2023. Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.02111 (2023)."},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376726"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"e_1_3_3_3_48_2","unstructured":"Cliff Weitzman. 2024. Speechify Founder Discusses Future of AI with the Everyday AI Podcast. https:\/\/speechify.com\/blog\/speechify-founder-discusses-future-of-ai"}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714263","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3714263","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T04:55:01Z","timestamp":1751604901000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714263"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":47,"alternative-id":["10.1145\/3706598.3714263","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3714263","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}