{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T12:43:50Z","timestamp":1776084230378,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T00:00:00Z","timestamp":1710720000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3,18]]},"DOI":"10.1145\/3640543.3645165","type":"proceedings-article","created":{"date-parts":[[2024,4,5]],"date-time":"2024-04-05T18:23:12Z","timestamp":1712341392000},"page":"259-273","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Creating an African American-Sounding TTS: Guidelines, Technical Challenges, and Surprising Evaluations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6715-1290","authenticated-orcid":false,"given":"Claudio Santos","family":"Pinhanez","sequence":"first","affiliation":[{"name":"IBM Research, Brazil"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7650-193X","authenticated-orcid":false,"given":"Raul","family":"Fernandez","sequence":"additional","affiliation":[{"name":"IBM Research, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1738-332X","authenticated-orcid":false,"given":"Marcelo Carpinette","family":"Grave","sequence":"additional","affiliation":[{"name":"IBM Research, Brazil"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9764-6874","authenticated-orcid":false,"given":"Julio","family":"Nogima","sequence":"additional","affiliation":[{"name":"IBM Research, Brazil"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1327-5160","authenticated-orcid":false,"given":"Ron","family":"Hoory","sequence":"additional","affiliation":[{"name":"IBM Research, Israel"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,4,5]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"The American robot: A cultural history","author":"Abnet A","unstructured":"Dustin\u00a0A Abnet. 2020. The American robot: A cultural history. University of Chicago Press, Chicago, USA."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPA.2018.8659465"},{"key":"e_1_3_2_2_3_1","volume-title":"11th International Conference on Voice Physiology and Biomechanics, Vol.\u00a012","author":"Arjmandi M","year":"2018","unstructured":"M Arjmandi, Laura\u00a0C Dilley, and Suzanne\u00a0E Wagner. 2018. Investigation of acoustic dimension use in dialect production: machine learning of sonorant sounds for modeling acoustic cues of African American dialect. In 11th International Conference on Voice Physiology and Biomechanics, Vol.\u00a012. ICVPB, East Lansing, MI, USA, 13."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.4989084"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3171221.3171260"},{"key":"e_1_3_2_2_6_1","volume-title":"Proc. of CSCW\u201919","author":"Cambre Julia","year":"2019","unstructured":"Julia Cambre and Chinmay Kulkarni. 2019. One voice fits all? Social implications and research challenges of designing voices for smart devices. In Proc. of CSCW\u201919. ACM, Austin, TX, USA, 1\u201319."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13347-020-00415-6"},{"key":"e_1_3_2_2_8_1","volume-title":"Proc. of the 18th International Congress of Phonetic Sciences. International Phonetic Association","author":"Chasaide Ailbhe\u00a0N\u00ed","year":"2015","unstructured":"Ailbhe\u00a0N\u00ed Chasaide, Neasa\u00a0N\u00ed Chiar\u00e1in, Harald Berthelsen, Christoph Wendler, and Andrew Murphy. 2015. Speech technology as documentation for endangered language preservation: The case of Irish. In Proc. of the 18th International Congress of Phonetic Sciences. International Phonetic Association, Glascow, UK, 1\u20135."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3148148"},{"key":"e_1_3_2_2_10_1","volume-title":"Critical Race Theory: An Introduction","author":"Delgado Richard","unstructured":"Richard Delgado and Jean Stefancic. 2011. Critical Race Theory: An Introduction. NYU Press, New York, NY, USA."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1002\/wcs.1550"},{"key":"e_1_3_2_2_12_1","volume-title":"The White Racial Frame: Centuries of Racial Framing and Counter-framing","author":"Feagin R","unstructured":"Joe\u00a0R Feagin. 2020. The White Racial Frame: Centuries of Racial Framing and Counter-framing. Routledge, New York, NY, USA."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Anjalie Field Su\u00a0Lin Blodgett Zeerak Waseem and Yulia Tsvetkov. 2021. A survey of race racism and anti-racism in NLP. arXiv:2106.11410","DOI":"10.18653\/v1\/2021.acl-long.149"},{"key":"e_1_3_2_2_14_1","volume-title":"African American English \u2013 A Linguistic Introduction","author":"Green Lisa","unstructured":"Lisa Green. 2002. African American English \u2013 A Linguistic Introduction. Cambridge University Press, Cambridge."},{"key":"e_1_3_2_2_15_1","volume-title":"Conversational End-to-End TTS for Voice Agents. In IEEE Spoken Language Technology Workshop (SLT 21)","author":"Zhang Shaofei","year":"2021","unstructured":"Haohan. Guo, Shaofei Zhang, Frank\u00a0K. Soong, Lei He, and Lei Xie. 2021. Conversational End-to-End TTS for Voice Agents. In IEEE Spoken Language Technology Workshop (SLT 21). IEEE, Shenzhen, China, 403\u2013409."},{"key":"e_1_3_2_2_16_1","volume-title":"Handbook of research methods in consumer psychology, Frank\u00a0R Kardes, Paul\u00a0M Herr","author":"Hauser David","unstructured":"David Hauser, Gabriele Paolacci, and Jesse\u00a0J Chandler. 2019. Common concerns with MTurk as a participant pool: Evidence and solutions. In Handbook of research methods in consumer psychology, Frank\u00a0R Kardes, Paul\u00a0M Herr, and Norbert Schwarz (Eds.). Routledge, New York, NY, USA."},{"key":"e_1_3_2_2_17_1","volume-title":"Proc. 12th Language Resources and Evaluation Conference (LREC","author":"He Fei","year":"2020","unstructured":"Fei He, Shan Hui\u00a0Cathy Chu, Oddur Kjartansson, Clara\u00a0E. Rivera, Anna Katanova, Alexander Gutkin, Isin Demirsahin, Cibu\u00a0C Johny, Martin Jansche, Supheakmungkol Sarin, and Knot Pipatsrisawat. 2020. Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems. In Proc. 12th Language Resources and Evaluation Conference (LREC 2020). European Language Resources Association (ELRA), Marseille, France, 6494\u20116503."},{"key":"e_1_3_2_2_18_1","volume-title":"Proc. of the The Web Conference 2018. International World Wide Web Conferences Steering Committee (IW3C2)","author":"Jim\u00e9nez Rafael\u00a0Zequeira","year":"2018","unstructured":"Rafael\u00a0Zequeira Jim\u00e9nez, Laura\u00a0Fern\u00e1ndez Fern\u00e1ndez\u00a0Gallardo, and Sebastian M\u00f6ller. 2018. Outliers Detection vs. Control Questions to Ensure Reliable Results in Crowdsourcing. A Speech Quality Assessment Case Study. In Proc. of the The Web Conference 2018. International World Wide Web Conferences Steering Committee (IW3C2), San Francisco, CA, USA, 1127\u20131130."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2018.8463298"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1357054.1357127"},{"key":"e_1_3_2_2_21_1","volume-title":"Talker\u2019s voice and gender stereotype in human auditory sentence processing-evidence from event-related brain potentials. Neuroscience letters 339, 3","author":"Lattner Sonja","year":"2003","unstructured":"Sonja Lattner and Angela\u00a0D Friederici. 2003. Talker\u2019s voice and gender stereotype in human auditory sentence processing-evidence from event-related brain potentials. Neuroscience letters 339, 3 (2003), 191\u2013194."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377325.3377488"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1002\/meet.14504901134"},{"key":"e_1_3_2_2_24_1","volume-title":"Agent Culture: Human-Agent Interaction in a Multicultural World","author":"Maldonado Heidy","unstructured":"Heidy Maldonado and Barbara Hayes-Roth. 2004. Toward cross-cultural believability in character design. In Agent Culture: Human-Agent Interaction in a Multicultural World. CRC Press, Boca Raton, FL, USA, 143\u2013175."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.7771\/1481-4374.2560"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4446"},{"key":"e_1_3_2_2_27_1","volume-title":"Standard English","author":"McWhorter John","unstructured":"John McWhorter. 1998. Word on the Street \u2013 Debunking the Myth of a \u201cPure\u201d Standard English. Basic Books, New York, NY, USA."},{"key":"e_1_3_2_2_28_1","volume-title":"Talking Back. Talking Black: Truths About America\u2019s Lingua Franca","author":"McWhorter John","unstructured":"John McWhorter. 2017. Talking Back. Talking Black: Truths About America\u2019s Lingua Franca. Bellevue Literary Press, New York, NY, USA."},{"key":"e_1_3_2_2_29_1","volume-title":"The acoustic correlates of perceived masculinity, perceived femininity, and perceived sexual orientation. Language and speech 50, 1","author":"Munson Benjamin","year":"2007","unstructured":"Benjamin Munson. 2007. The acoustic correlates of perceived masculinity, perceived femininity, and perceived sexual orientation. Language and speech 50, 1 (2007), 125\u2013142."},{"key":"e_1_3_2_2_30_1","volume-title":"Wired for speech: How voice activates and advances the human-computer relationship","author":"Nass Clifford\u00a0Ivar","unstructured":"Clifford\u00a0Ivar Nass and Scott Brave. 2005. Wired for speech: How voice activates and advances the human-computer relationship. MIT Press, Cambridge, MA, USA."},{"key":"e_1_3_2_2_31_1","volume-title":"Proc. of the 2018 International Conference on Human-robot Interaction (HRI 18)","author":"Phillips Elizabeth","year":"2018","unstructured":"Elizabeth Phillips, Xuan Zhao, Daniel Ullman, and Bertram\u00a0F Malle. 2018. What is human-like? decomposing robots\u2019 human-like appearance using the anthropomorphic robot (abot) database. In Proc. of the 2018 International Conference on Human-robot Interaction (HRI 18). ACM\/IEEE, Chicago, IL, USA, 105\u2013113."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2009.09.004"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1177\/0261927X99018001002"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2011.04.007"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1032"},{"key":"e_1_3_2_2_36_1","unstructured":"J. Shen Y. Jia M. Chrzanowski Y. Zhang I. Elias H. Zen and Y. Wu. 2020. Non-Attentive Tacotron: Robust and Controllable Neural TTS Synthesis Including Unsupervised Duration Modeling. arXiv:2010.04301"},{"key":"e_1_3_2_2_37_1","volume-title":"Proc. ICASSP 18","author":"Shen J.","unstructured":"J. Shen, R. R.\u00a0Pang, R.J. Weiss, M. Schuster, N. Jaitly, Z. Yang, Z. Chen, Y. Zhang, Y. Wang, R. Skerry-Ryan, R.A. Saurous, Y. Agiomyrgiannakis, and Y. Wu. 2018. Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions. In Proc. ICASSP 18. IEEE, Calgary, Canada, 4779\u20134783."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/MRA.2019.2927372"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1177\/0162243919862862"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ROMAN.2018.8525610"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12369-011-0100-4"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1178"},{"key":"e_1_3_2_2_43_1","volume-title":"LPCNET: Improving Neural Speech Synthesis through Linear Prediction. In Proc. of ICASSP 19","author":"Valin M.","year":"2019","unstructured":"J.\u00a0M. Valin and J. Skoglund. 2019. LPCNET: Improving Neural Speech Synthesis through Linear Prediction. In Proc. of ICASSP 19. IEEE, Brighton, England, 5891\u20135895."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.3704.738"}],"event":{"name":"IUI '24: 29th International Conference on Intelligent User Interfaces","location":"Greenville SC USA","acronym":"IUI '24","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 29th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640543.3645165","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3640543.3645165","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:59:01Z","timestamp":1764550741000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640543.3645165"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,18]]},"references-count":44,"alternative-id":["10.1145\/3640543.3645165","10.1145\/3640543"],"URL":"https:\/\/doi.org\/10.1145\/3640543.3645165","relation":{},"subject":[],"published":{"date-parts":[[2024,3,18]]},"assertion":[{"value":"2024-04-05","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}