{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:04:01Z","timestamp":1750309441017,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":132,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T00:00:00Z","timestamp":1742774400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Ministry of Education, Singapore","award":["T2EP20121-0040"],"award-info":[{"award-number":["T2EP20121-0040"]}]},{"name":"Infocomm Media Development Authority under its Trust Tech Funding Initiative","award":["DTC-RGC-09"],"award-info":[{"award-number":["DTC-RGC-09"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,24]]},"DOI":"10.1145\/3708359.3712105","type":"proceedings-article","created":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T12:50:34Z","timestamp":1742388634000},"page":"1203-1231","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Robust Relatable Explanations of Machine Learning with Disentangled Cue-specific Saliency"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6457-0713","authenticated-orcid":false,"given":"Harshavardhan Sunil","family":"Abichandani","sequence":"first","affiliation":[{"name":"Department of Computer Science, National University of Singapore, Singapore, Singapore,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1119-0957","authenticated-orcid":false,"given":"Wencan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, National University of Singapore, Singapore, Singapore,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0543-2414","authenticated-orcid":false,"given":"Brian Y","family":"Lim","sequence":"additional","affiliation":[{"name":"Department of Computer Science, National University of Singapore, Singapore, Singapore,"}]}],"member":"320","published-online":{"date-parts":[[2025,3,24]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"crossref","unstructured":"Abdelaziz\u00a0A Abdelhamid El-Sayed\u00a0M El-Kenawy Bandar Alotaibi Ghada\u00a0M Amer Mahmoud\u00a0Y Abdelkader Abdelhameed Ibrahim and Marwa\u00a0Metwally Eid. 2022. Robust speech emotion recognition using CNN+ LSTM based on stochastic fractal search optimization algorithm. Ieee Access 10 (2022) 49265\u201349284.","DOI":"10.1109\/ACCESS.2022.3172954"},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174156"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376615"},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1990.115971"},{"key":"e_1_3_3_3_6_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"crossref","unstructured":"Purvi Agrawal and Sriram Ganapathy. 2020. Interpretable representation learning for speech and audio signals based on relevance weighting. IEEE\/ACM Transactions on Audio Speech and Language Processing 28 (2020) 2823\u20132836.","DOI":"10.1109\/TASLP.2020.3030489"},{"key":"e_1_3_3_3_8_2","unstructured":"Marco Ancona Enea Ceolini Cengiz \u00d6ztireli and Markus Gross. 2017. Towards better understanding of gradient-based attribution methods for deep neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.06104 (2017)."},{"key":"e_1_3_3_3_9_2","first-page":"1121","volume-title":"International Conference on Machine Learning","author":"Asi Hilal","year":"2023","unstructured":"Hilal Asi, Jonathan Ullman, and Lydia Zakynthinou. 2023. From robustness to privacy and back. In International Conference on Machine Learning. PMLR, 1121\u20131146."},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445717"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.354"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"crossref","unstructured":"Yoshua Bengio Aaron Courville and Pascal Vincent. 2013. Representation learning: A review and new perspectives. IEEE transactions on pattern analysis and machine intelligence 35 8 (2013) 1798\u20131828.","DOI":"10.1109\/TPAMI.2013.50"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"crossref","unstructured":"Dmitri Bitouk Ragini Verma and Ani Nenkova. 2010. Class-level spectral features for emotion recognition. Speech communication 52 7-8 (2010) 613\u2013625.","DOI":"10.1016\/j.specom.2010.02.010"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642689"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"crossref","unstructured":"Antonis Botinis Bjorn Granstr\u00f6m and Bernd M\u00f6bius. 2001. Developments and paradigms in intonation research. Speech communication 33 4 (2001) 263\u2013296.","DOI":"10.1016\/S0167-6393(00)00060-1"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2009-106"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01129"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"crossref","unstructured":"Zana Bu\u00e7inca Maja\u00a0Barbara Malaya and Krzysztof\u00a0Z Gajos. 2021. To trust or to think: cognitive forcing functions can reduce overreliance on AI in AI-assisted decision-making. Proceedings of the ACM on Human-computer Interaction 5 CSCW1 (2021) 1\u201321.","DOI":"10.1145\/3449287"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3301275.3302289"},{"key":"e_1_3_3_3_20_2","unstructured":"Ricky\u00a0TQ Chen Xuechen Li Roger\u00a0B Grosse and David\u00a0K Duvenaud. 2018. Isolating sources of disentanglement in variational autoencoders. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"crossref","unstructured":"Valerie Chen Q\u00a0Vera Liao Jennifer Wortman\u00a0Vaughan and Gagan Bansal. 2023. Understanding the role of human intuition on reliance in human-AI decision-making with explanations. Proceedings of the ACM on Human-computer Interaction 7 CSCW2 (2023) 1\u201332.","DOI":"10.1145\/3610219"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"crossref","unstructured":"Po-Sheng Chiu Jia-Wei Chang Ming-Che Lee Ching-Hui Chen and Da-Sheng Lee. 2020. Enabling intelligent environment by the design of emotionally aware virtual assistant: A case of smart campus. IEEE Access 8 (2020) 62032\u201362041.","DOI":"10.1109\/ACCESS.2020.2984383"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"crossref","unstructured":"Alain De\u00a0Cheveign\u00e9 and Hideki Kawahara. 2002. YIN a fundamental frequency estimator for speech and music. The Journal of the Acoustical Society of America 111 4 (2002) 1917\u20131930.","DOI":"10.1121\/1.1458024"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"crossref","unstructured":"Bruno Defraene Naim Mansour Steven De\u00a0Hertogh Toon Van\u00a0Waterschoot Moritz Diehl and Marc Moonen. 2013. Declipping of audio signals using perceptual compressed sensing. IEEE Transactions on Audio Speech and Language Processing 21 12 (2013) 2627\u20132637.","DOI":"10.1109\/TASL.2013.2281570"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"crossref","unstructured":"John Dubnowski Ronald Schafer and Lawrence Rabiner. 1976. Real-time digital hardware pitch detector. IEEE Transactions on Acoustics Speech and Signal Processing 24 1 (1976) 2\u20138.","DOI":"10.1109\/TASSP.1976.1162765"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/1536414.1536466"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3301275.3302316"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.371"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013681"},{"key":"e_1_3_3_3_30_2","unstructured":"Justin Gilmer Ryan\u00a0P Adams Ian Goodfellow David Andersen and George\u00a0E Dahl. 2018. Motivating the rules of the game for adversarial example research. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1807.06732 (2018)."},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"crossref","unstructured":"Daniel\u00a0G Goldstein and David Rothschild. 2014. Lay understanding of probability distributions. Judgment & Decision Making 9 1 (2014).","DOI":"10.1017\/S1930297500004940"},{"key":"e_1_3_3_3_32_2","first-page":"2376","volume-title":"International Conference on Machine Learning","author":"Goyal Yash","year":"2019","unstructured":"Yash Goyal, Ziyan Wu, Jan Ernst, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Counterfactual visual explanations. In International Conference on Machine Learning. PMLR, 2376\u20132384."},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"crossref","unstructured":"Beate Grawemeyer Manolis Mavrikis Wayne Holmes Sergio Guti\u00e9rrez-Santos Michael Wiedmann and Nikol Rummel. 2017. Affective learning: Improving engagement and enhancing learning with affect-aware feedback. User Modeling and User-Adapted Interaction 27 (2017) 119\u2013158.","DOI":"10.1007\/s11257-017-9188-z"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00502"},{"key":"e_1_3_3_3_35_2","unstructured":"Dan Hendrycks Norman Mu Ekin\u00a0D Cubuk Barret Zoph Justin Gilmer and Balaji Lakshminarayanan. 2019. Augmix: A simple data processing method to improve robustness and uncertainty. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1912.02781 (2019)."},{"key":"e_1_3_3_3_36_2","unstructured":"Irina Higgins Loic Matthey Arka Pal Christopher\u00a0P Burgess Xavier Glorot Matthew\u00a0M Botvinick Shakir Mohamed and Alexander Lerchner. 2017. beta-vae: Learning basic visual concepts with a constrained variational framework. ICLR (Poster) 3 (2017)."},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3564246.3585115"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654984"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"crossref","unstructured":"Jeff Hwang Moto Hira Caroline Chen Xiaohui Zhang Zhaoheng Ni Guangzhi Sun Pingchuan Ma Ruizhe Huang Vineel Pratap Yuekai Zhang Anurag Kumar Chin-Yun Yu Chuang Zhu Chunxi Liu Jacob Kahn Mirco Ravanelli Peng Sun Shinji Watanabe Yangyang Shi Yumeng Tao Robin Scheibler Samuele Cornell Sean Kim and Stavros Petridis. 2023. TorchAudio 2.1: Advancing speech recognition self-supervised learning and audio processing components for PyTorch. arxiv:https:\/\/arXiv.org\/abs\/2310.17864\u00a0[eess.AS]","DOI":"10.1109\/ASRU57964.2023.10389648"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"crossref","unstructured":"Jinglu Jiang Surinder Kahai and Ming Yang. 2022. Who needs explanation and when? Juggling explainable AI and user epistemic uncertainty. International Journal of Human-Computer Studies 165 (2022) 102839.","DOI":"10.1016\/j.ijhcs.2022.102839"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"crossref","unstructured":"Peng-Tao Jiang Chang-Bin Zhang Qibin Hou Ming-Ming Cheng and Yunchao Wei. 2021. Layercam: Exploring hierarchical class activation maps for localization. IEEE Transactions on Image Processing 30 (2021) 5875\u20135888.","DOI":"10.1109\/TIP.2021.3089943"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"crossref","unstructured":"Patrik\u00a0N Juslin and Petri Laukka. 2001. Impact of intended emotion intensity on cue utilization and decoding accuracy in vocal expression of emotion. Emotion 1 4 (2001) 381.","DOI":"10.1037\/\/1528-3542.1.4.381"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"crossref","unstructured":"Athanasios Kallipolitis Michael Galliakis Andreas Menychtas and Ilias Maglogiannis. 2020. Affective analysis of patients in homecare video-assisted telemedicine using computational intelligence. Neural Computing and Applications 32 23 (2020) 17125\u201317136.","DOI":"10.1007\/s00521-020-05203-z"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-06417-3_55"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376219"},{"key":"e_1_3_3_3_47_2","first-page":"2668","volume-title":"International conference on machine learning","author":"Kim Been","year":"2018","unstructured":"Been Kim, Martin Wattenberg, Justin Gilmer, Carrie Cai, James Wexler, Fernanda Viegas, et\u00a0al. 2018. Interpretability beyond feature attribution: Quantitative testing with concept activation vectors (TCAV). In International conference on machine learning. PMLR, 2668\u20132677."},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"crossref","unstructured":"Doh-Suk Kim Soo-Young Lee and Rhee\u00a0Man Kil. 1999. Auditory processing of speech signals for robust speech recognition in real-world noisy environments. IEEE Transactions on speech and audio processing 7 1 (1999) 55\u201369.","DOI":"10.1109\/89.736331"},{"key":"e_1_3_3_3_49_2","unstructured":"Diederik\u00a0P Kingma. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300641"},{"key":"e_1_3_3_3_51_2","first-page":"5338","volume-title":"International Conference on Machine Learning","author":"Koh Pang\u00a0Wei","year":"2020","unstructured":"Pang\u00a0Wei Koh, Thao Nguyen, Yew\u00a0Siang Tang, Stephen Mussmann, Emma Pierson, Been Kim, and Percy Liang. 2020. Concept bottleneck models. In International Conference on Machine Learning. PMLR, 5338\u20135348."},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"crossref","unstructured":"Jacques Koreman. 2006. Perceived speech rate: The effects of articulation rate and speaking style in spontaneous speech. The Journal of the Acoustical Society of America 119 1 (2006) 582\u2013596.","DOI":"10.1121\/1.2133436"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"crossref","unstructured":"Felix Kreuk Joseph Keshet and Yossi Adi. 2020. Self-supervised contrastive learning for unsupervised phoneme segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2007.13465 (2020).","DOI":"10.21437\/Interspeech.2020-2398"},{"key":"e_1_3_3_3_54_2","volume-title":"NeurIPS Workshop on Interpretability and Robustness in Audio, Speech, and Language (IRASL)","author":"Krug Andreas","year":"2018","unstructured":"Andreas Krug, Ren\u00e9 Knaebel, and Sebastian Stober. 2018. Neuron activation profiles for interpreting convolutional speech recognition models. In NeurIPS Workshop on Interpretability and Robustness in Audio, Speech, and Language (IRASL)."},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5421"},{"key":"e_1_3_3_3_56_2","unstructured":"Samuli Laine Tero Karras Jaakko Lehtinen and Timo Aila. 2019. High-quality self-supervised deep image denoising. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_3_3_57_2","first-page":"5628","volume-title":"International conference on machine learning","author":"Lakkaraju Himabindu","year":"2020","unstructured":"Himabindu Lakkaraju, Nino Arsov, and Osbert Bastani. 2020. Robust and stable black box explanations. In International conference on machine learning. PMLR, 5628\u20135638."},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3375627.3375833"},{"key":"e_1_3_3_3_59_2","unstructured":"Balaji Lakshminarayanan Alexander Pritzel and Charles Blundell. 2017. Simple and scalable predictive uncertainty estimation using deep ensembles. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_3_60_2","volume-title":"Impact of noise in automatic speech recognition for low-resourced languages","author":"Lakshminarayanan Vigneshwar","year":"2022","unstructured":"Vigneshwar Lakshminarayanan. 2022. Impact of noise in automatic speech recognition for low-resourced languages. Rochester Institute of Technology."},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","unstructured":"Charles\u00a0R. Larson Jean Sun and Timothy\u00a0C. Hain. 2007. Effects of simultaneous perturbations of voice pitch and loudness feedback on voice F and amplitude control. The Journal of the Acoustical Society of America 121 5 (May 2007) 2862\u20132872. 10.1121\/1.2715657","DOI":"10.1121\/1.2715657"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054675"},{"key":"e_1_3_3_3_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00960"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"crossref","unstructured":"Song Li Mustafa\u00a0Ozkan Yerebakan Yue Luo Ben Amaba William Swope and Boyi Hu. 2022. The effect of different occupational background noises on voice recognition accuracy. Journal of Computing and Information Science in Engineering 22 5 (2022) 050905.","DOI":"10.1115\/1.4053521"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/2030112.2030168"},{"key":"e_1_3_3_3_66_2","doi-asserted-by":"crossref","unstructured":"Zachary\u00a0C Lipton. 2018. The Mythos of Model Interpretability: In machine learning the concept of interpretability is both important and slippery. Queue 16 3 (2018) 31\u201357.","DOI":"10.1145\/3236386.3241340"},{"key":"e_1_3_3_3_67_2","doi-asserted-by":"crossref","unstructured":"Chi-Min Liu Han-Wen Hsu and Wen-Chieh Lee. 2008. Compression artifacts in perceptual audio coding. IEEE transactions on audio speech and language processing 16 4 (2008) 681\u2013695.","DOI":"10.1109\/TASL.2008.918979"},{"key":"e_1_3_3_3_68_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1257"},{"key":"e_1_3_3_3_69_2","first-page":"4765","volume-title":"Advances in neural information processing systems","author":"Lundberg Scott\u00a0M","year":"2017","unstructured":"Scott\u00a0M Lundberg and Su-In Lee. 2017. A unified approach to interpreting model predictions. In Advances in neural information processing systems. 4765\u20134774."},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"crossref","unstructured":"Keenan\u00a0R May Brianna\u00a0J Tomlinson Xiaomeng Ma Phillip Roberts and Bruce\u00a0N Walker. 2020. Spotlights and soundscapes: On the design of mixed reality auditory environments for persons with visual impairment. ACM Transactions on Accessible Computing (TACCESS) 13 2 (2020) 1\u201347.","DOI":"10.1145\/3378576"},{"key":"e_1_3_3_3_71_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096607"},{"key":"e_1_3_3_3_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022601"},{"key":"e_1_3_3_3_73_2","doi-asserted-by":"crossref","unstructured":"Tim Miller. 2019. Explanation in artificial intelligence: Insights from the social sciences. Artificial intelligence 267 (2019) 1\u201338.","DOI":"10.1016\/j.artint.2018.07.007"},{"key":"e_1_3_3_3_74_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952552"},{"key":"e_1_3_3_3_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01208"},{"key":"e_1_3_3_3_76_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.675368"},{"key":"e_1_3_3_3_77_2","doi-asserted-by":"crossref","unstructured":"Katelyn Morrison Philipp Spitzer Violet Turri Michelle Feng Niklas K\u00fchl and Adam Perer. 2024. The Impact of Imperfect XAI on Human-AI Decision-Making. Proceedings of the ACM on Human-Computer Interaction 8 CSCW1 (2024) 1\u201339.","DOI":"10.1145\/3641022"},{"key":"e_1_3_3_3_78_2","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372850"},{"key":"e_1_3_3_3_79_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414721"},{"key":"e_1_3_3_3_80_2","doi-asserted-by":"crossref","unstructured":"Andreas Nautsch Abelino Jim\u00e9nez Amos Treiber Jascha Kolberg Catherine Jasserand Els Kindt H\u00e9ctor Delgado Massimiliano Todisco Mohamed\u00a0Amine Hmani Aymen Mtibaa et\u00a0al. 2019. Preserving privacy in speaker and speech characterisation. Computer Speech & Language 58 (2019) 441\u2013480.","DOI":"10.1016\/j.csl.2019.06.001"},{"key":"e_1_3_3_3_81_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2006-277"},{"key":"e_1_3_3_3_82_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581641.3584067"},{"key":"e_1_3_3_3_83_2","doi-asserted-by":"crossref","unstructured":"Chris Olah Alexander Mordvintsev and Ludwig Schubert. 2017. Feature visualization. Distill 2 11 (2017) e7.","DOI":"10.23915\/distill.00007"},{"key":"e_1_3_3_3_84_2","doi-asserted-by":"crossref","unstructured":"Jose Patino Natalia Tomashenko Massimiliano Todisco Andreas Nautsch and Nicholas Evans. 2020. Speaker anonymisation using the McAdams coefficient. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2011.01130 (2020).","DOI":"10.21437\/Interspeech.2021-1070"},{"key":"e_1_3_3_3_85_2","doi-asserted-by":"crossref","unstructured":"Rosalind\u00a0W. Picard Elias Vyzas and Jennifer Healey. 2001. Toward machine emotional intelligence: Analysis of affective physiological state. IEEE transactions on pattern analysis and machine intelligence 23 10 (2001) 1175\u20131191.","DOI":"10.1109\/34.954607"},{"key":"e_1_3_3_3_86_2","first-page":"983","volume-title":"proceedings of the IEEE\/CVF winter conference on applications of computer vision","author":"Ramaswamy Harish\u00a0Guruprasad","year":"2020","unstructured":"Harish\u00a0Guruprasad Ramaswamy et\u00a0al. 2020. Ablation-cam: Visual explanations for deep convolutional network via gradient-free localization. In proceedings of the IEEE\/CVF winter conference on applications of computer vision. 983\u2013991."},{"key":"e_1_3_3_3_87_2","unstructured":"Xuanchi Ren Tao Yang Yuwang Wang and Wenjun Zeng. 2021. Learning disentangled representation by exploiting pretrained generative models: A contrastive learning view. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2102.10543 (2021)."},{"key":"e_1_3_3_3_88_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462417"},{"key":"e_1_3_3_3_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939778"},{"key":"e_1_3_3_3_90_2","doi-asserted-by":"crossref","unstructured":"Colleen Richey Maria\u00a0A Barrios Zeb Armstrong Chris Bartels Horacio Franco Martin Graciarena Aaron Lawson Mahesh\u00a0Kumar Nandwana Allen Stauffer Julien van Hout et\u00a0al. 2018. Voices obscured in complex environmental settings (voices) corpus. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.05053 (2018).","DOI":"10.21437\/Interspeech.2018-1454"},{"key":"e_1_3_3_3_91_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/371"},{"key":"e_1_3_3_3_92_2","first-page":"366","volume-title":"INTERSPEECH","author":"Rozgic Viktor","year":"2012","unstructured":"Viktor Rozgic, Sankaranarayanan Ananthakrishnan, Shirin Saleem, Rohit Kumar, Aravind\u00a0Namandi Vembu, and Rohit Prasad. 2012. Emotion Recognition using Acoustic and Lexical Features.. In INTERSPEECH , Vol.\u00a02012. 366\u2013369."},{"key":"e_1_3_3_3_93_2","doi-asserted-by":"publisher","DOI":"10.1145\/3301275.3302308"},{"key":"e_1_3_3_3_94_2","doi-asserted-by":"crossref","unstructured":"Philipp Schmidt Felix Biessmann and Timm Teubner. 2020. Transparency and trust in artificial intelligence systems. Journal of Decision Systems 29 4 (2020) 260\u2013278.","DOI":"10.1080\/12460125.2020.1819094"},{"key":"e_1_3_3_3_95_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642621"},{"key":"e_1_3_3_3_96_2","doi-asserted-by":"publisher","DOI":"10.6028\/NIST.SP.1270"},{"key":"e_1_3_3_3_97_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_3_3_98_2","volume-title":"International Telecommunication Union Radiocommunication Assembly","author":"Series BS","year":"2011","unstructured":"BS Series. 2011. Algorithms to measure audio programme loudness and true-peak audio level. In International Telecommunication Union Radiocommunication Assembly."},{"key":"e_1_3_3_3_99_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461893"},{"key":"e_1_3_3_3_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/3461702.3462533"},{"key":"e_1_3_3_3_101_2","unstructured":"Karen Simonyan Andrea Vedaldi and Andrew Zisserman. 2013. Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6034 (2013)."},{"key":"e_1_3_3_3_102_2","unstructured":"Karen Simonyan Andrea Vedaldi and Andrew Zisserman. 2014. Deep inside convolutional networks: Visualising image classification models and saliency maps. (2014)."},{"key":"e_1_3_3_3_103_2","unstructured":"Dylan Slack Anna Hilgard Himabindu Lakkaraju and Sameer Singh. 2021. Counterfactual explanations can be manipulated. Advances in neural information processing systems 34 (2021) 62\u201375."},{"key":"e_1_3_3_3_104_2","doi-asserted-by":"publisher","DOI":"10.1145\/3375627.3375830"},{"key":"e_1_3_3_3_105_2","doi-asserted-by":"crossref","unstructured":"George\u00a0M Slavich Sara Taylor and Rosalind\u00a0W Picard. 2019. Stress measurement using speech: Recent advancements validation issues and ethical and privacy considerations. Stress 22 4 (2019) 408\u2013413.","DOI":"10.1080\/10253890.2019.1584180"},{"key":"e_1_3_3_3_106_2","unstructured":"Jiaqi Su Zeyu Jin and Adam Finkelstein. 2020. HiFi-GAN: High-fidelity denoising and dereverberation based on speech deep features in adversarial networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.05694 (2020)."},{"key":"e_1_3_3_3_107_2","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA52581.2021.9632770"},{"key":"e_1_3_3_3_108_2","unstructured":"Mukund Sundararajan Ankur Taly and Qiqi Yan. 2017. Axiomatic attribution for deep networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1703.01365 (2017)."},{"key":"e_1_3_3_3_109_2","first-page":"6056","volume-title":"International Conference on Machine Learning","author":"Suter Raphael","year":"2019","unstructured":"Raphael Suter, Djordje Miladinovic, Bernhard Sch\u00f6lkopf, and Stefan Bauer. 2019. Robustly disentangled causal mechanisms: Validating deep representations for interventional robustness. In International Conference on Machine Learning. PMLR, 6056\u20136065."},{"key":"e_1_3_3_3_110_2","doi-asserted-by":"publisher","DOI":"10.1145\/3397481.3450662"},{"key":"e_1_3_3_3_111_2","unstructured":"Rohan Taori Achal Dave Vaishaal Shankar Nicholas Carlini Benjamin Recht and Ludwig Schmidt. 2020. Measuring robustness to natural distribution shifts in image classification. Advances in Neural Information Processing Systems 33 (2020) 18583\u201318599."},{"key":"e_1_3_3_3_112_2","unstructured":"J Thiemann N Ito and E Vincent. 2013. Diverse environments multichannel acoustic noise database (demand)."},{"key":"e_1_3_3_3_113_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462677"},{"key":"e_1_3_3_3_114_2","doi-asserted-by":"crossref","unstructured":"Adam\u00a0P Vogel and Angela\u00a0T Morgan. 2009. Factors affecting the quality of sound recording for speech and voice analysis. International journal of speech-language pathology 11 6 (2009) 431\u2013437.","DOI":"10.3109\/17549500902822189"},{"key":"e_1_3_3_3_115_2","doi-asserted-by":"crossref","unstructured":"Sandra Wachter Brent Mittelstadt and Chris Russell. 2017. Counterfactual explanations without opening the black box: Automated decisions and the GDPR. Harv. JL & Tech. 31 (2017) 841.","DOI":"10.2139\/ssrn.3063289"},{"key":"e_1_3_3_3_116_2","doi-asserted-by":"crossref","unstructured":"Danding Wang Wencan Zhang and Brian\u00a0Y Lim. 2021. Show or suppress? Managing input uncertainty in machine learning model explanations. Artificial Intelligence 294 (2021) 103456.","DOI":"10.1016\/j.artint.2021.103456"},{"key":"e_1_3_3_3_117_2","volume-title":"Workshops at the thirty-second AAAI conference on artificial intelligence","author":"Wang Zhiguang","year":"2018","unstructured":"Zhiguang Wang and Jianbo Yang. 2018. Diabetic retinopathy detection via deep convolutional networks for discriminative localization and visual explanation. In Workshops at the thirty-second AAAI conference on artificial intelligence."},{"key":"e_1_3_3_3_118_2","unstructured":"Jiawei Wu Xiaoya Li Xiang Ao Yuxian Meng Fei Wu and Jiwei Li. 2020. Improving robustness and generality of NLP models using disentangled representations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2009.09587 (2020)."},{"key":"e_1_3_3_3_119_2","doi-asserted-by":"crossref","unstructured":"Huan Xu and Shie Mannor. 2012. Robustness and generalization. Machine learning 86 (2012) 391\u2013423.","DOI":"10.1007\/s10994-011-5268-1"},{"key":"e_1_3_3_3_120_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-05643-7_33"},{"key":"e_1_3_3_3_121_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/741"},{"key":"e_1_3_3_3_122_2","doi-asserted-by":"publisher","DOI":"10.1145\/3377325.3377480"},{"key":"e_1_3_3_3_123_2","unstructured":"Yao-Yuan Yang Moto Hira Zhaoheng Ni Anjali Chourdia Artyom Astafurov Caroline Chen Ching-Feng Yeh Christian Puhrsch David Pollack Dmitriy Genzel Donny Greenberg Edward\u00a0Z. Yang Jason Lian Jay Mahadeokar Jeff Hwang Ji Chen Peter Goldsborough Prabhat Roy Sean Narenthiran Shinji Watanabe Soumith Chintala Vincent Quenneville-B\u00e9lair and Yangyang Shi. 2021. TorchAudio: Building Blocks for Audio and Speech Processing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2110.15018 (2021)."},{"key":"e_1_3_3_3_124_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639583"},{"key":"e_1_3_3_3_125_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517522"},{"key":"e_1_3_3_3_126_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501826"},{"key":"e_1_3_3_3_127_2","volume-title":"29th { USENIX} security symposium ({ USENIX} security 20)","author":"Zhang Xinyang","year":"2020","unstructured":"Xinyang Zhang, Ningfei Wang, Hua Shen, Shouling Ji, Xiapu Luo, and Ting Wang. 2020. Interpretable deep learning under fire. In 29th { USENIX} security symposium ({ USENIX} security 20)."},{"key":"e_1_3_3_3_128_2","doi-asserted-by":"crossref","unstructured":"Jianfeng Zhao Xia Mao and Lijiang Chen. 2019. Speech emotion recognition using deep 1D & 2D CNN LSTM networks. Biomedical Signal Processing and Control 47 (2019) 312\u2013323.","DOI":"10.1016\/j.bspc.2018.08.035"},{"key":"e_1_3_3_3_129_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00072"},{"key":"e_1_3_3_3_130_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"e_1_3_3_3_131_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_8"},{"key":"e_1_3_3_3_132_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413391"},{"key":"e_1_3_3_3_133_2","doi-asserted-by":"crossref","unstructured":"Pablo Zinemanas Mart\u00edn Rocamora Marius Miron Frederic Font and Xavier Serra. 2021. An interpretable deep learning model for automatic sound classification. Electronics 10 7 (2021) 850.","DOI":"10.3390\/electronics10070850"}],"event":{"name":"IUI '25: 30th International Conference on Intelligent User Interfaces","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"Cagliari Italy","acronym":"IUI '25"},"container-title":["Proceedings of the 30th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712105","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708359.3712105","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:09:46Z","timestamp":1750295386000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712105"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,24]]},"references-count":132,"alternative-id":["10.1145\/3708359.3712105","10.1145\/3708359"],"URL":"https:\/\/doi.org\/10.1145\/3708359.3712105","relation":{},"subject":[],"published":{"date-parts":[[2025,3,24]]},"assertion":[{"value":"2025-03-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}