{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T14:52:13Z","timestamp":1774709533183,"version":"3.50.1"},"reference-count":81,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100021851","name":"William Demant Fonden","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100021851","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Speech Communication"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.specom.2026.103377","type":"journal-article","created":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T00:25:32Z","timestamp":1773188732000},"page":"103377","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["TSIP-Net: No-reference speech intelligibility prediction in the presence of competing speech"],"prefix":"10.1016","volume":"179","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3532-3455","authenticated-orcid":false,"given":"Haolan","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5322-2449","authenticated-orcid":false,"given":"Wai-Yip","family":"Chan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1478-622X","authenticated-orcid":false,"given":"Jesper","family":"Jensen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.specom.2026.103377_b1","series-title":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"The 2nd clarity enhancement challenge for hearing aid speech intelligibility enhancement: Overview and outcomes","author":"Akeroyd","year":"2023"},{"key":"10.1016\/j.specom.2026.103377_b2","series-title":"2017 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5085","article-title":"A non-intrusive short-time objective intelligibility measure","author":"Andersen","year":"2017"},{"key":"10.1016\/j.specom.2026.103377_b3","doi-asserted-by":"crossref","first-page":"1925","DOI":"10.1109\/TASLP.2018.2847459","article-title":"Nonintrusive speech intelligibility prediction using convolutional neural networks","volume":"26","author":"Andersen","year":"2018","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b4","series-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"10.1016\/j.specom.2026.103377_b5","series-title":"Advances in Neural Information Processing Systems","first-page":"12449","article-title":"Wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations","author":"Baevski","year":"2020"},{"key":"10.1016\/j.specom.2026.103377_b6","series-title":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6469","article-title":"Within-sample variability-invariant loss for robust speaker recognition under noisy environments","author":"Cai","year":"2020"},{"key":"10.1016\/j.specom.2026.103377_b7","doi-asserted-by":"crossref","first-page":"311","DOI":"10.1016\/j.bspc.2012.11.007","article-title":"Predicting the intelligibility of reverberant speech for cochlear implant listeners with a non-intrusive intelligibility measure","volume":"8","author":"Chen","year":"2013","journal-title":"Biomed. Signal Process. Control."},{"key":"10.1016\/j.specom.2026.103377_b8","doi-asserted-by":"crossref","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","article-title":"WavLM: Large-scale self-supervised pre-training for full stack speech processing","volume":"16","author":"Chen","year":"2022","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"10.1016\/j.specom.2026.103377_b9","series-title":"Proc. Interspeech","first-page":"1373","article-title":"Evaluating near end listening enhancement algorithms in realistic environments","author":"Chermaz","year":"2019"},{"key":"10.1016\/j.specom.2026.103377_b10","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2025","first-page":"467","article-title":"Audio-aware large language models as judges for speaking styles","author":"Chiang","year":"2025"},{"key":"10.1016\/j.specom.2026.103377_b11","doi-asserted-by":"crossref","first-page":"1562","DOI":"10.1121\/1.2166600","article-title":"A glimpsing model of speech perception in noise","volume":"119","author":"Cooke","year":"2006","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b12","series-title":"Proc. Interspeech","first-page":"3552","article-title":"Intelligibility-enhancing speech modifications: the hurricane challenge","author":"Cooke","year":"2013"},{"key":"10.1016\/j.specom.2026.103377_b13","doi-asserted-by":"crossref","first-page":"572","DOI":"10.1016\/j.specom.2013.01.001","article-title":"Evaluating the intelligibility benefit of speech modifications in known noise conditions","volume":"55","author":"Cooke","year":"2013","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2026.103377_b14","series-title":"LibriMix: An open-source dataset for generalizable speech separation","author":"Cosentino","year":"2020"},{"key":"10.1016\/j.specom.2026.103377_b15","series-title":"Interspeech 2020","first-page":"3830","article-title":"ECAPA-TDNN: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification","author":"Desplanques","year":"2020"},{"key":"10.1016\/j.specom.2026.103377_b16","series-title":"Proc. Interspeech","first-page":"206","article-title":"A spectro-temporal glimpsing index (STGI) for speech intelligibility prediction","author":"Edraki","year":"2021"},{"key":"10.1016\/j.specom.2026.103377_b17","doi-asserted-by":"crossref","first-page":"210","DOI":"10.1109\/TASLP.2020.3039929","article-title":"Speech intelligibility prediction using spectro-temporal modulation analysis","volume":"29","author":"Edraki","year":"2021","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b18","doi-asserted-by":"crossref","first-page":"978","DOI":"10.1109\/TIM.2009.2024697","article-title":"Temporal dynamics for blind measurement of room acoustical parameters","volume":"59","author":"Falk","year":"2010","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"10.1016\/j.specom.2026.103377_b19","doi-asserted-by":"crossref","first-page":"1766","DOI":"10.1109\/TASL.2010.2052247","article-title":"A non-intrusive quality and intelligibility measure of reverberant and dereverberated speech","volume":"18","author":"Falk","year":"2010","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b20","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2021.103204","article-title":"Nonintrusive objective measurement of speech intelligibility: A review of methodology","volume":"71","author":"Feng","year":"2022","journal-title":"Biomed. Signal Process. Control."},{"key":"10.1016\/j.specom.2026.103377_b21","doi-asserted-by":"crossref","first-page":"1725","DOI":"10.1121\/1.400247","article-title":"Effects of fluctuating noise and interfering speech on the speech-reception threshold for impaired and normal hearing","volume":"88","author":"Festen","year":"1990","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b22","doi-asserted-by":"crossref","first-page":"3328","DOI":"10.1121\/10.0022445","article-title":"Sentence recognition with modulation-filtered speech segments for younger and older adults: Effects of hearing impairment and cognition","volume":"154","author":"Fogerty","year":"2023","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b23","doi-asserted-by":"crossref","first-page":"EL396","DOI":"10.1121\/10.0001217","article-title":"The effect of simulated room acoustic parameters on the intelligibility and perceived reverberation of monosyllabic words and sentences","volume":"147","author":"Fogerty","year":"2020","journal-title":"J. Acoust. Soc. Amer"},{"key":"10.1016\/j.specom.2026.103377_b24","doi-asserted-by":"crossref","DOI":"10.1121\/1.5101302","article-title":"Effects of age, modulation rate, and modulation depth on sentence recognition in speech-modulated noise","volume":"145","author":"Fogerty","year":"2019","journal-title":"J. Acoust. Soc. Amer"},{"key":"10.1016\/j.specom.2026.103377_b25","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1121\/1.1916407","article-title":"Factors governing the intelligibility of speech sounds","volume":"19","author":"French","year":"1947","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b26","doi-asserted-by":"crossref","first-page":"EL449","DOI":"10.1121\/1.5041466","article-title":"Explaining intelligibility in speech-modulated maskers using acoustic glimpse analysis","volume":"143","author":"Gibbs","year":"2018","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b27","series-title":"Proceedings of the 23rd International Conference on Machine Learning","first-page":"369","article-title":"Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks","author":"Graves","year":"2006"},{"key":"10.1016\/j.specom.2026.103377_b28","series-title":"Interspeech 2020","first-page":"5036","article-title":"Conformer: Convolution-augmented transformer for speech recognition","author":"Gulati","year":"2020"},{"key":"10.1016\/j.specom.2026.103377_b29","series-title":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"12702","article-title":"Audio deepfake detection with self-supervised WavLM and multi-fusion attentive classifier","author":"Guo","year":"2024"},{"key":"10.1016\/j.specom.2026.103377_b30","series-title":"2014 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6334","article-title":"I-vector-based speaker adaptation of deep neural networks for French broadcast audio transcription","author":"Gupta","year":"2014"},{"key":"10.1016\/j.specom.2026.103377_b31","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.specom.2026.103377_b32","first-page":"355","article-title":"Evaluation of speech transmission channels by using artificial signals","volume":"25","author":"Houtgast","year":"1971","journal-title":"Acta Acust. United Acust."},{"key":"10.1016\/j.specom.2026.103377_b33","doi-asserted-by":"crossref","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","article-title":"HuBERT: Self-supervised speech representation learning by masked prediction of hidden units","volume":"29","author":"Hsu","year":"2021","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b34","doi-asserted-by":"crossref","first-page":"1777","DOI":"10.1121\/1.2766778","article-title":"A comparative intelligibility study of single-microphone noise reduction algorithms","volume":"122","author":"Hu","year":"2007","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b35","series-title":"International Conference on Learning Representations","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2022"},{"key":"10.1016\/j.specom.2026.103377_b36","series-title":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Adapting self-supervised models to multi-talker speech recognition using speaker embeddings","author":"Huang","year":"2023"},{"key":"10.1016\/j.specom.2026.103377_b37","series-title":"American National Standard: Methods for Calculation of the Speech Intelligibility Index","author":"Institute","year":"1997"},{"key":"10.1016\/j.specom.2026.103377_b38","doi-asserted-by":"crossref","first-page":"2009","DOI":"10.1109\/TASLP.2016.2585878","article-title":"An algorithm for predicting the intelligibility of speech masked by modulated noise maskers","volume":"24","author":"Jensen","year":"2016","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b39","series-title":"Interspeech 2019","first-page":"236","article-title":"Auxiliary interference speaker loss for target-speaker speech recognition","author":"Kanda","year":"2019"},{"key":"10.1016\/j.specom.2026.103377_b40","doi-asserted-by":"crossref","DOI":"10.1016\/j.heares.2022.108606","article-title":"ASR-based speech intelligibility prediction: A review","volume":"426","author":"Karbasi","year":"2022","journal-title":"Hear. Res."},{"key":"10.1016\/j.specom.2026.103377_b41","doi-asserted-by":"crossref","first-page":"2224","DOI":"10.1121\/1.1862575","article-title":"Coherence and the speech intelligibility index","volume":"117","author":"Kates","year":"2005","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b42","series-title":"2017 25th European Signal Processing Conference","first-page":"216","article-title":"Non-intrusive intelligibility prediction using a codebook-based approach","author":"Kavalekalam","year":"2017"},{"key":"10.1016\/j.specom.2026.103377_b43","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1016\/S0167-6393(01)00041-3","article-title":"Testing the correlation of word error rate and perplexity","volume":"38","author":"Klakow","year":"2002","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2026.103377_b44","series-title":"2017 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5220","article-title":"A study on data augmentation of reverberant speech for robust speech recognition","author":"Ko","year":"2017"},{"key":"10.1016\/j.specom.2026.103377_b45","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"22511","article-title":"GLIGEN: Open-set grounded text-to-image generation","author":"Li","year":"2023"},{"key":"10.1016\/j.specom.2026.103377_b46","series-title":"Interspeech 2024","first-page":"597","article-title":"SA-WavLM: Speaker-aware self-supervised pre-training for mixture speech","author":"Lin","year":"2024"},{"key":"10.1016\/j.specom.2026.103377_b47","series-title":"RoBERTa: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"10.1016\/j.specom.2026.103377_b48","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1016\/j.specom.2023.04.001","article-title":"On the deficiency of intelligibility metrics as proxies for subjective intelligibility","volume":"150","author":"L\u00f3pez-Espejo","year":"2023","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2026.103377_b49","series-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"696","article-title":"TasNet: Time-domain audio separation network for real-time, single-channel speech separation","author":"Luo","year":"2018"},{"key":"10.1016\/j.specom.2026.103377_b50","series-title":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"12516","article-title":"Extending Whisper with prompt tuning to target-speaker ASR","author":"Ma","year":"2024"},{"key":"10.1016\/j.specom.2026.103377_b51","article-title":"Visualizing data using t-SNE","volume":"9","author":"Van der Maaten","year":"2008","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.specom.2026.103377_b52","series-title":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Speechlmscore: Evaluating speech generation using speech language model","author":"Maiti","year":"2023"},{"key":"10.1016\/j.specom.2026.103377_b53","series-title":"AudioJudge: Understanding what works in large audio model based speech evaluation","author":"Manakul","year":"2025"},{"key":"10.1016\/j.specom.2026.103377_b54","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"A sidecar separator can convert a single-talker speech recognition system to a multi-talker one","author":"Meng","year":"2023"},{"key":"10.1016\/j.specom.2026.103377_b55","series-title":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5206","article-title":"LibriSpeech: An ASR corpus based on public domain audio books","author":"Panayotov","year":"2015"},{"key":"10.1016\/j.specom.2026.103377_b56","series-title":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"336","article-title":"A neural network for monaural intrusive speech intelligibility prediction","author":"Pedersen","year":"2020"},{"key":"10.1016\/j.specom.2026.103377_b57","article-title":"Data-driven non-intrusive speech intelligibility prediction using speech presence probability","author":"Pedersen","year":"2023","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b58","series-title":"International Conference on Learning Representations","article-title":"Conditionally adaptive multi-task learning: Improving transfer learning in NLP using fewer parameters & less data","author":"Pilault","year":"2021"},{"key":"10.1016\/j.specom.2026.103377_b59","series-title":"Proceedings of the 40th International Conference on Machine Learning","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2023"},{"key":"10.1016\/j.specom.2026.103377_b60","doi-asserted-by":"crossref","DOI":"10.1121\/1.4780630","article-title":"An SII-based approach to predict the speech intelligibility in fluctuating noise for normal-hearing listeners","volume":"115","author":"Rhebergen","year":"2004","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103377_b61","doi-asserted-by":"crossref","first-page":"225","DOI":"10.1109\/TAU.1969.1162058","article-title":"IEEE recommended practice for speech quality measurements","volume":"17","author":"Rothauser","year":"1969","journal-title":"IEEE Trans. Audio Electroacoust."},{"key":"10.1016\/j.specom.2026.103377_b62","series-title":"2014 14th International Workshop on Acoustic Signal Enhancement","first-page":"55","article-title":"An improved non-intrusive intelligibility metric for noisy and reverberant speech","author":"Santos","year":"2014"},{"key":"10.1016\/j.specom.2026.103377_b63","series-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5329","article-title":"X-Vectors: Robust DNN embeddings for speaker recognition","author":"Snyder","year":"2018"},{"key":"10.1016\/j.specom.2026.103377_b64","series-title":"2017 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"386","article-title":"Pitch-based non-intrusive objective intelligibility prediction","author":"S\u00f8rensen","year":"2017"},{"key":"10.1016\/j.specom.2026.103377_b65","doi-asserted-by":"crossref","first-page":"51","DOI":"10.1016\/j.csl.2017.10.004","article-title":"Predicting speech intelligibility with deep neural networks","volume":"48","author":"Spille","year":"2018","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.specom.2026.103377_b66","article-title":"Inception-v4, inception-resnet and the impact of residual connections on learning","volume":"31","author":"Szegedy","year":"2017","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.specom.2026.103377_b67","doi-asserted-by":"crossref","first-page":"2125","DOI":"10.1109\/TASL.2011.2114881","article-title":"An algorithm for intelligibility prediction of time\u2013frequency weighted noisy speech","volume":"19","author":"Taal","year":"2011","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b68","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.specom.2026.103377_b69","doi-asserted-by":"crossref","first-page":"1702","DOI":"10.1109\/TASLP.2018.2842159","article-title":"Supervised speech separation based on deep learning: An overview","volume":"26","author":"Wang","year":"2018","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b70","series-title":"Interspeech 2024","first-page":"3849","article-title":"No-reference speech intelligibility prediction leveraging a noisy-speech ASR pre-trained model","author":"Wang","year":"2024"},{"key":"10.1016\/j.specom.2026.103377_b71","doi-asserted-by":"crossref","first-page":"1849","DOI":"10.1109\/TASLP.2014.2352935","article-title":"On training targets for supervised speech separation","volume":"22","author":"Wang","year":"2014","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b72","series-title":"Interspeech 2023","first-page":"5301","article-title":"CAM++: A fast and efficient network for speaker verification using context-aware masking","author":"Wang","year":"2023"},{"key":"10.1016\/j.specom.2026.103377_b73","series-title":"Interspeech 2019","first-page":"1368","article-title":"WHAM!: Extending speech separation to noisy environments","author":"Wichern","year":"2019"},{"key":"10.1016\/j.specom.2026.103377_b74","doi-asserted-by":"crossref","first-page":"2884","DOI":"10.1109\/TASLP.2024.3389631","article-title":"A large-scale evaluation of speech foundation models","volume":"32","author":"Yang","year":"2024","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b75","series-title":"Interspeech 2021","first-page":"1194","article-title":"SUPERB: Speech processing universal performance benchmark","author":"Yang","year":"2021"},{"key":"10.1016\/j.specom.2026.103377_b76","series-title":"2017 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"241","article-title":"Permutation invariant training of deep models for speaker-independent multi-talker speech separation","author":"Yu","year":"2017"},{"key":"10.1016\/j.specom.2026.103377_b77","doi-asserted-by":"crossref","first-page":"1207","DOI":"10.1587\/transinf.2017EDL8225","article-title":"A deep learning-based approach to non-intrusive objective speech intelligibility estimation","volume":"101","author":"Yun","year":"2018","journal-title":"IEICE Transactions Inf. Syst."},{"key":"10.1016\/j.specom.2026.103377_b78","series-title":"2024 IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"A study on incorporating Whisper for robust speech assessment","author":"Zezario","year":"2024"},{"key":"10.1016\/j.specom.2026.103377_b79","doi-asserted-by":"crossref","first-page":"54","DOI":"10.1109\/TASLP.2022.3205757","article-title":"Deep learning-based non-intrusive multi-objective speech assessment model with cross-domain features","volume":"31","author":"Zezario","year":"2022","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103377_b80","series-title":"Asia-Pacific Signal and Information Processing Association Annual Summit and Conference","first-page":"482","article-title":"STOI-Net: A deep learning based non-intrusive speech intelligibility assessment model","author":"Zezario","year":"2020"},{"key":"10.1016\/j.specom.2026.103377_b81","series-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","first-page":"3901","article-title":"Paragraph-level neural question generation with maxout pointer and gated self-attention networks","author":"Zhao","year":"2018"}],"container-title":["Speech Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000257?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000257?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T14:12:31Z","timestamp":1774707151000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167639326000257"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":81,"alternative-id":["S0167639326000257"],"URL":"https:\/\/doi.org\/10.1016\/j.specom.2026.103377","relation":{},"ISSN":["0167-6393"],"issn-type":[{"value":"0167-6393","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"TSIP-Net: No-reference speech intelligibility prediction in the presence of competing speech","name":"articletitle","label":"Article Title"},{"value":"Speech Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.specom.2026.103377","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103377"}}