{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T02:42:12Z","timestamp":1776048132408,"version":"3.50.1"},"reference-count":51,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100010040","name":"Taishan Scholar Project of Shandong Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010040","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010029","name":"Taishan Scholar Foundation of Shandong Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010029","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171263"],"award-info":[{"award-number":["62171263"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Speech Communication"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.specom.2026.103378","type":"journal-article","created":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T16:23:11Z","timestamp":1773332591000},"page":"103378","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["AVFSNet: Audio-visual speech separation for flexible number of speakers with multi-scale and multi-task learning"],"prefix":"10.1016","volume":"179","author":[{"given":"Daning","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuanjie","family":"Deng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying","family":"Wei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bing","family":"Ji","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"issue":"12","key":"10.1016\/j.specom.2026.103378_b1","doi-asserted-by":"crossref","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","article-title":"Deep audio-visual speech recognition","volume":"44","author":"Afouras","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.specom.2026.103378_b2","article-title":"Used: Universal speaker extraction and diarization","author":"Ao","year":"2024","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"1","key":"10.1016\/j.specom.2026.103378_b3","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1109\/TMM.2019.2925956","article-title":"Generative model driven representation learning in a hybrid framework for environmental audio scene and sound event recognition","volume":"22","author":"Chandrakala","year":"2019","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.specom.2026.103378_b4","doi-asserted-by":"crossref","unstructured":"Chen, J., Mao, Q., Liu, D., 2020. Dual-path transformer network: Direct context-aware modeling for end-to-end monaural speech separation. In: Proc. Interspeech 2020. pp. 2642\u20132646.","DOI":"10.21437\/Interspeech.2020-2205"},{"key":"10.1016\/j.specom.2026.103378_b5","series-title":"ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"7857","article-title":"Fullsubnet+: Channel attention fullsubnet with complex spectrograms for speech enhancement","author":"Chen","year":"2022"},{"key":"10.1016\/j.specom.2026.103378_b6","series-title":"INTERSPEECH","first-page":"5393","article-title":"Speech separation for an unknown number of speakers using transformers with encoder\u2013decoder attractors","author":"Chetupalli","year":"2022"},{"key":"10.1016\/j.specom.2026.103378_b7","doi-asserted-by":"crossref","first-page":"1681","DOI":"10.1109\/TASLP.2023.3268572","article-title":"Speaker counting and separation from single-channel noisy mixtures","volume":"31","author":"Chetupalli","year":"2023","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103378_b8","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A., 2018. Voxceleb2: Deep speaker recognition. In: Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH. Vol. 2018, pp. 1086\u20131090.","DOI":"10.21437\/Interspeech.2018-1929"},{"issue":"4","key":"10.1016\/j.specom.2026.103378_b9","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3197517.3201357","article-title":"Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation","volume":"37","author":"Ephrat","year":"2018","journal-title":"ACM Trans. Graph."},{"key":"10.1016\/j.specom.2026.103378_b10","series-title":"2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15490","article-title":"Visualvoice: Audio-visual speech separation with cross-modal consistency","author":"Gao","year":"2021"},{"key":"10.1016\/j.specom.2026.103378_b11","doi-asserted-by":"crossref","unstructured":"Gulati, A., Qin, J., Chiu, C.-C., Parmar, N., Zhang, Y., Yu, J., Han, W., Wang, S., Zhang, Z., Wu, Y., et al., 2020. Conformer: Convolution-augmented transformer for speech recognition. In: Proc. Interspeech 2020. pp. 5036\u20135040.","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"10.1016\/j.specom.2026.103378_b12","series-title":"2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"31","article-title":"Deep clustering: Discriminative embeddings for segmentation and separation","author":"Hershey","year":"2016"},{"key":"10.1016\/j.specom.2026.103378_b13","series-title":"21st Annual Conference of the International Speech Communication Association, INTERSPEECH 2020","first-page":"269","article-title":"End-to-end speaker diarization for an unknown number of speakers with encoder\u2013decoder based attractors","author":"Horiguchi","year":"2020"},{"key":"10.1016\/j.specom.2026.103378_b14","doi-asserted-by":"crossref","unstructured":"Kendall, A., Gal, Y., Cipolla, R., 2018. Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 7482\u20137491.","DOI":"10.1109\/CVPR.2018.00781"},{"key":"10.1016\/j.specom.2026.103378_b15","series-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"5064","article-title":"Listening to each speaker one by one with recurrent selective hearing networks","author":"Kinoshita","year":"2018"},{"key":"10.1016\/j.specom.2026.103378_b16","series-title":"2021 IEEE Spoken Language Technology Workshop","first-page":"801","article-title":"Effective low-cost time-domain audio separation using globally attentive locally recurrent networks","author":"Lam","year":"2021"},{"key":"10.1016\/j.specom.2026.103378_b17","doi-asserted-by":"crossref","unstructured":"Lee, J., Chung, S.-W., Kim, S., Kang, H.-G., Sohn, K., 2021. Looking into your speech: Learning cross-modal affinity for audio-visual speech separation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1336\u20131345.","DOI":"10.1109\/CVPR46437.2021.00139"},{"key":"10.1016\/j.specom.2026.103378_b18","doi-asserted-by":"crossref","DOI":"10.1016\/j.specom.2025.103229","article-title":"Mffn: Multi-level feature fusion network for monaural speech separation","volume":"171","author":"Lei","year":"2025","journal-title":"Speech Commun."},{"issue":"10","key":"10.1016\/j.specom.2026.103378_b19","doi-asserted-by":"crossref","first-page":"3623","DOI":"10.1093\/cercor\/bhx235","article-title":"The effects of audiovisual inputs on solving the cocktail party problem in the human brain: An fmri study","volume":"28","author":"Li","year":"2018","journal-title":"Cerebral Cortex"},{"key":"10.1016\/j.specom.2026.103378_b20","article-title":"An audio-visual speech separation model inspired by cortico-thalamo-cortical circuits","author":"Li","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.specom.2026.103378_b21","unstructured":"Li, K., Yang, R., Sun, F., Hu, X., 2024b. Iianet: An intra-and inter-modality attention network for audio-visual speech separation. In: International Conference on Machine Learning, PMLR. pp. 29181\u201329200."},{"key":"10.1016\/j.specom.2026.103378_b22","series-title":"ICASSP 2023\u20132023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"Av-sepformer: Cross-attention sepformer for audio-visual target speaker extraction","author":"Lin","year":"2023"},{"key":"10.1016\/j.specom.2026.103378_b23","series-title":"2020 IEEE International Conference on Robotics and Automation","first-page":"1588","article-title":"Self-supervised learning for alignment of objects and sound","author":"Liu","year":"2020"},{"key":"10.1016\/j.specom.2026.103378_b24","series-title":"2024 IEEE International Symposium on Circuits and Systems","first-page":"1","article-title":"Multi-kernel attention encoder for time-domain speech separation","author":"Liu","year":"2024"},{"issue":"12","key":"10.1016\/j.specom.2026.103378_b25","doi-asserted-by":"crossref","first-page":"2092","DOI":"10.1109\/TASLP.2019.2941148","article-title":"Divide and conquer: A deep casa approach to talker-independent monaural speaker separation","volume":"27","author":"Liu","year":"2019","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103378_b26","series-title":"ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"46","article-title":"Dual-path rnn: efficient long sequence modeling for time-domain single-channel speech separation","author":"Luo","year":"2020"},{"issue":"8","key":"10.1016\/j.specom.2026.103378_b27","doi-asserted-by":"crossref","first-page":"1256","DOI":"10.1109\/TASLP.2019.2915167","article-title":"Conv-tasnet: Surpassing ideal time\u2013frequency magnitude masking for speech separation","volume":"27","author":"Luo","year":"2019","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103378_b28","doi-asserted-by":"crossref","unstructured":"Ma, P., Wang, Y., Shen, J., Petridis, S., Pantic, M., 2021. Lip-reading with densely connected temporal convolutional networks. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 2857\u20132866.","DOI":"10.1109\/WACV48630.2021.00290"},{"key":"10.1016\/j.specom.2026.103378_b29","series-title":"2022 IEEE Spoken Language Technology Workshop","first-page":"480","article-title":"Eend-ss: Joint end-to-end neural speaker diarization and speech separation for flexible number of speakers","author":"Maiti","year":"2023"},{"key":"10.1016\/j.specom.2026.103378_b30","series-title":"INTERSPEECH","article-title":"Audio-visual speech separation in noisy environments with a lightweight iterative model","author":"Martel","year":"2023"},{"key":"10.1016\/j.specom.2026.103378_b31","unstructured":"Nachmani, E., Adi, Y., Wolf, L., 2020. Voice separation with an unknown number of multiple speakers. In: International Conference on Machine Learning, PMLR. pp. 7164\u20137175."},{"key":"10.1016\/j.specom.2026.103378_b32","doi-asserted-by":"crossref","first-page":"3032","DOI":"10.1109\/TASLP.2022.3205759","article-title":"Usev: Universal speaker extraction with visual cue","volume":"30","author":"Pan","year":"2022","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103378_b33","series-title":"ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"6678","article-title":"Muse: Multi-modal target speaker extraction with visual cues","author":"Pan","year":"2021"},{"key":"10.1016\/j.specom.2026.103378_b34","unstructured":"Peng, Y., Dalmia, S., Lane, I., Watanabe, S., 2022. Branchformer: Parallel mlp-attention architectures to capture local and global context for speech recognition and understanding. In: International Conference on Machine Learning, PMLR. pp. 17627\u201317643."},{"key":"10.1016\/j.specom.2026.103378_b35","series-title":"2001 IEEE international conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 01CH37221)","first-page":"749","article-title":"Perceptual evaluation of speech quality (pesq)-a new method for speech quality assessment of telephone networks and codecs","volume":"Vol. 2","author":"Rix","year":"2001"},{"key":"10.1016\/j.specom.2026.103378_b36","series-title":"ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"626","article-title":"Sdr\u2013half-baked or well done?","author":"Roux","year":"2019"},{"key":"10.1016\/j.specom.2026.103378_b37","series-title":"Interspeech 2025 urgent speech enhancement challenge","author":"Saijo","year":"2025"},{"key":"10.1016\/j.specom.2026.103378_b38","first-page":"3735","article-title":"Sequence to multi-sequence learning via conditional chain mapping for mixture signals","volume":"33","author":"Shi","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.specom.2026.103378_b39","series-title":"Combining residual networks with lstms for lipreading","author":"Stafylakis","year":"2017"},{"key":"10.1016\/j.specom.2026.103378_b40","doi-asserted-by":"crossref","first-page":"451","DOI":"10.3389\/fnins.2019.00451","article-title":"Limits of perceived audio-visual spatial coherence as defined by reaction time measurements","volume":"13","author":"Stenzel","year":"2019","journal-title":"Front. Neurosci."},{"issue":"2","key":"10.1016\/j.specom.2026.103378_b41","doi-asserted-by":"crossref","first-page":"268","DOI":"10.1109\/TASLP.2018.2877892","article-title":"Countnet: Estimating the number of concurrent speakers using supervised learning","volume":"27","author":"St\u00f6ter","year":"2018","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103378_b42","series-title":"ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"21","article-title":"Attention is all you need in speech separation","author":"Subakan","year":"2021"},{"key":"10.1016\/j.specom.2026.103378_b43","series-title":"ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"6862","article-title":"Real-m: Towards speech separation on real mixtures","author":"Subakan","year":"2022"},{"key":"10.1016\/j.specom.2026.103378_b44","series-title":"2010 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4214","article-title":"A short-time objective intelligibility measure for time-frequency weighted noisy speech","author":"Taal","year":"2010"},{"key":"10.1016\/j.specom.2026.103378_b45","doi-asserted-by":"crossref","unstructured":"Takahashi, N., Parthasaarathy, S., Goswami, N., Mitsufuji, Y., 2019. Recursive speech separation for unknown number of speakers. In: Proc. Interspeech 2019. pp. 1348\u20131352.","DOI":"10.21437\/Interspeech.2019-1550"},{"key":"10.1016\/j.specom.2026.103378_b46","doi-asserted-by":"crossref","first-page":"26","DOI":"10.1109\/LSP.2020.3043977","article-title":"Sagrnn: Self-attentive gated rnn for binaural speaker separation with interaural cue preservation","volume":"28","author":"Tan","year":"2020","journal-title":"IEEE Signal Process. Lett."},{"key":"10.1016\/j.specom.2026.103378_b47","doi-asserted-by":"crossref","DOI":"10.1109\/TASLPRO.2025.3527766","article-title":"Audio-visual target speaker extraction with selective auditory attention","author":"Tao","year":"2025","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2026.103378_b48","series-title":"2023 2nd International Conference on Automation, Computing and Renewable Systems (ICACRS)","first-page":"1188","article-title":"Face and object detection algorithms for people counting applications","author":"Vasantha","year":"2023"},{"issue":"10","key":"10.1016\/j.specom.2026.103378_b49","doi-asserted-by":"crossref","first-page":"1702","DOI":"10.1109\/TASLP.2018.2842159","article-title":"Supervised speech separation based on deep learning: An overview","volume":"26","author":"Wang","year":"2018","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103378_b50","series-title":"2019 IEEE Automatic Speech Recognition and Understanding Workshop","first-page":"667","article-title":"Time domain audio visual speech separation","author":"Wu","year":"2019"},{"key":"10.1016\/j.specom.2026.103378_b51","series-title":"ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"3420","article-title":"Multi-decoder dprnn: Source separation for variable number of speakers","author":"Zhu","year":"2021"}],"container-title":["Speech Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000269?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000269?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T01:56:47Z","timestamp":1776045407000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167639326000269"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":51,"alternative-id":["S0167639326000269"],"URL":"https:\/\/doi.org\/10.1016\/j.specom.2026.103378","relation":{},"ISSN":["0167-6393"],"issn-type":[{"value":"0167-6393","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"AVFSNet: Audio-visual speech separation for flexible number of speakers with multi-scale and multi-task learning","name":"articletitle","label":"Article Title"},{"value":"Speech Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.specom.2026.103378","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103378"}}