{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T13:30:52Z","timestamp":1764250252792,"version":"build-2065373602"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"22","license":[{"start":{"date-parts":[[2023,8,25]],"date-time":"2023-08-25T00:00:00Z","timestamp":1692921600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,8,25]],"date-time":"2023-08-25T00:00:00Z","timestamp":1692921600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2023,11]]},"DOI":"10.1007\/s10489-023-04953-2","type":"journal-article","created":{"date-parts":[[2023,8,25]],"date-time":"2023-08-25T07:02:08Z","timestamp":1692946928000},"page":"26497-26517","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["TMS: Temporal multi-scale in time-delay neural network for speaker verification"],"prefix":"10.1007","volume":"53","author":[{"given":"Ruiteng","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Jianguo","family":"Wei","sequence":"additional","affiliation":[]},{"given":"Xugang","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Wenhuan","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Di","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Lin","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Junhai","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Jianwu","family":"Dang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,25]]},"reference":[{"key":"4953_CR1","doi-asserted-by":"crossref","unstructured":"Mittal A, Dua M (2022) Automatic speaker verification systems and spoof detection techniques: review and analysis. International Journal of Speech Technology, 1\u201330","DOI":"10.1007\/s10772-021-09876-2"},{"key":"4953_CR2","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1016\/j.neucom.2020.06.045","volume":"410","author":"J Xu","year":"2020","unstructured":"Xu J, Wang X, Feng B, Liu W (2020) Deep multi-metric learning for text-independent speaker verification. Neurocomputing 410:394\u2013400","journal-title":"Neurocomputing"},{"issue":"5","key":"4953_CR3","doi-asserted-by":"publisher","first-page":"1557","DOI":"10.1109\/TASL.2006.878256","volume":"14","author":"SE Tranter","year":"2006","unstructured":"Tranter SE, Reynolds DA (2006) An overview of automatic speaker diarization systems. IEEE Trans Audio Speech Lang Process 14(5):1557\u20131565","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"4953_CR4","doi-asserted-by":"publisher","first-page":"2645","DOI":"10.1109\/TASLP.2022.3196178","volume":"30","author":"W Wang","year":"2022","unstructured":"Wang W, Lin Q, Cai D, Li M (2022) Similarity measurement of segment-level speaker embeddings in speaker diarization. IEEE\/ACM Trans Audio Speech Lang Process 30:2645\u20132658","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"4953_CR5","doi-asserted-by":"crossref","unstructured":"Snyder D, Garcia-Romero D, Povey D, Khudanpur S (2017) Deep neural network embeddings for text-independent speaker verification. In: Proceedings interspeech, pp 999\u20131003","DOI":"10.21437\/Interspeech.2017-620"},{"key":"4953_CR6","doi-asserted-by":"publisher","first-page":"1243","DOI":"10.1109\/TASLP.2021.3065202","volume":"29","author":"X Chen","year":"2021","unstructured":"Chen X, Bao C (2021) Phoneme-unit-specific time-delay neural network for speaker verification. IEEE\/ACM Trans Audio Speech Lang Process 29:1243\u20131255. https:\/\/doi.org\/10.1109\/TASLP.2021.3065202","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"issue":"3","key":"4953_CR7","doi-asserted-by":"publisher","first-page":"328","DOI":"10.1109\/29.21701","volume":"37","author":"A Waibel","year":"1989","unstructured":"Waibel A, Hanazawa T, Hinton G, Shikano K, Lang KJ (1989) Phoneme recognition using time-delay neural networks. IEEE Trans Acoustics Speech Signal Process 37(3):328\u2013339","journal-title":"IEEE Trans Acoustics Speech Signal Process"},{"key":"4953_CR8","doi-asserted-by":"publisher","first-page":"1243","DOI":"10.1109\/TASLP.2021.3065202","volume":"29","author":"X Chen","year":"2021","unstructured":"Chen X, Bao C (2021) Phoneme-unit-specific time-delay neural network for speaker verification. IEEE\/ACM Trans Audio Speech Lang Process 29:1243\u20131255","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"4953_CR9","doi-asserted-by":"crossref","unstructured":"Snyder D, Garcia-Romero D, Sell G, McCree A, Povey D, Khudanpur S (2019) Speaker recognition for multi-speaker conversations using x-vectors. In: Proceedings ICASSP, pp 5796\u20135800","DOI":"10.1109\/ICASSP.2019.8683760"},{"key":"4953_CR10","doi-asserted-by":"crossref","unstructured":"Povey D, Cheng G, Wang Y, Li K, Xu H, Yarmohammadi M, Khudanpur S (2018) Semi-orthogonal low-rank matrix factorization for deep neural networks. In: Interspeech, pp 3743\u20133747","DOI":"10.21437\/Interspeech.2018-1417"},{"key":"4953_CR11","doi-asserted-by":"publisher","first-page":"1000","DOI":"10.1109\/TASLP.2023.3244502","volume":"31","author":"Y Zhu","year":"2023","unstructured":"Zhu Y, Mak B (2023) Bayesian Self-attentive speaker embeddings for text-independent speaker verification. IEEE\/ACM Trans Audio Speech Lang Process 31:1000\u20131012","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"4953_CR12","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1016\/j.specom.2022.09.003","volume":"144","author":"H Zhu","year":"2022","unstructured":"Zhu H, Lee KA, Li H (2022) Discriminative speaker embedding with serialized multi-layer multi-head attention. Speech Commun 144:89\u2013100","journal-title":"Speech Commun"},{"key":"4953_CR13","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1016\/j.neucom.2020.06.079","volume":"412","author":"Y Wu","year":"2020","unstructured":"Wu Y, Guo C, Gao H, Xu J, Bai G (2020) Dilated residual networks with multi-level attention for speaker verification. Neurocomputing 412:177\u2013186","journal-title":"Neurocomputing"},{"key":"4953_CR14","doi-asserted-by":"publisher","first-page":"643","DOI":"10.1109\/TASLP.2022.3231709","volume":"31","author":"B Gu","year":"2023","unstructured":"Gu B, Guo W, Zhang J (2023) Memory storable network based feature aggregation for speaker representation learning. IEEE\/ACM Trans Audio Speech Lang Process 31:643\u2013655","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"4953_CR15","doi-asserted-by":"crossref","unstructured":"Zhang R, Wei J, Lu W, Wang L, Liu M, Zhang L, Jin J, Xu J (2020) Aret: Aggregated residual extended time-delay neural networks for speaker verification. In: Proceedings interspeech, pp 946\u2013950","DOI":"10.21437\/Interspeech.2020-1626"},{"key":"4953_CR16","doi-asserted-by":"crossref","unstructured":"Shen H, Y Y, Sun G, Langman R, Han E, Droppo J, Stolcke A (2022) Improving fairness in speaker verification via Group-adapted Fusion Network. In: Proceedings ICASSP, pp 7077\u20137081. IEEE","DOI":"10.1109\/ICASSP43922.2022.9747384"},{"key":"4953_CR17","doi-asserted-by":"crossref","unstructured":"Liu W, Wen Y, Yu Z, Li M, Raj B, Song L (2017) Sphereface: Deep hypersphere embedding for face recognition. In: Proceedings CVPR, pp 212\u2013220","DOI":"10.1109\/CVPR.2017.713"},{"issue":"7","key":"4953_CR18","doi-asserted-by":"publisher","first-page":"926","DOI":"10.1109\/LSP.2018.2822810","volume":"25","author":"F Wang","year":"2018","unstructured":"Wang F, Cheng J, Liu W, Liu H (2018) Additive margin softmax for face verification. IEEE Signal Process Lett 25(7):926\u2013930","journal-title":"IEEE Signal Process Lett"},{"key":"4953_CR19","doi-asserted-by":"crossref","unstructured":"Deng J, Guo J, Xue N, Zafeiriou S (2019) Arcface: Additive angular margin loss for deep face recognition. In: Proceedings CVPR, pp 4690\u20134699","DOI":"10.1109\/CVPR.2019.00482"},{"key":"4953_CR20","unstructured":"Gao S, Cheng M-M, Zhao K, Zhang X-Y, Yang M-H, Torr PH (2019) Res2net: A new multi-scale backbone architecture. IEEE transactions on pattern analysis and machine intelligence"},{"key":"4953_CR21","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139166621","volume-title":"Principles of Phonetics","author":"J Laver","year":"1994","unstructured":"Laver J (1994) Principles of Phonetics. Cambridge University Press"},{"issue":"1","key":"4953_CR22","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1250\/ast.26.16","volume":"26","author":"T Kitamura","year":"2005","unstructured":"Kitamura T, Honda K, Takemoto H (2005) Individual variation of the hypopharyngeal cavities and its acoustic effects. Acoust Sci Technol 26(1):16\u201326","journal-title":"Acoust Sci Technol"},{"issue":"4","key":"4953_CR23","doi-asserted-by":"publisher","first-page":"2228","DOI":"10.1121\/1.2261270","volume":"120","author":"H Takemoto","year":"2006","unstructured":"Takemoto H, Adachi S, Kitamura T, Mokhtari P, Honda K (2006) Acoustic roles of the laryngeal cavity in vocal tract resonance. J Acoust Soc Am 120(4):2228\u20132238","journal-title":"J Acoust Soc Am"},{"key":"4953_CR24","doi-asserted-by":"publisher","first-page":"101426","DOI":"10.1016\/j.csl.2022.101426","volume":"77","author":"Y Qin","year":"2023","unstructured":"Qin Y, Ren Q, Mao Q, Chen J (2023) Multi-branch feature aggregation based on multiple weighting for speaker verification. Comput Speech Lang 77:101426","journal-title":"Comput Speech Lang"},{"key":"4953_CR25","doi-asserted-by":"crossref","unstructured":"Desplanques B, Thienpondt J, Demuynck K (2020) Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. In: Proceedings Interspeech, pp 3830\u20133834","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"4953_CR26","doi-asserted-by":"crossref","unstructured":"Alenin A, Okhotnikov A, Makarov R, Torgashov N, Shigabeev I, Simonchik K (2021) The ID R &D System description for short-duration speaker verification challenge 2021. In: Proceedings interspeech, pp 2297\u20132301","DOI":"10.21437\/Interspeech.2021-1553"},{"key":"4953_CR27","doi-asserted-by":"crossref","unstructured":"Zeinali H, Lee KA, Alam J, Burget L (2020) Sdsv challenge 2020: Large-scale evaluation of short-duration speaker verification. In: Proceedings interspeech, pp 731\u2013735","DOI":"10.21437\/Interspeech.2020-1485"},{"key":"4953_CR28","doi-asserted-by":"crossref","unstructured":"Ding X, Zhang X, Ma N, Han J, Ding G, Sun J (2021) Repvgg: Making vgg-style convnets great again. In: Proceedings CVPR, pp 13733\u201313742","DOI":"10.1109\/CVPR46437.2021.01352"},{"key":"4953_CR29","doi-asserted-by":"crossref","unstructured":"Ma N, Zhang X, Zheng H-T, Sun J (2018) Shufflenet v2: Practical guidelines for efficient cnn architecture design. In: Proceedings ECCV, pp 116\u2013131","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"4953_CR30","unstructured":"Povey D, Ghoshal A, Boulianne G, Burget L, Glembek O, Goel N, Hannemann M, Motlicek P, Qian Y, Schwarz P, et al (2011) The kaldi speech recognition toolkit. In: IEEE 2011 Workshop on automatic speech recognition and understanding. IEEE Signal processing society"},{"key":"4953_CR31","doi-asserted-by":"crossref","unstructured":"Zhang R, Wei J, Lu W, Zhang L, Ji Y, Xu J, Lu X (2022) CS-REP: Making speaker verification networks embracing re-parameterization. In: Proceedings ICASSP, pp 7082\u20137086. IEEE","DOI":"10.1109\/ICASSP43922.2022.9746456"},{"key":"4953_CR32","doi-asserted-by":"crossref","unstructured":"Yu Y-Q, Zheng S, Suo H, Lei Y, Li W-J (2021) Cam: Context-aware masking for robust speaker verification. In: Proceedings ICASSP, pp 6703\u20136707","DOI":"10.1109\/ICASSP39728.2021.9414704"},{"key":"4953_CR33","doi-asserted-by":"crossref","unstructured":"Szegedy C, Ioffe S, Vanhoucke V, Alemi AA (2017) Inception-v4, inception-resnet and the impact of residual connections on learning. In: Proceedings AAAI, pp 4278\u20134284","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"4953_CR34","doi-asserted-by":"publisher","first-page":"101523","DOI":"10.1016\/j.csl.2023.101523","volume":"81","author":"Z Li","year":"2013","unstructured":"Li Z, Xiao R, Chen H, Zhao Z, Wang W, Zhang P (2013) How to make embeddings suitable for PLDA. Comput Speech Lang 81:101523","journal-title":"Comput Speech Lang"},{"key":"4953_CR35","doi-asserted-by":"crossref","unstructured":"Chollet F (2017) Xception: Deep learning with depthwise separable convolutions. In: Proceedings CVPR, pp 1251\u20131258","DOI":"10.1109\/CVPR.2017.195"},{"key":"4953_CR36","unstructured":"Koluguri NR, Li J, Lavrukhin V, Ginsburg B (2020) Speakernet: 1d depth-wise separable convolutional network for text-independent speaker recognition and verification. arXiv:2010.12653"},{"key":"4953_CR37","doi-asserted-by":"crossref","unstructured":"Hu J, Shen L, Sun G (2018) Squeeze-and-excitation networks. In: Proceedings CVPR, pp 7132\u20137141","DOI":"10.1109\/CVPR.2018.00745"},{"key":"4953_CR38","doi-asserted-by":"crossref","unstructured":"Joon Son\u00a0Chung AN, Zisserman A (2018) Voxceleb2: Deep speaker recognition. In: Proceedings interspeech, pp 1086\u20131090","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"4953_CR39","doi-asserted-by":"crossref","unstructured":"Arsha\u00a0Nagrani JSC, Zisserman A (2017) Voxceleb: a large-scale speaker identification dataset. In: Proceedings interspeech, pp 2616\u20132620","DOI":"10.21437\/Interspeech.2017-950"},{"key":"4953_CR40","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1016\/j.specom.2022.01.002","volume":"137","author":"L Li","year":"2022","unstructured":"Li L, Liu R, Kang J, Fan Y, Cui H, Cai Y, Vipperla R, Zheng TF, Wang D (2022) Cn-celeb: multi-genre speaker recognition. Speech Commun 137:77\u201391","journal-title":"Speech Commun"},{"key":"4953_CR41","doi-asserted-by":"crossref","unstructured":"Prince SJ, Elder JH (2007) Probabilistic linear discriminant analysis for inferences about identity. In: Proceedings ICCV, pp 1\u20138","DOI":"10.1109\/ICCV.2007.4409052"},{"key":"4953_CR42","doi-asserted-by":"publisher","first-page":"101027","DOI":"10.1016\/j.csl.2019.101027","volume":"60","author":"A Nagrani","year":"2020","unstructured":"Nagrani A, Chung JS, Xie W, Zisserman A (2020) Voxceleb: Large-scale speaker verification in the wild. Comput Speech Lang 60:101027","journal-title":"Comput Speech Lang"},{"key":"4953_CR43","doi-asserted-by":"crossref","unstructured":"Cumani S, Batzu PD, Colibro D, Vair C, Laface P, Vasilakakis V (2011) Comparison of speaker recognition approaches for real applications. In: Proceedings interspeech, pp 2365\u20132368","DOI":"10.21437\/Interspeech.2011-64"},{"key":"4953_CR44","doi-asserted-by":"crossref","unstructured":"Martin AF, Greenberg CS (2009) Nist 2008 speaker recognition evaluation: Performance across telephone and room microphone channels. In: Proceedings interspeech, pp 2579\u20132582","DOI":"10.21437\/Interspeech.2009-679"},{"key":"4953_CR45","doi-asserted-by":"publisher","first-page":"1079","DOI":"10.1109\/TASLP.2021.3057230","volume":"29","author":"Y Qian","year":"2021","unstructured":"Qian Y, Chen Z, Wang S (2021) Audio-visual deep neural network for robust person verification. IEEE\/ACM Trans Audio Speech Lang Process 29:1079\u20131092","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"4953_CR46","doi-asserted-by":"crossref","unstructured":"Zhou T, Zhao Y, Wu J (2021) Resnext and res2net structures for speaker verification. In: 2021 IEEE Spoken Language Technology Workshop (SLT), pp 301\u2013307. IEEE","DOI":"10.1109\/SLT48900.2021.9383531"},{"key":"4953_CR47","doi-asserted-by":"publisher","first-page":"1330","DOI":"10.1109\/TASLP.2022.3161155","volume":"30","author":"Z Bai","year":"2022","unstructured":"Bai Z, Wang J, Zhang X-L, Chen J (2022) End-to-end speaker verification via curriculum bipartite ranking weighted binary cross-entropy. IEEE\/ACM Trans Audio Speech Lang Process 30:1330\u20131344","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"4953_CR48","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1016\/j.neucom.2022.09.014","volume":"511","author":"Y Wu","year":"2022","unstructured":"Wu Y, Guo C, Zhao J, Jin X, Xu J (2022) RSKNet-MTSP: Effective and portable deep architecture for speaker verification. Neurocomputing 511:259\u2013272","journal-title":"Neurocomputing"},{"key":"4953_CR49","doi-asserted-by":"publisher","first-page":"733","DOI":"10.1109\/TASLP.2020.3039573","volume":"29","author":"Y Cai","year":"2021","unstructured":"Cai Y, Li L, Abel A, Zhu X, Wang D (2021) Deep normalization for speaker vectors. IEEE\/ACM Trans Audio Speech Lang Process 29:733\u2013744. https:\/\/doi.org\/10.1109\/TASLP.2020.3039573","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-023-04953-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-023-04953-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-023-04953-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,25]],"date-time":"2023-10-25T15:09:48Z","timestamp":1698246588000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-023-04953-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,25]]},"references-count":49,"journal-issue":{"issue":"22","published-print":{"date-parts":[[2023,11]]}},"alternative-id":["4953"],"URL":"https:\/\/doi.org\/10.1007\/s10489-023-04953-2","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2023,8,25]]},"assertion":[{"value":"5 August 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 August 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest to this work.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}