{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T23:34:43Z","timestamp":1772580883563,"version":"3.50.1"},"reference-count":31,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T00:00:00Z","timestamp":1642032000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T00:00:00Z","timestamp":1642032000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62071302"],"award-info":[{"award-number":["62071302"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2022,3]]},"DOI":"10.1007\/s10772-022-09959-8","type":"journal-article","created":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T07:09:13Z","timestamp":1642057753000},"page":"261-268","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring single channel speech separation for short-time text-dependent speaker verification"],"prefix":"10.1007","volume":"25","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5390-8520","authenticated-orcid":false,"given":"Jiangyu","family":"Han","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yan","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0924-408X","authenticated-orcid":false,"given":"Yanhua","family":"Long","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiaen","family":"Liang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,1,13]]},"reference":[{"key":"9959_CR1","doi-asserted-by":"crossref","unstructured":"Bahmaninezhad, F., Wu, J., Gu, R., Zhang, S.-X., Xu, Y., Yu, M., & Yu, D. (2019). A comprehensive study of speech separation: Spectrogram vs waveform separation. In Proc. Interspeech, (pp. 4574\u20134578).","DOI":"10.21437\/Interspeech.2019-3181"},{"issue":"16","key":"9959_CR2","first-page":"359","volume":"10","author":"DJ Berndt","year":"1994","unstructured":"Berndt, D. J., & Clifford, J. (1994). Using dynamic time warping to find patterns in time series. in Proc.\u00a0KDD Workshop, 10(16), 359\u2013370.","journal-title":"KDD Workshop"},{"key":"9959_CR3","doi-asserted-by":"crossref","unstructured":"Bu, H., Du, J., Na, X., Wu, B., & Zheng, H. (2017). Aishell-1: An open-source mandarin speech corpus and a speech recognition baseline. In Proc. Oriental COCOSDA, (pp. 1\u20135).","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"9959_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Z., Luo, Y., & Mesgarani, N. (2017). Deep attractor network for single-microphone speaker separation. In Proc. ICASSP, (pp. 246\u2013250).","DOI":"10.1109\/ICASSP.2017.7952155"},{"key":"9959_CR5","doi-asserted-by":"crossref","unstructured":"Delcroix, M., Ochiai, T., Zmolikova, K., Kinoshita, K., Tawara, N., Nakatani, T., & Araki, S. (2020). Improving speaker discrimination of target speech extraction with time-domain speakerbeam. In Proc. ICASSP, (pp. 691\u2013695).","DOI":"10.1109\/ICASSP40776.2020.9054683"},{"key":"9959_CR6","doi-asserted-by":"crossref","unstructured":"Delcroix, M., Watanabe, S., Ochiai, T., & Kinoshita, K., et al. (2019). End-to-end SpeakerBeam for single channel target speech recognition. In Proc. Interspeech, (pp. 451\u2013455).","DOI":"10.21437\/Interspeech.2019-1856"},{"key":"9959_CR7","doi-asserted-by":"crossref","unstructured":"Ge, M., Xu, C., Wang, L., Chng, E.S., Dang, J., & Li, H. (2020). Spex+: A complete time domain speaker extraction network. In Proc. Interspeech, (pp. 1406\u20131410).","DOI":"10.21437\/Interspeech.2020-1397"},{"key":"9959_CR8","doi-asserted-by":"crossref","unstructured":"Han, J., Long, Y., & Liang, J. (2020). Attention-based scaling adaptation for target speech extraction. arXiv:2010.10923.","DOI":"10.1109\/ASRU51503.2021.9687903"},{"key":"9959_CR9","doi-asserted-by":"crossref","unstructured":"Han, J., Zhou, X., Long, Y., & Li, Y. (2021). Multi-channel target speech extraction with channel decorrelation and target speaker adaptation. In Proc. ICASSP.","DOI":"10.1109\/ICASSP39728.2021.9414244"},{"key":"9959_CR10","first-page":"545","volume":"73","author":"Y Isik","year":"2016","unstructured":"Isik, Y., Roux, J. L., Chen, Z., Watanabe, S., & Hershey, J. R. (2016). Single-channel multi-speaker separation using deep clustering. In Proc.\u00a0Interspeech,\u00a0 (pp. 545\u2013549).","journal-title":"Interspeech"},{"issue":"6","key":"9959_CR11","doi-asserted-by":"publisher","first-page":"168","DOI":"10.1109\/LSP.2003.811630","volume":"10","author":"G-J Jang","year":"2003","unstructured":"Jang, G.-J., Lee, T.-W., & Oh, Y.-H. (2003). Single-channel signal separation using time-domain basis functions. IEEE Signal Processing Letters, 10(6), 168\u2013171.","journal-title":"IEEE Signal Processing Letters"},{"key":"9959_CR12","doi-asserted-by":"crossref","unstructured":"Kanda, N., Boeddeker, C., Heitkaemper, J., & Fujita, Y., et al. (2019). Guided source separation meets a strong ASR Backend: Hitachi\/Paderborn University joint investigation for dinner party ASR. In Proc. Interspeech, (pp. 1248\u20131251).","DOI":"10.21437\/Interspeech.2019-1167"},{"issue":"10","key":"9959_CR13","doi-asserted-by":"publisher","first-page":"1901","DOI":"10.1109\/TASLP.2017.2726762","volume":"25","author":"M Kolb\u00e6k","year":"2017","unstructured":"Kolb\u00e6k, M., Yu, D., Tan, Z.-H., & Jensen, J. (2017). Multitalker speech separation with utterance-level permutation invariant training of deep recurrent neural networks. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (TASLP), 25(10), 1901\u20131913.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing (TASLP)"},{"key":"9959_CR14","doi-asserted-by":"crossref","unstructured":"Luo, Y., Chen, Z., & Yoshioka, T. (2020). Dual-path RNN: efficient long sequence modeling for time-domain single-channel speech separation. In proc. ICASSP, (pp. 46\u201350).","DOI":"10.1109\/ICASSP40776.2020.9054266"},{"key":"9959_CR15","doi-asserted-by":"crossref","unstructured":"Luo, Y., & Mesgarani, N. (2018). TasNet: Time-domain audio separation network for real-time, single-channel speech separation. In Proc. ICASSP, (pp. 696\u2013700).","DOI":"10.1109\/ICASSP.2018.8462116"},{"issue":"8","key":"9959_CR16","doi-asserted-by":"publisher","first-page":"1256","DOI":"10.1109\/TASLP.2019.2915167","volume":"27","author":"Y Luo","year":"2019","unstructured":"Luo, Y., & Mesgarani, N. (2019). Conv-tasnet: Surpassing ideal time-frequency magnitude masking for speech separation. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 27(8), 1256\u20131266.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"9959_CR17","doi-asserted-by":"crossref","unstructured":"Ochiai, T., Delcroix, M., Kinoshita, K., Ogawa, A., & Nakatani, T. (2019). Multimodal SpeakerBeam: Single channel target speech extraction with audio-visual speaker clues. In Proc. Interspeech, (pp. 2718\u20132722).","DOI":"10.21437\/Interspeech.2019-1513"},{"key":"9959_CR18","doi-asserted-by":"crossref","unstructured":"Qin, X., Bu, H., & Li, M. (2019). HI-MIA: A far-field text-dependent speaker verification database and the baselines. arXiv:1912.01231.","DOI":"10.1109\/ICASSP40776.2020.9054423"},{"key":"9959_CR19","doi-asserted-by":"crossref","unstructured":"Rao, W., Xu, C., Chng, E.S., & Li, H. (2019). Target speaker extraction for multi-talker speaker verification. In Proc. Interspeech, (pp. 1273\u20131277).","DOI":"10.21437\/Interspeech.2019-1410"},{"issue":"5","key":"9959_CR20","doi-asserted-by":"publisher","first-page":"561","DOI":"10.3233\/IDA-2007-11508","volume":"11","author":"S Salvador","year":"2007","unstructured":"Salvador, S., & Chan, P. (2007). Toward accurate dynamic time warping in linear time and space. Intelligent Data Analysis, 11(5), 561\u2013580.","journal-title":"Intelligent Data Analysis"},{"key":"9959_CR21","doi-asserted-by":"crossref","unstructured":"Schmidt, M.N., & Olsson, R.K. (2006). Single-channel speech separation using sparse non-negative matrix factorization. In Proc. Ninth International Conference on Spoken Language Processing.","DOI":"10.21437\/Interspeech.2006-655"},{"key":"9959_CR22","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Povey, D., & Khudanpu, S. (2017). Deep neural network embeddings for text-independent speaker verification. In Proc. Interspeech, (pp. 999\u20131003).","DOI":"10.21437\/Interspeech.2017-620"},{"key":"9959_CR23","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., & Khudanpur, S. (2018). X-Vectors: Robust DNN embeddings for speaker recognition. In Proc. ICASSP, (pp. 5329\u20135333).","DOI":"10.1109\/ICASSP.2018.8461375"},{"issue":"2","key":"9959_CR24","doi-asserted-by":"publisher","first-page":"242","DOI":"10.1109\/TASL.2010.2047419","volume":"19","author":"M Stark","year":"2011","unstructured":"Stark, M., Wohlmayr, M., & Pernkopf, F. (2011). Source-filter-based single-channel speech separation using pitch information. IEEE Transactions on Audio, Speech and Language Processing, 19(2), 242\u2013255.","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"issue":"4","key":"9959_CR25","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1109\/TSA.2005.858005","volume":"14","author":"E Vincent","year":"2006","unstructured":"Vincent, E., Gribonval, R., & F\u00e9votte, C. (2006). Performance measurement in blind audio source separation. IEEE Transactions on Audio, Speech and Language Processing, 14(4), 1462\u20131469.","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"key":"9959_CR26","doi-asserted-by":"crossref","unstructured":"Wang, S., Naithani, G., & Virtanen, T. (2019). Low-latency deep clustering for speech separation. In Proc. ICASSP, (pp. 76\u201380).","DOI":"10.1109\/ICASSP.2019.8683437"},{"key":"9959_CR27","doi-asserted-by":"crossref","unstructured":"Wang, Q., Muckenhirn, H., Wilson, K., & Sridhar, P., et al. (2019). VoiceFilter: Targeted voice separation by speaker-conditioned spectrogram masking. In Proc. Interspeech, (pp. 2728\u20132732).","DOI":"10.21437\/Interspeech.2019-1101"},{"key":"9959_CR28","doi-asserted-by":"crossref","unstructured":"Wu, J., Xu, Y., Zhang, S.-X., Chen, L.-W., Yu, M., Xie, L., & Yu, D. (2019). Time domain audio visual speech separation. In Proc. ASRU, (pp. 667\u2013673).","DOI":"10.1109\/ASRU46091.2019.9003983"},{"key":"9959_CR29","doi-asserted-by":"crossref","unstructured":"Xiao, X., Chen, Z., Yoshioka, T., Erdogan, H., & Liu, C. et al. (2019). Single-channel speech extraction using speaker inventory and attention network. In Proc. ICASSP, (pp. 86\u201390).","DOI":"10.1109\/ICASSP.2019.8682245"},{"key":"9959_CR30","doi-asserted-by":"crossref","unstructured":"Yousefi, M., Khorram, S., & Hansen, J. (2019). Probabilistic permutation invariant training for speech separation. In Proc. Interspeech, (pp. 4604\u20134608).","DOI":"10.21437\/Interspeech.2019-1827"},{"issue":"5","key":"9959_CR31","doi-asserted-by":"publisher","first-page":"967","DOI":"10.1109\/TASLP.2016.2536478","volume":"24","author":"X-L Zhang","year":"2016","unstructured":"Zhang, X.-L., & Wang, D. (2016). A deep ensemble learning method for monaural speech separation. IEEE\/ACM Transactions on Audio, Speech and Language Processing (TASLP), 24(5), 967\u2013977.","journal-title":"IEEE\/ACM Transactions on Audio, Speech and Language Processing (TASLP)"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-022-09959-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-022-09959-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-022-09959-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,3,15]],"date-time":"2022-03-15T18:41:06Z","timestamp":1647369666000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-022-09959-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,13]]},"references-count":31,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2022,3]]}},"alternative-id":["9959"],"URL":"https:\/\/doi.org\/10.1007\/s10772-022-09959-8","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,1,13]]},"assertion":[{"value":"31 January 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 December 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 January 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}