{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:22:45Z","timestamp":1775229765997,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T00:00:00Z","timestamp":1715385600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,11]]},"DOI":"10.1145\/3613904.3642057","type":"proceedings-article","created":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T08:38:06Z","timestamp":1715416686000},"page":"1-16","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["Look Once to Hear: Target Speech Hearing with Noisy Examples"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5086-9092","authenticated-orcid":false,"given":"Bandhav","family":"Veluri","sequence":"first","affiliation":[{"name":"Paul G. Allen Center for Computer Science &amp; Engineering, University of Washington, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7249-2641","authenticated-orcid":false,"given":"Malek","family":"Itani","sequence":"additional","affiliation":[{"name":"Paul G. Allen Center for Computer Science and Engineering, University of Washington, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8031-5066","authenticated-orcid":false,"given":"Tuochao","family":"Chen","sequence":"additional","affiliation":[{"name":"Paul G. Allen Center for Computer Science and Engineering, University of Washington, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7791-3545","authenticated-orcid":false,"given":"Takuya","family":"Yoshioka","sequence":"additional","affiliation":[{"name":"AssemblyAI, United States and Microsoft, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9863-3054","authenticated-orcid":false,"given":"Shyamnath","family":"Gollakota","sequence":"additional","affiliation":[{"name":"Paul G. Allen Center for Computer Science and Engineering, University of Washington, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,5,11]]},"reference":[{"key":"e_1_3_3_3_1_1","volume-title":"The Conversation: Deep Audio-Visual Speech Enhancement.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon\u00a0Son Chung, and Andrew Zisserman. 2018. The Conversation: Deep Audio-Visual Speech Enhancement. (2018). arxiv:cs.CV\/1804.04121"},{"key":"e_1_3_3_3_2_1","doi-asserted-by":"crossref","unstructured":"Triantafyllos Afouras Joon\u00a0Son Chung and Andrew Zisserman. 2019. My lips are concealed: Audio-visual speech enhancement through obstructions. (2019). arxiv:cs.CV\/1907.04975","DOI":"10.21437\/Interspeech.2019-3114"},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"publisher","unstructured":"V.R. Algazi R.O. Duda D.M. Thompson and C. Avendano. 2001. The CIPIC HRTF database. (2001) 99-102\u00a0pages. https:\/\/doi.org\/10.1109\/ASPAA.2001.969552","DOI":"10.1109\/ASPAA.2001.969552"},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"crossref","unstructured":"Winko\u00a0W. An Barbara Shinn-Cunningham Hannes Gamper Dimitra Emmanouilidou David Johnston Mihai Jalobeanu Edward Cutrell Andrew Wilson Kuan-Jung Chiang and Ivan Tashev. 2021. Decoding Music Attention from \u201cEEG Headphones\u201d: A User-Friendly Auditory Brain-Computer Interface. (2021) 985-989\u00a0pages.","DOI":"10.1109\/ICASSP39728.2021.9414492"},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"crossref","unstructured":"Taichi Asami Ryo Masumura Yoshikazu Yamaguchi Hirokazu Masataki and Yushi Aono. 2017. Domain adaptation of DNN acoustic models using knowledge distillation. (2017) 5185-5189\u00a0pages.","DOI":"10.1109\/ICASSP.2017.7953145"},{"key":"e_1_3_3_3_6_1","doi-asserted-by":"publisher","DOI":"10.1080\/10447310802205776"},{"key":"e_1_3_3_3_7_1","unstructured":"Yuanyuan Bao Yanze Xu Na Xu Wenjing Yang Hongfeng Li Shicong Li Yongtao Jia Fei Xiang Jincheng He and Ming Li. 2021. Lightweight Dual-channel Target Speaker Separation for Mobile Voice Communication. (2021). arxiv:cs.SD\/2106.02934"},{"key":"e_1_3_3_3_8_1","volume-title":"Bleichner and Stefan Debener","author":"G.","year":"2017","unstructured":"Martin\u00a0G. Bleichner and Stefan Debener. 2017. Concealed, Unobtrusive Ear-Centered EEG Acquisition: cEEGrids for Transparent EEG. (2017). https:\/\/api.semanticscholar.org\/CorpusID:2859820"},{"key":"e_1_3_3_3_9_1","doi-asserted-by":"crossref","unstructured":"Luca Brayda Federico Traverso Luca Giuliani Francesco Diotalevi Stefania Repetto Sara Sansalone Andrea Trucco and Giulio Sandini. 2015. Spatially selective binaural hearing aids. (2015).","DOI":"10.1145\/2800835.2806207"},{"key":"e_1_3_3_3_10_1","volume-title":"SUS: A quick and dirty usability scale. Usability Eval. Ind. 189 (11","author":"Brooke John","year":"1995","unstructured":"John Brooke. 1995. SUS: A quick and dirty usability scale. Usability Eval. Ind. 189 (11 1995)."},{"key":"e_1_3_3_3_11_1","unstructured":"S\u00e9bastien Bubeck Varun Chandrasekaran Ronen Eldan Johannes Gehrke Eric Horvitz Ece Kamar Peter Lee Yin\u00a0Tat Lee Yuanzhi Li Scott Lundberg Harsha Nori Hamid Palangi Marco\u00a0Tulio Ribeiro and Yi Zhang. 2023. Sparks of Artificial General Intelligence: Early experiments with GPT-4. (2023). arxiv:cs.CL\/2303.12712"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"crossref","unstructured":"Ishan Chatterjee Maruchi Kim Vivek Jayaram Shyamnath Gollakota Ira Kemelmacher Shwetak Patel and Steven\u00a0M Seitz. 2022. ClearBuds: wireless binaural earbuds for learning-based speech enhancement. (2022).","DOI":"10.1145\/3498361.3538654"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"crossref","unstructured":"Samuele Cornell Zhong-Qiu Wang Yoshiki Masuyama Shinji Watanabe Manuel Pariente and Nobutaka Ono. 2023. Multi-Channel Target Speaker Extraction with Refinement: The WavLab Submission to the Second Clarity Enhancement Challenge. (2023). arxiv:eess.AS\/2302.07928","DOI":"10.1109\/ICASSP49357.2023.10095961"},{"key":"e_1_3_3_3_14_1","unstructured":"ONNX\u00a0Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/. (2021). Version: x.y.z."},{"key":"e_1_3_3_3_15_1","volume-title":"ICASSP 2023 Deep Noise Suppression Challenge.","author":"Dubey Harishchandra","year":"2023","unstructured":"Harishchandra Dubey, Ashkan Aazami, Vishak Gopal, Babak Naderi, Sebastian Braun, Ross Cutler, Hannes Gamper, Mehrsa Golestaneh, and Robert Aichner. 2023. ICASSP 2023 Deep Noise Suppression Challenge. (2023)."},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","unstructured":"Sefik\u00a0Emre Eskimez Takuya Yoshioka Huaming Wang Xiaofei Wang Zhuo Chen and Xuedong Huang. 2022. Personalized speech enhancement: new models and Comprehensive evaluation. (2022) 356-360\u00a0pages. https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746962","DOI":"10.1109\/ICASSP43922.2022.9746962"},{"key":"e_1_3_3_3_18_1","unstructured":"Meng Ge Chenglin Xu Longbiao Wang Chng\u00a0Eng Siong Jianwu Dang and Haizhou Li. 2020. SpEx+: A Complete Time Domain Speaker Extraction Network. (2020). https:\/\/api.semanticscholar.org\/CorpusID:218581824"},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"crossref","unstructured":"Ritwik Giri Shrikant Venkataramani Jean-Marc Valin Umut Isik and Arvindh Krishnaswamy. 2021. Personalized percepnet: Real-time low-complexity target voice separation and enhancement. (2021).","DOI":"10.21437\/Interspeech.2021-694"},{"key":"e_1_3_3_3_20_1","unstructured":"Steven Goodman Dhruv Jain Jon Froehlich Brock Craft and Leah Findlater. 2019. Social Tensions with Head-Mounted Displays for Accessibility. (2019)."},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/jstsp.2020.2980956"},{"key":"e_1_3_3_3_22_1","doi-asserted-by":"crossref","unstructured":"Cong Han Yi Luo and Nima Mesgarani. 2020. Real-time binaural speech separation with preserved spatial cues. (2020). arXiv:eess.AS\/2002.06637","DOI":"10.1109\/ICASSP40776.2020.9053215"},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","unstructured":"Jiangyu Han Xinyuan Zhou Yanhua Long and Yijie Li. 2021. Multi-Channel Target Speech Extraction with Channel Decorrelation and Target Speaker Adaptation. (2021) 6094-6098\u00a0pages. https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9414244","DOI":"10.1109\/ICASSP39728.2021.9414244"},{"key":"e_1_3_3_3_24_1","unstructured":"Headphonesty. 2022. The Fascinating History of Noise-Cancelling Headphones. https:\/\/www.headphonesty.com\/2020\/10\/history-of-noise-cancelling-headphones\/. (2022)."},{"key":"e_1_3_3_3_25_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. (2015). arxiv:stat.ML\/1503.02531"},{"key":"e_1_3_3_3_26_1","volume-title":"DCCRN: Deep Complex Convolution Recurrent Network for Phase-Aware Speech Enhancement.","author":"Hu Yanxin","year":"2020","unstructured":"Yanxin Hu, Yun Liu, Shubo Lv, Mengtao Xing, Shimin Zhang, Yihui Fu, Jian Wu, Bihong Zhang, and Lei Xie. 2020. DCCRN: Deep Complex Convolution Recurrent Network for Phase-Aware Speech Enhancement. (2020). arxiv:eess.AS\/2008.00264"},{"key":"e_1_3_3_3_27_1","unstructured":"Apple Inc.2023. Apple AirPods. https:\/\/www.apple.com\/airpods\/. (2023)."},{"key":"e_1_3_3_3_28_1","unstructured":"IoSR-Surrey. 2016. IoSR-surrey\/realroombrirs: Binaural impulse responses captured in real rooms.https:\/\/github.com\/IoSR-Surrey\/RealRoomBRIRs. (2016)."},{"key":"e_1_3_3_3_29_1","unstructured":"IoSR-Surrey. 2023. Simulated Room Impulse Responses.https:\/\/iosr.uk\/software\/index.php. (2023)."},{"key":"e_1_3_3_3_30_1","unstructured":"Teerapat Jenrungrot Vivek Jayaram Steve Seitz and Ira Kemelmacher-Shlizerman. 2020. The Cone of Silence: Speech Separation by Localization. (2020)."},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06160-y"},{"key":"e_1_3_3_3_32_1","doi-asserted-by":"crossref","unstructured":"J. Kahn M. Rivi\u00e8re W. Zheng E. Kharitonov Q. Xu P.\u00a0E. Mazar\u00e9 J. Karadayi V. Liptchinsky R. Collobert C. Fuegen T. Likhomanenko G. Synnaeve A. Joulin A. Mohamed and E. Dupoux. 2020. Libri-Light: A Benchmark for ASR with Limited or No Supervision. (2020) 7669-7673\u00a0pages. https:\/\/github.com\/facebookresearch\/libri-light.","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBCAS.2020.3001265"},{"key":"e_1_3_3_3_34_1","doi-asserted-by":"crossref","unstructured":"Nithin\u00a0Rao Koluguri Taejin Park and Boris Ginsburg. 2021. TitaNet: Neural Model for speaker representation with 1D Depth-wise separable convolutions and global context. (2021). arxiv:eess.AS\/2110.04410","DOI":"10.1109\/ICASSP43922.2022.9746806"},{"key":"e_1_3_3_3_35_1","volume-title":"Two decades of array signal processing research: the parametric approach","author":"Krim Hamid","year":"1996","unstructured":"Hamid Krim and Mats Viberg. 1996. Two decades of array signal processing research: the parametric approach. IEEE signal processing magazine 13, 4 (1996), 67\u201394."},{"key":"e_1_3_3_3_36_1","unstructured":"Chenxing Li Jiaming Xu Nima Mesgarani and Bo Xu. 2021. Speaker and Direction Inferred Dual-channel Speech Separation. (2021). arxiv:cs.SD\/2102.04056"},{"key":"e_1_3_3_3_37_1","unstructured":"Guanjun Li Shan Liang Shuai Nie Wenju Liu Meng Yu Lianwu Chen Shouye Peng and Changliang Li. 2019. Direction-Aware Speaker Beam for Multi-Channel Speaker Extraction. (2019). https:\/\/api.semanticscholar.org\/CorpusID:202754432"},{"key":"e_1_3_3_3_38_1","unstructured":"Yen-Ju Lu Xuankai Chang Chenda Li Wangyou Zhang Samuele Cornell Zhaoheng Ni Yoshiki Masuyama Brian Yan Robin Scheibler Zhongqiu Wang Yu Tsao Yanmin Qian and Shinji Watanabe. 2022. ESPnet-SE++: Speech Enhancement for Robust Speech Recognition Translation and Understanding. (2022). https:\/\/api.semanticscholar.org\/CorpusID:250698853"},{"key":"e_1_3_3_3_39_1","doi-asserted-by":"crossref","unstructured":"Yi Luo Zhuo Chen and Takuya Yoshioka. 2020. Dual-path RNN: efficient long sequence modeling for time-domain single-channel speech separation. (2020).","DOI":"10.1109\/ICASSP40776.2020.9054266"},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","unstructured":"Yi Luo and Nima Mesgarani. 2017. TasNet: time-domain audio separation network for real-time single-channel speech separation. (2017). https:\/\/doi.org\/10.48550\/ARXIV.1711.00541","DOI":"10.48550\/ARXIV.1711.00541"},{"key":"e_1_3_3_3_41_1","volume-title":"Conv-tasnet: Surpassing ideal time\u2013frequency magnitude masking for speech separation.","author":"Luo Yi","year":"2019","unstructured":"Yi Luo and Nima Mesgarani. 2019. Conv-tasnet: Surpassing ideal time\u2013frequency magnitude masking for speech separation. (2019)."},{"key":"e_1_3_3_3_42_1","doi-asserted-by":"crossref","unstructured":"Bojana Mirkovic Martin\u00a0G. Bleichner Maarten de Vos and Stefan Debener. 2016. Target Speaker Detection with Concealed EEG Around the Ear. (2016). https:\/\/api.semanticscholar.org\/CorpusID:5261720","DOI":"10.3389\/fnins.2016.00349"},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2019.8682061"},{"key":"e_1_3_3_3_44_1","doi-asserted-by":"crossref","unstructured":"Tsubasa Ochiai Marc Delcroix Keisuke Kinoshita Atsunori Ogawa and Tomohiro Nakatani. 2019. Multimodal SpeakerBeam: Single Channel Target Speech Extraction with Audio-Visual Speaker Clues. (2019). https:\/\/api.semanticscholar.org\/CorpusID:202740792","DOI":"10.21437\/Interspeech.2019-1513"},{"key":"e_1_3_3_3_45_1","doi-asserted-by":"crossref","unstructured":"Andrew Owens and Alexei\u00a0A. Efros. 2018. Audio-Visual Scene Analysis with Self-Supervised Multisensory Features. (2018). arxiv:cs.CV\/1804.03641","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"e_1_3_3_3_46_1","doi-asserted-by":"crossref","unstructured":"Vassil Panayotov Guoguo Chen Daniel Povey and Sanjeev Khudanpur. 2015. Librispeech: an ASR corpus based on public domain audio books. (2015) 5206\u20135210\u00a0pages.","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_3_47_1","doi-asserted-by":"crossref","unstructured":"Manuel Pariente Samuele Cornell Joris Cosentino Sunit Sivasankaran Efthymios Tzinis Jens Heitkaemper Michel Olvera Fabian-Robert St\u00f6ter Mathieu Hu Juan\u00a0M. Mart\u00edn-Do\u00f1as David Ditter Ariel Frank Antoine Deleforge and Emmanuel Vincent. 2020. Asteroid: the PyTorch-based audio source separation toolkit for researchers. (2020).","DOI":"10.21437\/Interspeech.2020-1673"},{"key":"e_1_3_3_3_48_1","unstructured":"Se\u00a0Rim Park and Jinwon Lee. 2016. A Fully Convolutional Neural Network for Speech Enhancement. (2016). arXiv:1609.07132http:\/\/arxiv.org\/abs\/1609.07132"},{"key":"e_1_3_3_3_49_1","unstructured":"Resemble-Ai. 2019. Resemble-ai\/resemblyzer: A python package to analyze and compare voices with Deep Learning. (2019). https:\/\/github.com\/resemble-ai\/Resemblyzer"},{"key":"e_1_3_3_3_50_1","volume-title":"INTERSPEECH 2021","author":"Suppression\u00a0Challenge Results Deep Noise","year":"2021","unstructured":"Deep Noise Suppression\u00a0Challenge Results. 2021. INTERSPEECH 2021. https:\/\/www.microsoft.com\/en-us\/research\/academic-program\/deep-noise-suppression-challenge-interspeech-2021\/results\/. (2021)."},{"key":"e_1_3_3_3_51_1","unstructured":"[51] Apple AirPods Max Wireless\u00a0Headphones Review. 2023. https:\/\/www.rtings.com\/headphones\/reviews\/apple\/airpods-max-wireless#page-test-results. (2023)."},{"key":"e_1_3_3_3_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2013.2296173"},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2266"},{"key":"e_1_3_3_3_54_1","volume-title":"SDR - half-baked or well done?","author":"Roux Jonathan\u00a0Le","year":"2018","unstructured":"Jonathan\u00a0Le Roux, Scott Wisdom, Hakan Erdogan, and John\u00a0R. Hershey. 2018. SDR - half-baked or well done? (2018)."},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"crossref","unstructured":"Hiroshi Sato Tsubasa Ochiai Keisuke Kinoshita Marc Delcroix Tomohiro Nakatani and Shoko Araki. 2021. Multimodal Attention Fusion for Target Speaker Extraction. (2021). arxiv:eess.AS\/2102.01326","DOI":"10.1109\/SLT48900.2021.9383539"},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"crossref","unstructured":"Hendrik Schr\u00f6ter Alberto\u00a0N. Escalante-B. Tobias Rosenkranz and Andreas Maier. 2022. DeepFilterNet: A Low Complexity Speech Enhancement Framework for Full-Band Audio based on Deep Filtering. (2022).","DOI":"10.1109\/ICASSP43922.2022.9747055"},{"key":"e_1_3_3_3_57_1","unstructured":"SDK. 2023. Steam Audio. https:\/\/valvesoftware.github.io\/steam-audio\/. (2023)."},{"key":"e_1_3_3_3_58_1","unstructured":"ShanonPearce. 2022. Shanonpearce\/ash-listening-set: A dataset of filters for headphone correction and binaural synthesis of spatial audio systems on headphones. (2022). https:\/\/github.com\/ShanonPearce\/ASH-Listening-Set\/tree\/main"},{"key":"e_1_3_3_3_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230550"},{"key":"e_1_3_3_3_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2010.2051803"},{"key":"e_1_3_3_3_61_1","unstructured":"Cem Subakan Mirco Ravanelli Samuele Cornell Fr\u00e9d\u00e9ric Lepoutre and Fran\u00e7ois Grondin. 2022. Resource-Efficient Separation Transformer. (2022)."},{"key":"e_1_3_3_3_62_1","unstructured":"Masahiro Sunohara Chiho Haruta and Nobutaka Ono. 2017. Low-latency real-time blind source separation with binaural directional hearing aids. (2017)."},{"key":"e_1_3_3_3_63_1","doi-asserted-by":"crossref","unstructured":"Bandhav Veluri Justin Chan Malek Itani Tuochao Chen Takuya Yoshioka and Shyamnath Gollakota. 2023. Real-Time Target Sound Extraction. (2023).","DOI":"10.1109\/ICASSP49357.2023.10094573"},{"key":"e_1_3_3_3_64_1","doi-asserted-by":"crossref","unstructured":"Bandhav Veluri Malek Itani Justin Chan Takuya Yoshioka and Shyamnath Gollakota. 2023. Semantic hearing: Programming acoustic scenes with binaural hearables. (2023).","DOI":"10.1145\/3586183.3606779"},{"key":"e_1_3_3_3_65_1","unstructured":"Li Wan Quan Wang Alan Papir and Ignacio\u00a0Lopez Moreno. 2020. Generalized End-to-End Loss for Speaker Verification. (2020). arxiv:eess.AS\/1710.10467"},{"key":"e_1_3_3_3_66_1","doi-asserted-by":"crossref","unstructured":"Anran Wang Maruchi Kim Hao Zhang and Shyamnath Gollakota. 2022. Hybrid Neural Networks for On-Device Directional Hearing. (2022).","DOI":"10.1609\/aaai.v36i10.21394"},{"key":"e_1_3_3_3_67_1","doi-asserted-by":"crossref","unstructured":"Quan Wang Hannah Muckenhirn Kevin\u00a0W. Wilson Prashant Sridhar Zelin Wu John\u00a0R. Hershey Rif\u00a0A. Saurous Ron\u00a0J. Weiss Ye Jia and Ignacio Lopez-Moreno. 2018. VoiceFilter: Targeted Voice Separation by Speaker-Conditioned Spectrogram Masking. (2018).","DOI":"10.21437\/Interspeech.2019-1101"},{"key":"e_1_3_3_3_68_1","unstructured":"Zhong-Qiu Wang Samuele Cornell Shukjae Choi Younglo Lee Byeong-Yeol Kim and Shinji Watanabe. 2023. TF-GridNet: Making Time-Frequency Domain Models Great Again for Monaural Speaker Separation. (2023). arxiv:cs.SD\/2209.03952"},{"key":"e_1_3_3_3_69_1","doi-asserted-by":"crossref","unstructured":"Gordon Wichern Joe Antognini Michael Flynn Licheng\u00a0Richard Zhu Emmett McQuinn Dwight Crow Ethan Manilow and Jonathan\u00a0Le Roux. 2019. WHAM!: Extending Speech Separation to Noisy Environments. (2019). arxiv:cs.SD\/1907.01160","DOI":"10.21437\/Interspeech.2019-2821"},{"key":"e_1_3_3_3_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/msp.2023.3240008"},{"key":"e_1_3_3_3_71_1","doi-asserted-by":"crossref","unstructured":"Kate\u0159ina \u017dmol\u00edkov\u00e1 Marc Delcroix Keisuke Kinoshita Takuya Higuchi Atsunori Ogawa and Tomohiro Nakatani. 2017. Speaker-Aware Neural Network Based Beamformer for Speaker Extraction in Speech Mixtures. (2017). https:\/\/api.semanticscholar.org\/CorpusID:5587779","DOI":"10.21437\/Interspeech.2017-667"},{"key":"e_1_3_3_3_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2019.2922820"}],"event":{"name":"CHI '24: CHI Conference on Human Factors in Computing Systems","location":"Honolulu HI USA","acronym":"CHI '24","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGACCESS ACM Special Interest Group on Accessible Computing"]},"container-title":["Proceedings of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613904.3642057","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3613904.3642057","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:56:41Z","timestamp":1750291001000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613904.3642057"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,11]]},"references-count":72,"alternative-id":["10.1145\/3613904.3642057","10.1145\/3613904"],"URL":"https:\/\/doi.org\/10.1145\/3613904.3642057","relation":{},"subject":[],"published":{"date-parts":[[2024,5,11]]},"assertion":[{"value":"2024-05-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}