{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T16:26:33Z","timestamp":1768926393317,"version":"3.49.0"},"reference-count":50,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Grand R&amp;D Plan of China","award":["2016YFB1000101"],"award-info":[{"award-number":["2016YFB1000101"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["81973244"],"award-info":[{"award-number":["81973244"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Defense Science and Technology Innovation Special Zone Project"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/taslp.2020.3008832","type":"journal-article","created":{"date-parts":[[2020,7,13]],"date-time":"2020-07-13T22:20:35Z","timestamp":1594678835000},"page":"2073-2083","source":"Crossref","is-referenced-by-count":17,"title":["Audio Tagging by Cross Filtering Noisy Labels"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7867-2112","authenticated-orcid":false,"given":"Boqing","family":"Zhu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5997-5169","authenticated-orcid":false,"given":"Kele","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2864-0475","authenticated-orcid":false,"given":"Qiuqiang","family":"Kong","sequence":"additional","affiliation":[]},{"given":"Huaimin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yuxing","family":"Peng","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","first-page":"2694","article-title":"Self-paced curriculum learning","author":"jiang","year":"0","journal-title":"Proc 29th AAAI Conf Artif Intell"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.33682\/w13e-5v06"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref31","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref30","first-page":"69","article-title":"General-purpose tagging of freesound audio with audioset labels: Task description, dataset, and baseline","author":"fonseca","year":"0","journal-title":"Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)"},{"key":"ref37","first-page":"2080","article-title":"Pitch and spectral analysis of speech based on an auditory synchrony model","volume":"32","author":"seneff","year":"1985","journal-title":"J Hepatology"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1121\/1.400476"},{"key":"ref35","author":"young","year":"1993","journal-title":"The HTK hidden Markov model toolkit Design and philosophy"},{"key":"ref34","first-page":"248","article-title":"SPECMIX: A simple data augmentation to leverage clean and noisy set for efficient audio tagging","author":"bouteillon","year":"0","journal-title":"Detection and Classification of Acoustic Scenes and Events 2019 Workshop"},{"key":"ref28","first-page":"44","article-title":"General-purpose audio tagging by ensembling convolutional neural networks based on multiple features","author":"wilkinghoff","year":"0","journal-title":"Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.33682\/0avf-bm61"},{"key":"ref29","first-page":"133","article-title":"Acoustic scene classification using deep convolutional neural network and multiple spectrograms fusion","author":"weiping","year":"0","journal-title":"Proceedings of the Detection and Classification of Acoustic Scenes and Events 2017 Workshop (DCASE 2017)"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2871183"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.857573"},{"key":"ref20","first-page":"2304","article-title":"MentorNet: Learning data-driven curriculum for very deep neural networks on corrupted labels","volume":"80","author":"jiang","year":"0","journal-title":"Proc 35th Int Conf Mach Learn"},{"key":"ref22","doi-asserted-by":"crossref","first-page":"206","DOI":"10.1109\/JSTSP.2019.2908700","article-title":"Deep learning for audio signal processing","volume":"13","author":"hendrik","year":"2019","journal-title":"J Sel Topics Signal Process"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-63450-0"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351090"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2690563"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2018.8489641"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00764-5_2"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2018.2885636"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.696"},{"key":"ref11","first-page":"960","article-title":"Decoupling&#x201D; when to update&#x201D; from&#x201D; how to update","author":"malach","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref40","first-page":"1919","article-title":"Robust loss functions under label noise for deep neural networks","author":"ghosh","year":"0","journal-title":"Proc 31st AAAI Conf Artif Intell"},{"key":"ref12","first-page":"7164","article-title":"How does disagreement help generalization against label corruption?","author":"yu","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref13","first-page":"1","article-title":"Training deep neural networks on noisy labels with bootstrapping","author":"reed","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref14","first-page":"197","article-title":"Audio tagging system for dcase 2018: Focusing on label noise data augmentation and its efficient learning","author":"jeong","year":"0","journal-title":"Proc Detect Classif Acoust Sce Events 2018 Workshop"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683158"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.240"},{"key":"ref17","first-page":"5836","article-title":"Masking: A new perspective of noisy supervision","author":"han","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref18","first-page":"1","article-title":"Temporal ensembling for semi-supervised learning","author":"laine","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref19","first-page":"1195","article-title":"Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results","author":"tarvainen","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref4","first-page":"8527","article-title":"Co-teaching: Robust training of deep neural networks with extremely noisy labels","author":"han","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/279943.279962"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/384"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00906"},{"key":"ref7","first-page":"1196","article-title":"Learning with noisy labels","author":"natarajan","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2895254"},{"key":"ref9","first-page":"8778","article-title":"Generalized cross entropy loss for training deep neural networks with noisy labels","author":"zhang","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref46","first-page":"212","article-title":"General-purpose audio tagging from noisy labels using convolutional neural networks","author":"iqbal","year":"0","journal-title":"Proc Workshop Detection Classif Acoust Scenes Events"},{"key":"ref45","first-page":"54","article-title":"DCASE 2018 task 2: Iterative training, label smoothing, and background noise normalization for audio event tagging","author":"nguyen","year":"0","journal-title":"Proc Workshop Detection Classif Acoust Scenes Events"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref42","first-page":"1","article-title":"Label-efficient audio classification through multitask learning and self-supervision","author":"lee","year":"0","journal-title":"Proc Workshop Int Conf Learn Represent"},{"key":"ref41","first-page":"1705","article-title":"Clustering with Bregman divergences","volume":"6","author":"banerjee","year":"2005","journal-title":"J Mach Learn Res"},{"key":"ref44","first-page":"1","article-title":"Pseudo-label: The simple and efficient semi-supervised learning method for deep neural networks","author":"lee","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref43","first-page":"217","article-title":"DCASE 2018 challenge surrey cross-task convolutional neural network baseline","author":"kong","year":"0","journal-title":"Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/8938144\/09139374.pdf?arnumber=9139374","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T17:30:51Z","timestamp":1651080651000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9139374\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.3008832","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}