{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T14:31:14Z","timestamp":1767709874508,"version":"3.48.0"},"reference-count":66,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T00:00:00Z","timestamp":1764633600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T00:00:00Z","timestamp":1767657600000},"content-version":"vor","delay-in-days":35,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"name":"Italy NRRP - NextGenerationEU","award":["PE00000001"],"award-info":[{"award-number":["PE00000001"]}]},{"name":"Italy MUR - NextGenerationEU","award":["E13C22001060006"],"award-info":[{"award-number":["E13C22001060006"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J AUDIO SPEECH MUSIC PROC."],"DOI":"10.1186\/s13636-025-00436-z","type":"journal-article","created":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T08:04:54Z","timestamp":1764662694000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["AudioSet-tools: a Python framework for taxonomy-aware AudioSet curation and reproducible audio research"],"prefix":"10.1186","volume":"2026","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0438-1748","authenticated-orcid":false,"given":"Stefano","family":"Giacomelli","sequence":"first","affiliation":[]},{"given":"Marco","family":"Giordano","sequence":"additional","affiliation":[]},{"given":"Claudia","family":"Rinaldi","sequence":"additional","affiliation":[]},{"given":"Fabio","family":"Graziosi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,12,2]]},"reference":[{"key":"436_CR1","doi-asserted-by":"publisher","DOI":"10.3390\/app10062020","author":"J Abe\u00dfer","year":"2020","unstructured":"J. Abe\u00dfer, A review of deep learning based methods for acoustic scene classification. Appl. Sci. (2020). https:\/\/doi.org\/10.3390\/app10062020","journal-title":"Appl. Sci."},{"key":"436_CR2","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121902","volume":"238","author":"B Ding","year":"2024","unstructured":"B. Ding et al., Acoustic scene classification: a comprehensive survey. Expert Syst. Appl. 238, 121902 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2023.121902","journal-title":"Expert Syst. Appl."},{"key":"436_CR3","doi-asserted-by":"publisher","unstructured":"S. Chandrakala, S.L. Jayalakshmi, Environmental audio scene and sound event recognition for autonomous surveillance: a survey and comparative studies. ACM Comput. Surv. 52(3) (2019). https:\/\/doi.org\/10.1145\/3322240","DOI":"10.1145\/3322240"},{"issue":"5","key":"436_CR4","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1109\/MSP.2021.3090678","volume":"38","author":"A Mesaros","year":"2021","unstructured":"A. Mesaros, T. Heittola, T. Virtanen, M.D. Plumbley, Sound event detection: a tutorial. IEEE Signal Process. Mag. 38(5), 67\u201383 (2021). https:\/\/doi.org\/10.1109\/MSP.2021.3090678","journal-title":"IEEE Signal Process. Mag."},{"key":"436_CR5","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2023.109719","volume":"215","author":"C Castorena","year":"2024","unstructured":"C. Castorena, M. Cobos, J. Lopez-Ballester, F.J. Ferri, A safety-oriented framework for sound event detection in driving scenarios. Appl. Acoust. 215, 109719 (2024). https:\/\/doi.org\/10.1016\/j.apacoust.2023.109719","journal-title":"Appl. Acoust."},{"issue":"4","key":"436_CR6","doi-asserted-by":"publisher","first-page":"5537","DOI":"10.1007\/s11042-021-11817-9","volume":"81","author":"Z Mnasri","year":"2022","unstructured":"Z. Mnasri, S. Rovetta, F. Masulli, Anomalous sound event detection: a survey of machine learning based methods and applications. Multimed. Tools Appl. 81(4), 5537\u20135586 (2022). https:\/\/doi.org\/10.1007\/s11042-021-11817-9","journal-title":"Multimed. Tools Appl."},{"issue":"1","key":"436_CR7","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1121\/10.0011809","volume":"152","author":"PA Grumiaux","year":"2022","unstructured":"P.A. Grumiaux, S. Kiti\u0107, L. Girin, A. Gu\u00e9rin, A survey of sound source localization with deep learning methods. J. Acoust. Soc. Am. 152(1), 107\u2013151 (2022). https:\/\/doi.org\/10.1121\/10.0011809","journal-title":"J. Acoust. Soc. Am."},{"key":"436_CR8","doi-asserted-by":"publisher","unstructured":"J.F. Gemmeke, D.P.W. Ellis, D.\u00a0Freedman, A.\u00a0Jansen, W.\u00a0Lawrence, R.C. Moore, M.\u00a0Plakal, M.\u00a0Ritter, in 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Audio Set: An ontology and human-labeled dataset for audio events (IEEE, New Orleans, 2017), pp. 776\u2013780. https:\/\/doi.org\/10.1109\/ICASSP.2017.7952261","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"436_CR9","doi-asserted-by":"publisher","unstructured":"S.\u00a0Roller, D.\u00a0Kiela, M.\u00a0Nickel. Hearst Patterns Revisited: Automatic Hypernym Detection from Large Text Corpora (2018). https:\/\/doi.org\/10.48550\/arXiv.1806.03191","DOI":"10.48550\/arXiv.1806.03191"},{"key":"436_CR10","doi-asserted-by":"publisher","unstructured":"T.\u00a0Pellissier\u00a0Tanon, D.\u00a0Vrande\u010di\u0107, S.\u00a0Schaffert, T.\u00a0Steiner, L.\u00a0Pintscher, in Proceedings of the 25th International Conference on World Wide Web, From Freebase to Wikidata: The Great Migration, WWW \u201916 (International World Wide Web Conferences Steering Committee, Republic and Canton of Geneva, CHE, 2016), pp. 1419\u20131428. https:\/\/doi.org\/10.1145\/2872427.2874809","DOI":"10.1145\/2872427.2874809"},{"key":"436_CR11","doi-asserted-by":"publisher","unstructured":"N.\u00a0Turpault, R.\u00a0Serizel, E.\u00a0Vincent, in 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2020), Limitations of Weak Labels for Embedding and Tagging (2020), pp. 131\u2013135. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053160","DOI":"10.1109\/ICASSP40776.2020.9053160"},{"key":"436_CR12","doi-asserted-by":"publisher","unstructured":"S.\u00a0Damiano, T.\u00a0Dietzen, T.v. Waterschoot. Frequency Tracking Features for Data-Efficient Deep Siren Identification (2024). https:\/\/doi.org\/10.48550\/arXiv.2409.08587","DOI":"10.48550\/arXiv.2409.08587"},{"key":"436_CR13","doi-asserted-by":"publisher","first-page":"75702","DOI":"10.1109\/ACCESS.2020.2988986","volume":"8","author":"VT Tran","year":"2020","unstructured":"V.T. Tran, W.H. Tsai, Acoustic-based emergency vehicle detection using convolutional neural networks. IEEE Access 8, 75702\u201375713 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.2988986","journal-title":"IEEE Access"},{"key":"436_CR14","doi-asserted-by":"publisher","unstructured":"A. Shah, A. Singh, A. Singh, Audio Classification of Emergency Vehicle Sirens Using Recurrent Neural Network Architectures, in Proceedings of International Conference on Paradigms of Communication, Computing and Data Analytics, ed. by A. Yadav, S.J. Nanda, M.H. Lim (Springer Nature, Singapore, 2023), pp.71\u201383. https:\/\/doi.org\/10.1007\/978-981-99-4626-6_6","DOI":"10.1007\/978-981-99-4626-6_6"},{"key":"436_CR15","doi-asserted-by":"publisher","unstructured":"S.\u00a0Hershey, D.P.W. Ellis, E.\u00a0Fonseca, A.\u00a0Jansen, C.\u00a0Liu, R.\u00a0Channing\u00a0Moore, M.\u00a0Plakal, in ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), The Benefit of Temporally-Strong Labels in Audio Event Classification (2021), pp. 366\u2013370.https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9414579","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"436_CR16","unstructured":"B.\u00a0Ahmed. audioset-utils GitHub (2024). https:\/\/github.com\/bilalhsp\/audioset-utils. Accessed 3 Nov 2025"},{"key":"436_CR17","unstructured":"A.\u00a0McDonagh. audioset-processing GitHub (2025). https:\/\/github.com\/aoifemcdonagh\/audioset-processing.\u00a0Accessed 3 Nov 2025"},{"key":"436_CR18","unstructured":"K.\u00a0Lee. Fast-Audioset-Download GitHub (2025). https:\/\/github.com\/dlrudco\/Fast-Audioset-Download. Accessed 3 Nov 2025"},{"key":"436_CR19","unstructured":"yt-dlp GitHub (2025). https:\/\/github.com\/yt-dlp\/yt-dlp.\u00a0Accessed 3 Nov 2025"},{"issue":"146","key":"436_CR20","first-page":"10","volume":"2006","author":"S Tomar","year":"2006","unstructured":"S. Tomar, Converting video formats with FFmpeg. Linux J. 2006(146), 10 (2006)","journal-title":"Linux J."},{"key":"436_CR21","doi-asserted-by":"publisher","unstructured":"J.\u00a0Salamon, C.\u00a0Jacoby, J.P. Bello, in Proceedings of the 22nd ACM international conference on Multimedia, A Dataset and Taxonomy for Urban Sound Research (ACM, Orlando Florida USA, 2014), pp. 1041\u20131044. https:\/\/doi.org\/10.1145\/2647868.2655045","DOI":"10.1145\/2647868.2655045"},{"key":"436_CR22","doi-asserted-by":"publisher","unstructured":"E.\u00a0Fonseca, X.\u00a0Favory, J.\u00a0Pons, F.\u00a0Font, X.\u00a0Serra. FSD50K: An Open Dataset of Human-Labeled Sound Events (2022). https:\/\/doi.org\/10.48550\/arXiv.2010.00475","DOI":"10.48550\/arXiv.2010.00475"},{"key":"436_CR23","doi-asserted-by":"publisher","unstructured":"P.\u00a0Stamatiadis, M.\u00a0Olvera, S.\u00a0Essid. SALT: Standardized Audio event Label Taxonomy (2024). https:\/\/doi.org\/10.48550\/arXiv.2409.11746","DOI":"10.48550\/arXiv.2409.11746"},{"key":"436_CR24","doi-asserted-by":"publisher","unstructured":"H. Liu et al., in Interspeech 2023, Ontology-aware Learning and Evaluation for Audio Tagging (2023), pp. 3799\u20133803. https:\/\/doi.org\/10.21437\/Interspeech.2023-979","DOI":"10.21437\/Interspeech.2023-979"},{"key":"436_CR25","doi-asserted-by":"publisher","unstructured":"J.\u00a0Liang, H.\u00a0Phan, E.\u00a0Benetos, in ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Learning from Taxonomy: Multi-Label Few-Shot Classification for Everyday Sound Recognition (2024), pp. 771\u2013775. https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10446908","DOI":"10.1109\/ICASSP48485.2024.10446908"},{"key":"436_CR26","unstructured":"py-salt GitHub. https:\/\/github.com\/tpt-adasp\/salt\/tree\/main\/py-salt.\u00a0Accessed 3 Nov 2025"},{"key":"436_CR27","doi-asserted-by":"publisher","unstructured":"A.\u00a0Shah, A.\u00a0Singh. sireNNet-Emergency Vehicle Siren Classification Dataset For Urban Applications (2023). https:\/\/doi.org\/10.17632\/j4ydzzv4kb.1","DOI":"10.17632\/j4ydzzv4kb.1"},{"key":"436_CR28","doi-asserted-by":"publisher","unstructured":"M.\u00a0Usaid, M.\u00a0Asif, t.\u00a0rajab, P.D.E.S. Hussain, P.D.s.M. munaf, S.\u00a0Wasi. Large-Scale Audio Dataset for Emergency Vehicle Sirens and Road Noises (2022). https:\/\/doi.org\/10.6084\/M9.FIGSHARE.19291472.V2","DOI":"10.6084\/M9.FIGSHARE.19291472.V2"},{"issue":"1","key":"436_CR29","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-022-01727-2","volume":"9","author":"M Asif","year":"2022","unstructured":"M. Asif, M. Usaid, M. Rashid, T. Rajab, S. Hussain, S. Wasi, Large-scale audio dataset for emergency vehicle sirens and road noises. Sci. Data 9(1), 599 (2022). https:\/\/doi.org\/10.1038\/s41597-022-01727-2","journal-title":"Sci. Data"},{"key":"436_CR30","doi-asserted-by":"publisher","unstructured":"K.J. Piczak, in Proceedings of the 23rd ACM international conference on Multimedia, ESC: Dataset for Environmental Sound Classification (Association for Computing Machinery, New York, 2015), MM \u201915, pp. 1015\u20131018. https:\/\/doi.org\/10.1145\/2733373.2806390","DOI":"10.1145\/2733373.2806390"},{"key":"436_CR31","doi-asserted-by":"publisher","unstructured":"S. Giacomelli et al., From Large-scale Audio Tagging to Real-Time Explainable Emergency Vehicle Sirens Detection (2025). https:\/\/doi.org\/10.48550\/arXiv.2506.23437","DOI":"10.48550\/arXiv.2506.23437"},{"key":"436_CR32","unstructured":"M.L. Quatra. audioset-download GitHub (2025). https:\/\/github.com\/MorenoLaQuatra\/audioset-download.\u00a0Accessed 3 Nov 2025"},{"issue":"7825","key":"436_CR33","doi-asserted-by":"publisher","first-page":"357","DOI":"10.1038\/s41586-020-2649-2","volume":"585","author":"CR Harris","year":"2020","unstructured":"C.R. Harris, K.J. Millman, S.J. Van Der Walt, R. Gommers, P. Virtanen, D. Cournapeau, E. Wieser, J. Taylor, S. Berg, N.J. Smith, R. Kern, M. Picus, S. Hoyer, M.H. Van Kerkwijk, M. Brett, A. Haldane, J.F. Del R\u00edo, M. Wiebe, P. Peterson, P. G\u00e9rard-Marchant, K. Sheppard, T. Reddy, W. Weckesser, H. Abbasi, C. Gohlke, T.E. Oliphant, Array programming with NumPy. Nature 585(7825), 357\u2013362 (2020). https:\/\/doi.org\/10.1038\/s41586-020-2649-2","journal-title":"Nature"},{"key":"436_CR34","unstructured":"J.O. Smith, Digital Audio Resampling Home Page (Center for Computer Research in Music and Acoustics, 2002). https:\/\/ccrma.stanford.edu\/~jos\/resample\/.\u00a0Accessed 3 Nov 2025"},{"key":"436_CR35","doi-asserted-by":"publisher","unstructured":"Y.Y. Yang, M.\u00a0Hira, Z.\u00a0Ni, A.\u00a0Chourdia, A.\u00a0Astafurov, C.\u00a0Chen, C.F. Yeh, C.\u00a0Puhrsch, D.\u00a0Pollack, D.\u00a0Genzel, D.\u00a0Greenberg, E.Z. Yang, J.\u00a0Lian, J.\u00a0Mahadeokar, J.\u00a0Hwang, J.\u00a0Chen, P.\u00a0Goldsborough, P.\u00a0Roy, S.\u00a0Narenthiran, S.\u00a0Watanabe, S.\u00a0Chintala, V.\u00a0Quenneville-B\u00e9lair, Y.\u00a0Shi. TorchAudio: Building Blocks for Audio and Speech Processing (2022). https:\/\/doi.org\/10.48550\/arXiv.2110.15018","DOI":"10.48550\/arXiv.2110.15018"},{"key":"436_CR36","doi-asserted-by":"publisher","unstructured":"J.\u00a0Hwang, M.\u00a0Hira, C.\u00a0Chen, X.\u00a0Zhang, Z.\u00a0Ni, G.\u00a0Sun, P.\u00a0Ma, R.\u00a0Huang, V.\u00a0Pratap, Y.\u00a0Zhang, A.\u00a0Kumar, C.Y. Yu, C.\u00a0Zhu, C.\u00a0Liu, J.\u00a0Kahn, M.\u00a0Ravanelli, P.\u00a0Sun, S.\u00a0Watanabe, Y.\u00a0Shi, Y.\u00a0Tao, R.\u00a0Scheibler, S.\u00a0Cornell, S.\u00a0Kim, S.\u00a0Petridis. TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch (2023). https:\/\/doi.org\/10.48550\/arXiv.2310.17864","DOI":"10.48550\/arXiv.2310.17864"},{"key":"436_CR37","doi-asserted-by":"publisher","unstructured":"B.\u00a0McFee, A.\u00a0Valentino, S.\u00a0Kruchinin, E.\u00a0Karpov, N.\u00a0Werner, W.\u00a0Pimenta, chris, endolith. bmcfee\/resampy: 0.4.3 (2024). https:\/\/doi.org\/10.5281\/ZENODO.596633","DOI":"10.5281\/ZENODO.596633"},{"issue":"4","key":"436_CR38","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1007\/s41403-023-00424-9","volume":"8","author":"K Choudhury","year":"2023","unstructured":"K. Choudhury, D. Nandi, Review of emergency vehicle detection techniques by acoustic signals. Trans. Indian Natl. Acad. Eng. 8(4), 535\u2013550 (2023). https:\/\/doi.org\/10.1007\/s41403-023-00424-9","journal-title":"Trans. Indian Natl. Acad. Eng."},{"key":"436_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2023.109719","volume":"215","author":"C Castorena","year":"2024","unstructured":"C. Castorena, M. Cobos, J. Lopez-Ballester, F.J. Ferri, A safety-oriented framework for sound event detection in driving scenarios. Appl. Acoust. 215, 109719 (2024). https:\/\/doi.org\/10.1016\/j.apacoust.2023.109719","journal-title":"Appl. Acoust."},{"key":"436_CR40","unstructured":"D&R Electronics. An overview of emergency vehicle sirens. (2021). https:\/\/www.dandrelectronics.com\/an-overview-of-emergency-vehicle-sirens.html.\u00a0Accessed 3 Nov 2025"},{"key":"436_CR41","unstructured":"LED Equipped. Different siren sounds used by first responders. (2022). https:\/\/www.ledequipped.com\/blogs\/news\/different-siren-sounds-used-by-first-responders.\u00a0Accessed 3 Nov 2025"},{"key":"436_CR42","unstructured":"BossHorn. Different fire truck siren sounds: An overview. (2021). https:\/\/bosshorn.com\/blogs\/blog\/different-fire-truck-siren-sounds.\u00a0Accessed 3 Nov 2025"},{"key":"436_CR43","unstructured":"Extreme Tactical Dynamics. What are the different sounds a police siren makes. (2020). https:\/\/www.extremetacticaldynamics.com\/blog\/what-are-the-different-sounds-a-police-siren-makes\/.\u00a0Accessed 3 Nov 2025"},{"key":"436_CR44","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2022.105000","volume":"114","author":"AE Ramirez","year":"2022","unstructured":"A.E. Ramirez, E. Donati, C. Chousidis, A siren identification system using deep learning to aid hearing-impaired people. Eng. Appl. Artif. Intell. 114, 105000 (2022). https:\/\/doi.org\/10.1016\/j.engappai.2022.105000","journal-title":"Eng. Appl. Artif. Intell."},{"key":"436_CR45","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.123608","volume":"249","author":"MY Shams","year":"2024","unstructured":"M.Y. Shams, T. Abd El-Hafeez, E. Hassan, Acoustic data detection in large-scale emergency vehicle sirens and road noise dataset. Expert Syst. Appl. 249, 123608 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2024.123608","journal-title":"Expert Syst. Appl."},{"key":"436_CR46","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1016\/j.procs.2023.01.005","volume":"218","author":"U Mittal","year":"2023","unstructured":"U. Mittal, P. Chawla, Acoustic based emergency vehicle detection using ensemble of deep learning models. Procedia Comput. Sci. 218, 227\u2013234 (2023). https:\/\/doi.org\/10.1016\/j.procs.2023.01.005","journal-title":"Procedia Comput. Sci."},{"issue":"12","key":"436_CR47","doi-asserted-by":"publisher","DOI":"10.3390\/s22124338","volume":"22","author":"M Cantarini","year":"2022","unstructured":"M. Cantarini, L. Gabrielli, S. Squartini, Few-shot emergency siren detection. Sensors 22(12), 4338 (2022). https:\/\/doi.org\/10.3390\/s22124338","journal-title":"Sensors"},{"issue":"10","key":"436_CR48","doi-asserted-by":"publisher","first-page":"17087","DOI":"10.1109\/TITS.2022.3158076","volume":"23","author":"L Marchegiani","year":"2022","unstructured":"L. Marchegiani, P. Newman, Listening for sirens: locating and classifying acoustic alarms in city scenes. IEEE Trans. Intell. Transp. Syst. 23(10), 17087\u201317096 (2022). https:\/\/doi.org\/10.1109\/TITS.2022.3158076","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"436_CR49","doi-asserted-by":"publisher","unstructured":"A.\u00a0Paszke, S.\u00a0Gross, F.\u00a0Massa, A.\u00a0Lerer, J.\u00a0Bradbury, G.\u00a0Chanan, T.\u00a0Killeen, Z.\u00a0Lin, N.\u00a0Gimelshein, L.\u00a0Antiga, A.\u00a0Desmaison, A.\u00a0K\u00f6pf, E.\u00a0Yang, Z.\u00a0DeVito, M.\u00a0Raison, A.\u00a0Tejani, S.\u00a0Chilamkurthy, B.\u00a0Steiner, L.\u00a0Fang, J.\u00a0Bai, S.\u00a0Chintala. PyTorch: An Imperative Style, High-Performance Deep Learning Library (2019). https:\/\/doi.org\/10.48550\/arXiv.1912.01703","DOI":"10.48550\/arXiv.1912.01703"},{"key":"436_CR50","doi-asserted-by":"publisher","unstructured":"W.\u00a0Falcon, J.\u00a0Borovec, A.\u00a0W\u00e4lchli, N.\u00a0Eggert, J.\u00a0Schock, J.\u00a0Jordan, N.\u00a0Skafte, Ir1dXD, V.\u00a0Bereznyuk, E.\u00a0Harris, T.\u00a0Murrell, P.\u00a0Yu, S.\u00a0Pr\u00e6sius, T.\u00a0Addair, J.\u00a0Zhong, D.\u00a0Lipin, S.\u00a0Uchida, S.\u00a0Bapat, H.\u00a0Schr\u00f6ter, B.\u00a0Dayma, A.\u00a0Karnachev, A.\u00a0Kulkarni, S.\u00a0Komatsu, Martin.B, J.B. Schiratti, H.\u00a0Mary, D.\u00a0Byrne, C.\u00a0Eyzaguirre, cinjon, A.\u00a0Bakhtin. PyTorchLightning\/pytorch-lightning: 0.7.6 release (2020). https:\/\/doi.org\/10.5281\/zenodo.3828935","DOI":"10.5281\/zenodo.3828935"},{"key":"436_CR51","doi-asserted-by":"publisher","unstructured":"D.S. Park, W.\u00a0Chan, Y.\u00a0Zhang, C.C. Chiu, B.\u00a0Zoph, E.D. Cubuk, Q.V. Le, in Interspeech 2019, SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition (ISCA, 2019), pp. 2613\u20132617. https:\/\/doi.org\/10.21437\/Interspeech.2019-2680","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"436_CR52","doi-asserted-by":"publisher","unstructured":"H.\u00a0Zhang, M.\u00a0Cisse, Y.N. Dauphin, D.\u00a0Lopez-Paz. mixup: Beyond Empirical Risk Minimization (2018). https:\/\/doi.org\/10.48550\/arXiv.1710.09412","DOI":"10.48550\/arXiv.1710.09412"},{"key":"436_CR53","doi-asserted-by":"publisher","unstructured":"P.Y. Huang, H.\u00a0Xu, J.\u00a0Li, A.\u00a0Baevski, M.\u00a0Auli, W.\u00a0Galuba, F.\u00a0Metze, C.\u00a0Feichtenhofer. Masked Autoencoders that Listen (2023). https:\/\/doi.org\/10.48550\/arXiv.2207.06405","DOI":"10.48550\/arXiv.2207.06405"},{"key":"436_CR54","doi-asserted-by":"publisher","unstructured":"K.\u00a0Koutini, J.\u00a0Schl\u00fcter, H.\u00a0Eghbal-zadeh, G.\u00a0Widmer, in Interspeech 2022, Efficient Training of Audio Transformers with Patchout (ISCA, 2022), pp. 2753\u20132757. https:\/\/doi.org\/10.21437\/Interspeech.2022-227","DOI":"10.21437\/Interspeech.2022-227"},{"key":"436_CR55","doi-asserted-by":"publisher","unstructured":"A.\u00a0Singh, H.\u00a0Liu, M.D. Plumbley. E-PANNs: Sound Recognition Using Efficient Pre-trained Audio Neural Networks (2023). https:\/\/doi.org\/10.48550\/arXiv.2305.18665","DOI":"10.48550\/arXiv.2305.18665"},{"key":"436_CR56","doi-asserted-by":"publisher","first-page":"2880","DOI":"10.1109\/TASLP.2020.3030497","volume":"28","author":"Q Kong","year":"2020","unstructured":"Q. Kong, Y. Cao, T. Iqbal, Y. Wang, W. Wang, M.D. Plumbley, PANNs: large-scale pretrained audio neural networks for audio pattern recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 28, 2880\u20132894 (2020). https:\/\/doi.org\/10.1109\/TASLP.2020.3030497","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"436_CR57","doi-asserted-by":"publisher","first-page":"383","DOI":"10.1016\/j.patrec.2020.02.004","volume":"131","author":"A Singh","year":"2020","unstructured":"A. Singh, P. Rajan, A. Bhavsar, SVD-based redundancy removal in 1-D CNNs for acoustic scene classification. Pattern Recogn. Lett. 131, 383\u2013389 (2020). https:\/\/doi.org\/10.1016\/j.patrec.2020.02.004","journal-title":"Pattern Recogn. Lett."},{"key":"436_CR58","doi-asserted-by":"publisher","unstructured":"A.\u00a0Singh, M.D. Plumbley, in ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Efficient Similarity-Based Passive Filter Pruning for Compressing CNNS (2023), pp. 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10095560. ISSN: 2379-190X","DOI":"10.1109\/ICASSP49357.2023.10095560"},{"key":"436_CR59","doi-asserted-by":"publisher","unstructured":"S. Giacomelli, M. Giordano, C. Rinaldi, in 2024 IEEE Symposium on Computers and Communications (ISCC), The OCON model: an old but gold solution for distributable supervised classification (2024), pp. 1\u20137. https:\/\/doi.org\/10.1109\/ISCC61673.2024.10733621","DOI":"10.1109\/ISCC61673.2024.10733621"},{"key":"436_CR60","doi-asserted-by":"publisher","unstructured":"S. Giacomelli, M. Giordano, C. Rinaldi, in 2024 IEEE 5th International Symposium on the Internet of Sounds (IS2), The OCON Model: An Old but Green Solution for Distributable Supervised Classification for Acoustic Monitoring in Smart. Cities (2024), pp. 1\u201310. https:\/\/doi.org\/10.1109\/IS262782.2024.10704155","DOI":"10.1109\/IS262782.2024.10704155"},{"issue":"98","key":"436_CR61","doi-asserted-by":"publisher","DOI":"10.21105\/joss.06634","volume":"9","author":"M Fuentes","year":"2024","unstructured":"M. Fuentes et al., Soundata: reproducible use of audio datasets. J. Open Source Softw. 9(98), 6634 (2024). https:\/\/doi.org\/10.21105\/joss.06634","journal-title":"J. Open Source Softw."},{"key":"436_CR62","doi-asserted-by":"publisher","unstructured":"H.\u00a0Chen, W.\u00a0Xie, A.\u00a0Vedaldi, A.\u00a0Zisserman, in ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Vggsound: A Large-Scale Audio-Visual Dataset (2020), pp. 721\u2013725. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053174","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"436_CR63","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.7244360","author":"IM Morato","year":"2023","unstructured":"I.M. Morato, M. Harju, A. Mesaros, MAESTRO Real - Multi-Annotator Estimated Strong Labels (2023). https:\/\/doi.org\/10.5281\/zenodo.7244360","journal-title":"MAESTRO Real - Multi-Annotator Estimated Strong Labels"},{"key":"436_CR64","doi-asserted-by":"publisher","unstructured":"M.\u00a0Fuentes, B.\u00a0Steers, P.\u00a0Zinemanas, M.\u00a0Rocamora, L.\u00a0Bondi, J.\u00a0Wilkins, Q.\u00a0Shi, Y.\u00a0Hou, S.\u00a0Das, X.\u00a0Serra, J.P. Bello, in ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Urban Sound & Sight: Dataset And Benchmark For Audio-Visual Urban Scene Understanding (2022), pp. 141\u2013145. https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747644","DOI":"10.1109\/ICASSP43922.2022.9747644"},{"key":"436_CR65","doi-asserted-by":"publisher","unstructured":"M.\u00a0Cartwright, A.E.M. Mendez, A.\u00a0Cramer, V.\u00a0Lostanlen, G.\u00a0Dove, H.H. Wu, J.\u00a0Salamon, O.\u00a0Nov, J.\u00a0Bello. SONYC Urban Sound Tagging (SONYC-UST): A Multilabel Dataset from an Urban Acoustic Sensor Network (2019). https:\/\/doi.org\/10.33682\/j5zw-2t88","DOI":"10.33682\/j5zw-2t88"},{"key":"436_CR66","doi-asserted-by":"publisher","unstructured":"M.\u00a0Cartwright, J.\u00a0Cramer, A.E.M. Mendez, Y.\u00a0Wang, H.H. Wu, V.\u00a0Lostanlen, M.\u00a0Fuentes, G.\u00a0Dove, C.\u00a0Mydlarz, J.\u00a0Salamon, O.\u00a0Nov, J.P. Bello. SONYC-UST-V2: an urban sound tagging dataset with spatiotemporal context (2020). https:\/\/doi.org\/10.48550\/arXiv.2009.05188","DOI":"10.48550\/arXiv.2009.05188"}],"container-title":["EURASIP Journal on Audio, Speech, and Music Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-025-00436-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s13636-025-00436-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-025-00436-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T14:28:23Z","timestamp":1767709703000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1186\/s13636-025-00436-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,2]]},"references-count":66,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2026,12]]}},"alternative-id":["436"],"URL":"https:\/\/doi.org\/10.1186\/s13636-025-00436-z","relation":{},"ISSN":["1687-4722"],"issn-type":[{"type":"electronic","value":"1687-4722"}],"subject":[],"published":{"date-parts":[[2025,12,2]]},"assertion":[{"value":"5 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"2"}}