{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:59:44Z","timestamp":1730249984909,"version":"3.28.0"},"reference-count":36,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,15]]},"DOI":"10.1109\/icme57554.2024.10687821","type":"proceedings-article","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T17:24:16Z","timestamp":1727717056000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["Self-Supervised Learning-Based General Fine-tuning Framework For Audio Classification and Event Detection"],"prefix":"10.1109","author":[{"given":"Yanjie","family":"Sun","sequence":"first","affiliation":[{"name":"National University of Defense Technology,Changsha,China"}]},{"given":"Kele","family":"Xu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,Changsha,China"}]},{"given":"Yong","family":"Dou","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,Changsha,China"}]},{"given":"Tian","family":"Gao","sequence":"additional","affiliation":[{"name":"iFlytek Research,Hefei,China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"5178","article-title":"BEATs: Audio pre-training with acoustic tokenizers","volume-title":"ICML","author":"S"},{"key":"ref2","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref3","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"ICML","author":"Alexei"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1016\/j.patter.2022.100616"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1609\/aaai.v36i10.21315"},{"key":"ref6","first-page":"2073","article-title":"Audio tagging by cross filtering noisy labels","volume-title":"TASLP","author":"Kong"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/ICASSP40776.2020.9054224"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref9","first-page":"28708","article-title":"Masked autoencoders that listen","volume":"35","author":"Po-Yao","year":"2022","journal-title":"NeurIPS"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"issue":"4","key":"ref11","doi-asserted-by":"crossref","first-page":"664","DOI":"10.1038\/s41593-023-01285-9","article-title":"Intermediate acoustic-to-semantic representations link behavioral and neural responses to natural sounds","volume":"26","author":"L","year":"2023","journal-title":"Nat. Neurosci"},{"key":"ref12","article-title":"Sound event detection with weak prediction for dcase 2023 challenge task4a","author":"S","year":"2023","journal-title":"tech. rep., DCASE"},{"key":"ref13","article-title":"Semi-supervised learning-based sound event detection using frequency dynamic convolution with large kernel attention for DCASE challenge 2023 task 4","author":"J.-W","year":"2023","journal-title":"tech. rep., DCASE"},{"key":"ref14","first-page":"3875","article-title":"Contrastive learning of general-purpose audio representations","volume-title":"ICASSP","author":"Aaqib"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1109\/TASLP.2022.3221007"},{"volume-title":"ICLR","author":"Dosovitskiy","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","key":"ref16"},{"year":"2021","author":"Yuan","article-title":"Ast: Audio spectrogram transformer","key":"ref17"},{"key":"ref18","first-page":"16000","article-title":"Masked autoencoders are scalable vision learners","volume-title":"CVPR","author":"Kaiming"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.21437\/Interspeech.2022-10961"},{"key":"ref20","first-page":"4308","article-title":"Filteraugment: An acoustic environmental data augmentation method","volume-title":"ICASSP","author":"H"},{"volume-title":"ICLR","author":"H","article-title":"mixup: Beyond empirical risk minimization","key":"ref21"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1109\/TGRS.2021.3121765"},{"key":"ref23","article-title":"Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results","volume":"30","author":"A","year":"2017","journal-title":"NeurIPS"},{"key":"ref24","first-page":"3635","article-title":"Interpolation consistency training for semi-supervised learning","volume-title":"IJCAI","author":"V"},{"key":"ref25","first-page":"376","article-title":"Sound event detection by consistency training and pseudo-labeling with feature-pyramid convolutional recurrent neural networks","volume-title":"ICASSP","author":"C.-Y"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.21437\/Interspeech.2022-10037"},{"key":"ref27","article-title":"Fine-tune the pretrained ATST model for sound event detection","volume":"abs\/2309.08153","author":"Shao","year":"2023","journal-title":"CoRR"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.21437\/Interspeech.2022-10127"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1109\/ICCV48922.2021.00288"},{"key":"ref30","first-page":"777","article-title":"Sound event detection and time-frequency segmentation from weakly labelled data","volume-title":"TASLP","author":"Q"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1145\/3581783.3612849"},{"key":"ref32","first-page":"776","article-title":"Audio set: An ontology and human-labeled dataset for audio events","volume-title":"ICASSP","author":"F"},{"key":"ref33","first-page":"1015","article-title":"Esc: Dataset for environmental sound classification","author":"J","year":"2015","journal-title":"ACM MM"},{"key":"ref34","first-page":"253","article-title":"Sound event detection in domestic environments with weakly labeled data and soundscape synthesis","volume-title":"DCASE","author":"N"},{"key":"ref35","first-page":"1021","article-title":"Threshold independent evaluation of sound event detection scores","volume-title":"ICASSP","author":"J"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.1145\/3503161.3551579"}],"event":{"name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2024,7,15]]},"location":"Niagara Falls, ON, Canada","end":{"date-parts":[[2024,7,19]]}},"container-title":["2024 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10685847\/10687354\/10687821.pdf?arnumber=10687821","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T05:56:47Z","timestamp":1727762207000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10687821\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,15]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/icme57554.2024.10687821","relation":{},"subject":[],"published":{"date-parts":[[2024,7,15]]}}}