{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:02:09Z","timestamp":1772323329112,"version":"3.50.1"},"reference-count":24,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1109\/icassp48485.2024.10446348","type":"proceedings-article","created":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T18:56:31Z","timestamp":1710788191000},"page":"291-295","source":"Crossref","is-referenced-by-count":18,"title":["CED: Consistent Ensemble Distillation for Audio Tagging"],"prefix":"10.1109","author":[{"given":"Heinrich","family":"Dinkel","sequence":"first","affiliation":[{"name":"Xiaomi Corporation,Beijing,China"}]},{"given":"Yongqing","family":"Wang","sequence":"additional","affiliation":[{"name":"Xiaomi Corporation,Beijing,China"}]},{"given":"Zhiyong","family":"Yan","sequence":"additional","affiliation":[{"name":"Xiaomi Corporation,Beijing,China"}]},{"given":"Junbo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xiaomi Corporation,Beijing,China"}]},{"given":"Yujun","family":"Wang","sequence":"additional","affiliation":[{"name":"Xiaomi Corporation,Beijing,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.3390\/buildings12111947"},{"key":"ref2","author":"Dinkel","year":"2023","journal-title":"Streaming audio transformers for online audio tagging"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-607"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095534"},{"key":"ref5","author":"Chen","year":"2022","journal-title":"Beats: Audio pre-training with acoustic tokenizers"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746431"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref9","article-title":"Mavil: Masked audio-video learners","author":"Huang","year":"2022"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref11","first-page":"28708","article-title":"Masked autoencoders that listen","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Huang"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01065"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096110"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19803-8_5"},{"key":"ref16","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4899-7687-1_79"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095255"},{"key":"ref20","article-title":"8-bit optimizers via block-wise quantization","volume-title":"9th International Conference on Learning Representations, ICLR","author":"Dettmers"},{"key":"ref21","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095691"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1564"},{"key":"ref24","first-page":"125","article-title":"Hear: Holistic evaluation of audio representations","volume-title":"NeurIPS 2021 Competitions and Demonstrations Track","author":"Turian"}],"event":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Seoul, Korea, Republic of","start":{"date-parts":[[2024,4,14]]},"end":{"date-parts":[[2024,4,19]]}},"container-title":["ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10445798\/10445803\/10446348.pdf?arnumber=10446348","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,2]],"date-time":"2024-08-02T05:11:24Z","timestamp":1722575484000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10446348\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/icassp48485.2024.10446348","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]}}}