{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:56:15Z","timestamp":1781535375842,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100004921","name":"Shanghai Jiao Tong University","doi-asserted-by":"publisher","award":["25X010506040"],"award-info":[{"award-number":["25X010506040"]}],"id":[{"id":"10.13039\/501100004921","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004921","name":"Shanghai Jiao Tong University","doi-asserted-by":"publisher","award":["T541PRP49003"],"award-info":[{"award-number":["T541PRP49003"]}],"id":[{"id":"10.13039\/501100004921","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004921","name":"National College Students' Innovation and Entrepreneurship Training Program","doi-asserted-by":"publisher","award":["202610269116G"],"award-info":[{"award-number":["202610269116G"]}],"id":[{"id":"10.13039\/501100004921","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810789","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"595-604","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["From Physics to Representation: Audio Learning with Synthetic Pre-training via Procedural Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5646-990X","authenticated-orcid":false,"given":"Fengrui","family":"Liu","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, East China Normal University, shanghai, shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0178-8572","authenticated-orcid":false,"given":"Ruiyang","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Southeast University, nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6462-5400","authenticated-orcid":false,"given":"Qijian","family":"Zheng","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Fudan University, shanghai, shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8774-4732","authenticated-orcid":false,"given":"Yuanfang","family":"Wang","sequence":"additional","affiliation":[{"name":"Global College, Shanghai Jiao Tong University, shanghai, shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5289-5761","authenticated-orcid":false,"given":"Feng","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Psychology, Shanghai Jiao Tong University, shanghai, shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01751"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10961"},{"key":"e_1_3_3_1_4_2","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Baevski Alexei","year":"2022","unstructured":"Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, et\u00a0al. 2022. Data2vec: A General Framework for Self-Supervised Learning in Speech, Vision and Language. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_1_5_2","first-page":"1681","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Steffen Schneider, and Michael Auli. 2020. vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations. In Proc. Int. Conf. Learn. Representations (ICLR) , Vol.\u00a03. 1681\u20131693."},{"key":"e_1_3_3_1_6_2","first-page":"12449","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdel-rahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. In Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS). 12449\u201312460."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Yoshua Bengio Aaron Courville and Pascal Vincent. 2013. Representation Learning: A Review and New Perspectives. IEEE Trans. Pattern Anal. Mach. Intell. 35 8 (2013) 1798\u20131828.","DOI":"10.1109\/TPAMI.2013.50"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Shikhar Bharadwaj Samuele Cornell Kwanghee Choi et\u00a0al. 2025. OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder. arXiv:https:\/\/arXiv.org\/abs\/2507.14129.","DOI":"10.1109\/WASPAA66052.2025.11230965"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao Jian Wu Long Zhou Shuo Ren Yanmin Qian Yao Qian Jian Wu Michael Zeng Xiangzhan Yu and Furu Wei. 2022. WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. IEEE J. Sel. Top. Signal Process. 16 6 (2022) 1505\u20131518. doi:10.1109\/JSTSP.2022.3188113","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_3_1_10_2","first-page":"5178","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Chen Sanyuan","year":"2023","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, et\u00a0al. 2023. BEATs: Audio Pre-Training with Acoustic Tokenizers. In Proc. Int. Conf. Mach. Learn. (ICML) , Vol.\u00a0202. PMLR, 5178\u20135193."},{"key":"e_1_3_3_1_11_2","first-page":"3807","volume-title":"Proc. Int. Joint Conf. Artif. Intell. (IJCAI)","author":"Chen Wenxi","year":"2024","unstructured":"Wenxi Chen, Yuzhe Liang, Ziyang Ma, Zhisheng Zheng, and Xie Chen. 2024. EAT: Self-Supervised Pre-Training with Efficient Audio Transformer. In Proc. Int. Joint Conf. Artif. Intell. (IJCAI). International Joint Conferences on Artificial Intelligence Organization, 3807\u20133815. doi:10.24963\/ijcai.2024\/421Main Track."},{"key":"e_1_3_3_1_12_2","first-page":"47704","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Copet Jade","year":"2023","unstructured":"Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, et\u00a0al. 2023. Simple and Controllable Music Generation. In Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS) , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 47704\u201347720."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Mingyu Cui et\u00a0al. 2025. Exploring Cross-Utterance Speech Contexts for Conformer-Transducer Speech Recognition Systems. IEEE\/ACM Transactions on Audio Speech and Language Processing 33 (2025) 4168\u20134183. doi:10.1109\/TASLPRO.2025.3606235","DOI":"10.1109\/TASLPRO.2025.3606235"},{"key":"e_1_3_3_1_14_2","volume-title":"Proc. Conf. North American Chapter Assoc. Comput. Linguistics: Hum. Lang. Technol. (NAACL-HLT)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proc. Conf. North American Chapter Assoc. Comput. Linguistics: Hum. Lang. Technol. (NAACL-HLT)."},{"key":"e_1_3_3_1_15_2","first-page":"12010","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Engel Jesse","year":"2020","unstructured":"Jesse Engel, Lamtharn Hantrakul, Chenjie Gu, and Adam Roberts. 2020. DDSP: Differentiable Digital Signal Processing. In Proc. Int. Conf. Learn. Representations (ICLR) , Vol.\u00a016. 12010\u201312028."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.5555\/1941897"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Eduardo Fonseca Xavier Favory Jordi Pons Frederic Font and Xavier Serra. 2022. FSD50K: An Open Dataset of Human-Labeled Sound Events. IEEE\/ACM Trans. Audio Speech Lang. Process. 30 (2022) 829\u2013852.","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Robert Geirhos J\u00f6rn-Henrik Jacobsen Claudio Michaelis Richard Zemel et\u00a0al. 2020. Shortcut Learning in Deep Neural Networks. Nat. Mach. Intell. 2 11 (2020) 665\u2013673.","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21315"},{"key":"e_1_3_3_1_24_2","first-page":"31068","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Gong Yuan","year":"2023","unstructured":"Yuan Gong, Andrew Rouditchenko, Alexander\u00a0H. Liu, et\u00a0al. 2023. Contrastive Audio-Visual Masked Autoencoder. In Proc. Int. Conf. Learn. Representations (ICLR). 31068\u201331096."},{"key":"e_1_3_3_1_25_2","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS))","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9, et\u00a0al. 2020. Bootstrap Your Own Latent: A New Approach to Self-Supervised Learning. In Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS))."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_3_1_27_2","first-page":"60","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Higgins Irina","year":"2017","unstructured":"Irina Higgins, Loic Matthey, Arka Pal, et\u00a0al. 2017. beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework. In Proc. Int. Conf. Learn. Representations (ICLR) , Vol.\u00a01. 60\u201381."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29 (2021) 3451\u20133465.","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_1_29_2","first-page":"3103","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Hu Xu, Juncheng Li, Alexei Baevski, Michael Auli, Wei-Ning Wang, Luke Zettlemoyer, and Mathilde Caron. 2022. Masked autoencoders that listen. In Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS) , Vol.\u00a035. 3103\u20133116."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890881"},{"key":"e_1_3_3_1_31_2","unstructured":"Hirokatsu Kataoka Sora Takashima Ryo Hayamizu et\u00a0al. 2022. Pre-training Vision Transformers with Formula-driven Supervised Learning. arXiv:https:\/\/arXiv.org\/abs\/2206.09132."},{"key":"e_1_3_3_1_32_2","first-page":"2649","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Kim Hyunjik","year":"2018","unstructured":"Hyunjik Kim and Andriy Mnih. 2018. Disentangling by Factorising. In Proc. Int. Conf. Mach. Learn. (ICML) , Vol.\u00a080. PMLR, 2649\u20132658."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Qiuqiang Kong Yong Xu Mark\u00a0D Plumbley and Wenwu Wang. 2020. PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 28 (2020) 2880\u20132894.","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-227"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Alexander Kraskov Harald St\u00f6gbauer and Peter Grassberger. 2004. Estimating mutual information. Phys. Rev. E 69 6 (2004) 066138.","DOI":"10.1103\/PhysRevE.69.066138"},{"key":"e_1_3_3_1_36_2","first-page":"17409","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Kreuk Felix","year":"2023","unstructured":"Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, et\u00a0al. 2023. AudioGen: Textually Guided Audio Generation. In Proc. Int. Conf. Learn. Representations (ICLR). 17409\u201317424."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Xian Li Nian Shao and Xiaofei Li. 2024. Self-Supervised Audio Teacher-Student Transformer for Both Clip-Level and Frame-Level Tasks. IEEE\/ACM Trans. Audio Speech Lang. Process. 32 (2024) 1336\u20131351.","DOI":"10.1109\/TASLP.2024.3352248"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i38.40455"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1582"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Andy\u00a0T. Liu Shang-Wen Li and Hung-yi Lee. 2021. TERA: Self-Supervised Learning of Transformer Encoder Representation for Speech. IEEE\/ACM Trans. Audio Speech Lang. Process. 29 (2021) 2351\u20132366.","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"e_1_3_3_1_41_2","first-page":"21450","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, et\u00a0al. 2023. AudioLDM: Text-to-Audio Generation with Latent Diffusion Models. In Proc. Int. Conf. Mach. Learn. (ICML) , Vol.\u00a0202. PMLR, 21450\u201321474."},{"key":"e_1_3_3_1_42_2","first-page":"4114","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Locatello Francesco","year":"2019","unstructured":"Francesco Locatello, Stefan Bauer, Mario Lucic, et\u00a0al. 2019. Challenging common assumptions in the unsupervised learning of disentangled representations. In Proc. Int. Conf. Mach. Learn. (ICML) , Vol.\u00a097. PMLR, 4114\u20134124."},{"key":"e_1_3_3_1_43_2","first-page":"4061","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proc. Int. Conf. Learn. Representations (ICLR) , Vol.\u00a06. 4061\u20134078."},{"key":"e_1_3_3_1_44_2","series-title":"Proceedings of Machine Learning Research","first-page":"1","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS)","volume":"166","author":"Niizumi Daisuke","year":"2022","unstructured":"Daisuke Niizumi, Daiki Takeuchi, Yasunori Ohishi, et\u00a0al. 2022. Masked Spectrogram Modeling using Masked Autoencoders for Learning General-purpose Audio Representation. In Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS)(Proceedings of Machine Learning Research, Vol.\u00a0166). PMLR, 1\u201324."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Daisuke Niizumi Daiki Takeuchi Yasunori Ohishi et\u00a0al. 2023. BYOL for Audio: Exploring Pre-Trained General-Purpose Audio Representations. IEEE\/ACM Trans. Audio Speech Lang. Process. 31 (2023) 137\u2013151.","DOI":"10.1109\/TASLP.2022.3221007"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Julius Richter Simon Welker Jean-Marie Lemercier et\u00a0al. 2023. Speech Enhancement and Dereverberation With Diffusion-Based Generative Models. IEEE\/ACM Trans. Audio Speech Lang. Process. 31 (2023) 2351\u20132364.","DOI":"10.1109\/TASLP.2023.3285241"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413528"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"crossref","unstructured":"Justin Salamon and Juan\u00a0Pablo Bello. 2017. Deep Convolutional Neural Networks and Data Augmentation for Environmental Sound Classification. IEEE Signal Process Lett. 24 3 (2017) 279\u2013283. doi:10.1109\/LSP.2017.2657381","DOI":"10.1109\/LSP.2017.2657381"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2017.8170052"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01782"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2017.8202133"},{"key":"e_1_3_3_1_57_2","unstructured":"Aaron van\u00a0den Oord Yazhe Li and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. arXiv:https:\/\/arXiv.org\/abs\/1807.03748."},{"key":"e_1_3_3_1_58_2","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, et\u00a0al. 2017. Attention is all you need. In Proc. Int. Conf. Neural Inf. Process. Syst. (NeurIPS)."},{"key":"e_1_3_3_1_59_2","first-page":"1","volume-title":"Proc. IEEE Int. Conf. Acoust. Speech Signal Process. (ICASSP)","author":"Wang Helin","year":"2023","unstructured":"Helin Wang, Yuexian Zou, and Wenwu Wang. 2023. Masked Spectrogram Prediction for Self-Supervised Audio Pre-Training. In Proc. IEEE Int. Conf. Acoust. Speech Signal Process. (ICASSP). IEEE, 1\u20135. doi:10.1109\/ICASSP49357.2023.10095691"},{"key":"e_1_3_3_1_60_2","unstructured":"Pete Warden. 2018. Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition. arXiv:https:\/\/arXiv.org\/abs\/1804.03209."},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10653"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"crossref","unstructured":"Ryan Whetten Titouan Parcollet Marco Dinarelli and Yannick Est\u00e8ve. 2026. A Study of Data Selection Strategies for Pre-training Self-Supervised Speech Models. arXiv:https:\/\/arXiv.org\/abs\/2601.20896.","DOI":"10.1109\/ICASSP55912.2026.11463465"},{"key":"e_1_3_3_1_63_2","first-page":"10524","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Xiong Ruibin","year":"2020","unstructured":"Ruibin Xiong, Yunchang Yang, Di He, et\u00a0al. 2020. On layer normalization in the transformer architecture. In Proc. Int. Conf. Mach. Learn. (ICML) , Vol.\u00a0119. PMLR, 10524\u201310533."},{"key":"e_1_3_3_1_64_2","first-page":"2866","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Zhang Hongyi","year":"2018","unstructured":"Hongyi Zhang, Moustapha Cisse, Yann\u00a0N. Dauphin, and David Lopez-Paz. 2018. mixup: Beyond Empirical Risk Minimization. In Proc. Int. Conf. Learn. Representations (ICLR) , Vol.\u00a04. 2866\u20132878."},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"crossref","unstructured":"Ziqiang Zhang Sanyuan Chen Long Zhou Yu Wu Shuo Ren Shujie Liu Zhuoyuan Yao Xun Gong Lirong Dai Jinyu Li and Furu Wei. 2024. SpeechLM: Enhanced Speech Pre-Training With Unpaired Textual Data. IEEE\/ACM Transactions on Audio Speech and Language Processing 32 (2024) 2177\u20132187. doi:10.1109\/TASLP.2024.3379877","DOI":"10.1109\/TASLP.2024.3379877"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:49:52Z","timestamp":1781534992000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810789"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":64,"alternative-id":["10.1145\/3805622.3810789","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810789","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}