{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:05:21Z","timestamp":1775199921594,"version":"3.50.1"},"reference-count":25,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434692","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-4","source":"Crossref","is-referenced-by-count":0,"title":["The T12 System for AudioMOS Challenge 2025: Audio Aesthetics Score Prediction System Using KAN- and VERSA-based Models"],"prefix":"10.1109","author":[{"given":"Katsuhiko","family":"Yamamoto","sequence":"first","affiliation":[{"name":"CyberAgent,Tokyo,Japan"}]},{"given":"Koichi","family":"Miyazaki","sequence":"additional","affiliation":[{"name":"CyberAgent,Tokyo,Japan"}]},{"given":"Shogo","family":"Seki","sequence":"additional","affiliation":[{"name":"CyberAgent,Tokyo,Japan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-970"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389763"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832295"},{"key":"ref4","article-title":"Meta audiobox aesthetics: Unified automatic quality assessment for speech, music, and sound","author":"Tjandra","year":"2025","journal-title":"arXiv:2502.05139[cs.SD]"},{"key":"ref5","article-title":"KAN: Kolmogorov-arnold networks","volume-title":"The Thirteenth International Conference on Learning Representations (ICLR)","author":"Liu"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-demo.19"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref8","article-title":"Kolmogorov-arnold transformer","volume-title":"The Thirteenth International Conference on Learning Representations (ICLR)","author":"Yang"},{"key":"ref9","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","volume-title":"The Ninth International Conference on Learning Representations (ICLR)","author":"Dosovitskiy"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414878"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-299"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832315"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096680"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-325"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref17","first-page":"4218","article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"The Twelfth Language Resources and Evaluation Conference (LREC)","author":"Ardila"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-153"},{"key":"ref19","article-title":"Musdb18-hq - an uncompressed version of musdb18","author":"Rafii","year":"2019"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref21","article-title":"Musiclm: Generating music from text","author":"Agostinelli","year":"2023","journal-title":"arXiv:2301.11325[cs.SD]"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0320"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330701"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1800"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11034"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434692.pdf?arnumber=11434692","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:58:19Z","timestamp":1775192299000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434692\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434692","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}