{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T03:47:21Z","timestamp":1770695241307,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":35,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819569564","type":"print"},{"value":"9789819569571","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-6957-1_7","type":"book-chapter","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T10:44:52Z","timestamp":1770633892000},"page":"89-103","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Integrating Symbolic and\u00a0Waveform Music Into Large Language Models"],"prefix":"10.1007","author":[{"given":"Teng","family":"Tu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaohao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunshan","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ji","family":"Qi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,10]]},"reference":[{"key":"7_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"7_CR2","unstructured":"Agostinelli, A., Denk, T.I., Borsos, Z., Engel, J.H., Verzetti, M., Caillon, A., et\u00a0al.: MusiCLM: generating music from text. CoRR abs\/2301.11325 (2023)"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Borsos, Z., Marinier, R., Vincent, D., Kharitonov, E., Pietquin, O., Sharifi, M., et\u00a0al.: AudioLM: a language modeling approach to audio generation. IEEE ACM Trans. Audio Speech Lang. Process. 31, 2523\u20132533 (2023)","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"7_CR4","unstructured":"Chou, Y., Chen, I., Chang, C., Ching, J., Yang, Y.: MidiBERT-piano: large-scale pre-training for symbolic music understanding. CoRR abs\/2107.05223 (2021)"},{"key":"7_CR5","unstructured":"Chu, Y., Xu, J., Zhou, X., Yang, Q., Zhang, S., Yan, Z., et\u00a0al.: QWEN-audio: advancing universal audio understanding via unified large-scale audio-language models. CoRR abs\/2311.07919 (2023)"},{"key":"7_CR6","unstructured":"Copet, J., Kreuk, F., Gat, I., Remez, T., Kant, D., Synnaeve, G., et\u00a0al.: Simple and controllable music generation. In: NeurIPS (2023)"},{"key":"7_CR7","unstructured":"D\u00e9fossez, A., Copet, J., Synnaeve, G., Adi, Y.: High fidelity neural audio compression. Trans. Mach. Learn. Res. (2023)"},{"key":"7_CR8","unstructured":"Dhariwal, P., Jun, H., Payne, C., Kim, J.W., Radford, A., Sutskever, I.: Jukebox: a generative model for music. CoRR abs\/2005.00341 (2020)"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Elizalde, B., Deshmukh, S., Ismail, M.A., Wang, H.: CLAP learning audio concepts from natural language supervision. In: IEEE ICASSP, pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Deng, Z., Ma, Y., Liu, Y., Guo, R., Zhang, G., Chen, W., et\u00a0al.: Musilingo: bridging music and text with pre-trained language models for music captioning and query response. In: NAACL-HLT (Findings), pp. 3643\u20133655. Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.findings-naacl.231"},{"key":"7_CR11","unstructured":"Gardner, J., Durand, S., Stoller, D., Bittner, R.M.: LLARK: a multimodal foundation model for music. In: ICML (2024)"},{"key":"7_CR12","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y., Glass, J.R.: AST: audio spectrogram transformer. In: Interspeech, pp. 571\u2013575. ISCA (2021)","DOI":"10.21437\/Interspeech.2021-698"},{"key":"7_CR13","unstructured":"Gong, Y., Luo, H., Liu, A.H., Karlinsky, L., Glass, J.R.: Listen, think, and understand. In: ICLR (2024)"},{"key":"7_CR14","unstructured":"Gutmann, M., Hyv\u00e4rinen, A.: Noise-contrastive estimation: a new estimation principle for unnormalized statistical models. In: AISTATS. JMLR Proceedings, vol.\u00a09, pp. 297\u2013304. JMLR.org (2010)"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind: one embedding space to bind them all. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Hsiao, W., Liu, J., Yeh, Y., Yang, Y.: Compound word transformer: learning to compose full-song music over dynamic directed hypergraphs. In: AAAI, pp. 178\u2013186. AAAI Press (2021)","DOI":"10.1609\/aaai.v35i1.16091"},{"key":"7_CR17","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., et\u00a0al.: LoRA: low-rank adaptation of large language models. In: ICLR (2022)"},{"key":"7_CR18","unstructured":"Huang, C.A., Vaswani, A., Uszkoreit, J., Simon, I., Hawthorne, C., Shazeer, N., et\u00a0al.: Music transformer: generating music with long-term structure. In: ICLR (2019)"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Huang, Y., Yang, Y.: Pop music transformer: beat-based modeling and generation of expressive pop piano compositions. In: ACM Multimedia, pp. 1180\u20131188. ACM (2020)","DOI":"10.1145\/3394171.3413671"},{"key":"7_CR20","unstructured":"Kong, Z., Goel, A., Badlani, R., Ping, W., Valle, R., Catanzaro, B.: Audio flamingo: a novel audio language model with few-shot learning and dialogue abilities. In: ICML (2024)"},{"key":"7_CR21","unstructured":"Li, Y., Yuan, R., Zhang, G., Ma, Y., Chen, X., Yin, H., et\u00a0al.: MERT: acoustic music understanding model with large-scale self-supervised training. In: ICLR (2024)"},{"key":"7_CR22","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Liu, S., Hussain, A.S., Sun, C., Shan, Y.: Music understanding llama: advancing text-to-music generation with question answering and captioning. In: ICASSP, pp. 286\u2013290. IEEE (2024)","DOI":"10.1109\/ICASSP48485.2024.10447027"},{"key":"7_CR24","unstructured":"Liu, X., Xia, X., Huang, Z., Chua, T.-S.: Towards modality generalization: a benchmark and prospective analysis. CoRR abs\/2412.18277 (2024)"},{"key":"7_CR25","unstructured":"Liu, X., Tu, T., Ma, Y., Chua, T.-S.: Extending visual dynamics for video-to-music generation. CoRR abs\/2504.07594 (2025)"},{"key":"7_CR26","unstructured":"Manco, I., Weck, B., Doh, S., Won, M., Zhang, Y., Bogdanov, D., et\u00a0al.: The song describer dataset: a corpus of audio captions for music-and-language evaluation. CoRR abs\/2311.10057 (2023)"},{"key":"7_CR27","unstructured":"Melechovsk\u00fd, J., Roy, A., Herremans, D.: MidiCaps - a large-scale MIDI dataset with text captions. In: ISMIR, pp.\u00a0858\u2013865. ISMIR (2024)"},{"key":"7_CR28","unstructured":"van\u00a0den Oord, A., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., et\u00a0al.: WaveNet: a generative model for raw audio. In: SSW, p.\u00a0125. ISCA (2016)"},{"key":"7_CR29","unstructured":"Qu, X., Bai, Y., Ma, Y., Zhou, Z., Lo, K.M., Liu, J., et\u00a0al.: MUPT: a generative symbolic music pretrained transformer. In: ICLR (2025)"},{"key":"7_CR30","doi-asserted-by":"crossref","unstructured":"Wu, S., Guo, Z., Yuan, R., Jiang, J., Doh, S., Xia, G., et\u00a0al.: Clamp 3: universal music information retrieval across unaligned modalities and unseen languages. In: ACL (Findings), pp.\u00a02605\u20132625. Association for Computational Linguistics (2025)","DOI":"10.18653\/v1\/2025.findings-acl.133"},{"key":"7_CR31","unstructured":"Wu, S., Wang, Y., Li, X., Yu, F., Sun, M.: MeloDyt5: a unified score-to-score transformer for symbolic music processing. In: ISMIR, pp.\u00a0642\u2013650. ISMIR (2024)"},{"key":"7_CR32","doi-asserted-by":"crossref","unstructured":"Wu, S., Wang, Y., Yuan, R., Guo, Z., Tan, X., Zhang, G., et\u00a0al.: Clamp 2: multimodal music information retrieval across 101 languages using large language models. In: NAACL (Findings), pp. 435\u2013451. Association for Computational Linguistics (2025)","DOI":"10.18653\/v1\/2025.findings-naacl.27"},{"key":"7_CR33","doi-asserted-by":"crossref","unstructured":"Yuan, R., Lin, H., Wang, Y., Tian, Z., Wu, S., Shen, T., et\u00a0al.: ChatMusician: understanding and generating music intrinsically with LLM. In: ACL, pp. 6252\u20136271. Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.findings-acl.373"},{"key":"7_CR34","doi-asserted-by":"crossref","unstructured":"Zeghidour, N., Luebs, A., Omran, A., Skoglund, J., Tagliasacchi, M.: SoundStream: an end-to-end neural audio codec. IEEE ACM Trans. Audio Speech Lang. Process. 30, 495\u2013507 (2022)","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"7_CR35","doi-asserted-by":"crossref","unstructured":"Zeng, M., Tan, X., Wang, R., Ju, Z., Qin, T., Liu, T.: MusicBERT: symbolic music understanding with large-scale pre-training. In: ACL\/IJCNLP (Findings), pp. 791\u2013800. Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.findings-acl.70"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-6957-1_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T10:45:06Z","timestamp":1770633906000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-6957-1_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819569564","9789819569571"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-6957-1_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"10 February 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Prague","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Czech Republic","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 January 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"32","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2026.cz\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}