{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,22]],"date-time":"2026-02-22T07:01:02Z","timestamp":1771743662610,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":27,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819570805","type":"print"},{"value":"9789819570812","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-7081-2_28","type":"book-chapter","created":{"date-parts":[[2026,2,22]],"date-time":"2026-02-22T06:45:47Z","timestamp":1771742747000},"page":"430-445","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Text2Omni: A Text-Only Training Strategy for\u00a0MLLMs"],"prefix":"10.1007","author":[{"given":"Junxin","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifu","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zishan","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fengyu","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siyue","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siyan","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lihua","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,23]]},"reference":[{"key":"28_CR1","unstructured":"Agostinelli, A., et\u00a0al.: MusicLM: generating music from text. arXiv preprint arXiv:2301.11325 (2023)"},{"key":"28_CR2","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: ShareGPT4V: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"28_CR3","unstructured":"Chu, Y., et al.: Qwen-audio: advancing universal audio understanding via unified large-scale audio-language pre-training. arXiv preprint arXiv:2311.07919 (2023)"},{"key":"28_CR4","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: VirTex: learning visual representations from textual annotations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11162\u201311173 (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"28_CR5","unstructured":"Donahue, C., McAuley, J., Puckette, M.: Adversarial audio synthesis. arXiv preprint arXiv:1802.04208 (2019)"},{"key":"28_CR6","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2022)"},{"key":"28_CR7","first-page":"6827","volume":"34","author":"X Huang","year":"2021","unstructured":"Huang, X., Zhang, Y., Liang, P.: What makes for good views for contrastive learning? Adv. Neural. Inf. Process. Syst. 34, 6827\u20136839 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR8","unstructured":"Li, J., Li, D., Silvio, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"28_CR9","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: Vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR10","first-page":"17612","volume":"35","author":"W Liang","year":"2022","unstructured":"Liang, W., Zhang, Y., Kwon, Y., Yeung, S., Zou, J.: Mind the gap: understanding the modality gap in multi-modal contrastive representation learning. Adv. Neural. Inf. Process. Syst. 35, 17612\u201317625 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"28_CR12","unstructured":"Liu, H., et al.: AudioLDM: text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503 (2023)"},{"key":"28_CR13","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"28_CR14","unstructured":"OpenAI: GPT-4V(ision) system card. arXiv preprint arXiv:2310.12809 (2023)"},{"key":"28_CR15","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020 (2021)"},{"key":"28_CR16","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. arXiv preprint arXiv:2212.04356 (2023)"},{"key":"28_CR17","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"28_CR18","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. arXiv preprint arXiv:2102.12092 (2021)"},{"key":"28_CR19","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"28_CR20","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, vol. 1, pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"28_CR21","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: FLAVA: a foundational language and vision alignment model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15638\u201315650 (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"28_CR22","unstructured":"Tang, C., et al.: SALMONN: towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289 (2023)"},{"key":"28_CR23","unstructured":"Wang, P., et al.: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. arXiv preprint arXiv:2202.03052 (2022)"},{"key":"28_CR24","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: BEIT pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"28_CR25","doi-asserted-by":"crossref","unstructured":"Wang, Y., et\u00a0al.: Tacotron: towards end-to-end speech synthesis. arXiv preprint arXiv:1703.10135 (2017)","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"28_CR26","unstructured":"Zeng, Y., Zhang, X., Li, H.: Multi-grained vision language pre-training: aligning texts with visual concepts. arXiv preprint arXiv:2111.08276 (2022)"},{"key":"28_CR27","unstructured":"Zhang, Y., HaoChen, J.Z., Huang, S.C., Wang, K.C., Zou, J., Yeung, S.: Diagnosing and rectifying vision models using language. arXiv preprint arXiv:2302.04269 (2023)"}],"container-title":["Lecture Notes in Computer Science","PRICAI 2025: Trends in Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-7081-2_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,22]],"date-time":"2026-02-22T06:45:54Z","timestamp":1771742754000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-7081-2_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819570805","9789819570812"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-7081-2_28","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"23 February 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRICAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific Rim International Conference on Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wellington","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Zealand","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pricai2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.pricai.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}