{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T00:06:57Z","timestamp":1765498017862,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761264","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T23:55:33Z","timestamp":1762559733000},"page":"1249-1259","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Rethinking the Training Paradigm of Discrete Token-Based Multimodal LLMs: An Analysis of Text-Centric Bias"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8957-0223","authenticated-orcid":false,"given":"Wansik","family":"Jo","sequence":"first","affiliation":[{"name":"Meritzfire, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7870-8681","authenticated-orcid":false,"given":"Jooyeong","family":"Na","sequence":"additional","affiliation":[{"name":"Ajou University, Suwon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8766-3879","authenticated-orcid":false,"given":"Soyeon","family":"Hong","sequence":"additional","affiliation":[{"name":"Ajou University, Suwon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3570-0907","authenticated-orcid":false,"given":"Seungtaek","family":"Choi","sequence":"additional","affiliation":[{"name":"Hankuk University of Foreign Studies, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9134-1921","authenticated-orcid":false,"given":"Hyunsouk","family":"Cho","sequence":"additional","affiliation":[{"name":"Ajou University, Suwon, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli Andrea","year":"2023","unstructured":"Andrea Agostinelli, Timo I Denk, Zal\u00e1n Borsos, Jesse Engel, Mauro Verzetti, Antoine Caillon, Qingqing Huang, Aren Jansen, Adam Roberts, Marco Tagliasacchi, et al., 2023. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_3_1","volume-title":"See it from my perspective: Diagnosing the western cultural bias of large vision-language models in image understanding. arXiv preprint arXiv:2406.11665","author":"Ananthram Amith","year":"2024","unstructured":"Amith Ananthram, Elias Stengel-Eskin, Carl Vondrick, Mohit Bansal, and Kathleen McKeown. 2024. See it from my perspective: Diagnosing the western cultural bias of large vision-language models in image understanding. arXiv preprint arXiv:2406.11665 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Jae Sung Park, et al","author":"Awadalla Anas","year":"2024","unstructured":"Anas Awadalla, Le Xue, Manli Shu, An Yan, Jun Wang, Senthil Purushwalkam, Sheng Shen, Hannah Lee, Oscar Lo, Jae Sung Park, et al., 2024. BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions. arXiv preprint arXiv:2411.07461 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"International conference on machine learning. PMLR, 528-539","author":"Bahng Hyojin","year":"2020","unstructured":"Hyojin Bahng, Sanghyuk Chun, Sangdoo Yun, Jaegul Choo, and Seong Joon Oh. 2020. Learning de-biased representations with biased representations. In International conference on machine learning. PMLR, 528-539."},{"key":"e_1_3_2_1_6_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966, Vol. 1, 2 (2023), 3."},{"key":"e_1_3_2_1_7_1","volume-title":"A note on the inception score. arXiv preprint arXiv:1801.01973","author":"Barratt Shane","year":"2018","unstructured":"Shane Barratt and Rishi Sharma. 2018. A note on the inception score. arXiv preprint arXiv:1801.01973 (2018)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00254"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445922"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.5555\/146680.146685"},{"key":"e_1_3_2_1_11_1","volume-title":"Conference on fairness, accountability and transparency. PMLR, 77-91","author":"Buolamwini Joy","year":"2018","unstructured":"Joy Buolamwini and Timnit Gebru. 2018. Gender shades: Intersectional accuracy disparities in commercial gender classification. In Conference on fairness, accountability and transparency. PMLR, 77-91."},{"key":"e_1_3_2_1_12_1","volume-title":"Rubi: Reducing unimodal biases for visual question answering. Advances in neural information processing systems","author":"Cadene Remi","year":"2019","unstructured":"Remi Cadene, Corentin Dancette, Matthieu Cord, Devi Parikh, et al., 2019. Rubi: Reducing unimodal biases for visual question answering. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_13_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01124"},{"key":"e_1_3_2_1_15_1","unstructured":"Michael Han Daniel Han and Unsloth team. 2023. Unsloth. http:\/\/github.com\/unslothai\/unsloth"},{"key":"e_1_3_2_1_16_1","volume-title":"FMA: A dataset for music analysis. arXiv preprint arXiv:1612.01840","author":"Defferrard Micha\u00ebl","year":"2016","unstructured":"Micha\u00ebl Defferrard, Kirell Benzi, Pierre Vandergheynst, and Xavier Bresson. 2016. FMA: A dataset for music analysis. arXiv preprint arXiv:1612.01840 (2016)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_18_1","volume-title":"Lp-musiccaps: Llm-based pseudo music captioning. arXiv preprint arXiv:2307.16372","author":"Doh SeungHeon","year":"2023","unstructured":"SeungHeon Doh, Keunwoo Choi, Jongpil Lee, and Juhan Nam. 2023. Lp-musiccaps: Llm-based pseudo music captioning. arXiv preprint arXiv:2307.16372 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_20_1","volume-title":"Keith Achorn, Anjali Gopi, David Kanter, Maximilian Lam, Mark Mazumder, and Vijay Janapa Reddi.","author":"Galvez Daniel","year":"2021","unstructured":"Daniel Galvez, Greg Diamos, Juan Ciro, Juan Felipe Cer\u00f3n, Keith Achorn, Anjali Gopi, David Kanter, Maximilian Lam, Mark Mazumder, and Vijay Janapa Reddi. 2021. The people's speech: A large-scale diverse english speech recognition dataset for commercial usage. arXiv preprint arXiv:2111.09344 (2021)."},{"key":"e_1_3_2_1_21_1","volume-title":"Making llama see and draw with seed tokenizer. arXiv preprint arXiv:2310.01218","author":"Ge Yuying","year":"2023","unstructured":"Yuying Ge, Sijie Zhao, Ziyun Zeng, Yixiao Ge, Chen Li, Xintao Wang, and Ying Shan. 2023. Making llama see and draw with seed tokenizer. arXiv preprint arXiv:2310.01218 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Transformer feed-forward layers are key-value memories. arXiv preprint arXiv:2012.14913","author":"Geva Mor","year":"2020","unstructured":"Mor Geva, Roei Schuster, Jonathan Berant, and Omer Levy. 2020. Transformer feed-forward layers are key-value memories. arXiv preprint arXiv:2012.14913 (2020)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_24_1","volume-title":"Infinity-mm: Scaling multimodal performance with large-scale and high-quality instruction data. arXiv preprint arXiv:2410.18558","author":"Gu Shuhao","year":"2024","unstructured":"Shuhao Gu, Jialing Zhang, Siyuan Zhou, Kevin Yu, Zhaohu Xing, Liangdong Wang, Zhou Cao, Jintao Jia, Zhuoyi Zhang, Yixuan Wang, et al., 2024. Infinity-mm: Scaling multimodal performance with large-scale and high-quality instruction data. arXiv preprint arXiv:2410.18558 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Bias in large language models: Origin, evaluation, and mitigation. arXiv preprint arXiv:2411.10915","author":"Guo Yufei","year":"2024","unstructured":"Yufei Guo, Muzhe Guo, Juntao Su, Zhou Yang, Mengqiu Zhu, Hongfei Li, Mengyang Qiu, and Shuo Shuo Liu. 2024. Bias in large language models: Origin, evaluation, and mitigation. arXiv preprint arXiv:2411.10915 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Miner: Mining the underlying pattern of modality-specific neurons in multimodal large language models. arXiv preprint arXiv:2410.04819","author":"Huang Kaichen","year":"2024","unstructured":"Kaichen Huang, Jiahao Huo, Yibo Yan, Kun Wang, Yutao Yue, and Xuming Hu. 2024. Miner: Mining the underlying pattern of modality-specific neurons in multimodal large language models. arXiv preprint arXiv:2410.04819 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 13916-13932","author":"Huang Rongjie","year":"2023","unstructured":"Rongjie Huang, Jiawei Huang, Dongchao Yang, Yi Ren, Luping Liu, Mingze Li, Zhenhui Ye, Jinglin Liu, Xiang Yin, and Zhou Zhao. 2023. Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. In International Conference on Machine Learning. PMLR, 13916-13932."},{"key":"e_1_3_2_1_28_1","volume-title":"How large language models encode context knowledge? a layer-wise probing study. arXiv preprint arXiv:2402.16061","author":"Ju Tianjie","year":"2024","unstructured":"Tianjie Ju, Weiwei Sun, Wei Du, Xinwei Yuan, Zhaochun Ren, and Gongshen Liu. 2024. How large language models encode context knowledge? a layer-wise probing study. arXiv preprint arXiv:2402.16061 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054721"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00159"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02528"},{"key":"e_1_3_2_1_32_1","volume-title":"Last layer re-training is sufficient for robustness to spurious correlations. arXiv preprint arXiv:2204.02937","author":"Kirichenko Polina","year":"2022","unstructured":"Polina Kirichenko, Pavel Izmailov, and Andrew Gordon Wilson. 2022. Last layer re-training is sufficient for robustness to spurious correlations. arXiv preprint arXiv:2204.02937 (2022)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1915768117"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Hao Li Changyao Tian Jie Shao Xizhou Zhu Zhaokai Wang Jinguo Zhu Wenhan Dou Xiaogang Wang Hongsheng Li Lewei Lu et al. 2024a. SynerGen-VL: Towards Synergistic Image Understanding and Generation with Vision Experts and Token Folding. arXiv preprint arXiv:2412.09604 (2024).","DOI":"10.1109\/CVPR52734.2025.02771"},{"key":"e_1_3_2_1_35_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.265"},{"key":"e_1_3_2_1_38_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447027"},{"key":"e_1_3_2_1_40_1","volume-title":"Unified-io: A unified model for vision, language, and multi-modal tasks. arXiv preprint arXiv:2206.08916","author":"Lu Jiasen","year":"2022","unstructured":"Jiasen Lu, Christopher Clark, Rowan Zellers, Roozbeh Mottaghi, and Aniruddha Kembhavi. 2022. Unified-io: A unified model for vision, language, and multi-modal tasks. arXiv preprint arXiv:2206.08916 (2022)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.nuse-1.5"},{"key":"e_1_3_2_1_42_1","volume-title":"Locating and editing factual associations in gpt. Advances in neural information processing systems","author":"Meng Kevin","year":"2022","unstructured":"Kevin Meng, David Bau, Alex Andonian, and Yonatan Belinkov. 2022. Locating and editing factual associations in gpt. Advances in neural information processing systems, Vol. 35 (2022), 17359-17372."},{"key":"e_1_3_2_1_43_1","volume-title":"NISQA: A deep CNN-self-attention model for multidimensional speech quality prediction with crowdsourced datasets. arXiv preprint arXiv:2104.09494","author":"Mittag Gabriel","year":"2021","unstructured":"Gabriel Mittag, Babak Naderi, Assmaa Chehadi, and Sebastian Moller. 2021. NISQA: A deep CNN-self-attention model for multidimensional speech quality prediction with crowdsourced datasets. arXiv preprint arXiv:2104.09494 (2021)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1057\/s41599-024-03609-x"},{"key":"e_1_3_2_1_47_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_48_1","volume-title":"International conference on machine learning. PMLR, 28492-28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492-28518."},{"key":"e_1_3_2_1_49_1","volume-title":"Overcoming language priors in visual question answering with adversarial regularization. Advances in neural information processing systems","author":"Ramakrishnan Sainandan","year":"2018","unstructured":"Sainandan Ramakrishnan, Aishwarya Agrawal, and Stefan Lee. 2018. Overcoming language priors in visual question answering with adversarial regularization. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_50_1","volume-title":"COMET: A neural framework for MT evaluation. arXiv preprint arXiv:2009.09025","author":"Rei Ricardo","year":"2020","unstructured":"Ricardo Rei, Craig Stewart, Ana C Farinha, and Alon Lavie. 2020. COMET: A neural framework for MT evaluation. arXiv preprint arXiv:2009.09025 (2020)."},{"key":"e_1_3_2_1_51_1","volume-title":"Language-specific neurons: The key to multilingual capabilities in large language models. arXiv preprint arXiv:2402.16438","author":"Tang Tianyi","year":"2024","unstructured":"Tianyi Tang, Wenyang Luo, Haoyang Huang, Dongdong Zhang, Xiaolei Wang, Xin Zhao, Furu Wei, and Ji-Rong Wen. 2024. Language-specific neurons: The key to multilingual capabilities in large language models. arXiv preprint arXiv:2402.16438 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818","author":"Team Chameleon","year":"2024","unstructured":"Chameleon Team. 2024. Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818 (2024)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"e_1_3_2_1_54_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems Vol. 30 (2017)."},{"key":"e_1_3_2_1_55_1","unstructured":"Xinlong Wang Xiaosong Zhang Zhengxiong Luo Quan Sun Yufeng Cui Jinsheng Wang Fan Zhang Yueze Wang Zhen Li Qiying Yu et al. 2024a. Emu3: Next-token prediction is all you need. arXiv preprint arXiv:2409.18869 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"Mio: A foundation model on multimodal tokens. arXiv preprint arXiv:2409.17692","author":"Wang Zekun","year":"2024","unstructured":"Zekun Wang, King Zhu, Chunpu Xu, Wangchunshu Zhou, Jiaheng Liu, Yibo Zhang, Jiashuo Wang, Ning Shi, Siyu Li, Yizhi Li, et al., 2024b. Mio: A foundation model on multimodal tokens. arXiv preprint arXiv:2409.17692 (2024)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Maurice Weber Dan Fu Quentin Anthony Yonatan Oren Shane Adams Anton Alexandrov Xiaozhong Lyu Huu Nguyen Xiaozhe Yao Virginia Adams et al. 2024. Redpajama: an open dataset for training large language models. Advances in neural information processing systems Vol. 37 (2024) 116462-116492.","DOI":"10.52202\/079017-3697"},{"key":"e_1_3_2_1_58_1","unstructured":"Laura Weidinger John Mellor Maribeth Rauh Conor Griffin Jonathan Uesato Po-Sen Huang Myra Cheng Mia Glaese Borja Balle Atoosa Kasirzadeh et al. 2021. Ethical and social risks of harm from language models. arXiv preprint arXiv:2112.04359 (2021)."},{"key":"e_1_3_2_1_59_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024. Next-gpt: Any-to-any multimodal llm. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_61_1","volume-title":"Towards Unifying Understanding and Generation in the Era of Vision Foundation Models: A Survey from the Autoregression Perspective. arXiv preprint arXiv:2410.22217","author":"Xie Shenghao","year":"2024","unstructured":"Shenghao Xie, Wenqiang Zu, Mingyang Zhao, Duo Su, Shilong Liu, Ruohua Shi, Guoqi Li, Shanghang Zhang, and Lei Ma. 2024. Towards Unifying Understanding and Generation in the Era of Vision Foundation Models: A Survey from the Autoregression Perspective. arXiv preprint arXiv:2410.22217 (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"Vision: A Survey. arXiv preprint arXiv:2411.05902","author":"Xiong Jing","year":"2024","unstructured":"Jing Xiong, Gongye Liu, Lun Huang, Chengyue Wu, Taiqiang Wu, Yao Mu, Yuan Yao, Hui Shen, Zhongwei Wan, Jinfa Huang, et al., 2024. Autoregressive Models in Vision: A Survey. arXiv preprint arXiv:2411.05902 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the ieee\/cvf conference on computer vision and pattern recognition. 13040-13051","author":"Ye Qinghao","year":"2024","unstructured":"Qinghao Ye, Haiyang Xu, Jiabo Ye, Ming Yan, Anwen Hu, Haowei Liu, Qi Qian, Ji Zhang, and Fei Huang. 2024. mplug-owl2: Revolutionizing multi-modal large language model with modality collaboration. In Proceedings of the ieee\/cvf conference on computer vision and pattern recognition. 13040-13051."},{"key":"e_1_3_2_1_64_1","volume-title":"Neuron-level knowledge attribution in large language models. arXiv preprint arXiv:2312.12141","author":"Yu Zeping","year":"2023","unstructured":"Zeping Yu and Sophia Ananiadou. 2023. Neuron-level knowledge attribution in large language models. arXiv preprint arXiv:2312.12141 (2023)."},{"key":"e_1_3_2_1_65_1","volume-title":"Anygpt: Unified multimodal llm with discrete sequence modeling. arXiv preprint arXiv:2402.12226","author":"Zhan Jun","year":"2024","unstructured":"Jun Zhan, Junqi Dai, Jiasheng Ye, Yunhua Zhou, Dong Zhang, Zhigeng Liu, Xin Zhang, Ruibin Yuan, Ge Zhang, Linyang Li, et al., 2024. Anygpt: Unified multimodal llm with discrete sequence modeling. arXiv preprint arXiv:2402.12226 (2024)."},{"key":"e_1_3_2_1_66_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00533"},{"key":"e_1_3_2_1_68_1","volume-title":"Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692","author":"Zhang Xin","year":"2023","unstructured":"Xin Zhang, Dong Zhang, Shimin Li, Yaqian Zhou, and Xipeng Qiu. 2023. Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692 (2023)."},{"key":"e_1_3_2_1_69_1","volume-title":"Gender bias in coreference resolution: Evaluation and debiasing methods. arXiv preprint arXiv:1804.06876","author":"Zhao Jieyu","year":"2018","unstructured":"Jieyu Zhao, Tianlu Wang, Mark Yatskar, Vicente Ordonez, and Kai-Wei Chang. 2018. Gender bias in coreference resolution: Evaluation and debiasing methods. arXiv preprint arXiv:1804.06876 (2018)."},{"key":"e_1_3_2_1_70_1","volume-title":"LibriSQA: A Novel Dataset and Framework for Spoken Question Answering with Large Language Models","author":"Zhao Zihan","year":"2024","unstructured":"Zihan Zhao, Yiyang Jiang, Heyang Liu, Yu Wang, and Yanfeng Wang. 2024. LibriSQA: A Novel Dataset and Framework for Spoken Question Answering with Large Language Models. IEEE Transactions on Artificial Intelligence (2024)."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761264","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T00:03:58Z","timestamp":1765497838000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761264"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":70,"alternative-id":["10.1145\/3746252.3761264","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761264","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}